meQTL.pl
# 导入 -> 系统 package $|=1; use warnings; use strict; use Encode qw/decode/; use File::Spec; use Getopt::Long; use Statistics::R; use Excel::Writer::XLSX; # 定义 -> 常量 use constant SCRIPTDIR => (File::Spec->splitpath(File::Spec->rel2abs($0)))[1]; use constant PWD => $ENV{"PWD"}; my $scriptdir = SCRIPTDIR; my $matrix_meQTL_r = "$scriptdir/matrix_meQTL.r"; my $boxplot_r = "$scriptdir/plot_boxplot.r"; my $Rscript = "/home/*/software/r/3.5.1/bin/Rscript"; # 检测 -> 脚本输入 my ($snp_genotype, $rs_ref_alt, $methyl_expression, $number, $output_path, $only, $skip, $list, $if_help); GetOptions( "only=s" => \$only, "skip=s" => \$skip, "snp_genotype|g=s" => \$snp_genotype, "rs_ref_alt|r=s" => \$rs_ref_alt, "methyl_expression|m=s" => \$methyl_expression, "number|n=s" => \$number, "output_path|o=s" => \$output_path, "help|h" => \$if_help, "list!" => \$list ); $number = 10 if (not defined $number); #默认绘制10个snp_target对; if ($list) { my $help =<<EOF; 1 snp基因型转换 2 meQTL计算 3 绘图 EOF print qq{$help\n}; exit; } die help() if(defined $if_help or (not defined $snp_genotype or not defined $rs_ref_alt or not defined $output_path)); my @run_steps = (); if ($only) { foreach my $x (split /,/, $only) { push @run_steps, $x; } } else { my @skips = split /,/, $skip if defined ($skip); foreach my $x (1..3) { next if $x ~~ @skips; push @run_steps, $x; } } ####检查$snp_genotype(基因分型文件)和$methyl_expression(甲基化位点表达量)两个文件的样本名称和顺序是否一致 check_samples($snp_genotype, $methyl_expression); #################################################主程序############################################## #################################################################### ####step 1:snp_genotype基因型转换 #################################################################### my $tmpdir = "$output_path/tmp"; mkdir $tmpdir if not -d $tmpdir; my $snp_type = "$output_path/tmp/snp_type.txt"; if (1 ~~ @run_steps) { get_snp_type($snp_genotype, $rs_ref_alt, $snp_type); #####基因型转换前、转换后、突变位点 ==》xlsx表格输出 my $report_snp_genotype = "$output_path/snp_genotype.xlsx"; my $workbook = Excel::Writer::XLSX->new($report_snp_genotype); my %format = format_excel($workbook); #获取Excel格式内容 ####sheet1:基因型转换前 my $txt1 = "$snp_genotype"; my $worksheet1 = $workbook->add_worksheet("Original_SNP_Result"); report_xlsx($txt1, $worksheet1, \%format); ####sheet2:基因型转换后 my $txt2 = "$snp_type"; my $worksheet2 = $workbook->add_worksheet("Transformed_SNP_Result"); report_xlsx($txt2, $worksheet2, \%format); ####sheet3:突变位点 my $txt3 = "$rs_ref_alt"; my $worksheet3 = $workbook->add_worksheet("Gene_Mutation_Site"); report_xlsx($txt3, $worksheet3, \%format); $workbook->close(); } #################################################################### ####step 2:meQTL_cis and meQTL_trans 结果 ==》xlsx表格输出 #################################################################### if (2 ~~ @run_steps) { system qq{$Rscript $matrix_meQTL_r -o $tmpdir/meQTL_output.txt -m $methyl_expression -s $tmpdir/snp_type.txt}; system qq{sed -i "s/gene/target/" $tmpdir/meQTL_output.txt}; system qq{sed -i "s/-/_/" $tmpdir/meQTL_output.txt}; my $report_meQTL = "$output_path/meQTL_output.xlsx"; my $workbook_meQTL = Excel::Writer::XLSX->new($report_meQTL); my %format_meQTL = format_excel($workbook_meQTL); ####sheet1:meQTL_output my $txt_meQTL = "$tmpdir/meQTL_output.txt"; my $worksheet_meQTL = $workbook_meQTL->add_worksheet("meQTL_Output"); report_xlsx($txt_meQTL, $worksheet_meQTL, \%format_meQTL); ####sheet2:readme my $txt_readme = "$scriptdir/readme.txt"; ReadMe($workbook_meQTL,\%format_meQTL,"$txt_readme"); $workbook_meQTL->close(); } #################################################################### ####step 3:对p值最小的前n个结果绘制boxplot图,横坐标是snp分型,纵坐标是表型 #################################################################### if (3 ~~ @run_steps) { my $meQTL_output = "$tmpdir/meQTL_output.txt"; my $count = $number + 1; my $extract_meQTL_output = "$tmpdir/extract_meQTL_output.txt"; system qq{head -n $count $meQTL_output >$extract_meQTL_output}; #画图 system qq{$Rscript $boxplot_r -g $snp_genotype -m $methyl_expression -i $extract_meQTL_output -o $output_path/boxplot.pdf}; } ##################################################子程序############################## sub check_samples{ my $snp_genotype = shift @_; my $methyl_expression = shift @_; open FILE, $snp_genotype ; my $head = <FILE>; $head =~ s/[\r\n]//g; my @array0 = split /\t/, $head, 2; my @array = split /\t/, $array0[1]; open FILE2, $methyl_expression; my $head2 = <FILE2>; $head2 =~ s/[\r\n]//g; my @array1 = split /\t/, $head2, 2; my @array2 = split /\t/, $array1[1]; die "\n[Error]: please check '$snp_genotype $methyl_expression' samples name and order!\n\n" if (not @array ~~ @array2); close FILE; close FILE2; } sub get_snp_type{ my $snp_genotype = shift @_; my $rs_ref_alt = shift @_; my $snp_type = shift @_; open RS, $rs_ref_alt or die "The rs_ref_alt file doesn't exist!\n"; my %hashRef; my %hashAlt; while (<RS>) { $_ =~ s/[\r\n]//g; next if not /^rs/; my @array = split /\t/; $hashRef{$array[0]} = $array[1]; #$hash{"rs"} = "ref"; $hashAlt{$array[0]}{$array[1]} = $array[2]; #$hashAlt{"rs"}{"ref"} = "alt"; } open GENOTYPE, $snp_genotype or die "The snp_genotype file doesn't exist!\n"; open SAVE, ">$snp_type"; my $head = <GENOTYPE>; $head =~ s/[\r\n]//g; print SAVE "$head\n"; while (<GENOTYPE>) { $_ =~ s/[\r\n]//g; my @array = split /\t/; my $ref = $hashRef{$array[0]}; my $alt = $hashAlt{$array[0]}{$ref}; foreach my $col (1..$#array){ $array[$col] = exists $array[$col] ? $array[$col] : ""; $array[$col] = 0 if ($array[$col] eq "$ref/$ref"); $array[$col] = 1 if ($array[$col] eq "$ref/$alt" or ($array[$col] eq "$alt/$ref")); $array[$col] = 2 if ($array[$col] eq "$alt/$alt"); } # 输出 print SAVE (join "\t", @array) . "\n"; } close RS; close GENOTYPE; close SAVE; } sub report_xlsx{ my $txt = shift; my $worksheet = shift; my $format = shift; my $row = 0; open TXT, $txt or die "Can't open $txt!\n"; while (<TXT>) { chomp; my @arr = split /\t/; if($row == 0){ $worksheet->write_row( 0, 0, \@arr, $format->{"title"}); }else{ $worksheet->write_row( $row, 0, \@arr, $format->{"normal"}); } $row++; } close TXT; } sub format_excel{ my $workbook = shift; my %format = (); #第一行(标题设置:黄色、上下左右居中) $format{"title"} = $workbook->add_format(); $format{"title"}->set_bg_color('yellow'); $format{"title"}->set_align('center'); $format{"title"}->set_align('vcenter'); $format{"title"}->set_border(); $format{"title"}->set_font("Times New Roman"); #其它行(上下左右居中) $format{"normal"} = $workbook->add_format(); $format{"normal"}->set_align('center'); $format{"normal"}->set_align('vcenter'); $format{"normal"}->set_border(); $format{"normal"}->set_font("Times New Roman"); return %format; } sub ReadMe{ my ($workbook, $format, $file)=@_; my $sheet= $workbook -> add_worksheet("Read Me"); $sheet->set_row(0, 50); $sheet->set_column('A:A', 35); $sheet->set_column('B:B', 25); $sheet->set_column('C:C', 110); my $row = 0; open FILE, $file; while (<FILE>){ $_ = ~ s/\^/\n/g; my @split_line = split /\t/,$_; my $col = 0; foreach (@split_line) { my $text = decode("utf8", $_); if($row == 0 and $col >= 0){ $sheet -> write($row, $col, $text, $format->{"title"}); }else{ $sheet -> write($row, $col, $text, $format->{"normal"}); } $col++; } $row++; } close FILE; } sub help{ my $info = " Program: meQTL_methylTarget's SNP type transportation, meQTL analysis and plot Version: 2020-02-28 Contact: *** Usage: perl ".(File::Spec->splitpath(File::Spec->rel2abs($0)))[2]." [options] Options: --snp_genotype/-g 基因分型文件,第一行为表头,分别为snpid编号,样本名1,样本名2...每一行表示一个snp位点,每一列表示一个样本的基因分型。注:snp_genotype与methyl_expression文件中样本顺序需保持一致。 --rs_ref_alt/-r SNP突变位点信息, 第一行为表头,三列内容:分别为snpid编号,ref,alt --methyl_expression/-m 甲基化位点的表达量,第一行为表头,分别为甲基化位点,样本名1,样本名1,样本名2...每一行表示一个甲基化位点,每一列表示一个样本的甲基化值。注:snp_genotype与methyl_expression文件中样本顺序需保持一致。 --number/-n 绘制snp_target_boxplot关系对个数 --output_path/-o 输出文件夹 --help/-h 查看帮助文档 --only only steps --skip skip steps --list 分析列表 \n"; return $info; }
matrix_meQTL.r
#!/home/*/software/r/3.5.1/bin/Rscript library(docopt) "Usage: MatrixEQTL_CIS.r -o <file> -m <file> -s <file> [-c <file>] Options: -o, --output_file <file> the eQTL result -m , --methyl_file <file> the input methyltarget expression file -s , --SNV_file <file> the input SNV type file -c , --cov_file <file> the covariates file [default: xlsx]" -> doc opts <- docopt(doc, version = 'Program : EQTL analysis based on methyltarget and SNV data\n') methyl_input <- opts$methyl_file SNV_input <- opts$SNV_file output_file_name <-opts$output_file cov_input <- opts$cov_file ######### .libPaths("/home/*/software/r/3.5.1/lib64/R/library/") library(MatrixEQTL) useModel = modelLINEAR # Genotype file name SNP_file_name <- SNV_input # Gene expression file name expression_file_name <- methyl_input # Covariates file name covariates_file_name <- cov_input # Output file name output_file_name <- output_file_name # Set to numeric() for identity errorCovariance = numeric() cisDist = 1e6 ## Load genotype data snps = SlicedData$new() snps$fileDelimiter = "\t" # the TAB character snps$fileOmitCharacters = "NA" # denote missing values; snps$fileSkipRows = 1 # one row of column labels snps$fileSkipColumns = 1 # one column of row labels snps$fileSliceSize = 2000 # read file in slices of 2,000 rows snps$LoadFile(SNP_file_name) ## Load gene expression data gene = SlicedData$new() gene$fileDelimiter = "\t" # the TAB character gene$fileOmitCharacters = "NA" # denote missing values; gene$fileSkipRows = 1 # one row of column labels gene$fileSkipColumns = 1 # one column of row labels gene$fileSliceSize = 2000 # read file in slices of 2,000 rows gene$LoadFile(expression_file_name) ## Load covariates cvrt = SlicedData$new() if(file.exists(covariates_file_name)) { cvrt$fileDelimiter = "\t" # the TAB character cvrt$fileOmitCharacters = "NA" # denote missing values; cvrt$fileSkipRows = 1 # one row of column labels cvrt$fileSkipColumns = 1 # one column of row labels if(length(covariates_file_name)>0) { cvrt$LoadFile(covariates_file_name) } } ## Run the analysis #snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE) #genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE) me = Matrix_eQTL_main( snps = snps, gene = gene, cvrt = cvrt, output_file_name = output_file_name, pvOutputThreshold = 1, useModel = useModel, errorCovariance = errorCovariance, verbose = TRUE, cisDist = cisDist, pvalue.hist = TRUE, min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE)
readme.txt
标题 说明
SNV位点 snp位点
target 甲基化片段位点
beta 线性回归分析系数
t_stat 线性回归分析t值
p_value 线性回归分析p值(小于0.05,标注橘黄色) FDR 线性回归分析修正后的p值
plot_boxplot.r
#!/home/*/software/r/3.5.1/bin/Rscript .libPaths("/home/*/software/r/3.5.1/lib64/R/library/") library(docopt) "Usage: boxplot.r -g <file> -m <file> -i <file> -o <file> [--width <int> --y_name <string>] Options: -g, --genotype <file> 输入文件,基因分型文件,第一行为表头,分别为snpid编号,样本名1,样本名2...每一行表示一个snp位点,每一列表示一个样本的基因分型。注:snp_genotype与methyl_expression文件中样本顺序需保持一致。 -m, --methyl_expression <file> 输入文件,甲基化位点的表达量,第一行为表头,分别为甲基化位点,样本名1,样本名1,样本名2...每一行表示一个甲基化位点,每一列表示一个样本的甲基化值。注:snp_genotype与methyl_expression文件中样本顺序需保持一致。 -i, --meQTL_result <file> 输入文件,meQTL_output数据 -o, --outdir <file> 输出路径 --y_name <string> y轴名称 [default: methyl_expression] --width <int> pdf绘图时,每一个分组的宽度 [default: 2.5]" -> doc opts <- docopt(doc, version = '点图绘制\n') genotype <- opts$genotype methyl_expression <- opts$methyl_expression meQTL_result <- opts$meQTL_result outdir <- opts$outdir y_name <- opts$y_name width <- opts$width # 加载R包 library(ggpubr, quietly = TRUE) #读入snp分型数据 message("loading data : ", genotype) data_snp_genotype <- read.table(genotype, sep='\t', header = F, check.names=F, stringsAsFactors=F) data_snp_genotype <- t(data_snp_genotype) #转置 data_snp_genotype <- data_snp_genotype[,-1] #删除第一列样本名称 colnames(data_snp_genotype) <- data_snp_genotype[1,] #第一行是标题行 #读入甲基化位点表达量数据 message("loading data : ", methyl_expression) data_methyl_expression <- read.table(methyl_expression, sep='\t', header = F, check.names=F, stringsAsFactors=F) data_methyl_expression <- t(data_methyl_expression) #转置 data_methyl_expression <- data_methyl_expression[,-1] #删除第一列样本名称 colnames(data_methyl_expression) <- data_methyl_expression[1,] #第一行是标题行 #读入meQTL_output数据 message("loading data : ", meQTL_result) data_meQTL_result <- read.table(meQTL_result, sep='\t', header = T, check.names=F, stringsAsFactors=F) compare <- data_meQTL_result[,c(1,2,5)] #提取snp_target对、p_value pdf(outdir, width = 3 * as.numeric(width), height = 7) for(row in 1:nrow(compare)) { snp = compare[row, 1] target = compare[row, 2] p_value = compare[row, 3] p_value <- format(p_value, scientific=TRUE, digit=3) data_plot <- cbind(data_snp_genotype[,snp], data_methyl_expression[,target]) #提取绘图数据 data_plot <- as.data.frame(data_plot) data_plot <- data_plot[-1,] #删除标题行,否则非数值的标题行在as.numeric()时报错“强制改变过程中产生了NA” data_plot[,2] <- as.character(data_plot[,2]) #factor -> character data_plot[,2] <- as.numeric(data_plot[,2]) #character ->numeric colnames(data_plot) = c(snp, target) rownames(data_plot) <- NULL data_plot <- data_plot[order(data_plot[,1],decreasing=T),] #根据genotype排序 data_plot = data_plot[complete.cases(data_plot), ] # 去掉缺失 legend_P <- paste("p_value","=",p_value,sep = "") plot_title = colnames(data_plot)[2] colnames(data_plot)[2] = 'GENE' # 变量不能以数字开头,故替换名称 p <- ggboxplot(data_plot, x=snp, y='GENE', color = snp, palette = "npg", #杂志nature的配色 # palette = "aaas", #杂志Science的配色 # palette = "jco", #按jco杂志配色方案 add = c("jitter"), # 增加点 add.params = list(color = snp), outlier.size = -1, # 隐藏离群点,否则离群点会在图上出现两次(boxplot绘制离群点,jitter再绘制所有点,从而使离群点绘制两次) )+ xlab(snp) + ylab(y_name) + guides(fill = FALSE, color = FALSE) + theme(plot.title = element_text(hjust = 0.5)) + labs(title = plot_title, subtitle = legend_P) # 去掉legend,标题居中 print(p) } dev.off()