meQTL.pl
# 導入 -> 系統 package $|=1; use warnings; use strict; use Encode qw/decode/; use File::Spec; use Getopt::Long; use Statistics::R; use Excel::Writer::XLSX; # 定義 -> 常量 use constant SCRIPTDIR => (File::Spec->splitpath(File::Spec->rel2abs($0)))[1]; use constant PWD => $ENV{"PWD"}; my $scriptdir = SCRIPTDIR; my $matrix_meQTL_r = "$scriptdir/matrix_meQTL.r"; my $boxplot_r = "$scriptdir/plot_boxplot.r"; my $Rscript = "/home/*/software/r/3.5.1/bin/Rscript"; # 檢測 -> 腳本輸入 my ($snp_genotype, $rs_ref_alt, $methyl_expression, $number, $output_path, $only, $skip, $list, $if_help); GetOptions( "only=s" => \$only, "skip=s" => \$skip, "snp_genotype|g=s" => \$snp_genotype, "rs_ref_alt|r=s" => \$rs_ref_alt, "methyl_expression|m=s" => \$methyl_expression, "number|n=s" => \$number, "output_path|o=s" => \$output_path, "help|h" => \$if_help, "list!" => \$list ); $number = 10 if (not defined $number); #默認繪制10個snp_target對; if ($list) { my $help =<<EOF; 1 snp基因型轉換 2 meQTL計算 3 繪圖 EOF print qq{$help\n}; exit; } die help() if(defined $if_help or (not defined $snp_genotype or not defined $rs_ref_alt or not defined $output_path)); my @run_steps = (); if ($only) { foreach my $x (split /,/, $only) { push @run_steps, $x; } } else { my @skips = split /,/, $skip if defined ($skip); foreach my $x (1..3) { next if $x ~~ @skips; push @run_steps, $x; } } ####檢查$snp_genotype(基因分型文件)和$methyl_expression(甲基化位點表達量)兩個文件的樣本名稱和順序是否一致 check_samples($snp_genotype, $methyl_expression); #################################################主程序############################################## #################################################################### ####step 1:snp_genotype基因型轉換 #################################################################### my $tmpdir = "$output_path/tmp"; mkdir $tmpdir if not -d $tmpdir; my $snp_type = "$output_path/tmp/snp_type.txt"; if (1 ~~ @run_steps) { get_snp_type($snp_genotype, $rs_ref_alt, $snp_type); #####基因型轉換前、轉換后、突變位點 ==》xlsx表格輸出 my $report_snp_genotype = "$output_path/snp_genotype.xlsx"; my $workbook = Excel::Writer::XLSX->new($report_snp_genotype); my %format = format_excel($workbook); #獲取Excel格式內容 ####sheet1:基因型轉換前 my $txt1 = "$snp_genotype"; my $worksheet1 = $workbook->add_worksheet("Original_SNP_Result"); report_xlsx($txt1, $worksheet1, \%format); ####sheet2:基因型轉換后 my $txt2 = "$snp_type"; my $worksheet2 = $workbook->add_worksheet("Transformed_SNP_Result"); report_xlsx($txt2, $worksheet2, \%format); ####sheet3:突變位點 my $txt3 = "$rs_ref_alt"; my $worksheet3 = $workbook->add_worksheet("Gene_Mutation_Site"); report_xlsx($txt3, $worksheet3, \%format); $workbook->close(); } #################################################################### ####step 2:meQTL_cis and meQTL_trans 結果 ==》xlsx表格輸出 #################################################################### if (2 ~~ @run_steps) { system qq{$Rscript $matrix_meQTL_r -o $tmpdir/meQTL_output.txt -m $methyl_expression -s $tmpdir/snp_type.txt}; system qq{sed -i "s/gene/target/" $tmpdir/meQTL_output.txt}; system qq{sed -i "s/-/_/" $tmpdir/meQTL_output.txt}; my $report_meQTL = "$output_path/meQTL_output.xlsx"; my $workbook_meQTL = Excel::Writer::XLSX->new($report_meQTL); my %format_meQTL = format_excel($workbook_meQTL); ####sheet1:meQTL_output my $txt_meQTL = "$tmpdir/meQTL_output.txt"; my $worksheet_meQTL = $workbook_meQTL->add_worksheet("meQTL_Output"); report_xlsx($txt_meQTL, $worksheet_meQTL, \%format_meQTL); ####sheet2:readme my $txt_readme = "$scriptdir/readme.txt"; ReadMe($workbook_meQTL,\%format_meQTL,"$txt_readme"); $workbook_meQTL->close(); } #################################################################### ####step 3:對p值最小的前n個結果繪制boxplot圖,橫坐標是snp分型,縱坐標是表型 #################################################################### if (3 ~~ @run_steps) { my $meQTL_output = "$tmpdir/meQTL_output.txt"; my $count = $number + 1; my $extract_meQTL_output = "$tmpdir/extract_meQTL_output.txt"; system qq{head -n $count $meQTL_output >$extract_meQTL_output}; #畫圖 system qq{$Rscript $boxplot_r -g $snp_genotype -m $methyl_expression -i $extract_meQTL_output -o $output_path/boxplot.pdf}; } ##################################################子程序############################## sub check_samples{ my $snp_genotype = shift @_; my $methyl_expression = shift @_; open FILE, $snp_genotype ; my $head = <FILE>; $head =~ s/[\r\n]//g; my @array0 = split /\t/, $head, 2; my @array = split /\t/, $array0[1]; open FILE2, $methyl_expression; my $head2 = <FILE2>; $head2 =~ s/[\r\n]//g; my @array1 = split /\t/, $head2, 2; my @array2 = split /\t/, $array1[1]; die "\n[Error]: please check '$snp_genotype $methyl_expression' samples name and order!\n\n" if (not @array ~~ @array2); close FILE; close FILE2; } sub get_snp_type{ my $snp_genotype = shift @_; my $rs_ref_alt = shift @_; my $snp_type = shift @_; open RS, $rs_ref_alt or die "The rs_ref_alt file doesn't exist!\n"; my %hashRef; my %hashAlt; while (<RS>) { $_ =~ s/[\r\n]//g; next if not /^rs/; my @array = split /\t/; $hashRef{$array[0]} = $array[1]; #$hash{"rs"} = "ref"; $hashAlt{$array[0]}{$array[1]} = $array[2]; #$hashAlt{"rs"}{"ref"} = "alt"; } open GENOTYPE, $snp_genotype or die "The snp_genotype file doesn't exist!\n"; open SAVE, ">$snp_type"; my $head = <GENOTYPE>; $head =~ s/[\r\n]//g; print SAVE "$head\n"; while (<GENOTYPE>) { $_ =~ s/[\r\n]//g; my @array = split /\t/; my $ref = $hashRef{$array[0]}; my $alt = $hashAlt{$array[0]}{$ref}; foreach my $col (1..$#array){ $array[$col] = exists $array[$col] ? $array[$col] : ""; $array[$col] = 0 if ($array[$col] eq "$ref/$ref"); $array[$col] = 1 if ($array[$col] eq "$ref/$alt" or ($array[$col] eq "$alt/$ref")); $array[$col] = 2 if ($array[$col] eq "$alt/$alt"); } # 輸出 print SAVE (join "\t", @array) . "\n"; } close RS; close GENOTYPE; close SAVE; } sub report_xlsx{ my $txt = shift; my $worksheet = shift; my $format = shift; my $row = 0; open TXT, $txt or die "Can't open $txt!\n"; while (<TXT>) { chomp; my @arr = split /\t/; if($row == 0){ $worksheet->write_row( 0, 0, \@arr, $format->{"title"}); }else{ $worksheet->write_row( $row, 0, \@arr, $format->{"normal"}); } $row++; } close TXT; } sub format_excel{ my $workbook = shift; my %format = (); #第一行(標題設置:黃色、上下左右居中) $format{"title"} = $workbook->add_format(); $format{"title"}->set_bg_color('yellow'); $format{"title"}->set_align('center'); $format{"title"}->set_align('vcenter'); $format{"title"}->set_border(); $format{"title"}->set_font("Times New Roman"); #其它行(上下左右居中) $format{"normal"} = $workbook->add_format(); $format{"normal"}->set_align('center'); $format{"normal"}->set_align('vcenter'); $format{"normal"}->set_border(); $format{"normal"}->set_font("Times New Roman"); return %format; } sub ReadMe{ my ($workbook, $format, $file)=@_; my $sheet= $workbook -> add_worksheet("Read Me"); $sheet->set_row(0, 50); $sheet->set_column('A:A', 35); $sheet->set_column('B:B', 25); $sheet->set_column('C:C', 110); my $row = 0; open FILE, $file; while (<FILE>){ $_ = ~ s/\^/\n/g; my @split_line = split /\t/,$_; my $col = 0; foreach (@split_line) { my $text = decode("utf8", $_); if($row == 0 and $col >= 0){ $sheet -> write($row, $col, $text, $format->{"title"}); }else{ $sheet -> write($row, $col, $text, $format->{"normal"}); } $col++; } $row++; } close FILE; } sub help{ my $info = " Program: meQTL_methylTarget's SNP type transportation, meQTL analysis and plot Version: 2020-02-28 Contact: *** Usage: perl ".(File::Spec->splitpath(File::Spec->rel2abs($0)))[2]." [options] Options: --snp_genotype/-g 基因分型文件,第一行為表頭,分別為snpid編號,樣本名1,樣本名2...每一行表示一個snp位點,每一列表示一個樣本的基因分型。注:snp_genotype與methyl_expression文件中樣本順序需保持一致。 --rs_ref_alt/-r SNP突變位點信息, 第一行為表頭,三列內容:分別為snpid編號,ref,alt --methyl_expression/-m 甲基化位點的表達量,第一行為表頭,分別為甲基化位點,樣本名1,樣本名1,樣本名2...每一行表示一個甲基化位點,每一列表示一個樣本的甲基化值。注:snp_genotype與methyl_expression文件中樣本順序需保持一致。 --number/-n 繪制snp_target_boxplot關系對個數 --output_path/-o 輸出文件夾 --help/-h 查看幫助文檔 --only only steps --skip skip steps --list 分析列表 \n"; return $info; }
matrix_meQTL.r
#!/home/*/software/r/3.5.1/bin/Rscript library(docopt) "Usage: MatrixEQTL_CIS.r -o <file> -m <file> -s <file> [-c <file>] Options: -o, --output_file <file> the eQTL result -m , --methyl_file <file> the input methyltarget expression file -s , --SNV_file <file> the input SNV type file -c , --cov_file <file> the covariates file [default: xlsx]" -> doc opts <- docopt(doc, version = 'Program : EQTL analysis based on methyltarget and SNV data\n') methyl_input <- opts$methyl_file SNV_input <- opts$SNV_file output_file_name <-opts$output_file cov_input <- opts$cov_file ######### .libPaths("/home/*/software/r/3.5.1/lib64/R/library/") library(MatrixEQTL) useModel = modelLINEAR # Genotype file name SNP_file_name <- SNV_input # Gene expression file name expression_file_name <- methyl_input # Covariates file name covariates_file_name <- cov_input # Output file name output_file_name <- output_file_name # Set to numeric() for identity errorCovariance = numeric() cisDist = 1e6 ## Load genotype data snps = SlicedData$new() snps$fileDelimiter = "\t" # the TAB character snps$fileOmitCharacters = "NA" # denote missing values; snps$fileSkipRows = 1 # one row of column labels snps$fileSkipColumns = 1 # one column of row labels snps$fileSliceSize = 2000 # read file in slices of 2,000 rows snps$LoadFile(SNP_file_name) ## Load gene expression data gene = SlicedData$new() gene$fileDelimiter = "\t" # the TAB character gene$fileOmitCharacters = "NA" # denote missing values; gene$fileSkipRows = 1 # one row of column labels gene$fileSkipColumns = 1 # one column of row labels gene$fileSliceSize = 2000 # read file in slices of 2,000 rows gene$LoadFile(expression_file_name) ## Load covariates cvrt = SlicedData$new() if(file.exists(covariates_file_name)) { cvrt$fileDelimiter = "\t" # the TAB character cvrt$fileOmitCharacters = "NA" # denote missing values; cvrt$fileSkipRows = 1 # one row of column labels cvrt$fileSkipColumns = 1 # one column of row labels if(length(covariates_file_name)>0) { cvrt$LoadFile(covariates_file_name) } } ## Run the analysis #snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE) #genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE) me = Matrix_eQTL_main( snps = snps, gene = gene, cvrt = cvrt, output_file_name = output_file_name, pvOutputThreshold = 1, useModel = useModel, errorCovariance = errorCovariance, verbose = TRUE, cisDist = cisDist, pvalue.hist = TRUE, min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE)
readme.txt
標題 說明
SNV位點 snp位點
target 甲基化片段位點
beta 線性回歸分析系數
t_stat 線性回歸分析t值
p_value 線性回歸分析p值(小於0.05,標注橘黃色) FDR 線性回歸分析修正后的p值
plot_boxplot.r
#!/home/*/software/r/3.5.1/bin/Rscript .libPaths("/home/*/software/r/3.5.1/lib64/R/library/") library(docopt) "Usage: boxplot.r -g <file> -m <file> -i <file> -o <file> [--width <int> --y_name <string>] Options: -g, --genotype <file> 輸入文件,基因分型文件,第一行為表頭,分別為snpid編號,樣本名1,樣本名2...每一行表示一個snp位點,每一列表示一個樣本的基因分型。注:snp_genotype與methyl_expression文件中樣本順序需保持一致。 -m, --methyl_expression <file> 輸入文件,甲基化位點的表達量,第一行為表頭,分別為甲基化位點,樣本名1,樣本名1,樣本名2...每一行表示一個甲基化位點,每一列表示一個樣本的甲基化值。注:snp_genotype與methyl_expression文件中樣本順序需保持一致。 -i, --meQTL_result <file> 輸入文件,meQTL_output數據 -o, --outdir <file> 輸出路徑 --y_name <string> y軸名稱 [default: methyl_expression] --width <int> pdf繪圖時,每一個分組的寬度 [default: 2.5]" -> doc opts <- docopt(doc, version = '點圖繪制\n') genotype <- opts$genotype methyl_expression <- opts$methyl_expression meQTL_result <- opts$meQTL_result outdir <- opts$outdir y_name <- opts$y_name width <- opts$width # 加載R包 library(ggpubr, quietly = TRUE) #讀入snp分型數據 message("loading data : ", genotype) data_snp_genotype <- read.table(genotype, sep='\t', header = F, check.names=F, stringsAsFactors=F) data_snp_genotype <- t(data_snp_genotype) #轉置 data_snp_genotype <- data_snp_genotype[,-1] #刪除第一列樣本名稱 colnames(data_snp_genotype) <- data_snp_genotype[1,] #第一行是標題行 #讀入甲基化位點表達量數據 message("loading data : ", methyl_expression) data_methyl_expression <- read.table(methyl_expression, sep='\t', header = F, check.names=F, stringsAsFactors=F) data_methyl_expression <- t(data_methyl_expression) #轉置 data_methyl_expression <- data_methyl_expression[,-1] #刪除第一列樣本名稱 colnames(data_methyl_expression) <- data_methyl_expression[1,] #第一行是標題行 #讀入meQTL_output數據 message("loading data : ", meQTL_result) data_meQTL_result <- read.table(meQTL_result, sep='\t', header = T, check.names=F, stringsAsFactors=F) compare <- data_meQTL_result[,c(1,2,5)] #提取snp_target對、p_value pdf(outdir, width = 3 * as.numeric(width), height = 7) for(row in 1:nrow(compare)) { snp = compare[row, 1] target = compare[row, 2] p_value = compare[row, 3] p_value <- format(p_value, scientific=TRUE, digit=3) data_plot <- cbind(data_snp_genotype[,snp], data_methyl_expression[,target]) #提取繪圖數據 data_plot <- as.data.frame(data_plot) data_plot <- data_plot[-1,] #刪除標題行,否則非數值的標題行在as.numeric()時報錯“強制改變過程中產生了NA” data_plot[,2] <- as.character(data_plot[,2]) #factor -> character data_plot[,2] <- as.numeric(data_plot[,2]) #character ->numeric colnames(data_plot) = c(snp, target) rownames(data_plot) <- NULL data_plot <- data_plot[order(data_plot[,1],decreasing=T),] #根據genotype排序 data_plot = data_plot[complete.cases(data_plot), ] # 去掉缺失 legend_P <- paste("p_value","=",p_value,sep = "") plot_title = colnames(data_plot)[2] colnames(data_plot)[2] = 'GENE' # 變量不能以數字開頭,故替換名稱 p <- ggboxplot(data_plot, x=snp, y='GENE', color = snp, palette = "npg", #雜志nature的配色 # palette = "aaas", #雜志Science的配色 # palette = "jco", #按jco雜志配色方案 add = c("jitter"), # 增加點 add.params = list(color = snp), outlier.size = -1, # 隱藏離群點,否則離群點會在圖上出現兩次(boxplot繪制離群點,jitter再繪制所有點,從而使離群點繪制兩次) )+ xlab(snp) + ylab(y_name) + guides(fill = FALSE, color = FALSE) + theme(plot.title = element_text(hjust = 0.5)) + labs(title = plot_title, subtitle = legend_P) # 去掉legend,標題居中 print(p) } dev.off()