snp基因型的轉換、與甲基化位點表達量meQTL分析、snp_target對繪圖

本文轉載自查看原文 2020-04-02 13:56 673

meQTL.pl

# 導入 -> 系統 package
$|=1;   
use warnings;
use strict;
use Encode qw/decode/;
use File::Spec;
use Getopt::Long;
use Statistics::R;
use Excel::Writer::XLSX;


# 定義 -> 常量
use constant SCRIPTDIR => (File::Spec->splitpath(File::Spec->rel2abs($0)))[1];
use constant PWD => $ENV{"PWD"};
my $scriptdir = SCRIPTDIR;
my $matrix_meQTL_r = "$scriptdir/matrix_meQTL.r";
my $boxplot_r = "$scriptdir/plot_boxplot.r";
my $Rscript = "/home/*/software/r/3.5.1/bin/Rscript";

# 檢測 -> 腳本輸入
my ($snp_genotype, $rs_ref_alt, $methyl_expression, $number, $output_path, $only, $skip, $list, $if_help);
GetOptions(
    "only=s"           => \$only,
    "skip=s"           => \$skip,
    "snp_genotype|g=s" => \$snp_genotype,
    "rs_ref_alt|r=s"   => \$rs_ref_alt,
    "methyl_expression|m=s" => \$methyl_expression,    
    "number|n=s"       => \$number,
    "output_path|o=s"  => \$output_path,
    "help|h"           => \$if_help,
    "list!"            => \$list
);

$number = 10 if (not defined $number); #默認繪制10個snp_target對；

 
if ($list) {
my $help =<<EOF;

1 snp基因型轉換
2 meQTL計算
3 繪圖
EOF
print qq{$help\n};
exit;    
}

die help() if(defined $if_help or (not defined $snp_genotype or not defined $rs_ref_alt or not defined $output_path));

my @run_steps = ();
if ($only) {
    foreach my $x (split /,/, $only) {
        push @run_steps, $x;
    }
} else {
    my @skips = split /,/, $skip if defined ($skip);
    foreach my $x (1..3) {
        next if $x ~~ @skips;
        push @run_steps, $x;
    }
}

####檢查$snp_genotype(基因分型文件)和$methyl_expression(甲基化位點表達量)兩個文件的樣本名稱和順序是否一致
check_samples($snp_genotype, $methyl_expression);

#################################################主程序##############################################

####################################################################
####step 1:snp_genotype基因型轉換
####################################################################
my $tmpdir = "$output_path/tmp"; 
mkdir $tmpdir if not -d $tmpdir;
my $snp_type = "$output_path/tmp/snp_type.txt";
if (1 ~~ @run_steps) {
    get_snp_type($snp_genotype, $rs_ref_alt, $snp_type);

    #####基因型轉換前、轉換后、突變位點 ==》xlsx表格輸出
    my $report_snp_genotype = "$output_path/snp_genotype.xlsx";
    my $workbook   = Excel::Writer::XLSX->new($report_snp_genotype);
    my %format = format_excel($workbook); #獲取Excel格式內容 
    ####sheet1:基因型轉換前
    my $txt1       = "$snp_genotype";
    my $worksheet1 = $workbook->add_worksheet("Original_SNP_Result");
    report_xlsx($txt1, $worksheet1, \%format);
    ####sheet2:基因型轉換后
    my $txt2       = "$snp_type";
    my $worksheet2 = $workbook->add_worksheet("Transformed_SNP_Result");
    report_xlsx($txt2, $worksheet2, \%format);
    ####sheet3:突變位點
    my $txt3       = "$rs_ref_alt";
    my $worksheet3 = $workbook->add_worksheet("Gene_Mutation_Site");
    report_xlsx($txt3, $worksheet3, \%format);
    $workbook->close();

}

####################################################################
####step 2:meQTL_cis and meQTL_trans 結果 ==》xlsx表格輸出
####################################################################
if (2 ~~ @run_steps) {
    system qq{$Rscript $matrix_meQTL_r -o $tmpdir/meQTL_output.txt  -m $methyl_expression  -s $tmpdir/snp_type.txt};
    system qq{sed -i "s/gene/target/" $tmpdir/meQTL_output.txt};
    system qq{sed -i "s/-/_/" $tmpdir/meQTL_output.txt};
    my $report_meQTL = "$output_path/meQTL_output.xlsx";
    my $workbook_meQTL = Excel::Writer::XLSX->new($report_meQTL);
    my %format_meQTL = format_excel($workbook_meQTL);

    ####sheet1:meQTL_output
    my $txt_meQTL = "$tmpdir/meQTL_output.txt";
    my $worksheet_meQTL = $workbook_meQTL->add_worksheet("meQTL_Output");
    report_xlsx($txt_meQTL, $worksheet_meQTL, \%format_meQTL);
    ####sheet2:readme
    my $txt_readme = "$scriptdir/readme.txt";
    ReadMe($workbook_meQTL,\%format_meQTL,"$txt_readme");
    $workbook_meQTL->close();
}

####################################################################
####step 3:對p值最小的前n個結果繪制boxplot圖，橫坐標是snp分型，縱坐標是表型
####################################################################
if (3 ~~ @run_steps) {
    my $meQTL_output = "$tmpdir/meQTL_output.txt";
    my $count = $number + 1;
    my $extract_meQTL_output = "$tmpdir/extract_meQTL_output.txt";
    system qq{head -n $count $meQTL_output >$extract_meQTL_output};
    #畫圖
    system qq{$Rscript $boxplot_r -g $snp_genotype -m $methyl_expression -i $extract_meQTL_output -o $output_path/boxplot.pdf};
}

##################################################子程序##############################
sub check_samples{
    my $snp_genotype = shift @_;
    my $methyl_expression = shift @_;
    
    open FILE, $snp_genotype ;
    my $head = <FILE>;
       $head =~ s/[\r\n]//g;
    my @array0 = split /\t/, $head, 2;
    my @array = split /\t/, $array0[1];

    open FILE2, $methyl_expression;
    my $head2 = <FILE2>;
       $head2 =~ s/[\r\n]//g;
    my @array1 = split /\t/, $head2, 2;
    my @array2 = split /\t/, $array1[1];
    
    die "\n[Error]: please check '$snp_genotype $methyl_expression' samples name and order！\n\n" if (not @array ~~ @array2);
    
    close FILE;
    close FILE2;
}

sub get_snp_type{
    my $snp_genotype = shift @_;
    my $rs_ref_alt = shift @_;
    my $snp_type = shift @_;
    
    open RS, $rs_ref_alt or die "The rs_ref_alt file doesn't exist!\n";
    my %hashRef;
    my %hashAlt;
    while (<RS>) {
        $_ =~ s/[\r\n]//g;
        next if not /^rs/;
        my @array = split /\t/;
        $hashRef{$array[0]} = $array[1]; #$hash{"rs"} = "ref";
        $hashAlt{$array[0]}{$array[1]} = $array[2]; #$hashAlt{"rs"}{"ref"} = "alt";
    }
    
    open GENOTYPE, $snp_genotype or die "The snp_genotype file doesn't exist!\n";
    open SAVE, ">$snp_type";
    my $head = <GENOTYPE>;
       $head =~ s/[\r\n]//g;
    print SAVE "$head\n";
    
    while (<GENOTYPE>) {
        $_ =~ s/[\r\n]//g;
        my @array = split /\t/;
        my $ref = $hashRef{$array[0]};
        my $alt = $hashAlt{$array[0]}{$ref};
        foreach my $col (1..$#array){
            $array[$col] = exists $array[$col] ? $array[$col] : "";            
            $array[$col] = 0 if ($array[$col] eq "$ref/$ref");
            $array[$col] = 1 if ($array[$col] eq "$ref/$alt" or ($array[$col] eq "$alt/$ref"));
            $array[$col] = 2 if ($array[$col] eq "$alt/$alt");
        }
        # 輸出
        print SAVE (join "\t", @array) . "\n";    
    }    
    close RS;
    close GENOTYPE;
    close SAVE;     
}

sub report_xlsx{
    my $txt       = shift;
    my $worksheet = shift;
    my $format      = shift;

    my $row = 0;
    open TXT, $txt or die "Can't open $txt!\n";
    while (<TXT>) {
        chomp;
        my @arr = split /\t/;
        if($row == 0){
            $worksheet->write_row( 0, 0, \@arr, $format->{"title"});
        }else{
            $worksheet->write_row( $row, 0, \@arr, $format->{"normal"});
        }
        $row++;
    }
    close TXT;
}

sub format_excel{
    my $workbook = shift;
    my %format = ();
    #第一行（標題設置:黃色、上下左右居中）
    $format{"title"} = $workbook->add_format();
    $format{"title"}->set_bg_color('yellow');
    $format{"title"}->set_align('center');
    $format{"title"}->set_align('vcenter');
    $format{"title"}->set_border();
    $format{"title"}->set_font("Times New Roman");
    #其它行（上下左右居中）
    $format{"normal"} = $workbook->add_format();
    $format{"normal"}->set_align('center');
    $format{"normal"}->set_align('vcenter');
    $format{"normal"}->set_border();
    $format{"normal"}->set_font("Times New Roman");

    return %format;
}

sub ReadMe{
    my ($workbook, $format, $file)=@_;
    my $sheet= $workbook -> add_worksheet("Read Me");
    $sheet->set_row(0, 50);
    $sheet->set_column('A:A', 35);
    $sheet->set_column('B:B', 25);
    $sheet->set_column('C:C', 110);
    my $row = 0;
    open FILE, $file;
    while (<FILE>){
        $_ = ~ s/\^/\n/g;
        my @split_line = split /\t/,$_;
        my $col = 0;
        foreach (@split_line)
        {
            my $text = decode("utf8", $_);
            if($row == 0 and $col >= 0){
                $sheet -> write($row, $col, $text, $format->{"title"});
            }else{
               $sheet -> write($row, $col, $text, $format->{"normal"});
            }
            $col++;
        }
        $row++;
    }
    close FILE;
}


sub help{
    my $info = " Program: meQTL_methylTarget's SNP type transportation, meQTL analysis and plot
 Version: 2020-02-28
 Contact: *** Usage: perl ".(File::Spec->splitpath(File::Spec->rel2abs($0)))[2]." [options] 


 Options:
         --snp_genotype/-g      基因分型文件,第一行為表頭，分別為snpid編號，樣本名1，樣本名2...每一行表示一個snp位點，每一列表示一個樣本的基因分型。注：snp_genotype與methyl_expression文件中樣本順序需保持一致。
         --rs_ref_alt/-r        SNP突變位點信息, 第一行為表頭，三列內容：分別為snpid編號,ref,alt
         --methyl_expression/-m 甲基化位點的表達量，第一行為表頭，分別為甲基化位點，樣本名1，樣本名1，樣本名2...每一行表示一個甲基化位點，每一列表示一個樣本的甲基化值。注：snp_genotype與methyl_expression文件中樣本順序需保持一致。
         --number/-n            繪制snp_target_boxplot關系對個數
         --output_path/-o       輸出文件夾
         --help/-h              查看幫助文檔
         --only                 only steps
         --skip                 skip steps
         --list                 分析列表
    \n";
    return $info;
}

matrix_meQTL.r

#!/home/*/software/r/3.5.1/bin/Rscript
library(docopt)
"Usage: MatrixEQTL_CIS.r -o <file>  -m <file>  -s <file> [-c <file>]
Options:
    -o, --output_file <file>        the eQTL result
    -m , --methyl_file <file>        the input methyltarget expression file
    -s , --SNV_file <file>            the input SNV type file
    -c , --cov_file <file>             the covariates file [default: xlsx]" -> doc

opts            <- docopt(doc, version = 'Program : EQTL analysis based on methyltarget and SNV data\n')
methyl_input    <- opts$methyl_file
SNV_input        <- opts$SNV_file
output_file_name <-opts$output_file
cov_input         <- opts$cov_file

#########
.libPaths("/home/*/software/r/3.5.1/lib64/R/library/")
library(MatrixEQTL)

useModel = modelLINEAR

# Genotype file name
SNP_file_name <- SNV_input

# Gene expression file name
expression_file_name <- methyl_input

# Covariates file name
covariates_file_name <- cov_input

# Output file name
output_file_name <- output_file_name

# Set to numeric() for identity
errorCovariance = numeric()

cisDist = 1e6

## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t"      # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1          # one row of column labels
snps$fileSkipColumns = 1       # one column of row labels
snps$fileSliceSize = 2000      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)

## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)

## Load covariates
cvrt = SlicedData$new()
if(file.exists(covariates_file_name)) {
    cvrt$fileDelimiter = "\t"      # the TAB character
    cvrt$fileOmitCharacters = "NA" # denote missing values;
    cvrt$fileSkipRows = 1          # one row of column labels
    cvrt$fileSkipColumns = 1       # one column of row labels
    if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name)
    }
}

## Run the analysis
#snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
#genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name,
    pvOutputThreshold = 1,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    cisDist = cisDist,
    pvalue.hist = TRUE,
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)

readme.txt

標題     說明
SNV位點    snp位點
target    甲基化片段位點
beta    線性回歸分析系數
t_stat    線性回歸分析t值
p_value    線性回歸分析p值（小於0.05，標注橘黃色） FDR 線性回歸分析修正后的p值

plot_boxplot.r

#!/home/*/software/r/3.5.1/bin/Rscript

.libPaths("/home/*/software/r/3.5.1/lib64/R/library/")
library(docopt)
"Usage: boxplot.r  -g <file> -m <file> -i <file>  -o <file> [--width <int> --y_name <string>]
Options:
   -g, --genotype <file>                   輸入文件，基因分型文件，第一行為表頭，分別為snpid編號，樣本名1，樣本名2...每一行表示一個snp位點，每一列表示一個樣本的基因分型。注：snp_genotype與methyl_expression文件中樣本順序需保持一致。
   -m, --methyl_expression <file>          輸入文件，甲基化位點的表達量，第一行為表頭，分別為甲基化位點，樣本名1，樣本名1，樣本名2...每一行表示一個甲基化位點，每一列表示一個樣本的甲基化值。注：snp_genotype與methyl_expression文件中樣本順序需保持一致。
   -i, --meQTL_result <file>               輸入文件，meQTL_output數據
   -o, --outdir <file>                     輸出路徑
   --y_name <string>                       y軸名稱 [default: methyl_expression]
   --width <int>                           pdf繪圖時，每一個分組的寬度 [default: 2.5]" -> doc

opts                     <- docopt(doc, version = '點圖繪制\n')
genotype                 <- opts$genotype
methyl_expression        <- opts$methyl_expression
meQTL_result             <- opts$meQTL_result
outdir                   <- opts$outdir
y_name                   <- opts$y_name
width                    <- opts$width

# 加載R包
library(ggpubr, quietly = TRUE)

#讀入snp分型數據
message("loading data : ", genotype)
data_snp_genotype <- read.table(genotype, sep='\t', header = F, check.names=F, stringsAsFactors=F)
data_snp_genotype <- t(data_snp_genotype) #轉置
data_snp_genotype <- data_snp_genotype[,-1] #刪除第一列樣本名稱
colnames(data_snp_genotype) <- data_snp_genotype[1,] #第一行是標題行


#讀入甲基化位點表達量數據
message("loading data : ", methyl_expression)
data_methyl_expression <- read.table(methyl_expression, sep='\t', header = F, check.names=F, stringsAsFactors=F)
data_methyl_expression <- t(data_methyl_expression) #轉置
data_methyl_expression <- data_methyl_expression[,-1] #刪除第一列樣本名稱
colnames(data_methyl_expression) <- data_methyl_expression[1,] #第一行是標題行


#讀入meQTL_output數據
message("loading data : ", meQTL_result)
data_meQTL_result <- read.table(meQTL_result, sep='\t', header = T, check.names=F, stringsAsFactors=F)
compare <- data_meQTL_result[,c(1,2,5)] #提取snp_target對、p_value


pdf(outdir, width = 3 * as.numeric(width), height = 7)
for(row in 1:nrow(compare))
{   
    snp = compare[row, 1]
    target = compare[row, 2]
    p_value = compare[row, 3]
    p_value <- format(p_value, scientific=TRUE, digit=3)

    data_plot <- cbind(data_snp_genotype[,snp], data_methyl_expression[,target]) #提取繪圖數據
    data_plot <- as.data.frame(data_plot) 
    data_plot <- data_plot[-1,] #刪除標題行，否則非數值的標題行在as.numeric()時報錯“強制改變過程中產生了NA”
    data_plot[,2] <- as.character(data_plot[,2]) #factor -> character
    data_plot[,2] <- as.numeric(data_plot[,2])   #character ->numeric
    colnames(data_plot) = c(snp, target)
    rownames(data_plot) <- NULL  
    data_plot <- data_plot[order(data_plot[,1],decreasing=T),] #根據genotype排序
    data_plot = data_plot[complete.cases(data_plot), ]  # 去掉缺失
    
    legend_P <- paste("p_value","=",p_value,sep = "")
    plot_title = colnames(data_plot)[2]

    colnames(data_plot)[2] = 'GENE'  # 變量不能以數字開頭，故替換名稱

    p <- ggboxplot(data_plot, x=snp, y='GENE', color = snp, 
         palette = "npg", #雜志nature的配色
         # palette = "aaas", #雜志Science的配色
         # palette = "jco", #按jco雜志配色方案
         add = c("jitter"),  # 增加點
         add.params = list(color = snp),
         outlier.size = -1, # 隱藏離群點，否則離群點會在圖上出現兩次（boxplot繪制離群點，jitter再繪制所有點，從而使離群點繪制兩次）
         )+ xlab(snp) + ylab(y_name) +  guides(fill = FALSE, color = FALSE) + theme(plot.title = element_text(hjust = 0.5)) + labs(title = plot_title, subtitle = legend_P)  # 去掉legend,標題居中
        print(p)    
}

dev.off()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 利用ONT測序檢測真核生物全基因組甲基化狀態甲基化問題錦集 DNA甲基化及其測量方法（轉）如何從vcf文件中批量提取一系列基因的SNP位點？使用 bcftools 進行基因型過濾（genotypes QC） DNA甲基化與癌症、泛癌早篩 | DNA methylation and pan-cancer NCBI上查看SNP位點在哪個基因座上（locus） eQTL | Expression quantitative trait loci | 表達數量性狀基因座 | QTL | 數量性狀位點 plink計算兩個SNP位點的連鎖不平衡值（LD） SNPEFF提取vcf指定的染色體和位置（position）, SNP位點