1)介紹
我們用SRAdb library來對SRA數據進行處理。 SRAdb 可以更方便更快的接入 metadata associated with submission, 包括study, sample, experiment, and run. SRAdb 包通過 NCBI SRA數據庫中的metadata信息 作用. 首先dbConnect ()接入 R system 中的local database systems, 所有的搜索就在本地文件的基礎上進行。
the queries we tried with the dbGetQuery function are passed in the form of SQL queries, which is a Select From Where framework. This part actually requires the
RSQLite package, which is installed when installing the SRAdb package, as a dependency. The getSRA function can actually do a full text search in the SRA data again via RSQLite and fetch the data in the selected fields for the query.
2)下載
source("http://bioconductor.org/biocLite.R") biocLite("SRAdb") library(SRAdb)
3)了解SRA database
#sqlFile <- getSRAdbFile() #在線獲取,太大了,不要這樣做。
sraCon <- dbConnect(SQLite(), 'SRAmetadb.sqlite') #於是我下載了這個文件,壓縮文件2個G(解壓后36個G),然后讀取了這個文件,相當於下載nr庫到本地。 sraTables <- dbListTables(sraCon) # investigate the content of the database dbListFields(sraCon,"study") #########關鍵詞keyword: embryo myHit <- dbGetQuery(sraCon, paste("select study_accession,study_title from study where","study_description like'%embryo'",sep=" ")) # myHit <- getSRA( search_terms = "brain", out_types = c('run','study'), sraCon) #free text收索 myHit <- getSRA( search_terms ='Alzheimers OR "EPILEPSY"', out_types = c('sample'), sraCon) #邏輯收索
4)從SRA database下載數據
myHit <- getSRA( search_terms ='ALZHEIMERS OR "EPILEPSY"', out_types = c('sample'), sraCon) #關鍵詞收索 conversion <- sraConvert( c('ERS354366','SRS266589'), sra_con = sraCon) #選擇其中的2個,查看信息 conversion rs <- getSRAinfo( c("SRX100465"), sraCon, sraType = "sra") #選擇其中一個看相應的信息,會顯示出ftp地址 getSRAfile( c("SRR351672", "SRR351673"), sraCon, fileType='fastq') ##下載感興趣的run
5)下載完fq文件后,用R進行讀取
install.packages("R.utils") library(R.utils) #下載數據用 download.file(url="ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA000/SRA000241/SRX000122/SRR000648.fastq.bz2", destfile = "SRR000648.fastq.bz2") bunzip2(list.files(pattern = ".fastq.bz2$")) #解壓 biocLite("ShortRead") library(ShortRead) #讀取fq文件 MyFastq <- readFastq(getwd(), pattern=".fastq") #小心運行,要至少8G內存 readLines("SRR000648.fastq", 4) # first four lines of the file
6)下載並讀取比對數據(bam)
download.file(url="http://genome.ucsc.edu/goldenPath/help/examples/bamExample.bam", destfile = "bamExample.bam") library(Rsamtools) bam <- scanBam("bamExample.bam") #讀取bam names(bam[[1]]) #查看bam的信息 countBam("bamExample.bam") #統計bam信息 what <- c("rname", "strand", "pos", "qwidth", "seq") #只讀取其中的幾列 param <- ScanBamParam(what=what) bam2 <- scanBam("bamExample.bam", param=param) names(bam2[[1]]) bam_df <- do.call("DataFrame", bam[[1]]) # Read the data as a DataFrame object head(bam_df) table(bam_df$rname == '21' & bam_df$flag == 16) #提取符合指定要求的sequences,即flag=16為reverse strands
7)對原始raw NGS data 的預處理
prefetch SRR000648 prefetch SRR000657 fastq-dump --split-3 -O ./ SRR000657 fastq-dump --split-3 -O ./ SRR000648 library(ShortRead) myFiles <- list.files(getwd(), "fastq", full=TRUE) myFQ <- lapply(myFiles, readFastq) myQual <- FastqQuality(quality(quality(myFQ[[1]]))) #讀取質量 readM <- as(myQual, "matrix") #將質控轉化為矩陣 boxplot(data.frame(readM), outline = FALSE, main="Per Cycle Read Quality", xlab="Cycle", ylab="Phred Quality") #畫箱型圖