f='GSE42872_eSet.Rdata' ##原文操作如此,是不是作者的習慣,先命名 # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE42872
library(GEOquery) # 這個包需要注意兩個配置,一般來說自動化的配置是足夠的。 #Setting options('download.file.method.GEOquery'='auto') #Setting options('GEOquery.inmemory.gpl'=FALSE)
##因為安裝包的問題我把Rstudio關了,所以這里getGEO從本地讀取文件
##gset <- getGEO(filename = "GSE42872_series_matrix.txt.gz")
##問題出來了,用getGEO
> gset_42872 <- getGEO(filename = "GSE42872_series_matrix.txt.gz") Parsed with column specification: cols( ID_REF = col_double(), GSM1052615 = col_double(), GSM1052616 = col_double(), GSM1052617 = col_double(), GSM1052618 = col_double(), GSM1052619 = col_double(), GSM1052620 = col_double() ) |===================================================================| 100% 1 MB Using locally cached version of GPL6244 found here: C:\Users\Mark\AppData\Local\Temp\RtmpM1jsvM/GPL6244.soft > class(gset_42872) ##得到的gset_42872等同於下面程序中的對象a,是ExpressionSet類型,而不是原文中的list類型??? [1] "ExpressionSet" attr(,"package") [1] "Biobase"
##當本地存在GSE42872...gz這個文件時,getGEO會從本地讀取。
> gset <- getGEO("GSE42872",destdir = ".",AnnotGPL = F,getGPL = F) Found 1 file(s) GSE42872_series_matrix.txt.gz Using locally cached version: ./GSE42872_series_matrix.txt.gz
if(!file.exists(f)){ gset <- getGEO('GSE42872', destdir=".", AnnotGPL = F, ## 注釋文件 getGPL = F) ## 平台文件 save(gset,file=f) ## 保存到本地 } load('GSE42872_eSet.Rdata') ## 載入數據--載入數據干嘛?從后面看似乎並沒有用到這個。 class(gset)
> class(gset)
[1] "list"
length(gset) class(gset[[1]]) ##既然gset是個list,gset[[1]]表示訪問其第一個元素本身。返回的是ExpressionSet類型
> class(gset[[1]])
[1] "ExpressionSet"
attr(,"package")
[1] "Biobase"
gset
> gset $GSE42872_series_matrix.txt.gz ##列表以$符號開頭 ExpressionSet (storageMode: lockedEnvironment) assayData: 33297 features, 6 samples element names: exprs protocolData: none phenoData sampleNames: GSM1052615 GSM1052616 ... GSM1052620 (6 total) varLabels: title geo_accession ... cell type:ch1 (34 total) varMetadata: labelDescription featureData: none experimentData: use 'experimentData(object)' pubMedIds: 24469106 Annotation: GPL6244
# 因為這個GEO數據集只有一個GPL平台,所以下載到的是一個含有一個元素的list--所以有幾個注釋的GPL平台,該list就有幾個元素是嗎? a=gset[[1]] #gset[[1]]取第一個元素本身 dat=exprs(a) #a現在是一個對象,取a這個對象通過看說明書知道要用exprs這個函數 怎么看出來啊??還有exprs()是什么函數? dim(dat)#看一下dat這個矩陣的維度
> a = gset[[1]] > class(a) [1] "ExpressionSet" attr(,"package") [1] "Biobase" > dat = exprs(a) > class(dat) [1] "matrix" "array" > dim(dat) ##33697行 6列 [1] 33297 6 > head(dat) GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620 7892501 7.24559 6.80686 7.73301 6.18961 7.05335 7.20371 7892502 6.82711 6.70157 7.02471 6.20493 6.76554 6.24252 7892503 4.39977 4.50781 4.88250 4.36295 4.18137 4.73492 7892504 9.48025 9.67952 9.63074 9.69200 9.91324 9.65897 7892505 4.54734 4.45247 5.11753 4.87307 5.15505 3.99340 7892506 6.80701 6.90597 6.72472 6.77028 6.77058 6.77685
# GPL6244 dat[1:4,1:4] #查看dat這個矩陣的1至4行和1至4列,逗號前為行,逗號后為列 boxplot(dat,las=2) ##boxplot()表示畫箱線圖,類似K線。las = 2好像表示橫軸label與橫軸垂直,las=1表示平行

> boxplot(dat[,1:3],las = 1 )

pd=pData(a) #通過查看說明書知道取對象a里的臨床信息用pData?怎么看出來的啊?
> pd = pData(a) > class(pd) [1] "data.frame" > dim(pd) [1] 6 34 > head(pd[,c(1,34)],n =2) title cell type:ch1 GSM1052615 A375 cells 24h Control rep1 Melanoma cell line GSM1052616 A375 cells 24h Control rep2 Melanoma cell line
## 挑選一些感興趣的臨床表型。如何挑? library(stringr) group_list=str_split(pd$title,' ',simplify = T)[,4] ##單引號中間是有個空格的!表示以空格將取出的title這一列分隔開,然后取第四列 table(group_list)
> library(stringr) > group_list = str_split(pd$title," ",simplify = T)[,4] > class(group_list) [1] "character" > group_list [1] "Control" "Control" "Control" "Vemurafenib" "Vemurafenib" [6] "Vemurafenib" > table(group_list) group_list Control Vemurafenib 3 3
> x = str_split(pd$title," ",simplify = T) ##看看str_split()之后得到的到底是個啥,simplify = T,返回的是一個6行5列的字符矩陣。 > class(x) [1] "matrix" "array" > dim(x) [1] 6 5 > x [,1] [,2] [,3] [,4] [,5] [1,] "A375" "cells" "24h" "Control" "rep1" [2,] "A375" "cells" "24h" "Control" "rep2" [3,] "A375" "cells" "24h" "Control" "rep3" [4,] "A375" "cells" "24h" "Vemurafenib" "rep1" [5,] "A375" "cells" "24h" "Vemurafenib" "rep2" [6,] "A375" "cells" "24h" "Vemurafenib" "rep3" > dim(pd) ##pd是個6行34列的數據框, [1] 6 34
這里關於str_split(),simplify = F,默認情況下,返回的是含有6個元素-每個元素都是一個字符向量的的列表
> x = str_split(pd$title," ",simplify = F) > x [[1]] [1] "A375" "cells" "24h" "Control" "rep1" [[2]] [1] "A375" "cells" "24h" "Control" "rep2" [[3]] [1] "A375" "cells" "24h" "Control" "rep3" [[4]] [1] "A375" "cells" "24h" "Vemurafenib" "rep1" [[5]] [1] "A375" "cells" "24h" "Vemurafenib" "rep2" [[6]] [1] "A375" "cells" "24h" "Vemurafenib" "rep3"
dat[1:4,1:4] # GPL6244 if(F){ ##這里的if(F)應該是不對的吧,if(T)才能運行后面的一大堆呀 library(GEOquery) #Download GPL file, put it in the current directory, and load it: gpl <- getGEO('GPL6244', destdir=".") colnames(Table(gpl)) head(Table(gpl)[,c(1,15)]) ## you need to check this , which column do you need probe2gene=Table(gpl)[,c(1,15)] head(probe2gene) library(stringr) save(probe2gene,file='probe2gene.Rdata') } # # load(file='probe2gene.Rdata') # ids=probe2gene
> colnames(Table(gpl)) [1] "ID" "GB_LIST" "SPOT_ID" [4] "seqname" "RANGE_GB" "RANGE_STRAND" [7] "RANGE_START" "RANGE_STOP" "total_probes" [10] "gene_assignment" "mrna_assignment" "category" > class(Table(gpl)) [1] "data.frame" > ls() [1] "a" "dat" "f" "gpl" "group_list" [6] "gset" "pd" "x" > y <- Table(gpl) ##y似乎沒有被用到 > dim(y) ##12列 [1] 6428 12 > colnames(Table(gpl)) [1] "ID" "GB_LIST" "SPOT_ID" [4] "seqname" "RANGE_GB" "RANGE_STRAND" [7] "RANGE_START" "RANGE_STOP" "total_probes" [10] "gene_assignment" "mrna_assignment" "category" > head(Table(gpl)[,c(1,15)]) ##只有12列 Error in `[.data.frame`(Table(gpl), , c(1, 15)) : undefined columns selected
> head(Table(gpl)[,c(1,12)]) ##原文這里都是15,查看NCBI GEO數據框原始的GPL6244,只有12列。我猜這里是要取第一和最后一列,就自取c(1,12)了 ID category 1 7896736 main 2 7896738 main 3 7896740 main 4 7896742 main 5 7896744 main 6 7896746 main > probe2gene = Table(gpl)[,c(1,12)] > head(probe2gene) ID category 1 7896736 main 2 7896738 main 3 7896740 main 4 7896742 main 5 7896744 main 6 7896746 main > library(stringr) > save(probe2gene,file = "probe2gene.Rdata")
library(hugene10sttranscriptcluster.db) ids=toTable(hugene10sttranscriptclusterSYMBOL) #toTable這個函數:通過看hgu133plus2.db這個包的說明書知道提取probe_id(探針名)和symbol(基因名)的對應關系的表達矩陣的函數為toTable head(ids) #head為查看前六行
> library(hugene10sttranscriptcluster.db) ##這個包直接搜名字,然后在是Bioconductor的包 > ids = toTable(hugene10sttranscriptclusterSYMBOL) > class(ids) ##習慣性查看創建的對象類型 [1] "data.frame" > dim(ids) ##然后如果是數據框、矩陣就看行列數,懶得用nrow()和ncol()了 [1] 19814 2 > head(ids) ##這里看出這個ids是探針ID和symbol(基因名)一一對應的一個列表 probe_id symbol 1 7896759 LINC01128 2 7896761 SAMD11 3 7896779 KLHL17 4 7896798 PLEKHN1 5 7896817 ISG15 6 7896822 AGRN
colnames(ids)=c('probe_id','symbol') ids=ids[ids$symbol != '',]
> colnames(ids) = c("probe_id","symbol") ##很奇怪ids的列名本來就是probe_id symbol,為啥這里有重新賦值一次 > ids = ids[ids$symbol != "",] ##原文引號內應該是空字符,這里取原ids的symbol不為空的行,重新賦值給ids > dim(ids) [1] 19814 2 > head(ids) probe_id symbol 1 7896759 LINC01128 2 7896761 SAMD11 3 7896779 KLHL17 4 7896798 PLEKHN1 5 7896817 ISG15 6 7896822 AGRN
ids=ids[ids$probe_id %in% rownames(dat),]
> ids = ids[ids$probe_id %in% rownames(dat),] ##應該把probe_id這一列與dat行名一致的行取出來,再賦值給ids。那不一致的行呢? > dat = dat[ids$probe_id,] ##然后用上面一致的行名在dat中挑一遍,再賦值給dat
> dim(ids) [1] 19814 2 > dim(dat) ##dat原來是33297列,現在列數與ids相同 [1] 19814 6
dat[1:4,1:4] dat=dat[ids$probe_id,] ids$median=apply(dat,1,median) #ids新建median這一列,列名為median,同時對dat這個矩陣按行操作,取每一行的中位數,將結果給到median這一列的每一行
> ids$median = apply(dat,1,median) ##apply()求dat每行的median,1表示每行 > head(ids, n= 3) probe_id symbol median 1 7896759 LINC01128 8.534290 2 7896761 SAMD11 8.814165 3 7896779 KLHL17 8.172010
ids=ids[order(ids$symbol,ids$median,decreasing = T),]#對ids$symbol按照ids$median中位數從大到小排列的順序排序,將對應的行賦值為一個新的ids
> ids = ids[order(ids$symbol,ids$median,decreasing = T),] ##order()默認返回按升序排列的索引值
> head(ids,n = 3) probe_id symbol median 1397 7917103 ZZZ3 11.37165 7922 8011542 ZZEF1 9.79119 16662 8136918 ZYX 10.85015 > head(ids,n = 6) ##似乎並不是從大到小排列,要弄明白 probe_id symbol median 1397 7917103 ZZZ3 11.371650 7922 8011542 ZZEF1 9.791190 16662 8136918 ZYX 10.850150 349 7901479 ZYG11B 10.498800 350 7901497 ZYG11A 5.697845 13475 8090351 ZXDC 10.131450 > dim(ids) [1] 19814 3
ids=ids[!duplicated(ids$symbol),]#將symbol這一列取取出重復項,'!'為否,即取出不重復的項,去除重復的gene ,保留每個基因最大表達量結果s
> ids = ids[!duplicated(ids$symbol),] > dim(ids) ##列數減少了,也就是說有多個探針對應一個probe_id [1] 18821 3
dat=dat[ids$probe_id,] #新的ids取出probe_id這一列,將dat按照取出的這一列中的每一行組成一個新的dat rownames(dat)=ids$symbol#把ids的symbol這一列中的每一行給dat作為dat的行名 dat[1:4,1:4] #保留每個基因ID第一次出現的信息
> dat = dat[ids$probe_id,] ##用ids中symbol去重復之后的probe_id,再挑一次dat,組成新的dat > dim(dat) [1] 18821 6 > head(dat) ##可以看到,此時da的行名還是probe_id GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620 7917103 11.26970 11.12560 11.23260 11.47360 11.55680 11.47480 8011542 9.63412 9.44327 9.67075 9.96091 10.02970 9.91163 8136918 10.89980 10.93190 10.91850 10.71250 10.56110 10.80050 7901479 10.48080 10.32370 10.51680 10.74500 10.65980 10.38960 7901497 5.76104 5.89974 6.00656 5.56078 5.63465 5.56169 8090351 10.10980 10.10570 10.19240 10.24110 10.15310 10.06140 > rownames(dat) = ids$symbol ##可以直接這樣的原因在於用dat[ids$probe_id,]得到新的dat之后,行名順序與ids的probe_id列的是一樣的 > head(dat) ### GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620 ZZZ3 11.26970 11.12560 11.23260 11.47360 11.55680 11.47480 ZZEF1 9.63412 9.44327 9.67075 9.96091 10.02970 9.91163 ZYX 10.89980 10.93190 10.91850 10.71250 10.56110 10.80050 ZYG11B 10.48080 10.32370 10.51680 10.74500 10.65980 10.38960 ZYG11A 5.76104 5.89974 6.00656 5.56078 5.63465 5.56169 ZXDC 10.10980 10.10570 10.19240 10.24110 10.15310 10.06140
save(dat,group_list,file = 'step1-output.Rdata')
> save(dat,group_list,file = "step1-output.Rdata") ##.RData以二進制的方式保存了會話中的變量值 > ?save > ls() [1] "a" "dat" "f" "gpl" "group_list" [6] "gset" "ids" "pd" "probe2gene" "x" [11] "y" > save("gpl","group_list","gset","ids","pd","probe2gene","a","x","y",file = "step1-all-test-files.Rdata")#保留所有新建的變量
然后在新Rstudio窗口中打開這個.Rdata文件,運行如下
> load("D:/RStudy/RScript/test_GSE42872_first try/step1-all-test-files.Rdata") ##load()讀取.Rdata文件,絕對路徑 Loading required package: Biobase Loading required package: BiocGenerics Loading required package: parallel Attaching package: ‘BiocGenerics’ The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs The following objects are masked from ‘package:base’: anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. Loading required package: GEOquery Setting options('download.file.method.GEOquery'='auto') Setting options('GEOquery.inmemory.gpl'=FALSE) > group_list [1] "Control" "Control" "Control" "Vemurafenib" "Vemurafenib" [6] "Vemurafenib"
以上內容from https://github.com/jmzeng1314/GEO/blob/master/GSE42872_main/step1-download.R,在其基礎上有自己的思考。
> gset$GSE42872_series_matrix.txt.gzExpressionSet (storageMode: lockedEnvironment)assayData: 33297 features, 6 samples element names: exprs protocolData: nonephenoData sampleNames: GSM1052615 GSM1052616 ... GSM1052620 (6 total) varLabels: title geo_accession ... cell type:ch1 (34 total) varMetadata: labelDescriptionfeatureData: noneexperimentData: use 'experimentData(object)' pubMedIds: 24469106 Annotation: GPL6244