跟着jmzeng學習GEO數據分析-GEO42872_1

本文轉載自查看原文 2020-05-31 22:45 939 R練習

 f='GSE42872_eSet.Rdata' ##原文操作如此，是不是作者的習慣，先命名 # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE42872
library(GEOquery) # 這個包需要注意兩個配置，一般來說自動化的配置是足夠的。 #Setting options('download.file.method.GEOquery'='auto') #Setting options('GEOquery.inmemory.gpl'=FALSE)

##因為安裝包的問題我把Rstudio關了，所以這里getGEO從本地讀取文件
##gset <- getGEO(filename = "GSE42872_series_matrix.txt.gz")
##問題出來了，用getGEO


> gset_42872 <- getGEO(filename = "GSE42872_series_matrix.txt.gz") Parsed with column specification: cols( ID_REF = col_double(), GSM1052615 = col_double(), GSM1052616 = col_double(), GSM1052617 = col_double(), GSM1052618 = col_double(), GSM1052619 = col_double(), GSM1052620 = col_double() ) |===================================================================| 100%    1 MB Using locally cached version of GPL6244 found here: C:\Users\Mark\AppData\Local\Temp\RtmpM1jsvM/GPL6244.soft > class(gset_42872) ##得到的gset_42872等同於下面程序中的對象a，是ExpressionSet類型，而不是原文中的list類型？？？ [1] "ExpressionSet" attr(,"package") [1] "Biobase"

##當本地存在GSE42872...gz這個文件時，getGEO會從本地讀取。

> gset <- getGEO("GSE42872",destdir = ".",AnnotGPL = F,getGPL = F)
Found 1 file(s)
GSE42872_series_matrix.txt.gz
Using locally cached version: ./GSE42872_series_matrix.txt.gz

if(!file.exists(f)){ gset <- getGEO('GSE42872', destdir=".", AnnotGPL = F, ## 注釋文件 getGPL = F) ## 平台文件 save(gset,file=f) ## 保存到本地 } load('GSE42872_eSet.Rdata') ## 載入數據--載入數據干嘛？從后面看似乎並沒有用到這個。 class(gset)

> class(gset)
[1] "list"

length(gset) class(gset[[1]]) ##既然gset是個list，gset[[1]]表示訪問其第一個元素本身。返回的是ExpressionSet類型

> class(gset[[1]])
[1] "ExpressionSet"
attr(,"package")
[1] "Biobase"

gset

> gset $GSE42872_series_matrix.txt.gz ##列表以$符號開頭 ExpressionSet (storageMode: lockedEnvironment) assayData: 33297 features, 6 samples element names: exprs protocolData: none phenoData sampleNames: GSM1052615 GSM1052616 ... GSM1052620 (6 total) varLabels: title geo_accession ... cell type:ch1 (34 total) varMetadata: labelDescription featureData: none experimentData: use 'experimentData(object)' pubMedIds: 24469106 Annotation: GPL6244

 # 因為這個GEO數據集只有一個GPL平台，所以下載到的是一個含有一個元素的list--所以有幾個注釋的GPL平台，該list就有幾個元素是嗎？ a=gset[[1]] #gset[[1]]取第一個元素本身 dat=exprs(a) #a現在是一個對象，取a這個對象通過看說明書知道要用exprs這個函數 怎么看出來啊？？還有exprs()是什么函數？ dim(dat)#看一下dat這個矩陣的維度

> a  = gset[[1]]
> class(a)
[1] "ExpressionSet"
attr(,"package")
[1] "Biobase"
> dat = exprs(a)
> class(dat)
[1] "matrix" "array" 
> dim(dat)  ##33697行 6列
[1] 33297     6
> head(dat)
        GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
7892501    7.24559    6.80686    7.73301    6.18961    7.05335    7.20371
7892502    6.82711    6.70157    7.02471    6.20493    6.76554    6.24252
7892503    4.39977    4.50781    4.88250    4.36295    4.18137    4.73492
7892504    9.48025    9.67952    9.63074    9.69200    9.91324    9.65897
7892505    4.54734    4.45247    5.11753    4.87307    5.15505    3.99340
7892506    6.80701    6.90597    6.72472    6.77028    6.77058    6.77685

 # GPL6244 dat[1:4,1:4] #查看dat這個矩陣的1至4行和1至4列，逗號前為行，逗號后為列 boxplot(dat,las=2) ##boxplot()表示畫箱線圖，類似K線。las = 2好像表示橫軸label與橫軸垂直，las=1表示平行

> boxplot(dat[,1:3],las = 1 )

pd=pData(a) #通過查看說明書知道取對象a里的臨床信息用pData？怎么看出來的啊？

> pd = pData(a)
> class(pd)
[1] "data.frame"
> dim(pd)
[1]  6 34
> head(pd[,c(1,34)],n =2)
                                 title      cell type:ch1
GSM1052615 A375 cells 24h Control rep1 Melanoma cell line
GSM1052616 A375 cells 24h Control rep2 Melanoma cell line

## 挑選一些感興趣的臨床表型。如何挑？ library(stringr) group_list=str_split(pd$title,' ',simplify = T)[,4] ##單引號中間是有個空格的！表示以空格將取出的title這一列分隔開，然后取第四列 table(group_list)

> library(stringr)
> group_list = str_split(pd$title," ",simplify = T)[,4]  
> class(group_list)
[1] "character"
> group_list
[1] "Control"     "Control"     "Control"     "Vemurafenib" "Vemurafenib"
[6] "Vemurafenib"
> table(group_list)
group_list
    Control Vemurafenib 
          3           3

> x = str_split(pd$title," ",simplify  = T)  ##看看str_split()之后得到的到底是個啥，simplify = T，返回的是一個6行5列的字符矩陣。
> class(x)
[1] "matrix" "array" 
> dim(x)
[1] 6 5
> x
     [,1]   [,2]    [,3]  [,4]          [,5]  
[1,] "A375" "cells" "24h" "Control"     "rep1"
[2,] "A375" "cells" "24h" "Control"     "rep2"
[3,] "A375" "cells" "24h" "Control"     "rep3"
[4,] "A375" "cells" "24h" "Vemurafenib" "rep1"
[5,] "A375" "cells" "24h" "Vemurafenib" "rep2"
[6,] "A375" "cells" "24h" "Vemurafenib" "rep3"
> dim(pd) ##pd是個6行34列的數據框，
[1]  6 34

這里關於str_split(),simplify = F,默認情況下，返回的是含有6個元素-每個元素都是一個字符向量的的列表

> x = str_split(pd$title," ",simplify  = F)
> x
[[1]]
[1] "A375"    "cells"   "24h"     "Control" "rep1"   

[[2]]
[1] "A375"    "cells"   "24h"     "Control" "rep2"   

[[3]]
[1] "A375"    "cells"   "24h"     "Control" "rep3"   

[[4]]
[1] "A375"        "cells"       "24h"         "Vemurafenib" "rep1"       

[[5]]
[1] "A375"        "cells"       "24h"         "Vemurafenib" "rep2"       

[[6]]
[1] "A375"        "cells"       "24h"         "Vemurafenib" "rep3"

 dat[1:4,1:4] # GPL6244 if(F){ ##這里的if(F)應該是不對的吧，if(T)才能運行后面的一大堆呀 library(GEOquery) #Download GPL file, put it in the current directory, and load it: gpl <- getGEO('GPL6244', destdir=".") colnames(Table(gpl)) head(Table(gpl)[,c(1,15)]) ## you need to check this , which column do you need probe2gene=Table(gpl)[,c(1,15)] head(probe2gene) library(stringr) save(probe2gene,file='probe2gene.Rdata') } # # load(file='probe2gene.Rdata') # ids=probe2gene

> colnames(Table(gpl))
 [1] "ID"              "GB_LIST"         "SPOT_ID"        
 [4] "seqname"         "RANGE_GB"        "RANGE_STRAND"   
 [7] "RANGE_START"     "RANGE_STOP"      "total_probes"   
[10] "gene_assignment" "mrna_assignment" "category"       
> class(Table(gpl))
[1] "data.frame"
> ls()
[1] "a"          "dat"        "f"          "gpl"        "group_list"
[6] "gset"       "pd"         "x"         
> y <- Table(gpl) ##y似乎沒有被用到
> dim(y)  ##12列
[1] 6428   12
> colnames(Table(gpl))
 [1] "ID"              "GB_LIST"         "SPOT_ID"        
 [4] "seqname"         "RANGE_GB"        "RANGE_STRAND"   
 [7] "RANGE_START"     "RANGE_STOP"      "total_probes"   
[10] "gene_assignment" "mrna_assignment" "category"       
> head(Table(gpl)[,c(1,15)])  ##只有12列
Error in `[.data.frame`(Table(gpl), , c(1, 15)) : 
  undefined columns selected
> head(Table(gpl)[,c(1,12)]) ##原文這里都是15，查看NCBI GEO數據框原始的GPL6244，只有12列。我猜這里是要取第一和最后一列，就自取c(1,12)了
       ID category
1 7896736     main
2 7896738     main
3 7896740     main
4 7896742     main
5 7896744     main
6 7896746     main
> probe2gene = Table(gpl)[,c(1,12)]
> head(probe2gene)
       ID category
1 7896736     main
2 7896738     main
3 7896740     main
4 7896742     main
5 7896744     main
6 7896746     main
> library(stringr)
> save(probe2gene,file = "probe2gene.Rdata")

 library(hugene10sttranscriptcluster.db) ids=toTable(hugene10sttranscriptclusterSYMBOL) #toTable這個函數：通過看hgu133plus2.db這個包的說明書知道提取probe_id（探針名）和symbol（基因名）的對應關系的表達矩陣的函數為toTable head(ids) #head為查看前六行

> library(hugene10sttranscriptcluster.db)  ##這個包直接搜名字，然后在是Bioconductor的包
> ids = toTable(hugene10sttranscriptclusterSYMBOL)
> class(ids) ##習慣性查看創建的對象類型
[1] "data.frame"
> dim(ids)  ##然后如果是數據框、矩陣就看行列數，懶得用nrow()和ncol()了
[1] 19814     2
>  head(ids)   ##這里看出這個ids是探針ID和symbol（基因名）一一對應的一個列表
  probe_id    symbol
1  7896759 LINC01128
2  7896761    SAMD11
3  7896779    KLHL17
4  7896798   PLEKHN1
5  7896817     ISG15
6  7896822      AGRN

colnames(ids)=c('probe_id','symbol') ids=ids[ids$symbol != '',]

> colnames(ids) = c("probe_id","symbol") ##很奇怪ids的列名本來就是probe_id symbol，為啥這里有重新賦值一次
> ids = ids[ids$symbol != "",]  ##原文引號內應該是空字符，這里取原ids的symbol不為空的行，重新賦值給ids
> dim(ids)
[1] 19814     2
> head(ids)
  probe_id    symbol
1  7896759 LINC01128
2  7896761    SAMD11
3  7896779    KLHL17
4  7896798   PLEKHN1
5  7896817     ISG15
6  7896822      AGRN

ids=ids[ids$probe_id %in% rownames(dat),]

> ids = ids[ids$probe_id %in% rownames(dat),]  ##應該把probe_id這一列與dat行名一致的行取出來，再賦值給ids。那不一致的行呢？
> dat = dat[ids$probe_id,]  ##然后用上面一致的行名在dat中挑一遍，再賦值給dat
> dim(ids)
[1] 19814     2
> dim(dat)  ##dat原來是33297列，現在列數與ids相同
[1] 19814     6

dat[1:4,1:4] dat=dat[ids$probe_id,] ids$median=apply(dat,1,median) #ids新建median這一列，列名為median，同時對dat這個矩陣按行操作，取每一行的中位數，將結果給到median這一列的每一行

> ids$median = apply(dat,1,median) ##apply()求dat每行的median，1表示每行
> head(ids, n= 3)
  probe_id    symbol   median
1  7896759 LINC01128 8.534290
2  7896761    SAMD11 8.814165
3  7896779    KLHL17 8.172010

ids=ids[order(ids$symbol,ids$median,decreasing = T),]#對ids$symbol按照ids$median中位數從大到小排列的順序排序，將對應的行賦值為一個新的ids

> ids  = ids[order(ids$symbol,ids$median,decreasing = T),]  ##order()默認返回按升序排列的索引值
> head(ids,n = 3)
      probe_id symbol   median
1397   7917103   ZZZ3 11.37165
7922   8011542  ZZEF1  9.79119
16662  8136918    ZYX 10.85015
> head(ids,n = 6)    ##似乎並不是從大到小排列，要弄明白
      probe_id symbol    median
1397   7917103   ZZZ3 11.371650
7922   8011542  ZZEF1  9.791190
16662  8136918    ZYX 10.850150
349    7901479 ZYG11B 10.498800
350    7901497 ZYG11A  5.697845
13475  8090351   ZXDC 10.131450
> dim(ids)
[1] 19814     3

ids=ids[!duplicated(ids$symbol),]#將symbol這一列取取出重復項，'!'為否，即取出不重復的項，去除重復的gene ，保留每個基因最大表達量結果s

> ids  = ids[!duplicated(ids$symbol),]
> dim(ids)  ##列數減少了，也就是說有多個探針對應一個probe_id
[1] 18821     3

dat=dat[ids$probe_id,] #新的ids取出probe_id這一列，將dat按照取出的這一列中的每一行組成一個新的dat rownames(dat)=ids$symbol#把ids的symbol這一列中的每一行給dat作為dat的行名 dat[1:4,1:4] #保留每個基因ID第一次出現的信息

> dat = dat[ids$probe_id,]  ##用ids中symbol去重復之后的probe_id,再挑一次dat，組成新的dat
> dim(dat)
[1] 18821     6
> head(dat) ##可以看到，此時da的行名還是probe_id
        GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
7917103   11.26970   11.12560   11.23260   11.47360   11.55680   11.47480
8011542    9.63412    9.44327    9.67075    9.96091   10.02970    9.91163
8136918   10.89980   10.93190   10.91850   10.71250   10.56110   10.80050
7901479   10.48080   10.32370   10.51680   10.74500   10.65980   10.38960
7901497    5.76104    5.89974    6.00656    5.56078    5.63465    5.56169
8090351   10.10980   10.10570   10.19240   10.24110   10.15310   10.06140
> rownames(dat) = ids$symbol  ##可以直接這樣的原因在於用dat[ids$probe_id,]得到新的dat之后，行名順序與ids的probe_id列的是一樣的
> head(dat)  ###
       GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
ZZZ3     11.26970   11.12560   11.23260   11.47360   11.55680   11.47480
ZZEF1     9.63412    9.44327    9.67075    9.96091   10.02970    9.91163
ZYX      10.89980   10.93190   10.91850   10.71250   10.56110   10.80050
ZYG11B   10.48080   10.32370   10.51680   10.74500   10.65980   10.38960
ZYG11A    5.76104    5.89974    6.00656    5.56078    5.63465    5.56169
ZXDC     10.10980   10.10570   10.19240   10.24110   10.15310   10.06140

save(dat,group_list,file = 'step1-output.Rdata')

> save(dat,group_list,file = "step1-output.Rdata")  ##.RData以二進制的方式保存了會話中的變量值
> ?save
> ls()
 [1] "a"          "dat"        "f"          "gpl"        "group_list"
 [6] "gset"       "ids"        "pd"         "probe2gene" "x"         
[11] "y"         
> save("gpl","group_list","gset","ids","pd","probe2gene","a","x","y",file = "step1-all-test-files.Rdata")#保留所有新建的變量

然后在新Rstudio窗口中打開這個.Rdata文件，運行如下

> load("D:/RStudy/RScript/test_GSE42872_first try/step1-all-test-files.Rdata") ##load()讀取.Rdata文件，絕對路徑
Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which.min

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: GEOquery
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
> group_list
[1] "Control"     "Control"     "Control"     "Vemurafenib" "Vemurafenib"
[6] "Vemurafenib"

以上內容from https://github.com/jmzeng1314/GEO/blob/master/GSE42872_main/step1-download.R，在其基礎上有自己的思考。

> gset$GSE42872_series_matrix.txt.gzExpressionSet (storageMode: lockedEnvironment)assayData: 33297 features, 6 samples element names: exprs protocolData: nonephenoData sampleNames: GSM1052615 GSM1052616 ... GSM1052620 (6 total) varLabels: title geo_accession ... cell type:ch1 (34 total) varMetadata: labelDescriptionfeatureData: noneexperimentData: use 'experimentData(object)' pubMedIds: 24469106 Annotation: GPL6244

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 芯片數據分析步驟1 芯片數據下載-GEO從GEO數據庫下載數據的方法 12、geo數據上傳 Python數據分析學習（一）機器學習之數據分析數據分析學習路線 Python數據分析之numpy學習 Python數據分析之pandas學習 Python數據分析之pandas學習 Python數據分析學習 Python數據分析之pandas學習