把以前在swirl課程學的基礎數據清洗操作重新整理一遍,主要包括:
- dplyr包中的select、filter、arrange、mutate、group_by、summarize函數,以及%>% 管道操算符(pip operation),“then”的意思。
- tidyr包中的gather、seperate、spread函數。
- lubridate包中date型變量記錄更新等操作。
1、dplyr包(select列選擇、filter行選擇、arrange排序、mutate新增、group_by分組統計、summarize匯總)
(1)首先讀入數據並查看數據基本信息:
1 library(dplyr) 2 3 ### Intro 4 path2csv <- file.path('2014-07-08.csv') 5 df <- read.csv(path2csv, as.is = TRUE) 6 dim(df) 7 head(df) 8 cran <- tbl_df(df) 9 cran
(2)利用幾個常用函數進行數據提取,篩選,排序等操作:
1 ##基本函數 2 select(cran,ip_id,package,country) #cran為數據集名稱,ip_id,package,country為列名 3 select(cran,r_arch:country) #r_arch:country表示從r_arch列取到country列 4 select(cran,country:r_arch) #反過來排序 5 select(cran,-time) #不取time列 6 select(cran,-(X:size)) #不取X到size的列 7 8 filter(cran,package=="swirl") #cran為數據集,篩選package為”swirl“的所有行 9 filter(cran,r_version=="3.1.1",country=="US") #兩個篩選條件 10 filter(cran,r_version<="3.0.2",country=="IN") 11 filter(cran,country=="US"|country=="IN") #添加正則表達式 12 filter(cran,!is.na(r_version)) #篩選所有r_version非空的行 13 14 arrange(cran2,ip_id) #按ip_id排升序(默認排序方式) 15 arrange(cran2,desc(ip_id)) #按ip_id排降序 16 arrange(cran2,package,ip_id) #先按package列排序,再對ip_id排序 17 18 mutate(cran3,size_mb=size/2^20) #利用已有列新建一列size_mb 19 mutate(cran3,size_mb=size/2^20,size_gb=size_mb/2^10) #新建倆列 20 21 summarise(cran3,avg_bytes=mean(size)) #匯總,查看整體統計數據
(3)分組統計並排序:
1 by_package<-group_by(cran,package) #按package列進行分組統計 2 summarise(by_package,mean(size)) #計算分組后每組記錄的平均size 3 pack_sum <- summarize(by_package, #對分組數據by_package進行匯總統計 4 count =n(), #n()統計每組頻數 5 unique = n_distinct(ip_id), #統計每組有多少不同的ip_id 6 countries = n_distinct(country), 7 avg_bytes = mean(size)) 8 9 quantile(pack_sum$unique, probs = 0.99) #計算unique的99%分位數 10 top_unique<-filter(pack_sum,unique>465) #篩選頻數大於465的記錄 11 top_unique_sorted<-arrange(top_unique,desc(unique)) 12 top_countries <- filter(pack_sum, countries > 60) 13 result1 <- arrange(top_countries, desc(countries), avg_bytes) 14 print(result1)
(4)利用嵌套精簡(3)的過程:
1 result2 <- 2 arrange( 3 filter( 4 summarize( #分組數據用summarize進行匯總 5 group_by(cran, 6 package 7 ), 8 count = n(), 9 unique = n_distinct(ip_id), 10 countries = n_distinct(country), 11 avg_bytes = mean(size) 12 ), 13 countries > 60 14 ), 15 desc(countries), 16 avg_bytes 17 ) 18 19 print(result2)
(5)利用%>%符號操作(3)中的過程:
1 # you read it, you can pronounce the %>% operator as 2 # the word 'then'. 3 result3 <- 4 cran %>% 5 group_by(package) %>% 6 summarize(count = n(), 7 unique = n_distinct(ip_id), 8 countries = n_distinct(country), 9 avg_bytes = mean(size) 10 ) %>% 11 filter(countries > 60) %>% 12 arrange(desc(countries), avg_bytes) 13 14 # Print result to console 15 print(result3)
(6)幾個函數及管道符號的一起使用:
1 cran %>% 2 select(ip_id, country, package, size) %>% 3 mutate(size_mb = size / 2^20) %>% 4 filter(size_mb <= 0.5) %>% 5 arrange(desc(size_mb)) %>% 6 print
2、tidyr包
(1)gather() 和 seperate():
1 library(tidyr) 2 library(readr) 3 library(dplyr) 4 5 students2 %>% 6 gather( sex_class,count ,-grade ) %>% 7 separate( sex_class, c("sex", "class")) %>% 8 print
下面的gather( students2,sex_class,count ,-grade )
#gather Gather columns into key-value pairs;students2為數據集,grade為第一列,不參與gather;剩下的列名及數據作為鍵值對放入sex_class(key)和count(value)下。
接下來seperate將sex_class列分成倆列。separate :Separate one column into multiple columns.
(2)spread(): # spread:Spread a key-value pair across multiple columns.
1 students3 %>% 2 gather(class, grade, class1:class5, na.rm = TRUE) %>% 3 #name未參與gather,因此照原格式輸出 4 spread(test, grade) %>% 5 # spread:Spread a key-value pair across multiple columns. 6 mutate(class=parse_number(class)) %>% 7 # parse_numeric Extract numeric component of variable. 8 print
(3)行合並以及列合並:
1 student_info <- students4 %>% 2 select(id, name, sex) %>% 3 unique() %>% #去除重復記錄 4 print 5 6 gradebook <- students4 %>% 7 select(id,class,midterm,final) %>% 8 print 9 10 merge.data.frame(gradebook,student_info,by="id") 11 #列合並不同行數的數據框 12 13 passed<-mutate(passed,status="passed") 14 failed<-mutate(failed,status="failed") 15 bind_rows(passed,failed) 16 #行合並倆個數據框
(4)幾個函數一起使用:
1 sat %>% 2 select(-contains("total")) %>% 3 gather(part_sex, count, -score_range) %>% 4 separate(part_sex, c("part", "sex")) %>% 5 group_by(part,sex) %>% 6 mutate( total = sum(count), #統計每組頻數 7 prop = count / total) %>% #統計每組里面各分數段比例 8 print
3、lubradate包
(1)獲取當前日期時間
1 library(lubridate)
wday(today()) #get the day of the week,such that 1 = Sunday, 2 = Monday
(2)時間數據的錄入
1 ymd("1989-05-17") 2 ymd("1989 May 17") 3 mdy("March 12,1975") 4 dmy(25081985) 5 ymd("1920/1/2") 6 7 hms("03:22:14") 8 ymd_hms(c("2010-04-14-04-35-59", "2010-01-01 12:00:00")) 9 ymd_hms(now())
(3)更新時間數據,計算時間間隔