數據清洗整理基本操作(R:dplyr、tidyr、lubridate)


把以前在swirl課程學的基礎數據清洗操作重新整理一遍,主要包括:

  1. dplyr包中的select、filter、arrange、mutate、group_by、summarize函數,以及%>% 管道操算符(pip operation),“then”的意思。
  2. tidyr包中的gather、seperate、spread函數。
  3. lubridate包中date型變量記錄更新等操作。

1、dplyr包(select列選擇、filter行選擇、arrange排序、mutate新增、group_by分組統計、summarize匯總)

(1)首先讀入數據並查看數據基本信息:

1 library(dplyr)
2 
3 ### Intro
4 path2csv <- file.path('2014-07-08.csv')
5 df <- read.csv(path2csv, as.is = TRUE)
6 dim(df)
7 head(df)
8 cran <- tbl_df(df)
9 cran

 (2)利用幾個常用函數進行數據提取,篩選,排序等操作:

 1 ##基本函數
 2 select(cran,ip_id,package,country)  #cran為數據集名稱,ip_id,package,country為列名
 3 select(cran,r_arch:country)  #r_arch:country表示從r_arch列取到country列
 4 select(cran,country:r_arch)  #反過來排序
 5 select(cran,-time)  #不取time列
 6 select(cran,-(X:size))  #不取X到size的列
 7 
 8 filter(cran,package=="swirl")  #cran為數據集,篩選package為”swirl“的所有行
 9 filter(cran,r_version=="3.1.1",country=="US")  #兩個篩選條件
10 filter(cran,r_version<="3.0.2",country=="IN")
11 filter(cran,country=="US"|country=="IN")  #添加正則表達式
12 filter(cran,!is.na(r_version))  #篩選所有r_version非空的行
13 
14 arrange(cran2,ip_id)  #按ip_id排升序(默認排序方式)
15 arrange(cran2,desc(ip_id))  #按ip_id排降序
16 arrange(cran2,package,ip_id)  #先按package列排序,再對ip_id排序
17 
18 mutate(cran3,size_mb=size/2^20)  #利用已有列新建一列size_mb
19 mutate(cran3,size_mb=size/2^20,size_gb=size_mb/2^10)  #新建倆列
20 
21 summarise(cran3,avg_bytes=mean(size))  #匯總,查看整體統計數據

(3)分組統計並排序:

 1 by_package<-group_by(cran,package)  #按package列進行分組統計
 2 summarise(by_package,mean(size))  #計算分組后每組記錄的平均size
 3 pack_sum <- summarize(by_package,  #對分組數據by_package進行匯總統計
 4                       count =n(),  #n()統計每組頻數  
 5                       unique = n_distinct(ip_id),  #統計每組有多少不同的ip_id
 6                       countries = n_distinct(country),
 7                       avg_bytes = mean(size))
 8 
 9 quantile(pack_sum$unique, probs = 0.99)  #計算unique的99%分位數
10 top_unique<-filter(pack_sum,unique>465)  #篩選頻數大於465的記錄
11 top_unique_sorted<-arrange(top_unique,desc(unique))
12 top_countries <- filter(pack_sum, countries > 60)
13 result1 <- arrange(top_countries, desc(countries), avg_bytes)
14 print(result1)

(4)利用嵌套精簡(3)的過程:

 1 result2 <-
 2   arrange(
 3     filter(
 4       summarize(  #分組數據用summarize進行匯總
 5         group_by(cran,
 6                  package
 7         ),
 8         count = n(),
 9         unique = n_distinct(ip_id),
10         countries = n_distinct(country),
11         avg_bytes = mean(size)
12       ),
13       countries > 60
14     ),
15     desc(countries),
16     avg_bytes
17   )
18 
19 print(result2)

(5)利用%>%符號操作(3)中的過程:

 1 # you read it, you can pronounce the %>% operator as
 2 # the word 'then'.
 3 result3 <-
 4   cran %>%
 5   group_by(package) %>%
 6   summarize(count = n(),
 7             unique = n_distinct(ip_id),
 8             countries = n_distinct(country),
 9             avg_bytes = mean(size)
10   ) %>%
11   filter(countries > 60) %>%
12   arrange(desc(countries), avg_bytes)
13 
14 # Print result to console
15 print(result3)

(6)幾個函數及管道符號的一起使用:

1 cran %>%
2   select(ip_id, country, package, size) %>%
3   mutate(size_mb = size / 2^20) %>%
4   filter(size_mb <= 0.5) %>%
5   arrange(desc(size_mb)) %>%
6   print

2、tidyr包

(1)gather() 和 seperate():

1 library(tidyr)
2 library(readr)
3 library(dplyr)
4 
5 students2 %>%
6   gather( sex_class,count ,-grade ) %>%
7   separate( sex_class, c("sex", "class")) %>%
8   print

       

下面的gather( students2,sex_class,count ,-grade ) 
#gather Gather columns into key-value pairs;students2為數據集,grade為第一列,不參與gather;剩下的列名及數據作為鍵值對放入sex_class(key)和count(value)下。

接下來seperate將sex_class列分成倆列。separate :Separate one column into multiple columns.

(2)spread():   # spread:Spread a key-value pair across multiple columns.

1 students3 %>%
2   gather(class, grade, class1:class5, na.rm = TRUE) %>%
3   #name未參與gather,因此照原格式輸出
4   spread(test, grade) %>%   
5   # spread:Spread a key-value pair across multiple columns.
6   mutate(class=parse_number(class)) %>%
7   # parse_numeric    Extract numeric component of variable.
8   print

(3)行合並以及列合並:

 1 student_info <- students4 %>%
 2   select(id, name, sex) %>%
 3   unique() %>%  #去除重復記錄
 4   print
 5 
 6 gradebook <- students4 %>%
 7   select(id,class,midterm,final) %>%
 8   print
 9 
10 merge.data.frame(gradebook,student_info,by="id")
11 #列合並不同行數的數據框
12 
13 passed<-mutate(passed,status="passed")
14 failed<-mutate(failed,status="failed")
15 bind_rows(passed,failed)
16 #行合並倆個數據框

  

 

(4)幾個函數一起使用:

1 sat %>%
2   select(-contains("total")) %>%
3   gather(part_sex, count, -score_range) %>%
4   separate(part_sex, c("part", "sex")) %>%
5   group_by(part,sex) %>%
6   mutate( total = sum(count),  #統計每組頻數
7           prop = count / total) %>%  #統計每組里面各分數段比例
8   print

3、lubradate包

(1)獲取當前日期時間

1 library(lubridate)

wday(today()) #get the day of the week,such that 1 = Sunday, 2 = Monday

(2)時間數據的錄入

1 ymd("1989-05-17")
2 ymd("1989 May 17")
3 mdy("March 12,1975")
4 dmy(25081985)
5 ymd("1920/1/2")
6 
7 hms("03:22:14")
8 ymd_hms(c("2010-04-14-04-35-59", "2010-01-01 12:00:00"))
9 ymd_hms(now())

(3)更新時間數據,計算時間間隔

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM