Data Manipulation with dplyr in R
select
select(data,变量名)
The filter and arrange verbs
arrange
counties_selected <- counties %>%
select(state, county, population, private_work, public_work, self_employed)
# Add a verb to sort in descending order of public_work
counties_selected %>%arrange(desc(public_work))
filter
counties_selected <- counties %>%
select(state, county, population)
# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
filter(state == "California",
population > 1000000)
#筛选多个变量
filter(id %in% c("a","b","c"...)) 存在
filter(id %in% c("a","b","c"...)) 不存在
fct_relevel {forcats}
Reorder factor levels by hand
排序,order不好使的时候
f <- factor(c("a", "b", "c", "d"), levels = c("b", "c", "d", "a"))
fct_relevel(f)
fct_relevel(f, "a")
fct_relevel(f, "b", "a")
# Move to the third position
fct_relevel(f, "a", after = 2)
# Relevel to the end
fct_relevel(f, "a", after = Inf)
fct_relevel(f, "a", after = 3)
# Revel with a function
fct_relevel(f, sort)
fct_relevel(f, sample)
fct_relevel(f, rev)
Filtering and arranging
counties_selected <- counties %>%
select(state, county, population, private_work, public_work, self_employed)
>
> # Filter for Texas and more than 10000 people; sort in descending order of private_work
> counties_selected %>%filter(state=='Texas',population>10000)%>%arrange(desc(private_work))
# A tibble: 169 x 6
state county population private_work public_work self_employed
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Texas Gregg 123178 84.7 9.8 5.4
2 Texas Collin 862215 84.1 10 5.8
3 Texas Dallas 2485003 83.9 9.5 6.4
4 Texas Harris 4356362 83.4 10.1 6.3
5 Texas Andrews 16775 83.1 9.6 6.8
6 Texas Tarrant 1914526 83.1 11.4 5.4
7 Texas Titus 32553 82.5 10 7.4
8 Texas Denton 731851 82.2 11.9 5.7
9 Texas Ector 149557 82 11.2 6.7
10 Texas Moore 22281 82 11.7 5.9
# ... with 159 more rows
Mutate
counties_selected <- counties %>%
select(state, county, population, public_work)
# Sort in descending order of the public_workers column
counties_selected %>%
mutate(public_workers = public_work * population / 100) %>%arrange(desc(public_workers))
counties %>%
# Select the five columns
select(state, county, population, men, women) %>%
# Add the proportion_men variable
mutate(proportion_men = men / population) %>%
# Filter for population of at least 10,000
filter(population >= 10000) %>%
# Arrange proportion of men in descending order
arrange(desc(proportion_men))
The count verb
counties_selected %>%count(region,sort=TRUE)
counties_selected %>%count(state,wt=citizens,sort=TRUE)
Summarizing
# Summarize to find minimum population, maximum unemployment, and average income
counties_selected %>%summarize(
min_population=min(population),
max_unemployment=max(unemployment),
average_income=mean(income)
)
# Add a density column, then sort in descending order
counties_selected %>%
group_by(state) %>%
summarize(total_area = sum(land_area),
total_population = sum(population),
density=total_population/total_area) %>%arrange(desc(density))
发现了,归根到底是一种函数关系,看看该怎样处理这个函数比较简单,如果写不出来,可能和小学的时候应用题写不出来有关系
top_n
按照优先级来筛选
# Extract the most populated row for each state
counties_selected %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
top_n(1, total_pop)
Selecting
Using the select verb, we can answer interesting questions about our dataset by focusing in on related groups of verbs.
The colon (😃 is useful for getting many columns at a time.
In the video you learned about the select helper starts_with(). Another select helper is ends_with(), which finds the columns that end with a particular string.
counties %>%
# Select the state, county, population, and those ending with "work"
select(state, county, population, ends_with("work")) %>%
# Filter for counties that have at least 50% of people engaged in public work
filter(public_work >= 50)
我觉得这种简单的逻辑关系不应该出错,但是老是出错。。是我真的不太适合做编程这一行嘛?
rename
rename()进行重命名
# Rename the n column to num_counties
counties %>%
count(state)%>%rename(num_counties=n)
也可以在select的时候直接重命名
# Select state, county, and poverty as poverty_rate
> counties %>%select(state,county,poverty_rate=poverty)
# A tibble: 3,138 x 3
state county poverty_rate
<chr> <chr> <dbl>
1 Alabama Autauga 12.9
2 Alabama Baldwin 13.4
3 Alabama Barbour 26.7
4 Alabama Bibb 16.8
5 Alabama Blount 16.7
6 Alabama Bullock 24.6
7 Alabama Butler 25.4
8 Alabama Calhoun 20.5
9 Alabama Chambers 21.6
10 Alabama Cherokee 19.2
# ... with 3,128 more rows
transmute
combination select & mutate
类似于mutate,添加新列但是只保留新列,删掉旧列
官方解释: use to calculate new columns while dropping other columns
counties %>%
# Keep the state, county, and populations columns, and add a density column
transmute(state, county, population, density = population / land_area) %>%
# Filter for counties with a population greater than one million
filter(population > 1000000) %>%
# Sort density in ascending order
arrange(density
这个解释挺好的
给出一个综合的例子
> # Change the name of the unemployment column
> counties %>%
rename(unemployment_rate = unemployment)
# A tibble: 3,138 x 40
census_id state county region metro population men women hispanic white
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1001 Alab~ Autau~ South Metro 55221 26745 28476 2.6 75.8
2 1003 Alab~ Baldw~ South Metro 195121 95314 99807 4.5 83.1
3 1005 Alab~ Barbo~ South Nonm~ 26932 14497 12435 4.6 46.2
4 1007 Alab~ Bibb South Metro 22604 12073 10531 2.2 74.5
5 1009 Alab~ Blount South Metro 57710 28512 29198 8.6 87.9
6 1011 Alab~ Bullo~ South Nonm~ 10678 5660 5018 4.4 22.2
7 1013 Alab~ Butler South Nonm~ 20354 9502 10852 1.2 53.3
8 1015 Alab~ Calho~ South Metro 116648 56274 60374 3.5 73
9 1017 Alab~ Chamb~ South Nonm~ 34079 16258 17821 0.4 57.3
10 1019 Alab~ Chero~ South Nonm~ 26008 12975 13033 1.5 91.7
# ... with 3,128 more rows, and 30 more variables: black <dbl>, native <dbl>,
# asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
# income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
# child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
# construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
# transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
# mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
# self_employed <dbl>, family_work <dbl>, unemployment_rate <dbl>,
# land_area <dbl>
>
> # Keep the state and county columns, and the columns containing poverty
> counties %>%
select(state, county, contains("poverty"))
# A tibble: 3,138 x 4
state county poverty child_poverty
<chr> <chr> <dbl> <dbl>
1 Alabama Autauga 12.9 18.6
2 Alabama Baldwin 13.4 19.2
3 Alabama Barbour 26.7 45.3
4 Alabama Bibb 16.8 27.9
5 Alabama Blount 16.7 27.2
6 Alabama Bullock 24.6 38.4
7 Alabama Butler 25.4 39.2
8 Alabama Calhoun 20.5 31.6
9 Alabama Chambers 21.6 37.2
10 Alabama Cherokee 19.2 30.1
# ... with 3,128 more rows
>
> # Calculate the fraction_women column without dropping the other columns
> counties %>%
mutate(fraction_women = women / population)
# A tibble: 3,138 x 41
census_id state county region metro population men women hispanic white
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1001 Alab~ Autau~ South Metro 55221 26745 28476 2.6 75.8
2 1003 Alab~ Baldw~ South Metro 195121 95314 99807 4.5 83.1
3 1005 Alab~ Barbo~ South Nonm~ 26932 14497 12435 4.6 46.2
4 1007 Alab~ Bibb South Metro 22604 12073 10531 2.2 74.5
5 1009 Alab~ Blount South Metro 57710 28512 29198 8.6 87.9
6 1011 Alab~ Bullo~ South Nonm~ 10678 5660 5018 4.4 22.2
7 1013 Alab~ Butler South Nonm~ 20354 9502 10852 1.2 53.3
8 1015 Alab~ Calho~ South Metro 116648 56274 60374 3.5 73
9 1017 Alab~ Chamb~ South Nonm~ 34079 16258 17821 0.4 57.3
10 1019 Alab~ Chero~ South Nonm~ 26008 12975 13033 1.5 91.7
# ... with 3,128 more rows, and 31 more variables: black <dbl>, native <dbl>,
# asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
# income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
# child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
# construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
# transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
# mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
# self_employed <dbl>, family_work <dbl>, unemployment <dbl>,
# land_area <dbl>, fraction_women <dbl>
>
> # Keep only the state, county, and employment_rate columns
> counties %>%
transmute(state, county, employment_rate = employed / population)
# A tibble: 3,138 x 3
state county employment_rate
<chr> <chr> <dbl>
1 Alabama Autauga 0.434
2 Alabama Baldwin 0.441
3 Alabama Barbour 0.319
4 Alabama Bibb 0.367
5 Alabama Blount 0.384
6 Alabama Bullock 0.362
7 Alabama Butler 0.384
8 Alabama Calhoun 0.406
9 Alabama Chambers 0.402
10 Alabama Cherokee 0.390
# ... with 3,128 more rows
貌似忘记%in%符号的使用了,复习一下啊
# Filter for the names Steven, Thomas, and Matthew
selected_names <- babynames %>%
filter(name %in% c("Steven", "Thomas", "Matthew"))
Grouped mutates
这个就是两两组合之前的例子中有的