Select the following four columns from the counties variable:
You don’t need to save the result to a variable.
counties %>%
select("state", "county", "population", "poverty")
library(readr)
library(dplyr)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
counties <- read_csv("acs2017_county_data.csv")
Parsed with column specification:
cols(
.default = col_double(),
State = [31mcol_character()[39m,
County = [31mcol_character()[39m
)
See spec(...) for full column specifications.
glimpse(counties)
Observations: 3,220
Variables: 37
$ CountyId [3m[38;5;246m<dbl>[39m[23m 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1019, 1021, 1023, 1025,...
$ State [3m[38;5;246m<chr>[39m[23m "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ...
$ County [3m[38;5;246m<chr>[39m[23m "Autauga County", "Baldwin County", "Barbour County", "Bibb County", "Blount ...
$ TotalPop [3m[38;5;246m<dbl>[39m[23m 55036, 203360, 26201, 22580, 57667, 10478, 20126, 115527, 33895, 25855, 43805...
$ Men [3m[38;5;246m<dbl>[39m[23m 26899, 99527, 13976, 12251, 28490, 5616, 9416, 55593, 16320, 12862, 21554, 62...
$ Women [3m[38;5;246m<dbl>[39m[23m 28137, 103833, 12225, 10329, 29177, 4862, 10710, 59934, 17575, 12993, 22251, ...
$ Hispanic [3m[38;5;246m<dbl>[39m[23m 2.7, 4.4, 4.2, 2.4, 9.0, 0.3, 0.3, 3.6, 2.2, 1.6, 7.7, 0.5, 0.2, 3.1, 2.4, 6....
$ White [3m[38;5;246m<dbl>[39m[23m 75.4, 83.1, 45.7, 74.6, 87.4, 21.6, 52.2, 72.7, 56.2, 91.8, 80.4, 56.3, 53.0,...
$ Black [3m[38;5;246m<dbl>[39m[23m 18.9, 9.5, 47.8, 22.0, 1.5, 75.6, 44.7, 20.4, 39.3, 5.0, 9.5, 42.1, 45.7, 14....
$ Native [3m[38;5;246m<dbl>[39m[23m 0.3, 0.8, 0.2, 0.4, 0.3, 1.0, 0.1, 0.2, 0.3, 0.5, 0.4, 0.0, 0.1, 0.9, 0.3, 1....
$ Asian [3m[38;5;246m<dbl>[39m[23m 0.9, 0.7, 0.6, 0.0, 0.1, 0.7, 1.1, 1.0, 1.0, 0.1, 0.4, 0.1, 0.5, 0.0, 0.5, 1....
$ Pacific [3m[38;5;246m<dbl>[39m[23m 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0....
$ VotingAgeCitizen [3m[38;5;246m<dbl>[39m[23m 41016, 155376, 20269, 17662, 42513, 8212, 15459, 88383, 26259, 20620, 31776, ...
$ Income [3m[38;5;246m<dbl>[39m[23m 55317, 52562, 33368, 43404, 47412, 29655, 36326, 43686, 37342, 40041, 43501, ...
$ IncomeErr [3m[38;5;246m<dbl>[39m[23m 2838, 1348, 2551, 3431, 2630, 5376, 2701, 1491, 2011, 2316, 2877, 2797, 2336,...
$ IncomePerCap [3m[38;5;246m<dbl>[39m[23m 27824, 29364, 17561, 20911, 22021, 20856, 19004, 23638, 22002, 23010, 23368, ...
$ IncomePerCapErr [3m[38;5;246m<dbl>[39m[23m 2024, 735, 798, 1889, 850, 2355, 943, 793, 1205, 1354, 1925, 1307, 1203, 1553...
$ Poverty [3m[38;5;246m<dbl>[39m[23m 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.8, 16.1, 19.4, 22.3, 25.3,...
$ ChildPoverty [3m[38;5;246m<dbl>[39m[23m 20.1, 16.1, 44.9, 26.6, 25.4, 50.4, 34.8, 26.6, 29.1, 20.0, 27.8, 32.8, 30.7,...
$ Professional [3m[38;5;246m<dbl>[39m[23m 35.3, 35.7, 25.0, 24.4, 28.5, 19.7, 26.9, 29.0, 24.3, 28.8, 25.3, 23.6, 21.6,...
$ Service [3m[38;5;246m<dbl>[39m[23m 18.0, 18.2, 16.8, 17.6, 12.9, 17.1, 17.3, 17.5, 13.5, 14.8, 14.5, 15.4, 14.3,...
$ Office [3m[38;5;246m<dbl>[39m[23m 23.2, 25.6, 22.6, 19.7, 23.3, 18.6, 18.5, 23.7, 23.0, 18.1, 23.7, 22.0, 24.8,...
$ Construction [3m[38;5;246m<dbl>[39m[23m 8.1, 9.7, 11.5, 15.9, 15.8, 14.0, 11.6, 10.4, 11.6, 11.9, 15.5, 17.1, 13.7, 1...
$ Production [3m[38;5;246m<dbl>[39m[23m 15.4, 10.8, 24.1, 22.4, 19.5, 30.6, 25.7, 19.4, 27.6, 26.5, 21.0, 21.9, 25.6,...
$ Drive [3m[38;5;246m<dbl>[39m[23m 86.0, 84.7, 83.4, 86.4, 86.8, 73.1, 83.6, 85.0, 87.1, 85.0, 83.2, 81.8, 83.7,...
$ Carpool [3m[38;5;246m<dbl>[39m[23m 9.6, 7.6, 11.1, 9.5, 10.2, 15.7, 12.6, 9.2, 9.7, 12.1, 12.6, 13.7, 11.9, 6.0,...
$ Transit [3m[38;5;246m<dbl>[39m[23m 0.1, 0.1, 0.3, 0.7, 0.1, 0.3, 0.0, 0.2, 0.2, 0.4, 0.1, 0.0, 0.2, 0.0, 0.0, 0....
$ Walk [3m[38;5;246m<dbl>[39m[23m 0.6, 0.8, 2.2, 0.3, 0.4, 6.2, 0.9, 1.3, 0.6, 0.3, 0.6, 1.7, 0.7, 2.8, 0.9, 1....
$ OtherTransp [3m[38;5;246m<dbl>[39m[23m 1.3, 1.1, 1.7, 1.7, 0.4, 1.7, 0.9, 1.1, 0.5, 0.3, 1.8, 1.2, 2.7, 0.6, 0.1, 1....
$ WorkAtHome [3m[38;5;246m<dbl>[39m[23m 2.5, 5.6, 1.3, 1.5, 2.1, 3.0, 2.0, 3.2, 2.0, 2.0, 1.7, 1.6, 0.9, 3.0, 2.7, 2....
$ MeanCommute [3m[38;5;246m<dbl>[39m[23m 25.8, 27.0, 23.4, 30.0, 35.0, 29.8, 23.2, 24.8, 23.6, 26.5, 32.5, 32.7, 23.9,...
$ Employed [3m[38;5;246m<dbl>[39m[23m 24112, 89527, 8878, 8171, 21380, 4290, 7727, 47392, 14527, 9879, 17675, 4301,...
$ PrivateWork [3m[38;5;246m<dbl>[39m[23m 74.1, 80.7, 74.1, 76.0, 83.9, 81.4, 79.1, 74.9, 84.5, 74.8, 81.1, 79.9, 83.1,...
$ PublicWork [3m[38;5;246m<dbl>[39m[23m 20.2, 12.9, 19.1, 17.4, 11.9, 13.6, 15.3, 19.9, 11.8, 17.1, 14.0, 14.8, 11.8,...
$ SelfEmployed [3m[38;5;246m<dbl>[39m[23m 5.6, 6.3, 6.5, 6.3, 4.0, 5.0, 5.3, 5.1, 3.7, 8.1, 4.5, 4.9, 5.1, 7.7, 8.2, 5....
$ FamilyWork [3m[38;5;246m<dbl>[39m[23m 0.1, 0.1, 0.3, 0.3, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.4, 0.4, 0.0, 0.0, 0.0, 0....
$ Unemployment [3m[38;5;246m<dbl>[39m[23m 5.2, 5.5, 12.4, 8.2, 4.9, 12.1, 7.6, 10.1, 6.4, 5.3, 6.7, 9.8, 15.2, 6.4, 7.8...
Here you see the counties_selected dataset with a few interesting variables selected. These variables: private_work, public_work, self_employed describe whether people work for the government, for private companies, or for themselves.
counties_selected <- counties %>%
select(State, County, TotalPop, PrivateWork, PublicWork, SelfEmployed, Walk)
# Add a verb to sort in descending order of public_work
counties_selected %>%
arrange(desc(PublicWork))
We sorted the counties in descending order according to public_work. What if we were interested in looking at observations in counties that have a large population or within a specific state?
You use the filter() verb to get only observations that match a particular condition, or match multiple conditions.
# Filter for counties with a population above 1000000
counties_selected %>%
filter(TotalPop > 1000000)
# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
filter(State == "California",
TotalPop > 1000000)
Now you know that there are 9 counties in the state of California with a population greater than one million. In the next exercise, you’ll practice filtering and then sorting a dataset to focus on specific observations!
We’re often interested in both filtering and sorting a dataset, to focus on observations of particular interest to you. Here, you’ll find counties that are extreme examples of what fraction of the population works in the private sector.
# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
filter(State == "Texas", TotalPop > 10000) %>%
arrange(desc(PrivateWork))
We can use mutate() to add columns
# Add a new column public_workers with the number of people employed in public work
counties_selected %>%
mutate(public_workers = PublicWork * TotalPop / 100) %>%
arrange(desc(public_workers))
it looks like Los Angeles is the county with the most government employees.
The dataset includes columns for the total number (not percentage) of men and women in each county. You could use this, along with the population variable, to compute the fraction of men (or women) within each county.
# Select the columns state, county, population, men, and women
counties_selected <- counties %>%
select(State, County, TotalPop, Men, Women, Walk)
# Calculate proportion_women as the fraction of the population made up of women
counties_selected %>%
mutate(proportion_women = Women/TotalPop)
We’ll put together everything you’ve learned in this chapter (select(), mutate(), filter() and arrange()), to find the counties with the highest proportion of men.
counties %>%
# Select the five columns
select(State, County, TotalPop, Men, Women) %>%
# Add the proportion_men variable
mutate(proportion_men = Men/TotalPop) %>%
# Filter for population of at least 10,000
filter(TotalPop >= 10000) %>%
# Arrange proportion of men in descending order
arrange(desc(proportion_men))
We can use count to count the number of counties in each state.
# Use count to find the number of counties in each state
counties_selected %>%
count(State, sort = TRUE)
Or we can add a weight to the count verb:
# Find number of counties per state, weighted by population
counties_selected %>%
count(State, wt=TotalPop, sort = TRUE)
You can combine multiple verbs together to answer increasingly complicated questions of your data. For example: “What are the US states where the most people walk to work?”
You’ll use the walk column, which offers a percentage of people in each county that walk to work, to add a new column and count based on it.
counties_selected <- counties %>%
select(region, state, population, walk)
counties_selected %>%
# Add population_walk containing the total number of people who walk to work
mutate(population_walk = Walk * TotalPop/100) %>%
# Count weighted by the new column
count(State, wt = population_walk, sort = TRUE)
We can see that while California had the largest total population, New York state has the largest number of people who walk to work.
The summarize() verb is very useful for collapsing a large dataset into a single observation.
counties_selected <- counties %>%
select(county, population, income, unemployment)
# Summarize to find minimum population, maximum unemployment, and average income
counties %>%
summarize(min_population = min(TotalPop), max_unemployment = max(Unemployment), average_income = mean(Income))
If we wanted to take this a step further, we could use filter() to determine the specific counties that returned the value for min_population and max_unemployment.
We can see for example the percentage of men per state.
counties_selected %>%
group_by(State) %>%
summarize(total_men = sum(Men),
total_population = sum(TotalPop)) %>%
mutate(percentage_men = total_men / total_population) %>%
arrange(desc(percentage_men))
counties_selected %>%
group_by(State) %>%
top_n(1, TotalPop)
counties_selected <- counties %>%
select(State, County, Unemployment)
counties_selected %>%
group_by(State) %>%
top_n(3, Unemployment)
When you group by multiple columns and then summarize, it’s important to remember that the summarize “peels off” one of the groups, but leaves the rest on. For example, if you group_by(X, Y) then summarize, the result will still be grouped by X.
counties_selected <- counties %>%
select(State, County, TotalPop, Income)
counties_selected %>%
group_by(County, State) %>%
# Calculate average income
summarize(average_income = mean(Income) ) %>%
# Find the highest income state in each region
top_n(1, average_income)
Five dplyr verbs related to aggregation: count(), group_by(), summarize(), ungroup(), and top_n(). We’ll use all of them to answer a question: In how many states women population is greater than men?
counties_selected <- counties %>%
select(State, County, Men, Women)
# Extract the most populated row for each state
counties_selected %>%
group_by(State, County) %>%
summarize(women_total = sum(Women),
men_total = sum(Men),
women_more_men = women_total > men_total) %>%
top_n(1, women_total) %>%
ungroup() %>%
count(women_more_men)
Notice that 44 states have more women than men.
Using the select verb, we can answer interesting questions about our dataset by focusing in on related groups of verbs. The colon (:) is useful for getting many columns at a time.
counties %>%
# Select state, county, population, and industry-related columns
select(State, County, TotalPop, Professional:Production) %>%
# Arrange service in descending order
arrange(desc(Service))
Below a list of useful helpers: - contains() - starts_with() - ends_with() - last_col()
You can see all typing ?select_helpers.
counties %>%
# Select the state, county, population, and those ending with "work"
select(State, County, TotalPop, ends_with("Work")) %>%
# Filter for counties that have at least 50% of people engaged in public work
filter(PublicWork >= 50)
It looks like only a few counties have more than half the population working for the government.
counties_selected <- counties %>%
select(State, County, TotalPop, Unemployment)
counties_selected %>%
rename(unemployment_rate = Unemployment)
The rename() verb is often useful for changing the name of a column that comes out of another verb, such as count().
# Rename the n column to num_counties
counties %>%
count(State) %>%
rename(num_counties = n)
rename() isn’t the only way you can choose a new name for a column: you can also choose a name as part of a select().
# Select state, county, and poverty as poverty_rate
counties %>%
select(State, County, poverty_rate = Poverty)
The transmute verb allows you to control which variables you keep, which variables you calculate, and which variables you drop.
counties %>%
# Keep the state, county, and populations columns, and add a active column
transmute(State, County, TotalPop, employment_rate = Employed/TotalPop) %>%
# Filter for counties with a population greater than one million
filter(TotalPop > 1000000) %>%
# Sort active in ascending order
arrange(employment_rate)
Examples using different verbs:
# Change the name of the unemployment column
counties %>%
rename(unemployment_rate = Unemployment)
# Keep the state and county columns, and the columns containing poverty
counties %>%
select(State, County, contains("Poverty"))
# Calculate the fraction_women column without dropping the other columns
counties %>%
mutate(fraction_women = Women / TotalPop)
# Keep only the state, county, and employment_rate columns
counties %>%
transmute(State, County, employment_rate = Employed / TotalPop)