## Load package
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# load dataset
counties<-read_rds("/Users/anikalewis/Downloads/counties.rds")

Data Manipulation with dplyr; Transforming Data with dplyr

Exploring data with dplyr

included in Tidyverse
select(), filter(), arrange(), mutate()
select() verb extracts only particular variables from a dataset

Understanding your data

Take a look at the counties dataset using the glimpse() function.

What is the first value in the income variable?

51281

glimpse(counties)

## Rows: 3,138
## Columns: 40
## $ census_id          <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10…
## $ state              <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam…
## $ county             <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", …
## $ region             <chr> "South", "South", "South", "South", "South", "South…
## $ metro              <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No…
## $ population         <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1…
## $ men                <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627…
## $ women              <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603…
## $ hispanic           <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7…
## $ white              <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.…
## $ black              <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,…
## $ native             <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0…
## $ asian              <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0…
## $ pacific            <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ citizens           <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88…
## $ income             <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41…
## $ income_err         <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949…
## $ income_per_cap     <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21…
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15…
## $ poverty            <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.…
## $ child_poverty      <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.…
## $ professional       <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.…
## $ service            <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.…
## $ office             <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.…
## $ construction       <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5…
## $ production         <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.…
## $ drive              <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.…
## $ carpool            <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, …
## $ transit            <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0…
## $ walk               <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1…
## $ other_transp       <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1…
## $ work_at_home       <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1…
## $ mean_commute       <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.…
## $ employed           <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,…
## $ private_work       <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.…
## $ public_work        <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.…
## $ self_employed      <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4…
## $ family_work        <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0…
## $ unemployment       <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.…
## $ land_area          <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81, 77…

Selecting columns

Instructions
- Select the columns listed from the counties variable.

counties %>%
  # Select the columns
  select(state,county,population,poverty)

## # A tibble: 3,138 × 4
##    state   county   population poverty
##    <chr>   <chr>         <dbl>   <dbl>
##  1 Alabama Autauga       55221    12.9
##  2 Alabama Baldwin      195121    13.4
##  3 Alabama Barbour       26932    26.7
##  4 Alabama Bibb          22604    16.8
##  5 Alabama Blount        57710    16.7
##  6 Alabama Bullock       10678    24.6
##  7 Alabama Butler        20354    25.4
##  8 Alabama Calhoun      116648    20.5
##  9 Alabama Chambers      34079    21.6
## 10 Alabama Cherokee      26008    19.2
## # ℹ 3,128 more rows

The filter and arrange verbs

arrange() verbs sorts data based on 1+ variables
arrange(variable to sort by)
desc() function wrapped around variable = descending
filter() extract observations based on conditions
can filter based on logical operators (==,<,>)

Arranging observations

Instructions
- Add a verb to sort the observations of the public_work variable in descending order.

counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

counties_selected %>%
  # Add a verb to sort in descending order of public_work
  arrange(desc(public_work))

## # A tibble: 3,138 × 6
##    state        county         population private_work public_work self_employed
##    <chr>        <chr>               <dbl>        <dbl>       <dbl>         <dbl>
##  1 Hawaii       Kalawao                85         25          64.1          10.9
##  2 Alaska       Yukon-Koyukuk…       5644         33.3        61.7           5.1
##  3 Wisconsin    Menominee            4451         36.8        59.1           3.7
##  4 North Dakota Sioux                4380         32.9        56.8          10.2
##  5 South Dakota Todd                 9942         34.4        55             9.8
##  6 Alaska       Lake and Peni…       1474         42.2        51.6           6.1
##  7 California   Lassen              32645         42.6        50.5           6.8
##  8 South Dakota Buffalo              2038         48.4        49.5           1.8
##  9 South Dakota Dewey                5579         34.9        49.2          14.7
## 10 Texas        Kenedy                565         51.9        48.1           0  
## # ℹ 3,128 more rows

Filtering for conditions

Instructions
- Find only the counties that have a population above one million (1000000).
- Find only the counties in the state of California that also have a population above one million (1000000).

counties_selected <- counties %>%
  select(state, county, population)

counties_selected %>%
  # Filter for counties with a population above 1000000
  filter(population>1000000)

## # A tibble: 41 × 3
##    state      county         population
##    <chr>      <chr>               <dbl>
##  1 Arizona    Maricopa          4018143
##  2 California Alameda           1584983
##  3 California Contra Costa      1096068
##  4 California Los Angeles      10038388
##  5 California Orange            3116069
##  6 California Riverside         2298032
##  7 California Sacramento        1465832
##  8 California San Bernardino    2094769
##  9 California San Diego         3223096
## 10 California Santa Clara       1868149
## # ℹ 31 more rows

counties_selected <- counties %>%
  select(state, county, population)

# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
filter(state == "California", population>1000000)

## # A tibble: 9 × 3
##   state      county         population
##   <chr>      <chr>               <dbl>
## 1 California Alameda           1584983
## 2 California Contra Costa      1096068
## 3 California Los Angeles      10038388
## 4 California Orange            3116069
## 5 California Riverside         2298032
## 6 California Sacramento        1465832
## 7 California San Bernardino    2094769
## 8 California San Diego         3223096
## 9 California Santa Clara       1868149

Filtering and arranging

Instructions
- Filter for counties in the state of Texas that have more than ten thousand people (10000), and sort them in descending order of the percentage of people employed in private work.

counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
  # Filter for Texas and more than 10000 people
  filter(state=="Texas",population>10000) %>%
  # Sort in descending order of private_work
  arrange(desc(private_work))

## # A tibble: 169 × 6
##    state county  population private_work public_work self_employed
##    <chr> <chr>        <dbl>        <dbl>       <dbl>         <dbl>
##  1 Texas Gregg       123178         84.7         9.8           5.4
##  2 Texas Collin      862215         84.1        10             5.8
##  3 Texas Dallas     2485003         83.9         9.5           6.4
##  4 Texas Harris     4356362         83.4        10.1           6.3
##  5 Texas Andrews      16775         83.1         9.6           6.8
##  6 Texas Tarrant    1914526         83.1        11.4           5.4
##  7 Texas Titus        32553         82.5        10             7.4
##  8 Texas Denton      731851         82.2        11.9           5.7
##  9 Texas Ector       149557         82          11.2           6.7
## 10 Texas Moore        22281         82          11.7           5.9
## # ℹ 159 more rows

The mutate() verb

adds new variables or changes existing variables

Calculating the number of government employees

Instructions
- Use mutate() to add a column called public_workers to the dataset, with the number of people employed in public (government) work.
- Sort in descending order of the public_workers column

counties_selected <- counties %>%
  select(state, county, population, public_work)

counties_selected %>%
  mutate(public_workers = public_work * population / 100) %>%
  # Sort in descending order of the public_workers column
  arrange(desc(public_workers))

## # A tibble: 3,138 × 5
##    state      county         population public_work public_workers
##    <chr>      <chr>               <dbl>       <dbl>          <dbl>
##  1 California Los Angeles      10038388        11.5       1154415.
##  2 Illinois   Cook              5236393        11.5        602185.
##  3 California San Diego         3223096        14.8        477018.
##  4 Arizona    Maricopa          4018143        11.7        470123.
##  5 Texas      Harris            4356362        10.1        439993.
##  6 New York   Kings             2595259        14.4        373717.
##  7 California San Bernardino    2094769        16.7        349826.
##  8 California Riverside         2298032        14.9        342407.
##  9 California Sacramento        1465832        21.8        319551.
## 10 California Orange            3116069        10.2        317839.
## # ℹ 3,128 more rows

Calculating the percentage of women in a county

Instructions
- Select the columns state, county, population, men, and women.
- Add a new variable called proportion_women with the fraction of the county’s population made up of women.

counties_selected <- counties %>%
  # Select the columns state, county, population, men, and women
 select(state, county, population, men, women)

counties_selected %>%
  # Calculate proportion_women as the fraction of the population made up of women
  mutate(proportion_women= women/population)

## # A tibble: 3,138 × 6
##    state   county   population   men women proportion_women
##    <chr>   <chr>         <dbl> <dbl> <dbl>            <dbl>
##  1 Alabama Autauga       55221 26745 28476            0.516
##  2 Alabama Baldwin      195121 95314 99807            0.512
##  3 Alabama Barbour       26932 14497 12435            0.462
##  4 Alabama Bibb          22604 12073 10531            0.466
##  5 Alabama Blount        57710 28512 29198            0.506
##  6 Alabama Bullock       10678  5660  5018            0.470
##  7 Alabama Butler        20354  9502 10852            0.533
##  8 Alabama Calhoun      116648 56274 60374            0.518
##  9 Alabama Chambers      34079 16258 17821            0.523
## 10 Alabama Cherokee      26008 12975 13033            0.501
## # ℹ 3,128 more rows

Select, mutate, filter, and arrange

Instruction
- Select only the columns state, county, population, men, and women.
- Add a variable proportion_men with the fraction of the county’s population made up of men.
- Filter for counties with a population of at least ten thousand (10000).
- Arrange counties in descending order of their proportion of men.

counties %>%
  # Select the five columns 
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable
  mutate(proportion_men=men/population) %>%
  # Filter for population of at least 10,000
  filter(population > 10000) %>%
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men))

## # A tibble: 2,437 × 6
##    state      county         population   men women proportion_men
##    <chr>      <chr>               <dbl> <dbl> <dbl>          <dbl>
##  1 Virginia   Sussex              11864  8130  3734          0.685
##  2 California Lassen              32645 21818 10827          0.668
##  3 Georgia    Chattahoochee       11914  7940  3974          0.666
##  4 Louisiana  West Feliciana      15415 10228  5187          0.664
##  5 Florida    Union               15191  9830  5361          0.647
##  6 Texas      Jones               19978 12652  7326          0.633
##  7 Missouri   DeKalb              12782  8080  4702          0.632
##  8 Texas      Madison             13838  8648  5190          0.625
##  9 Virginia   Greensville         11760  7303  4457          0.621
## 10 Texas      Anderson            57915 35469 22446          0.612
## # ℹ 2,427 more rows

Data Manipulation with dplyr; Aggregating Data

The count verb

count() finds the number of observations
takes the argument, sort()
count(variable,sort=TRUE) the rows will be sorted by the most common observation to the least
count(variable, wt = ) will weight the count by the desired variable

Counting by region

Instructions
- Use count() to find the number of counties in each region, using a second argument to sort in descending order.

counties_selected <- counties %>%
  select(county, region, state, population, citizens)

# Use count to find the number of counties in each region
counties_selected %>%
  count(region, sort=TRUE)

## # A tibble: 4 × 2
##   region            n
##   <chr>         <int>
## 1 South          1420
## 2 North Central  1054
## 3 West            447
## 4 Northeast       217

Counting citizens by state

# Find number of counties per state, weighted by citizens, sorted in descending order
counties_selected %>%
  count(state, wt=citizens, sort=TRUE)

## # A tibble: 50 × 2
##    state                 n
##    <chr>             <dbl>
##  1 California     24280349
##  2 Texas          16864864
##  3 Florida        13933052
##  4 New York       13531404
##  5 Pennsylvania    9710416
##  6 Illinois        8979999
##  7 Ohio            8709050
##  8 Michigan        7380136
##  9 North Carolina  7107998
## 10 Georgia         6978660
## # ℹ 40 more rows

Mutating and counting

Instruction
- Use mutate() to calculate and add a column called population_walk, containing the total number of people who walk to work in a county.
- Use a (weighted and sorted) count() to find the total number of people who walk to work in each state.

counties_selected <- counties %>%
  select(county, region, state, population, walk)
counties_selected %>%
  # Add population_walk containing the total number of people who walk to work 
  mutate(population_walk=population*walk/100) %>%
  # Count weighted by the new column, sort in descending order
  count(state,wt=population_walk,sort=TRUE)

## # A tibble: 50 × 2
##    state                n
##    <chr>            <dbl>
##  1 New York      1237938.
##  2 California    1017964.
##  3 Pennsylvania   505397.
##  4 Texas          430783.
##  5 Illinois       400346.
##  6 Massachusetts  316765.
##  7 Florida        284723.
##  8 New Jersey     273047.
##  9 Ohio           266911.
## 10 Washington     239764.
## # ℹ 40 more rows

The group_by, summarize, and ungroup verbs

summarize() takes many observations and turns them into one observation.
sum(),mean(), median(), min(), max(), n()
group_by() allows you to aggregate within groups
arrange(desc()) allows you to sort by descending order, allowing you to see the most notable results first
group by multiple columns at the same times, passing both column names through group_by() +ungroup() use when you don’t want to keep a variable as a group

Summarizing

Instructions
- Summarize the counties dataset to find the following columns: min_population (with the smallest population), max_unemployment (with the maximum unemployment), and average_income (with the mean of the income variable).

counties_selected <- counties %>%
  select(county, population, income, unemployment)

counties_selected %>%
  # Summarize to find minimum population, maximum unemployment, and average income
  summarize(min_population=min(population), max_unemployment=max(unemployment), average_income=mean(income))

## # A tibble: 1 × 3
##   min_population max_unemployment average_income
##            <dbl>            <dbl>          <dbl>
## 1             85             29.4         46832.

Summarizing by state

Instructions
- Group the data by state, and summarize to create the columns total_area (with total area in square miles) and total_population (with total population).
- Add a density column with the people per square mile, then arrange in descending order.

counties_selected <- counties %>%
  select(state, county, population, land_area)

counties_selected %>%
  # Group by state 
  group_by(state) %>%
  # Find the total area and population
  summarise(total_area=sum(land_area), total_population=sum(population))

## # A tibble: 50 × 3
##    state       total_area total_population
##    <chr>            <dbl>            <dbl>
##  1 Alabama         50645.          4830620
##  2 Alaska         553560.           725461
##  3 Arizona        113594.          6641928
##  4 Arkansas        52035.          2958208
##  5 California     155779.         38421464
##  6 Colorado       103642.          5278906
##  7 Connecticut      4842.          3593222
##  8 Delaware         1949.           926454
##  9 Florida         53625.         19645772
## 10 Georgia         57514.         10006693
## # ℹ 40 more rows

# Add a density column, then sort in descending order
counties_selected %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area),
            total_population = sum(population)) %>%
  mutate(density=total_population/total_area)%>%
  arrange(desc(density))

## # A tibble: 50 × 4
##    state         total_area total_population density
##    <chr>              <dbl>            <dbl>   <dbl>
##  1 New Jersey         7354.          8904413   1211.
##  2 Rhode Island       1034.          1053661   1019.
##  3 Massachusetts      7800.          6705586    860.
##  4 Connecticut        4842.          3593222    742.
##  5 Maryland           9707.          5930538    611.
##  6 Delaware           1949.           926454    475.
##  7 New York          47126.         19673174    417.
##  8 Florida           53625.         19645772    366.
##  9 Pennsylvania      44743.         12779559    286.
## 10 Ohio              40861.         11575977    283.
## # ℹ 40 more rows

Summarizing by state and region

Instructions
- Summarize to find the total population, as a column called total_pop, in each combination of region and state, grouped in that order
- Notice the tibble is still grouped by region; use another summarize() step to calculate two new columns: the average state population in each region (average_pop) and the median state population in each region (median_pop).

counties_selected <- counties %>%
  select(region, state, county, population)

counties_selected %>%
  # Group and summarize to find the total population
  group_by(region, state) %>%
  summarize(total_pop = sum(population)) %>%
  # Calculate the average_pop and median_pop columns 
  summarize(average_pop=mean(total_pop), median_pop=median(total_pop))

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

## # A tibble: 4 × 3
##   region        average_pop median_pop
##   <chr>               <dbl>      <dbl>
## 1 North Central    5627687.    5580644
## 2 Northeast        6221058.    3593222
## 3 South            7370486     4804098
## 4 West             5722755.    2798636

The slice_min and slice_max verbs

extract the most extreme observations in each group
slice_max(column we want to base the ordering on,the number of observations to extract from each group(n))
works with group_by() verb

Selecting a county from each region

Instructions
- Find the county in each region with the highest percentage of citizens who walk to work.

counties_selected <- counties %>%
  select(region, state, county, metro, population, walk)

counties_selected %>%
  # Group by region
  group_by(region) %>%
  # Find the county with the highest percentage of people who walk to work
  slice_max(walk,n=1)

## # A tibble: 4 × 6
## # Groups:   region [4]
##   region        state        county                 metro    population  walk
##   <chr>         <chr>        <chr>                  <chr>         <dbl> <dbl>
## 1 North Central North Dakota McIntosh               Nonmetro       2759  17.5
## 2 Northeast     New York     New York               Metro       1629507  20.7
## 3 South         Virginia     Lexington city         Nonmetro       7071  31.7
## 4 West          Alaska       Aleutians East Borough Nonmetro       3304  71.2

Finding the lowest-income state in each region

Instructions
- Calculate the average income (as average_income) of counties within each region and state (notice the group_by() has already been done for you).
- Find the state with the lowest average_income in each region.

counties_selected <- counties %>%
  select(region, state, county, population, income)

counties_selected %>%
  group_by(region, state) %>%
  # Calculate average income
  summarize(average_income=mean(income))%>%
  # Find the lowest income state in each region
  slice_min(average_income,n=1)

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

## # A tibble: 4 × 3
## # Groups:   region [4]
##   region        state       average_income
##   <chr>         <chr>                <dbl>
## 1 North Central Missouri            41755.
## 2 Northeast     Maine               46142.
## 3 South         Mississippi         34939.
## 4 West          New Mexico          40184.

Using summarize, slice_max, and count together

Instructions
- For each combination of state and metro, find the total population as total_pop.
- Extract the most populated row from each state, which will be either Metro or Nonmetro.
- Ungroup, then count how often Metro or Nonmetro appears to see how many states have more people living in those areas.

counties_selected <- counties %>%
  select(state, metro, population)

counties_selected %>%
  # Find the total population for each combination of state and metro
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  # Extract the most populated row for each state
  slice_max(total_pop, n = 1) %>%
  # Count the states with more people in Metro or Nonmetro areas
  ungroup()%>%
  count(metro)

## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

## # A tibble: 2 × 2
##   metro        n
##   <chr>    <int>
## 1 Metro       44
## 2 Nonmetro     6

Data Manipulation with dplyr; Selecting and Transforming Data

Selecting

select(variable, columnx:columny)select a range of columns
select helpers: functions that specifiy criteria for choosing columns
- take strings
- select(variable,” “) selects all columns containing a specific word
- select(variable,starts_with(““)) selects only the columns that start with a particular prefix
- ends_with()
- last_col()
- matches() selects columns with a specified patterns
select(-column) removes variables from a table

Selecting columns

Instructions
- Use glimpse() to examine all the variables in the counties table.
- Select the columns for state, county, population, and (using a colon) all five of those industry-related variables; there are five consecutive variables in the table related to the industry of people’s work: professional, service, office, construction, and production.
- Arrange the table in descending order of service to find which counties have the highest rates of working in the service industry.

# Glimpse the counties table
glimpse(counties)

## Rows: 3,138
## Columns: 40
## $ census_id          <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10…
## $ state              <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam…
## $ county             <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", …
## $ region             <chr> "South", "South", "South", "South", "South", "South…
## $ metro              <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No…
## $ population         <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1…
## $ men                <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627…
## $ women              <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603…
## $ hispanic           <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7…
## $ white              <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.…
## $ black              <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,…
## $ native             <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0…
## $ asian              <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0…
## $ pacific            <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ citizens           <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88…
## $ income             <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41…
## $ income_err         <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949…
## $ income_per_cap     <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21…
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15…
## $ poverty            <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.…
## $ child_poverty      <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.…
## $ professional       <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.…
## $ service            <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.…
## $ office             <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.…
## $ construction       <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5…
## $ production         <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.…
## $ drive              <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.…
## $ carpool            <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, …
## $ transit            <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0…
## $ walk               <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1…
## $ other_transp       <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1…
## $ work_at_home       <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1…
## $ mean_commute       <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.…
## $ employed           <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,…
## $ private_work       <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.…
## $ public_work        <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.…
## $ self_employed      <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4…
## $ family_work        <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0…
## $ unemployment       <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.…
## $ land_area          <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81, 77…

counties %>%
  # Select state, county, population, and industry-related columns
  select(state,county,population,professional:production) %>%
  # Arrange service in descending order 
  arrange(desc(service))

## # A tibble: 3,138 × 8
##    state   county population professional service office construction production
##    <chr>   <chr>       <dbl>        <dbl>   <dbl>  <dbl>        <dbl>      <dbl>
##  1 Missis… Tunica      10477         23.9    36.6   21.5          3.5       14.5
##  2 Texas   Kinney       3577         30      36.5   11.6         20.5        1.3
##  3 Texas   Kenedy        565         24.9    34.1   20.5         20.5        0  
##  4 New Yo… Bronx     1428357         24.3    33.3   24.2          7.1       11  
##  5 Texas   Brooks       7221         19.6    32.4   25.3         11.1       11.5
##  6 Colora… Fremo…      46809         26.6    32.2   22.8         10.7        7.6
##  7 Texas   Culbe…       2296         20.1    32.2   24.2         15.7        7.8
##  8 Califo… Del N…      27788         33.9    31.5   18.8          8.9        6.8
##  9 Minnes… Mahno…       5496         26.8    31.5   18.7         13.1        9.9
## 10 Virgin… Lanca…      11129         30.3    31.2   22.8          8.1        7.6
## # ℹ 3,128 more rows

Select helpers

Instructions
- Select the columns state, county, population, and all those that end with work.
- Filter just for the counties where at least 50% of the population is engaged in public work.

counties %>%
  # Select the state, county, population, and those ending with "work"
  select(state, county, population, ends_with("work")) %>%
  # Filter for counties that have at least 50% of people engaged in public work
  filter(public_work >= 50)

## # A tibble: 7 × 6
##   state        county            population private_work public_work family_work
##   <chr>        <chr>                  <dbl>        <dbl>       <dbl>       <dbl>
## 1 Alaska       Lake and Peninsu…       1474         42.2        51.6         0.2
## 2 Alaska       Yukon-Koyukuk Ce…       5644         33.3        61.7         0  
## 3 California   Lassen                 32645         42.6        50.5         0.1
## 4 Hawaii       Kalawao                   85         25          64.1         0  
## 5 North Dakota Sioux                   4380         32.9        56.8         0.1
## 6 South Dakota Todd                    9942         34.4        55           0.8
## 7 Wisconsin    Menominee               4451         36.8        59.1         0.4

The rename verb

rename(new column name= old column name)
while we can rename inside of select (…, new column name = old column name), we’d have to name all the columns we want keep along with it. With rename () we can just keep one column who’s name we want to change.

Renaming a column after count

Instructions
- Use count() to determine how many counties are in each state.
- Notice the n column in the output; use rename() to rename that to num_counties.

counties %>%
  # Count the number of counties in each state
  count(state) %>%
  # Rename the n column to num_counties
  rename(num_counties=n)

## # A tibble: 50 × 2
##    state       num_counties
##    <chr>              <int>
##  1 Alabama               67
##  2 Alaska                28
##  3 Arizona               15
##  4 Arkansas              75
##  5 California            58
##  6 Colorado              64
##  7 Connecticut            8
##  8 Delaware               3
##  9 Florida               67
## 10 Georgia              159
## # ℹ 40 more rows

Renaming a column as part of a select

Instructions
- Select the columns state, county, and poverty from the counties dataset; in the same step, rename the poverty column to poverty_rate.

counties %>%
  # Select state, county, and poverty as poverty_rate
  select(state, county, poverty,poverty_rate=poverty)

## # A tibble: 3,138 × 3
##    state   county   poverty_rate
##    <chr>   <chr>           <dbl>
##  1 Alabama Autauga          12.9
##  2 Alabama Baldwin          13.4
##  3 Alabama Barbour          26.7
##  4 Alabama Bibb             16.8
##  5 Alabama Blount           16.7
##  6 Alabama Bullock          24.6
##  7 Alabama Butler           25.4
##  8 Alabama Calhoun          20.5
##  9 Alabama Chambers         21.6
## 10 Alabama Cherokee         19.2
## # ℹ 3,128 more rows

The transmute verb

a combination of select() and mutate()
returns a subset of columns that are transformed and changed at the same time
keep the columns that you want, rename them and do calculations at the same time

Choosing among verbs

Which of the following verbs would you use to calculate new columns while dropping other columns? + transmute

Using transmute

Instructions
- Keep only the state, county, and population columns, and add a new column, density, that contains the population per land_area.
- Filter for only counties with a population greater than one million.
- Sort the table in ascending order of density.

counties %>%
  # Keep the state, county, and populations columns, and add a density column
  transmute(state,county,population,density=population/land_area) %>%
  # Filter for counties with a population greater than one million 
  filter(population>1000000) %>%
  # Sort density in ascending order 
  arrange(density)

## # A tibble: 41 × 4
##    state      county         population density
##    <chr>      <chr>               <dbl>   <dbl>
##  1 California San Bernardino    2094769    104.
##  2 Nevada     Clark             2035572    258.
##  3 California Riverside         2298032    319.
##  4 Arizona    Maricopa          4018143    437.
##  5 Florida    Palm Beach        1378806    700.
##  6 California San Diego         3223096    766.
##  7 Washington King              2045756    967.
##  8 Texas      Travis            1121645   1133.
##  9 Florida    Hillsborough      1302884   1277.
## 10 Florida    Orange            1229039   1360.
## # ℹ 31 more rows

Matching verbs to their definitions

rename
- Leaves the columns you don’t mention alone; doesn’t allow you to calculate or change values.
transmute
- Must mention all the columns you keep; allows you to calculate or change values.
mutate
- Leaves the columns you don’t mention alone; allows you to calculate or change values

Choosing among the four verbs

Instructions
- Choose the right verb for changing the name of the unemployment column to unemployment_rate
- Choose the right verb for keeping only the columns state, county, and the ones containing poverty.
- Calculate a new column called fraction_women with the fraction of the population made up of women, without dropping any columns.
- Keep only three columns: the state, county, and employed / population, which you’ll call employment_rate.

# Change the name of the unemployment column
counties %>%
  rename(unemployment_rate = unemployment)

## # A tibble: 3,138 × 40
##    census_id state   county   region metro population   men women hispanic white
##    <chr>     <chr>   <chr>    <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl>
##  1 1001      Alabama Autauga  South  Metro      55221 26745 28476      2.6  75.8
##  2 1003      Alabama Baldwin  South  Metro     195121 95314 99807      4.5  83.1
##  3 1005      Alabama Barbour  South  Nonm…      26932 14497 12435      4.6  46.2
##  4 1007      Alabama Bibb     South  Metro      22604 12073 10531      2.2  74.5
##  5 1009      Alabama Blount   South  Metro      57710 28512 29198      8.6  87.9
##  6 1011      Alabama Bullock  South  Nonm…      10678  5660  5018      4.4  22.2
##  7 1013      Alabama Butler   South  Nonm…      20354  9502 10852      1.2  53.3
##  8 1015      Alabama Calhoun  South  Metro     116648 56274 60374      3.5  73  
##  9 1017      Alabama Chambers South  Nonm…      34079 16258 17821      0.4  57.3
## 10 1019      Alabama Cherokee South  Nonm…      26008 12975 13033      1.5  91.7
## # ℹ 3,128 more rows
## # ℹ 30 more variables: black <dbl>, native <dbl>, asian <dbl>, pacific <dbl>,
## #   citizens <dbl>, income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## #   income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, …

# Keep the state and county columns, and the columns containing poverty
counties %>%
  select(state, county, contains("poverty"))

## # A tibble: 3,138 × 4
##    state   county   poverty child_poverty
##    <chr>   <chr>      <dbl>         <dbl>
##  1 Alabama Autauga     12.9          18.6
##  2 Alabama Baldwin     13.4          19.2
##  3 Alabama Barbour     26.7          45.3
##  4 Alabama Bibb        16.8          27.9
##  5 Alabama Blount      16.7          27.2
##  6 Alabama Bullock     24.6          38.4
##  7 Alabama Butler      25.4          39.2
##  8 Alabama Calhoun     20.5          31.6
##  9 Alabama Chambers    21.6          37.2
## 10 Alabama Cherokee    19.2          30.1
## # ℹ 3,128 more rows

# Calculate the fraction_women column without dropping the other columns
counties %>%
  mutate(fraction_women = women / population)

## # A tibble: 3,138 × 41
##    census_id state   county   region metro population   men women hispanic white
##    <chr>     <chr>   <chr>    <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl>
##  1 1001      Alabama Autauga  South  Metro      55221 26745 28476      2.6  75.8
##  2 1003      Alabama Baldwin  South  Metro     195121 95314 99807      4.5  83.1
##  3 1005      Alabama Barbour  South  Nonm…      26932 14497 12435      4.6  46.2
##  4 1007      Alabama Bibb     South  Metro      22604 12073 10531      2.2  74.5
##  5 1009      Alabama Blount   South  Metro      57710 28512 29198      8.6  87.9
##  6 1011      Alabama Bullock  South  Nonm…      10678  5660  5018      4.4  22.2
##  7 1013      Alabama Butler   South  Nonm…      20354  9502 10852      1.2  53.3
##  8 1015      Alabama Calhoun  South  Metro     116648 56274 60374      3.5  73  
##  9 1017      Alabama Chambers South  Nonm…      34079 16258 17821      0.4  57.3
## 10 1019      Alabama Cherokee South  Nonm…      26008 12975 13033      1.5  91.7
## # ℹ 3,128 more rows
## # ℹ 31 more variables: black <dbl>, native <dbl>, asian <dbl>, pacific <dbl>,
## #   citizens <dbl>, income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## #   income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, …

# Keep only the state, county, and employment_rate columns
counties %>%
  transmute(state, county, employment_rate = employed / population)

## # A tibble: 3,138 × 3
##    state   county   employment_rate
##    <chr>   <chr>              <dbl>
##  1 Alabama Autauga            0.434
##  2 Alabama Baldwin            0.441
##  3 Alabama Barbour            0.319
##  4 Alabama Bibb               0.367
##  5 Alabama Blount             0.384
##  6 Alabama Bullock            0.362
##  7 Alabama Butler             0.384
##  8 Alabama Calhoun            0.406
##  9 Alabama Chambers           0.402
## 10 Alabama Cherokee           0.390
## # ℹ 3,128 more rows

babynames<-read_rds("/Users/anikalewis/Downloads/babynames.rds")

Data Manipulation with dplyr; Case Study; The babynames Dataset

The babynames data + filter for multiple variations of a variable + filter(column %in% c(“variable1”, “variable2”))

Filtering and arranging for one year

Instructions
- Filter for only the year 1990.
- Sort the table in descending order of the number of babies born.

babynames %>%
  # Filter for the year 1990
  filter(year==1990) %>%
  # Sort the number column in descending order 
  arrange(desc(number))

## # A tibble: 21,223 × 3
##     year name        number
##    <dbl> <chr>        <int>
##  1  1990 Michael      65560
##  2  1990 Christopher  52520
##  3  1990 Jessica      46615
##  4  1990 Ashley       45797
##  5  1990 Matthew      44925
##  6  1990 Joshua       43382
##  7  1990 Brittany     36650
##  8  1990 Amanda       34504
##  9  1990 Daniel       33963
## 10  1990 David        33862
## # ℹ 21,213 more rows

Finding the most popular names each year

Instructions
- Use group_by() and slice_max() to find the most common name for US babies in each year.

babynames %>%
  # Find the most common name in each year
  group_by(year) %>%
  slice_max(number, n = 1)

## # A tibble: 28 × 3
## # Groups:   year [28]
##     year name  number
##    <dbl> <chr>  <int>
##  1  1880 John    9701
##  2  1885 Mary    9166
##  3  1890 Mary   12113
##  4  1895 Mary   13493
##  5  1900 Mary   16781
##  6  1905 Mary   16135
##  7  1910 Mary   22947
##  8  1915 Mary   58346
##  9  1920 Mary   71175
## 10  1925 Mary   70857
## # ℹ 18 more rows

Visualizing names with ggplot2

Instructions
- Filter for only the names Steven, Thomas, and Matthew, and assign it to an object called selected_names.
- Visualize those three names as a line plot over time, with each name represented by a different color.

selected_names <- babynames %>%
  # Filter for the names Steven, Thomas, and Matthew 
  filter(name %in% c("Steven", "Thomas", "Matthew"))

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
  geom_line()

Grouped mutates

group_by() %>%
- mutate(=)
- ungroup()
  - Notice from the header that the table is still grouped by year, which could affect other verbs we want to use in the future. In particular, it can make other mutates or filters slower to run, especially if there are a lot of groups in the table.

Finding the year and each name is most common

Instructions
- First, calculate the total number of people born in that year in this dataset as year_total.
- Next, use year_total to calculate the fraction of people born in each year that have each name.
- Now use your newly calculated fraction column, in combination with slice_max(), to identify the year each name was most common.

# Calculate the fraction of people born each year with the same name
babynames %>%
  group_by(year) %>%
  mutate(year_total = sum(number)) %>%
  ungroup() %>%
  mutate(fraction = number / year_total) %>%
  # Find the year each name is most common
  group_by(name) %>%
  slice_max(fraction, n = 1)

## # A tibble: 48,040 × 5
## # Groups:   name [48,040]
##     year name     number year_total   fraction
##    <dbl> <chr>     <int>      <int>      <dbl>
##  1  2015 Aaban        15    3648781 0.00000411
##  2  2015 Aadam        22    3648781 0.00000603
##  3  2010 Aadan        11    3672066 0.00000300
##  4  2015 Aadarsh      15    3648781 0.00000411
##  5  2010 Aaden       450    3672066 0.000123  
##  6  2015 Aadhav       31    3648781 0.00000850
##  7  2015 Aadhavan      5    3648781 0.00000137
##  8  2015 Aadhya      265    3648781 0.0000726 
##  9  2010 Aadi         54    3672066 0.0000147 
## 10  2005 Aadil        20    3828460 0.00000522
## # ℹ 48,030 more rows

Adding the total and maximum for each name

Instructions
- Use a grouped mutate to add two columns:
  - name_total: the sum of the number of babies born with that name in the entire dataset.
  - name_max: the maximum number of babies born with that name in any year.
  - Add another step to ungroup the table.
  - Add a column called fraction_max containing the number in the year divided by name_max.

babynames %>%
  # Add columns name_total and name_max for each name
  group_by(name) %>%
  mutate(name_total = sum(number),
         name_max = max(number)) %>%
  # Ungroup the table 
  ungroup() %>%
  # Add the fraction_max column containing the number by the name maximum 
  mutate(fraction_max=number/name_max)

## # A tibble: 332,595 × 6
##     year name    number name_total name_max fraction_max
##    <dbl> <chr>    <int>      <int>    <int>        <dbl>
##  1  1880 Aaron      102     114739    14635     0.00697 
##  2  1880 Ab           5         77       31     0.161   
##  3  1880 Abbie       71       4330      445     0.160   
##  4  1880 Abbott       5        217       51     0.0980  
##  5  1880 Abby         6      11272     1753     0.00342 
##  6  1880 Abe         50       1832      271     0.185   
##  7  1880 Abel         9      10565     3245     0.00277 
##  8  1880 Abigail     12      72600    15762     0.000761
##  9  1880 Abner       27       1552      199     0.136   
## 10  1880 Abraham     81      17882     2449     0.0331  
## # ℹ 332,585 more rows

Visualizing the normalized change in popularity

Instructions
- Filter the names_normalized table to limit it to the three names Steven, Thomas, and Matthew.
- Create a line plot to visualize fraction_max over time, colored by name.

names_normalized <- babynames %>%
                     group_by(name) %>%
                     mutate(name_total = sum(number),
                            name_max = max(number)) %>%
                     ungroup() %>%
                     mutate(fraction_max = number / name_max)

names_filtered <- names_normalized %>%
  # Filter for the names Steven, Thomas, and Matthew
  filter(name %in% c("Steven", "Thomas", "Matthew")) 

# Visualize these names over time
ggplot(data=names_filtered,aes(x=year,y=fraction_max, color=name)) + geom_line()

Window functions

window function takes a vector and returns another vector of the same length
lag() moves each iten to the right by 1. Can compare consecutive steps and calculate the changes.

Using ratios to describe the frequency of a name

Instructions
- Arrange the data in ascending order of name and then year.
- Group by name so that your mutate works within each name.
- Add a column ratio containing the ratio (not difference) of fraction between each year.

babynames_fraction<-babynames %>%
  group_by(year) %>%
  mutate(year_total = sum(number)) %>%
  ungroup() %>%
  mutate(fraction = number / year_total) 

babynames_fraction %>%
  # Arrange the data in order of name, then year 
  arrange(name, year) %>%
  # Group the data by name
  group_by(name) %>%
  # Add a ratio column that contains the ratio of fraction between each year 
  mutate(ratio = fraction / lag(fraction))

## # A tibble: 332,595 × 6
## # Groups:   name [48,040]
##     year name    number year_total   fraction  ratio
##    <dbl> <chr>    <int>      <int>      <dbl>  <dbl>
##  1  2010 Aaban        9    3672066 0.00000245 NA    
##  2  2015 Aaban       15    3648781 0.00000411  1.68 
##  3  1995 Aadam        6    3652750 0.00000164 NA    
##  4  2000 Aadam        6    3767293 0.00000159  0.970
##  5  2005 Aadam        6    3828460 0.00000157  0.984
##  6  2010 Aadam        7    3672066 0.00000191  1.22 
##  7  2015 Aadam       22    3648781 0.00000603  3.16 
##  8  2010 Aadan       11    3672066 0.00000300 NA    
##  9  2015 Aadan       10    3648781 0.00000274  0.915
## 10  2000 Aadarsh      5    3767293 0.00000133 NA    
## # ℹ 332,585 more rows

Biggest jumps in a name

Instructions
- From each name in the data, keep the observation (the year) with the largest ratio; note the data is already grouped by name.
- Sort the ratio column in descending order.
- Filter the babynames_ratios_filtered data further by filtering the fraction column to only display results greater than or equal to 0.001.

babynames_ratios_filtered <- babynames_fraction %>%
                     arrange(name, year) %>%
                     group_by(name) %>%
                     mutate(ratio = fraction / lag(fraction)) %>%
                     filter(fraction >= 0.00001)

babynames_ratios_filtered %>%
  # Extract the largest ratio from each name 
  slice_max(ratio,n=1) %>%
  # Sort the ratio column in descending order 
  arrange(desc(ratio)) %>%
  # Filter for fractions greater than or equal to 0.001
  filter(fraction>=0.001)

## # A tibble: 291 × 6
## # Groups:   name [291]
##     year name    number year_total fraction ratio
##    <dbl> <chr>    <int>      <int>    <dbl> <dbl>
##  1  1960 Tammy    14365    4152075  0.00346  70.1
##  2  2005 Nevaeh    4610    3828460  0.00120  45.8
##  3  1940 Brenda    5460    2301630  0.00237  37.5
##  4  1885 Grover     774     240822  0.00321  36.0
##  5  1945 Cheryl    8170    2652029  0.00308  24.9
##  6  1955 Lori      4980    4012691  0.00124  23.2
##  7  2010 Khloe     5411    3672066  0.00147  23.2
##  8  1950 Debra     6189    3502592  0.00177  22.6
##  9  2010 Bentley   4001    3672066  0.00109  22.4
## 10  1935 Marlene   4840    2088487  0.00232  16.8
## # ℹ 281 more rows

Data Manipulation with dplyr

Anika

2023-07-25

Data Manipulation with dplyr; Transforming Data with dplyr

Exploring data with dplyr

Understanding your data

Selecting columns

The filter and arrange verbs

Arranging observations

Filtering for conditions

Filtering and arranging

The mutate() verb

Calculating the number of government employees

Calculating the percentage of women in a county

Select, mutate, filter, and arrange

Data Manipulation with dplyr; Aggregating Data

The count verb

Counting by region

Counting citizens by state

Mutating and counting

The group_by, summarize, and ungroup verbs

Summarizing

Summarizing by state

Summarizing by state and region

The slice_min and slice_max verbs

Selecting a county from each region

Finding the lowest-income state in each region

Using summarize, slice_max, and count together

Data Manipulation with dplyr; Selecting and Transforming Data

Selecting

Selecting columns

Select helpers

The rename verb

Renaming a column after count

Renaming a column as part of a select

The transmute verb

Choosing among verbs

Using transmute

Matching verbs to their definitions

Choosing among the four verbs

Data Manipulation with dplyr; Case Study; The babynames Dataset

Filtering and arranging for one year

Finding the most popular names each year

Visualizing names with ggplot2

Grouped mutates

Finding the year and each name is most common

Adding the total and maximum for each name

Visualizing the normalized change in popularity

Window functions

Using ratios to describe the frequency of a name

Biggest jumps in a name