Dplyr Exercises

Harold Nelson

2/1/2022

Setup

library(tidyverse)
load("county.rda")
load("cdc.Rdata")

Exercise 1

Create a subset of cdc, cdc2 which contains only gender, height and weight. Use head() to look at it.

Solution

cdc2 = cdc %>%
  select(gender, height, weight) 
head(cdc2)
##   gender height weight
## 1      m     70    175
## 2      f     64    125
## 3      f     60    105
## 4      f     66    132
## 5      f     61    150
## 6      f     64    114

Exercise 2

Read the documentation on select in the dplyr package. Then create a dataframe cdc2 starting with cdc and eliminating the variable exerany.

Solution

cdc2 = cdc %>%
  select(-exerany) %>%
  glimpse()
## Rows: 20,000
## Columns: 8
## $ genhlth  <fct> good, good, good, good, very good, very good, very good, very…
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0…
## $ height   <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 70, 69, 7…
## $ weight   <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186, 168, 1…
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175, 148, 2…
## $ age      <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 69, 23, 7…
## $ gender   <fct> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m, m, f, f…
cdc3 = cdc %>%
  select(!exerany) %>%
  glimpse()
## Rows: 20,000
## Columns: 8
## $ genhlth  <fct> good, good, good, good, very good, very good, very good, very…
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0…
## $ height   <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 70, 69, 7…
## $ weight   <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186, 168, 1…
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175, 148, 2…
## $ age      <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 69, 23, 7…
## $ gender   <fct> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m, m, f, f…

Exercise 3

Create cdc2 by removing the variables exerany and genhlth from cdc.

Solution

cdc2 = cdc %>% 
  select(!c(exerany,genhlth)) %>% 
  glimpse()
## Rows: 20,000
## Columns: 7
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0…
## $ height   <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 70, 69, 7…
## $ weight   <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186, 168, 1…
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175, 148, 2…
## $ age      <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 69, 23, 7…
## $ gender   <fct> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m, m, f, f…
cdc2 = cdc %>% 
  select(-c(exerany,genhlth)) %>% 
  glimpse()
## Rows: 20,000
## Columns: 7
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0…
## $ height   <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 70, 69, 7…
## $ weight   <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186, 168, 1…
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175, 148, 2…
## $ age      <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 69, 23, 7…
## $ gender   <fct> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m, m, f, f…
# This doesn't work

# cdc2 = cdc %>% 
#  select(-exerany, -genhlth)) %>% 
# glimpse()

Exercise 4

Create a dataframe strong_couch which contains people who don’t exercise and do smoke but have excellent general health. Start with cdc. Use glimpse() to examine the results. Get a table of gender.

Solution

strong_couch = cdc %>% 
  filter(exerany == 0 & genhlth == "excellent" & smoke100 == 1 ) %>%
  glimpse()
## Rows: 304
## Columns: 9
## $ genhlth  <fct> excellent, excellent, excellent, excellent, excellent, excell…
## $ exerany  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hlthplan <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1…
## $ smoke100 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ height   <dbl> 70, 66, 66, 69, 70, 68, 72, 69, 67, 60, 67, 63, 73, 64, 64, 7…
## $ weight   <int> 235, 134, 138, 210, 180, 180, 248, 150, 199, 113, 135, 200, 2…
## $ wtdesire <int> 175, 134, 138, 185, 175, 180, 225, 165, 145, 113, 135, 150, 2…
## $ age      <int> 56, 60, 21, 45, 25, 56, 71, 35, 49, 68, 50, 69, 42, 31, 22, 3…
## $ gender   <fct> m, f, f, m, m, m, m, m, f, f, m, f, m, m, m, m, f, f, m, f, m…
table(strong_couch$gender)
## 
##   m   f 
## 153 151

Exercise 5

Create cdc2 from cdc by adding a factor version of smoke100 with labels “Non-smoker” and “Smoker”.

Solution

cdc4 = cdc %>% 
  mutate(smokef = factor(smoke100,labels=c("Non-smoker","Smoker"))) %>% glimpse()
## Rows: 20,000
## Columns: 10
## $ genhlth  <fct> good, good, good, good, very good, very good, very good, very…
## $ exerany  <dbl> 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1…
## $ hlthplan <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ smoke100 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0…
## $ height   <dbl> 70, 64, 60, 66, 61, 64, 71, 67, 65, 70, 69, 69, 66, 70, 69, 7…
## $ weight   <int> 175, 125, 105, 132, 150, 114, 194, 170, 150, 180, 186, 168, 1…
## $ wtdesire <int> 175, 115, 105, 124, 130, 114, 185, 160, 130, 170, 175, 148, 2…
## $ age      <int> 77, 33, 49, 42, 55, 55, 31, 45, 27, 44, 46, 62, 21, 69, 23, 7…
## $ gender   <fct> m, f, f, f, f, f, m, m, f, m, m, m, m, m, m, m, m, m, m, f, f…
## $ smokef   <fct> Non-smoker, Smoker, Smoker, Non-smoker, Non-smoker, Non-smoke…

Exercise 6

Use mutate to create a variable odd. This variable is true if a man weighs less than 150 or a woman weighs more than 200. Use mean() and sum() in summarize() to get the fraction of odd people and the count of odd people in cdc.

Solution

cdc %>% 
  mutate(odd = (gender == "m" & weight < 150) |
               (gender == "f" & weight > 200)) %>% 
  summarize(mean(odd) , sum(odd))
##   mean(odd) sum(odd)
## 1   0.07895     1579

Exercise 7

Create a dataframe health_gender from cdc with one row for every combination of genhlth and gender. Use summarize to get the count of cases and the mean of weight. Don’t forget to ungroup(). Use glimpse to see the result.

Solution

health_gender = cdc %>% 
  group_by(genhlth,gender) %>% 
  summarize(count = n(), 
             mean_weight = mean(weight)) %>% 
  ungroup() %>% 
  glimpse()
## `summarise()` has grouped output by 'genhlth'. You can override using the
## `.groups` argument.
## Rows: 10
## Columns: 4
## $ genhlth     <fct> excellent, excellent, very good, very good, good, good, fa…
## $ gender      <fct> m, f, m, f, m, f, m, f, m, f
## $ count       <int> 2298, 2359, 3382, 3590, 2722, 2953, 884, 1135, 283, 394
## $ mean_weight <dbl> 182.8742, 142.1174, 189.1470, 150.4309, 193.1227, 154.8629…

Exercise 8

EXtend your code from the previous exercise to sort your dataframe in ascending order by count. Use head() to see the result.

Solution

health_gender = cdc %>% 
  group_by(genhlth,gender) %>% 
  summarize(count = n(), 
             mean_weight = mean(weight)) %>% 
  ungroup() %>% 
  arrange(count) 
## `summarise()` has grouped output by 'genhlth'. You can override using the
## `.groups` argument.
  head(health_gender)
## # A tibble: 6 × 4
##   genhlth   gender count mean_weight
##   <fct>     <fct>  <int>       <dbl>
## 1 poor      m        283        193.
## 2 poor      f        394        165.
## 3 fair      m        884        194.
## 4 fair      f       1135        162.
## 5 excellent m       2298        183.
## 6 excellent f       2359        142.

Exercise 9

Repeat the previous exercise, but sort in descending order by mean_weight.

Solution

health_gender = cdc %>% 
  group_by(genhlth,gender) %>% 
  summarize(count = n(), 
             mean_weight = mean(weight)) %>% 
  ungroup() %>% 
  arrange(desc(mean_weight)) 
## `summarise()` has grouped output by 'genhlth'. You can override using the
## `.groups` argument.
  head(health_gender)
## # A tibble: 6 × 4
##   genhlth   gender count mean_weight
##   <fct>     <fct>  <int>       <dbl>
## 1 fair      m        884        194.
## 2 poor      m        283        193.
## 3 good      m       2722        193.
## 4 very good m       3382        189.
## 5 excellent m       2298        183.
## 6 poor      f        394        165.

Exercise 10

Use the dataframe county from the file “county.rda”. Do a summary of the dataframe to look for anomalies.

Solution

summary(county)
##                 name           state         pop2000           pop2010       
##  Washington County:  30   Texas   : 254   Min.   :     67   Min.   :     82  
##  Jefferson County :  25   Georgia : 159   1st Qu.:  11224   1st Qu.:  11114  
##  Franklin County  :  24   Virginia: 133   Median :  24621   Median :  25872  
##  Jackson County   :  23   Kentucky: 120   Mean   :  89650   Mean   :  98262  
##  Lincoln County   :  23   Missouri: 115   3rd Qu.:  61775   3rd Qu.:  66780  
##  Madison County   :  19   Kansas  : 105   Max.   :9519338   Max.   :9818605  
##  (Other)          :2998   (Other) :2256   NA's   :3                          
##     pop2017           pop_change          poverty      homeownership  
##  Min.   :      88   Min.   :-33.6300   Min.   : 2.40   Min.   : 0.00  
##  1st Qu.:   10976   1st Qu.: -1.9700   1st Qu.:11.30   1st Qu.:69.50  
##  Median :   25857   Median : -0.0600   Median :15.20   Median :74.60  
##  Mean   :  103763   Mean   :  0.5339   Mean   :15.97   Mean   :73.27  
##  3rd Qu.:   67756   3rd Qu.:  2.3750   3rd Qu.:19.40   3rd Qu.:78.40  
##  Max.   :10163507   Max.   : 37.1900   Max.   :52.00   Max.   :91.30  
##  NA's   :3          NA's   :3          NA's   :2                      
##    multi_unit    unemployment_rate  metro             median_edu  
##  Min.   : 0.00   Min.   : 1.620    no  :1974   below_hs    :   2  
##  1st Qu.: 6.10   1st Qu.: 3.520    yes :1165   hs_diploma  :1397  
##  Median : 9.70   Median : 4.360    NA's:   3   some_college:1695  
##  Mean   :12.32   Mean   : 4.611                bachelors   :  46  
##  3rd Qu.:15.90   3rd Qu.: 5.355                NA's        :   2  
##  Max.   :98.50   Max.   :19.070                                   
##                  NA's   :3                                        
##  per_capita_income median_hh_income   smoking_ban  
##  Min.   :10467     Min.   : 19264   none    :1927  
##  1st Qu.:21772     1st Qu.: 41126   partial : 635  
##  Median :25445     Median : 48072   complete:   0  
##  Mean   :26093     Mean   : 49765   NA's    : 580  
##  3rd Qu.:29276     3rd Qu.: 55771                  
##  Max.   :69533     Max.   :129588                  
##  NA's   :2         NA's   :2

Exercise 11 NA?

Where are the NA values coming from?

Solution

county %>% 
  filter(is.na(pop2017) |
         is.na(pop2000) |
         is.na(pop_change) |
         is.na(median_edu) |
         is.na(median_hh_income) |
         is.na(unemployment_rate) |
         is.na(metro) |
         is.na(per_capita_income))
## # A tibble: 7 × 15
##   name     state pop2000 pop2010 pop2017 pop_c…¹ poverty homeo…² multi…³ unemp…⁴
##   <fct>    <fct>   <dbl>   <dbl>   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 Hoonah–… Alas…    3436    2150      NA   NA       11.1    64       8.6   12.6 
## 2 Skagway  Alas…      NA     968      NA   NA        5.6    59.1    27.2   10.6 
## 3 Kusilva… Alas…    7028    7459    8202    2.91    NA      64.8     4.1   NA   
## 4 Wrangell Alas…      NA    2369      NA   NA       11.7    78.7    11.9    7.71
## 5 Broomfi… Colo…      NA   55889   68341   13.8      5.2    74.4    21.7    2.63
## 6 Kalawao… Hawa…     147      90      88   -1.12    12.7     0      22.3   NA   
## 7 Oglala … Sout…   12466   13586   14354    1.59    NA      51.3     3.8   NA   
## # … with 5 more variables: metro <fct>, median_edu <fct>,
## #   per_capita_income <dbl>, median_hh_income <int>, smoking_ban <fct>, and
## #   abbreviated variable names ¹​pop_change, ²​homeownership, ³​multi_unit,
## #   ⁴​unemployment_rate

Exercise 12 Cleanup

Replace county with county_clean, from which all the bad data has been removed. Research: Remove cases with missing values using dplyr.

Repeat the summary and verify.

Solution

Research. Remove cases with missing values using dplyr.

county_clean = county %>% 
  select(-smoking_ban) %>% 
  drop_na() 

summary(county_clean)
##                 name           state         pop2000           pop2010       
##  Washington County:  30   Texas   : 254   Min.   :     67   Min.   :     82  
##  Jefferson County :  25   Georgia : 159   1st Qu.:  11260   1st Qu.:  11154  
##  Franklin County  :  24   Virginia: 133   Median :  24663   Median :  25910  
##  Jackson County   :  23   Kentucky: 120   Mean   :  89757   Mean   :  98455  
##  Lincoln County   :  23   Missouri: 115   3rd Qu.:  61896   3rd Qu.:  67016  
##  Madison County   :  19   Kansas  : 105   Max.   :9519338   Max.   :9818605  
##  (Other)          :2991   (Other) :2249                                      
##     pop2017           pop_change          poverty      homeownership 
##  Min.   :     134   Min.   :-33.6300   Min.   : 2.40   Min.   :20.7  
##  1st Qu.:   11004   1st Qu.: -1.9750   1st Qu.:11.35   1st Qu.:69.5  
##  Median :   25862   Median : -0.0700   Median :15.20   Median :74.6  
##  Mean   :  103867   Mean   :  0.5292   Mean   :15.98   Mean   :73.3  
##  3rd Qu.:   67756   3rd Qu.:  2.3700   3rd Qu.:19.40   3rd Qu.:78.4  
##  Max.   :10163507   Max.   : 37.1900   Max.   :52.00   Max.   :91.3  
##                                                                      
##    multi_unit    unemployment_rate metro             median_edu  
##  Min.   : 0.00   Min.   : 1.620    no :1971   below_hs    :   2  
##  1st Qu.: 6.10   1st Qu.: 3.520    yes:1164   hs_diploma  :1397  
##  Median : 9.70   Median : 4.360               some_college:1691  
##  Mean   :12.32   Mean   : 4.607               bachelors   :  45  
##  3rd Qu.:15.90   3rd Qu.: 5.350                                  
##  Max.   :98.50   Max.   :19.070                                  
##                                                                  
##  per_capita_income median_hh_income
##  Min.   :10467     Min.   : 19264  
##  1st Qu.:21765     1st Qu.: 41124  
##  Median :25442     Median : 48038  
##  Mean   :26074     Mean   : 49739  
##  3rd Qu.:29250     3rd Qu.: 55751  
##  Max.   :69533     Max.   :129588  
## 
glimpse(county_clean)
## Rows: 3,135
## Columns: 14
## $ name              <fct> Autauga County, Baldwin County, Barbour County, Bibb…
## $ state             <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama…
## $ pop2000           <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 11…
## $ pop2010           <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 11…
## $ pop2017           <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 11…
## $ pop_change        <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51, …
## $ poverty           <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.8…
## $ homeownership     <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.4…
## $ multi_unit        <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 13.7, 14.3, 8.7, 4.3…
## $ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 5.49, 4.93, 4.08…
## $ metro             <fct> yes, yes, no, yes, yes, no, no, yes, no, no, yes, no…
## $ median_edu        <fct> some_college, some_college, hs_diploma, hs_diploma, …
## $ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.05, 21367.39, 15…
## $ median_hh_income  <int> 55317, 52562, 33368, 43404, 47412, 29655, 36326, 436…
save(county_clean,file = "county_clean.Rdata")

Exercise 13

Calculate the total population in each state for 2017. Sort the results in ascending order. Use head() and tail() to examine the results.

Solution

state_pop = county_clean %>% 
  select(state,pop2017) %>% 
  group_by(state) %>% 
  summarize(pop = sum(pop2017)) %>% 
  arrange(pop)

head(state_pop)
## # A tibble: 6 × 2
##   state                   pop
##   <fct>                 <int>
## 1 Wyoming              579315
## 2 Vermont              623657
## 3 District of Columbia 693972
## 4 Alaska               725770
## 5 North Dakota         755393
## 6 South Dakota         855312
tail(state_pop)
## # A tibble: 6 × 2
##   state             pop
##   <fct>           <int>
## 1 Illinois     12802023
## 2 Pennsylvania 12805537
## 3 New York     19849399
## 4 Florida      20984400
## 5 Texas        28304596
## 6 California   39536653

Exercise 14

What happens if we do this with county instead of county_clean?

Solution

state_pop = county %>% 
  select(state,pop2017) %>% 
  group_by(state) %>% 
  summarize(pop = sum(pop2017)) %>% 
  arrange(pop)

head(state_pop)
## # A tibble: 6 × 2
##   state                   pop
##   <fct>                 <int>
## 1 Wyoming              579315
## 2 Vermont              623657
## 3 District of Columbia 693972
## 4 North Dakota         755393
## 5 South Dakota         869666
## 6 Delaware             961939
tail(state_pop)
## # A tibble: 6 × 2
##   state             pop
##   <fct>           <int>
## 1 Pennsylvania 12805537
## 2 New York     19849399
## 3 Florida      20984400
## 4 Texas        28304596
## 5 California   39536653
## 6 Alaska             NA

Exercise 15

Callculate State per capita income for 2017 and arrange in ascending order. Examine the head and the tail.

Hint: Begin by calculating total income for each county.

Solution

state_per_cap_income = county_clean %>% 
  mutate(inc_17 = pop2017 * per_capita_income) %>% 
  group_by(state) %>% 
  summarize(total_pop = sum(pop2017),
            total_inc = sum(inc_17)) %>% 
  ungroup() %>% 
  mutate(state_per_cap = total_inc/total_pop) %>% 
  select(state,state_per_cap) %>% 
  arrange(state_per_cap)

head(state_per_cap_income)
## # A tibble: 6 × 2
##   state         state_per_cap
##   <fct>                 <dbl>
## 1 Mississippi          22305.
## 2 Arkansas             24335.
## 3 New Mexico           24375.
## 4 West Virginia        24623.
## 5 Idaho                25245.
## 6 Alabama              25344.
tail(state_per_cap_income)
## # A tibble: 6 × 2
##   state                state_per_cap
##   <fct>                        <dbl>
## 1 New Hampshire               37343.
## 2 New Jersey                  38598.
## 3 Maryland                    38662.
## 4 Massachusetts               40252.
## 5 Connecticut                 41442.
## 6 District of Columbia        50917.

Save the result

Solution

save(state_per_cap_income,file = "state_per_cap_income.Rdata")