Project 3

Project 3: Tidying the counties_complete dataset!

Packages etc

#install.packages("usdata")
#install.packages("openintro")

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(openintro)

Loading required package: airports
Loading required package: cherryblossom
Loading required package: usdata

library(usdata)

names(county_complete)

  [1] "fips"                                          
  [2] "state"                                         
  [3] "name"                                          
  [4] "pop2000"                                       
  [5] "pop2010"                                       
  [6] "pop2011"                                       
  [7] "pop2012"                                       
  [8] "pop2013"                                       
  [9] "pop2014"                                       
 [10] "pop2015"                                       
 [11] "pop2016"                                       
 [12] "pop2017"                                       
 [13] "age_under_5_2010"                              
 [14] "age_under_5_2017"                              
 [15] "age_under_18_2010"                             
 [16] "age_over_65_2010"                              
 [17] "age_over_65_2017"                              
 [18] "median_age_2017"                               
 [19] "female_2010"                                   
 [20] "white_2010"                                    
 [21] "black_2010"                                    
 [22] "black_2017"                                    
 [23] "native_2010"                                   
 [24] "native_2017"                                   
 [25] "asian_2010"                                    
 [26] "asian_2017"                                    
 [27] "pac_isl_2010"                                  
 [28] "pac_isl_2017"                                  
 [29] "other_single_race_2017"                        
 [30] "two_plus_races_2010"                           
 [31] "two_plus_races_2017"                           
 [32] "hispanic_2010"                                 
 [33] "hispanic_2017"                                 
 [34] "white_not_hispanic_2010"                       
 [35] "white_not_hispanic_2017"                       
 [36] "speak_english_only_2017"                       
 [37] "no_move_in_one_plus_year_2010"                 
 [38] "foreign_born_2010"                             
 [39] "foreign_spoken_at_home_2010"                   
 [40] "women_16_to_50_birth_rate_2017"                
 [41] "hs_grad_2010"                                  
 [42] "hs_grad_2016"                                  
 [43] "hs_grad_2017"                                  
 [44] "some_college_2016"                             
 [45] "some_college_2017"                             
 [46] "bachelors_2010"                                
 [47] "bachelors_2016"                                
 [48] "bachelors_2017"                                
 [49] "veterans_2010"                                 
 [50] "veterans_2017"                                 
 [51] "mean_work_travel_2010"                         
 [52] "mean_work_travel_2017"                         
 [53] "broadband_2017"                                
 [54] "computer_2017"                                 
 [55] "housing_units_2010"                            
 [56] "homeownership_2010"                            
 [57] "housing_multi_unit_2010"                       
 [58] "median_val_owner_occupied_2010"                
 [59] "households_2010"                               
 [60] "households_2017"                               
 [61] "persons_per_household_2010"                    
 [62] "persons_per_household_2017"                    
 [63] "per_capita_income_2010"                        
 [64] "per_capita_income_2017"                        
 [65] "metro_2013"                                    
 [66] "median_household_income_2010"                  
 [67] "median_household_income_2016"                  
 [68] "median_household_income_2017"                  
 [69] "private_nonfarm_establishments_2009"           
 [70] "private_nonfarm_employment_2009"               
 [71] "percent_change_private_nonfarm_employment_2009"
 [72] "nonemployment_establishments_2009"             
 [73] "firms_2007"                                    
 [74] "black_owned_firms_2007"                        
 [75] "native_owned_firms_2007"                       
 [76] "asian_owned_firms_2007"                        
 [77] "pac_isl_owned_firms_2007"                      
 [78] "hispanic_owned_firms_2007"                     
 [79] "women_owned_firms_2007"                        
 [80] "manufacturer_shipments_2007"                   
 [81] "mercent_whole_sales_2007"                      
 [82] "sales_2007"                                    
 [83] "sales_per_capita_2007"                         
 [84] "accommodation_food_service_2007"               
 [85] "building_permits_2010"                         
 [86] "fed_spending_2009"                             
 [87] "area_2010"                                     
 [88] "density_2010"                                  
 [89] "smoking_ban_2010"                              
 [90] "poverty_2010"                                  
 [91] "poverty_2016"                                  
 [92] "poverty_2017"                                  
 [93] "poverty_age_under_5_2017"                      
 [94] "poverty_age_under_18_2017"                     
 [95] "civilian_labor_force_2007"                     
 [96] "employed_2007"                                 
 [97] "unemployed_2007"                               
 [98] "unemployment_rate_2007"                        
 [99] "civilian_labor_force_2008"                     
[100] "employed_2008"                                 
[101] "unemployed_2008"                               
[102] "unemployment_rate_2008"                        
[103] "civilian_labor_force_2009"                     
[104] "employed_2009"                                 
[105] "unemployed_2009"                               
[106] "unemployment_rate_2009"                        
[107] "civilian_labor_force_2010"                     
[108] "employed_2010"                                 
[109] "unemployed_2010"                               
[110] "unemployment_rate_2010"                        
[111] "civilian_labor_force_2011"                     
[112] "employed_2011"                                 
[113] "unemployed_2011"                               
[114] "unemployment_rate_2011"                        
[115] "civilian_labor_force_2012"                     
[116] "employed_2012"                                 
[117] "unemployed_2012"                               
[118] "unemployment_rate_2012"                        
[119] "civilian_labor_force_2013"                     
[120] "employed_2013"                                 
[121] "unemployed_2013"                               
[122] "unemployment_rate_2013"                        
[123] "civilian_labor_force_2014"                     
[124] "employed_2014"                                 
[125] "unemployed_2014"                               
[126] "unemployment_rate_2014"                        
[127] "civilian_labor_force_2015"                     
[128] "employed_2015"                                 
[129] "unemployed_2015"                               
[130] "unemployment_rate_2015"                        
[131] "civilian_labor_force_2016"                     
[132] "employed_2016"                                 
[133] "unemployed_2016"                               
[134] "unemployment_rate_2016"                        
[135] "uninsured_2017"                                
[136] "uninsured_age_under_6_2017"                    
[137] "uninsured_age_under_19_2017"                   
[138] "uninsured_age_over_74_2017"                    
[139] "civilian_labor_force_2017"                     
[140] "employed_2017"                                 
[141] "unemployed_2017"                               
[142] "unemployment_rate_2017"                        
[143] "age_over_18_2019"                              
[144] "age_over_65_2019"                              
[145] "age_over_85_2019"                              
[146] "age_under_5_2019"                              
[147] "asian_2019"                                    
[148] "avg_family_size_2019"                          
[149] "bachelors_2019"                                
[150] "black_2019"                                    
[151] "hispanic_2019"                                 
[152] "household_has_broadband_2019"                  
[153] "household_has_computer_2019"                   
[154] "household_has_smartphone_2019"                 
[155] "households_2019"                               
[156] "households_speak_asian_or_pac_isl_2019"        
[157] "households_speak_limited_english_2019"         
[158] "households_speak_other_2019"                   
[159] "households_speak_other_indo_euro_lang_2019"    
[160] "households_speak_spanish_2019"                 
[161] "housing_mobile_homes_2019"                     
[162] "housing_one_unit_structures_2019"              
[163] "housing_two_unit_structures_2019"              
[164] "hs_grad_2019"                                  
[165] "mean_household_income_2019"                    
[166] "mean_work_travel_2019"                         
[167] "median_age_2019"                               
[168] "median_household_income_2019"                  
[169] "median_individual_income_2019"                 
[170] "median_individual_income_age_25plus_2019"      
[171] "native_2019"                                   
[172] "other_single_race_2019"                        
[173] "pac_isl_2019"                                  
[174] "per_capita_income_2019"                        
[175] "persons_per_household_2019"                    
[176] "pop_2019"                                      
[177] "poverty_2019"                                  
[178] "poverty_65_and_over_2019"                      
[179] "poverty_under_18_2019"                         
[180] "two_plus_races_2019"                           
[181] "unemployment_rate_2019"                        
[182] "uninsured_2019"                                
[183] "uninsured_65_and_older_2019"                   
[184] "uninsured_under_19_2019"                       
[185] "uninsured_under_6_2019"                        
[186] "veterans_2019"                                 
[187] "white_2019"                                    
[188] "white_not_hispanic_2019"

Pre-pivoting cleaning

#county_short <- subset(county_complete, select = -c(per_capita_income, persons_per_household, median_age, sales_per_capita, unemployment_rate))

names(county_complete)[3] = "county"
names(county_complete)[4] = "pop_2000" 
names(county_complete)[5] = "pop_2010" 
names(county_complete)[6] = "pop_2011" 
names(county_complete)[7] = "pop_2012"
names(county_complete)[8] = "pop_2013"
names(county_complete)[9] = "pop_2014"
names(county_complete)[10] = "pop_2015"
names(county_complete)[11] = "pop_2016"
names(county_complete)[12] = "pop_2017"
names(county_complete)[136] = "uninsured_under_6_2017_pain_in_ass"
names(county_complete)[19] = "female_percent_2010"

I want to mutate a new gender column called “male_2010” (this may be unnecessary, who needs them anyway)

county_complete <- county_complete |> 
  mutate(male_percent_2010 = (100 - female_percent_2010))

Chaninging an inexplicable character column into numeric

county_complete <- county_complete |> 
  mutate(uninsured_age_under_6_2017 = as.numeric(uninsured_under_6_2017_pain_in_ass)) |> 
  select(-uninsured_under_6_2017_pain_in_ass)

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `uninsured_age_under_6_2017 =
  as.numeric(uninsured_under_6_2017_pain_in_ass)`.
Caused by warning:
! NAs introduced by coercion

unique(county_complete$uninsured_under_6_2017_pain_in_ass)

NULL

tidy_year <- county_complete |> 
  pivot_longer(
  cols = -c(fips, state, county),
  names_to = c(".value", "Year"), 
  names_pattern = "(.+)_(\\d{4})", 
  values_drop_na = TRUE)

names(tidy_year)[5] = "population"

So year just became a character, that sucks, so will now fix that.

tidy_year <- tidy_year |> 
  mutate(year = as.numeric(Year)) |> 
  select(-Year) |>  
  relocate(year, .before = population)

tidy_year <- tidy_year |> 
  pivot_longer(
    cols = c(starts_with("other_single_race"), black, white, hispanic, native, asian, pac_isl, starts_with("two_plus_races"), starts_with("white_not_his"), starts_with("hs_grad"), starts_with("some_college"), starts_with("bachelors"), starts_with("age"), starts_with("broadband"), starts_with("computer"), starts_with("poverty"), starts_with("uninsured"), starts_with("households_speak"), starts_with("poverty"), starts_with("uninsured"), starts_with("no_move"), starts_with("uninsured"), starts_with("speak_english"), starts_with("foreign"), starts_with("male"), starts_with("female")),
  names_to = "demographic", 
  values_to = "percent_of_population",
  values_drop_na = TRUE) |> 
  relocate(demographic, .after = population) |> 
  relocate(percent_of_population, .after = demographic)

head(tidy_year)

# A tibble: 6 × 55
   fips state   county         year population demographic percent_of_population
  <dbl> <chr>   <chr>         <dbl>      <dbl> <chr>                       <dbl>
1  1001 Alabama Autauga Coun…  2010      54571 black                        17.7
2  1001 Alabama Autauga Coun…  2010      54571 white                        78.5
3  1001 Alabama Autauga Coun…  2010      54571 hispanic                      2.4
4  1001 Alabama Autauga Coun…  2010      54571 native                        0.4
5  1001 Alabama Autauga Coun…  2010      54571 asian                         0.9
6  1001 Alabama Autauga Coun…  2010      54571 two_plus_r…                   1.6
# ℹ 48 more variables: median_age <dbl>, women_16_to_50_birth_rate <dbl>,
#   veterans <dbl>, mean_work_travel <dbl>, housing_units <dbl>,
#   homeownership <dbl>, housing_multi_unit <dbl>,
#   median_val_owner_occupied <dbl>, households <dbl>,
#   persons_per_household <dbl>, per_capita_income <dbl>, metro <dbl>,
#   median_household_income <dbl>, private_nonfarm_establishments <dbl>,
#   private_nonfarm_employment <dbl>, …

tidy_year <- tidy_year |> 
  mutate(demographic = case_when(
    demographic == "hs_grad" ~ "high school",
    demographic == "pac_isl" ~ "pacific islander", 
    demographic == "male_percent " ~ "men",
    demographic == "female_percent" ~ "women",
    TRUE ~ demographic
  ))

Final outcome:

After much gnashing of teeth, here is what I have come up with. I do not know if it is according to the ideal, but the number of variables have been reduced, names have been cleaned up, things that aught not to be characters have been made into integers, and there are new columns that summarize the data more succinctly. I did perhaps go a little crazy with the column “demographic, but I found it gross that there would be so many percentages just floating about.

This exercise killed my memory so many times.

less_a_mess <- tidy_year |> 
  mutate(demographic = gsub("_", " ", demographic))

head(less_a_mess)

# A tibble: 6 × 55
   fips state   county         year population demographic percent_of_population
  <dbl> <chr>   <chr>         <dbl>      <dbl> <chr>                       <dbl>
1  1001 Alabama Autauga Coun…  2010      54571 black                        17.7
2  1001 Alabama Autauga Coun…  2010      54571 white                        78.5
3  1001 Alabama Autauga Coun…  2010      54571 hispanic                      2.4
4  1001 Alabama Autauga Coun…  2010      54571 native                        0.4
5  1001 Alabama Autauga Coun…  2010      54571 asian                         0.9
6  1001 Alabama Autauga Coun…  2010      54571 two plus r…                   1.6
# ℹ 48 more variables: median_age <dbl>, women_16_to_50_birth_rate <dbl>,
#   veterans <dbl>, mean_work_travel <dbl>, housing_units <dbl>,
#   homeownership <dbl>, housing_multi_unit <dbl>,
#   median_val_owner_occupied <dbl>, households <dbl>,
#   persons_per_household <dbl>, per_capita_income <dbl>, metro <dbl>,
#   median_household_income <dbl>, private_nonfarm_establishments <dbl>,
#   private_nonfarm_employment <dbl>, …