#install.packages("usdata")
#install.packages("openintro")Project 3
Project 3: Tidying the counties_complete dataset!
Packages etc
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)Loading required package: airports
Loading required package: cherryblossom
Loading required package: usdata
library(usdata)names(county_complete) [1] "fips"
[2] "state"
[3] "name"
[4] "pop2000"
[5] "pop2010"
[6] "pop2011"
[7] "pop2012"
[8] "pop2013"
[9] "pop2014"
[10] "pop2015"
[11] "pop2016"
[12] "pop2017"
[13] "age_under_5_2010"
[14] "age_under_5_2017"
[15] "age_under_18_2010"
[16] "age_over_65_2010"
[17] "age_over_65_2017"
[18] "median_age_2017"
[19] "female_2010"
[20] "white_2010"
[21] "black_2010"
[22] "black_2017"
[23] "native_2010"
[24] "native_2017"
[25] "asian_2010"
[26] "asian_2017"
[27] "pac_isl_2010"
[28] "pac_isl_2017"
[29] "other_single_race_2017"
[30] "two_plus_races_2010"
[31] "two_plus_races_2017"
[32] "hispanic_2010"
[33] "hispanic_2017"
[34] "white_not_hispanic_2010"
[35] "white_not_hispanic_2017"
[36] "speak_english_only_2017"
[37] "no_move_in_one_plus_year_2010"
[38] "foreign_born_2010"
[39] "foreign_spoken_at_home_2010"
[40] "women_16_to_50_birth_rate_2017"
[41] "hs_grad_2010"
[42] "hs_grad_2016"
[43] "hs_grad_2017"
[44] "some_college_2016"
[45] "some_college_2017"
[46] "bachelors_2010"
[47] "bachelors_2016"
[48] "bachelors_2017"
[49] "veterans_2010"
[50] "veterans_2017"
[51] "mean_work_travel_2010"
[52] "mean_work_travel_2017"
[53] "broadband_2017"
[54] "computer_2017"
[55] "housing_units_2010"
[56] "homeownership_2010"
[57] "housing_multi_unit_2010"
[58] "median_val_owner_occupied_2010"
[59] "households_2010"
[60] "households_2017"
[61] "persons_per_household_2010"
[62] "persons_per_household_2017"
[63] "per_capita_income_2010"
[64] "per_capita_income_2017"
[65] "metro_2013"
[66] "median_household_income_2010"
[67] "median_household_income_2016"
[68] "median_household_income_2017"
[69] "private_nonfarm_establishments_2009"
[70] "private_nonfarm_employment_2009"
[71] "percent_change_private_nonfarm_employment_2009"
[72] "nonemployment_establishments_2009"
[73] "firms_2007"
[74] "black_owned_firms_2007"
[75] "native_owned_firms_2007"
[76] "asian_owned_firms_2007"
[77] "pac_isl_owned_firms_2007"
[78] "hispanic_owned_firms_2007"
[79] "women_owned_firms_2007"
[80] "manufacturer_shipments_2007"
[81] "mercent_whole_sales_2007"
[82] "sales_2007"
[83] "sales_per_capita_2007"
[84] "accommodation_food_service_2007"
[85] "building_permits_2010"
[86] "fed_spending_2009"
[87] "area_2010"
[88] "density_2010"
[89] "smoking_ban_2010"
[90] "poverty_2010"
[91] "poverty_2016"
[92] "poverty_2017"
[93] "poverty_age_under_5_2017"
[94] "poverty_age_under_18_2017"
[95] "civilian_labor_force_2007"
[96] "employed_2007"
[97] "unemployed_2007"
[98] "unemployment_rate_2007"
[99] "civilian_labor_force_2008"
[100] "employed_2008"
[101] "unemployed_2008"
[102] "unemployment_rate_2008"
[103] "civilian_labor_force_2009"
[104] "employed_2009"
[105] "unemployed_2009"
[106] "unemployment_rate_2009"
[107] "civilian_labor_force_2010"
[108] "employed_2010"
[109] "unemployed_2010"
[110] "unemployment_rate_2010"
[111] "civilian_labor_force_2011"
[112] "employed_2011"
[113] "unemployed_2011"
[114] "unemployment_rate_2011"
[115] "civilian_labor_force_2012"
[116] "employed_2012"
[117] "unemployed_2012"
[118] "unemployment_rate_2012"
[119] "civilian_labor_force_2013"
[120] "employed_2013"
[121] "unemployed_2013"
[122] "unemployment_rate_2013"
[123] "civilian_labor_force_2014"
[124] "employed_2014"
[125] "unemployed_2014"
[126] "unemployment_rate_2014"
[127] "civilian_labor_force_2015"
[128] "employed_2015"
[129] "unemployed_2015"
[130] "unemployment_rate_2015"
[131] "civilian_labor_force_2016"
[132] "employed_2016"
[133] "unemployed_2016"
[134] "unemployment_rate_2016"
[135] "uninsured_2017"
[136] "uninsured_age_under_6_2017"
[137] "uninsured_age_under_19_2017"
[138] "uninsured_age_over_74_2017"
[139] "civilian_labor_force_2017"
[140] "employed_2017"
[141] "unemployed_2017"
[142] "unemployment_rate_2017"
[143] "age_over_18_2019"
[144] "age_over_65_2019"
[145] "age_over_85_2019"
[146] "age_under_5_2019"
[147] "asian_2019"
[148] "avg_family_size_2019"
[149] "bachelors_2019"
[150] "black_2019"
[151] "hispanic_2019"
[152] "household_has_broadband_2019"
[153] "household_has_computer_2019"
[154] "household_has_smartphone_2019"
[155] "households_2019"
[156] "households_speak_asian_or_pac_isl_2019"
[157] "households_speak_limited_english_2019"
[158] "households_speak_other_2019"
[159] "households_speak_other_indo_euro_lang_2019"
[160] "households_speak_spanish_2019"
[161] "housing_mobile_homes_2019"
[162] "housing_one_unit_structures_2019"
[163] "housing_two_unit_structures_2019"
[164] "hs_grad_2019"
[165] "mean_household_income_2019"
[166] "mean_work_travel_2019"
[167] "median_age_2019"
[168] "median_household_income_2019"
[169] "median_individual_income_2019"
[170] "median_individual_income_age_25plus_2019"
[171] "native_2019"
[172] "other_single_race_2019"
[173] "pac_isl_2019"
[174] "per_capita_income_2019"
[175] "persons_per_household_2019"
[176] "pop_2019"
[177] "poverty_2019"
[178] "poverty_65_and_over_2019"
[179] "poverty_under_18_2019"
[180] "two_plus_races_2019"
[181] "unemployment_rate_2019"
[182] "uninsured_2019"
[183] "uninsured_65_and_older_2019"
[184] "uninsured_under_19_2019"
[185] "uninsured_under_6_2019"
[186] "veterans_2019"
[187] "white_2019"
[188] "white_not_hispanic_2019"
Pre-pivoting cleaning
#county_short <- subset(county_complete, select = -c(per_capita_income, persons_per_household, median_age, sales_per_capita, unemployment_rate))names(county_complete)[3] = "county"
names(county_complete)[4] = "pop_2000"
names(county_complete)[5] = "pop_2010"
names(county_complete)[6] = "pop_2011"
names(county_complete)[7] = "pop_2012"
names(county_complete)[8] = "pop_2013"
names(county_complete)[9] = "pop_2014"
names(county_complete)[10] = "pop_2015"
names(county_complete)[11] = "pop_2016"
names(county_complete)[12] = "pop_2017"
names(county_complete)[136] = "uninsured_under_6_2017_pain_in_ass"
names(county_complete)[19] = "female_percent_2010"I want to mutate a new gender column called “male_2010” (this may be unnecessary, who needs them anyway)
county_complete <- county_complete |>
mutate(male_percent_2010 = (100 - female_percent_2010))Chaninging an inexplicable character column into numeric
county_complete <- county_complete |>
mutate(uninsured_age_under_6_2017 = as.numeric(uninsured_under_6_2017_pain_in_ass)) |>
select(-uninsured_under_6_2017_pain_in_ass)Warning: There was 1 warning in `mutate()`.
ℹ In argument: `uninsured_age_under_6_2017 =
as.numeric(uninsured_under_6_2017_pain_in_ass)`.
Caused by warning:
! NAs introduced by coercion
unique(county_complete$uninsured_under_6_2017_pain_in_ass)NULL
tidy_year <- county_complete |>
pivot_longer(
cols = -c(fips, state, county),
names_to = c(".value", "Year"),
names_pattern = "(.+)_(\\d{4})",
values_drop_na = TRUE)
names(tidy_year)[5] = "population" So year just became a character, that sucks, so will now fix that.
tidy_year <- tidy_year |>
mutate(year = as.numeric(Year)) |>
select(-Year) |>
relocate(year, .before = population)tidy_year <- tidy_year |>
pivot_longer(
cols = c(starts_with("other_single_race"), black, white, hispanic, native, asian, pac_isl, starts_with("two_plus_races"), starts_with("white_not_his"), starts_with("hs_grad"), starts_with("some_college"), starts_with("bachelors"), starts_with("age"), starts_with("broadband"), starts_with("computer"), starts_with("poverty"), starts_with("uninsured"), starts_with("households_speak"), starts_with("poverty"), starts_with("uninsured"), starts_with("no_move"), starts_with("uninsured"), starts_with("speak_english"), starts_with("foreign"), starts_with("male"), starts_with("female")),
names_to = "demographic",
values_to = "percent_of_population",
values_drop_na = TRUE) |>
relocate(demographic, .after = population) |>
relocate(percent_of_population, .after = demographic)head(tidy_year)# A tibble: 6 × 55
fips state county year population demographic percent_of_population
<dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
1 1001 Alabama Autauga Coun… 2010 54571 black 17.7
2 1001 Alabama Autauga Coun… 2010 54571 white 78.5
3 1001 Alabama Autauga Coun… 2010 54571 hispanic 2.4
4 1001 Alabama Autauga Coun… 2010 54571 native 0.4
5 1001 Alabama Autauga Coun… 2010 54571 asian 0.9
6 1001 Alabama Autauga Coun… 2010 54571 two_plus_r… 1.6
# ℹ 48 more variables: median_age <dbl>, women_16_to_50_birth_rate <dbl>,
# veterans <dbl>, mean_work_travel <dbl>, housing_units <dbl>,
# homeownership <dbl>, housing_multi_unit <dbl>,
# median_val_owner_occupied <dbl>, households <dbl>,
# persons_per_household <dbl>, per_capita_income <dbl>, metro <dbl>,
# median_household_income <dbl>, private_nonfarm_establishments <dbl>,
# private_nonfarm_employment <dbl>, …
tidy_year <- tidy_year |>
mutate(demographic = case_when(
demographic == "hs_grad" ~ "high school",
demographic == "pac_isl" ~ "pacific islander",
demographic == "male_percent " ~ "men",
demographic == "female_percent" ~ "women",
TRUE ~ demographic
))Final outcome:
After much gnashing of teeth, here is what I have come up with. I do not know if it is according to the ideal, but the number of variables have been reduced, names have been cleaned up, things that aught not to be characters have been made into integers, and there are new columns that summarize the data more succinctly. I did perhaps go a little crazy with the column “demographic, but I found it gross that there would be so many percentages just floating about.
This exercise killed my memory so many times.
less_a_mess <- tidy_year |>
mutate(demographic = gsub("_", " ", demographic))head(less_a_mess)# A tibble: 6 × 55
fips state county year population demographic percent_of_population
<dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
1 1001 Alabama Autauga Coun… 2010 54571 black 17.7
2 1001 Alabama Autauga Coun… 2010 54571 white 78.5
3 1001 Alabama Autauga Coun… 2010 54571 hispanic 2.4
4 1001 Alabama Autauga Coun… 2010 54571 native 0.4
5 1001 Alabama Autauga Coun… 2010 54571 asian 0.9
6 1001 Alabama Autauga Coun… 2010 54571 two plus r… 1.6
# ℹ 48 more variables: median_age <dbl>, women_16_to_50_birth_rate <dbl>,
# veterans <dbl>, mean_work_travel <dbl>, housing_units <dbl>,
# homeownership <dbl>, housing_multi_unit <dbl>,
# median_val_owner_occupied <dbl>, households <dbl>,
# persons_per_household <dbl>, per_capita_income <dbl>, metro <dbl>,
# median_household_income <dbl>, private_nonfarm_establishments <dbl>,
# private_nonfarm_employment <dbl>, …