Cleaning 1.0 :
setwd("/Users/ronaldohernandez/Desktop/Data 110 folder/Final Project")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dataset
meow <- readr::read_csv ("county_w_sm_banfinaldataset.csv")
## Rows: 5999 Columns: 54
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, state, smoking_ban
## dbl (51): fips, pop2010, pop2000, age_under_5, age_under_18, age_over_65, fe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect dataset structure and first few rows
glimpse(meow)
## Rows: 5,999
## Columns: 54
## $ name <chr> "Abbeville County", "Acadia …
## $ state <chr> "South Carolina", "Louisiana…
## $ fips <dbl> 45001, 22001, 51001, 16001, …
## $ pop2010 <dbl> 25417, 61773, 33164, 392365,…
## $ pop2000 <dbl> 26167, 58861, 38305, 300904,…
## $ age_under_5 <dbl> 6.0, 7.6, 5.9, 7.2, 5.6, 5.6…
## $ age_under_18 <dbl> 22.8, 27.3, 20.9, 26.4, 19.4…
## $ age_over_65 <dbl> 16.5, 12.8, 19.1, 10.5, 12.5…
## $ female <dbl> 51.5, 51.2, 51.3, 49.9, 52.3…
## $ white <dbl> 69.6, 79.5, 65.3, 90.3, 94.0…
## $ black <dbl> 28.3, 18.1, 28.1, 1.1, 1.6, …
## $ native <dbl> 0.2, 0.3, 0.4, 0.7, 0.3, 0.3…
## $ asian <dbl> 0.3, 0.2, 0.6, 2.4, 1.8, 1.8…
## $ pac_isl <dbl> NA, NA, 0.1, 0.2, 0.1, 0.1, …
## $ two_plus_races <dbl> 1.1, 1.3, 1.6, 2.8, 1.7, 1.7…
## $ hispanic <dbl> 1.0, 1.7, 8.6, 7.1, 2.0, 2.0…
## $ white_not_hispanic <dbl> 69.1, 78.6, 61.1, 86.5, 92.8…
## $ no_move_in_one_plus_year <dbl> 88.9, 87.9, 91.5, 79.2, 72.9…
## $ foreign_born <dbl> 1.4, 0.7, 6.0, 5.8, 2.5, 2.5…
## $ foreign_spoken_at_home <dbl> 2.9, 16.4, 8.7, 9.0, 3.9, 3.…
## $ hs_grad <dbl> 76.8, 69.5, 78.9, 92.9, 88.1…
## $ bachelors <dbl> 15.3, 10.9, 18.0, 35.0, 25.5…
## $ veterans <dbl> 2151, 3495, 3378, 32352, 172…
## $ mean_work_travel <dbl> 25.1, 28.9, 20.0, 19.3, 13.9…
## $ housing_units <dbl> 12079, 25387, 21002, 159471,…
## $ home_ownership <dbl> 77.4, 69.8, 74.1, 69.6, 61.1…
## $ housing_multi_unit <dbl> 7.7, 7.1, 5.6, 18.0, 24.3, 2…
## $ median_val_owner_occupied <dbl> 85900, 86700, 149800, 214500…
## $ households <dbl> 9875, 21984, 14085, 145584, …
## $ persons_per_household <dbl> 2.49, 2.75, 2.39, 2.55, 2.33…
## $ per_capita_income <dbl> 16653, 18116, 22766, 27915, …
## $ median_household_income <dbl> 33143, 37261, 41372, 55835, …
## $ poverty <dbl> 20.7, 20.1, 15.6, 10.2, 26.0…
## $ private_nonfarm_establishments <dbl> 356, 1114, 818, 12394, 700, …
## $ private_nonfarm_employment <dbl> 4713, 12485, 9424, 166239, 8…
## $ percent_change_private_nonfarm_employment <dbl> -29.4, 4.9, 8.6, 8.5, -5.4, …
## $ nonemployment_establishments <dbl> 1494, 3681, 2267, 29533, 146…
## $ firms <dbl> 1385, 4289, 2944, 42344, 206…
## $ black_owned_firms <dbl> 19.1, NA, 6.0, 0.4, NA, NA, …
## $ native_owned_firms <dbl> NA, NA, NA, 1.0, NA, NA, NA,…
## $ asian_owned_firms <dbl> NA, NA, NA, 1.3, 1.4, 1.4, 1…
## $ pac_isl_owned_firms <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ hispanic_owned_firms <dbl> NA, 1.4, NA, 2.1, NA, NA, NA…
## $ women_owned_firms <dbl> 33.4, 25.4, 23.1, 25.4, 28.2…
## $ manufacturer_shipments_2007 <dbl> 657498, NA, 526157, 4942388,…
## $ mercent_whole_sales_2007 <dbl> NA, NA, 59400, 6006918, 5474…
## $ sales <dbl> 71936, 525956, 298001, 58551…
## $ sales_per_capita <dbl> 2841, 8808, 7749, 15720, 125…
## $ accommodation_food_service <dbl> 10963, 40790, 48144, 795953,…
## $ building_permits <dbl> 19, 108, 89, 1285, 17, 17, 1…
## $ fed_spending <dbl> 169972, 459879, 449275, 3122…
## $ area <dbl> 490.48, 655.12, 449.50, 1052…
## $ density <dbl> 51.8, 94.3, 73.8, 372.8, 45.…
## $ smoking_ban <chr> "none", "partial", "none", "…
# Select only relevant columns for the analysis
meow2.0 <- meow %>%
select(name,state, smoking_ban, sales_per_capita, density, pop2010, age_under_18, age_over_65,median_household_income,poverty,bachelors,white,black,native,asian,hispanic,hs_grad,bachelors)
# Clean column names: lowercase and replace spaces with underscores
names(meow2.0) <- tolower(names(meow2.0))
names(meow2.0) <- gsub(" ", "_", names(meow2.0))
sum(is.na(meow2.0))
## [1] 186
meow3.0 <- drop_na(meow2.0)
sum(is.na(meow3.0))
## [1] 0
dim(meow3.0)
## [1] 5816 17
Initial decode of data:
How does sales_per_capita vary across smoking_ban types? Is there a
connection between smoking_ban and median_household_income or poverty
rates?
# Summarize key economic, demographic, and racial metrics by smoking ban type (https://dplyr.tidyverse.org/reference/summarise.html)
meow3.0mean <- meow3.0 %>%
group_by(state,name,smoking_ban) %>%
summarize(
sales_per_capita = mean(sales_per_capita, na.rm = TRUE),
density = mean(density, na.rm = TRUE),
pop2010 = mean(pop2010, na.rm = TRUE),
age_under_18 = mean(age_under_18, na.rm = TRUE),
age_over_65 = mean(age_over_65, na.rm = TRUE),
median_household_income = mean(median_household_income, na.rm = TRUE),
poverty = mean(poverty, na.rm = TRUE),
bachelors = mean(bachelors, na.rm = TRUE),
white = mean(white, na.rm = TRUE),
black = mean(black, na.rm = TRUE),
native = mean(native, na.rm = TRUE),
asian = mean(asian, na.rm = TRUE),
hispanic = mean(hispanic, na.rm = TRUE),
hs_grad = mean(hs_grad, na.rm = TRUE),
bachelors = mean(hs_grad, na.rm = TRUE)
) %>%
ungroup() #pipeline to remove grouping structure
## `summarise()` has grouped output by 'state', 'name'. You can override using the
## `.groups` argument.