# Loading the data first girly pop
setwd("~/Desktop/my class stuff/Wednesday Class")
Wednesday_class <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Wednesday_class)
## # A tibble: 6 × 61
## county_fips_code study_year unr_16 funr_16 munr_16 unr_20to64 funr_20to64
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 2008 5.42 4.41 6.32 4.6 3.5
## 2 1001 2009 5.93 5.72 6.11 4.8 4.6
## 3 1001 2010 6.21 5.57 6.78 5.1 4.6
## 4 1001 2011 7.55 8.13 7.03 6.2 6.3
## 5 1001 2012 8.6 8.88 8.29 6.7 6.4
## 6 1001 2013 9.39 10.3 8.56 7.3 7.6
## # ℹ 54 more variables: munr_20to64 <dbl>, flfpr_20to64 <dbl>,
## # flfpr_20to64_under6 <dbl>, flfpr_20to64_6to17 <dbl>,
## # flfpr_20to64_under6_6to17 <dbl>, mlfpr_20to64 <dbl>, pr_f <dbl>,
## # pr_p <dbl>, mhi_2018 <dbl>, me_2018 <dbl>, fme_2018 <dbl>, mme_2018 <dbl>,
## # total_pop <dbl>, one_race <dbl>, one_race_w <dbl>, one_race_b <dbl>,
## # one_race_i <dbl>, one_race_a <dbl>, one_race_h <dbl>, one_race_other <dbl>,
## # two_races <dbl>, hispanic <dbl>, households <dbl>, …
#The variable I want to look at is flfpr_20to64_under which is Labor force participation rate of the female population aged 20–64 years old who have children under 6 years old. I felt like it would tie perfectly to my
library(pastecs)
stat.desc(Wednesday_class$flfpr_20to64_under6)
## nbr.val nbr.null nbr.na min max range
## 3.456700e+04 3.900000e+01 0.000000e+00 0.000000e+00 1.000000e+02 1.000000e+02
## sum median mean SE.mean CI.mean.0.95 var
## 2.378950e+06 6.960000e+01 6.882141e+01 6.324204e-02 1.239565e-01 1.382526e+02
## std.dev coef.var
## 1.175809e+01 1.708493e-01
# Summarize variable with pastecs
#for my own sake, PASTECS means :PAckage for STatistical Exploration of Complex Systems
library(dplyr)
#dplyr comes from the tidyverse and its to organzize data!
Wednesday_class_clean <- Wednesday_class %>%
filter(!is.na(flfpr_20to64_under6))
library(ggplot2)
ggplot(Wednesday_class_clean, aes(x = flfpr_20to64_under6)) +
geom_histogram(binwidth = 1, color = "purple", fill = "lightblue") +
labs(title = "Female Labor Force Participation (Mothers with Children Under 6)",
x = "Labor Force Participation Rate (%)",
y = "Number of Counties")
#The histogram shows that most counties have female labor force participation rates between 60% and 80% for mothers with children under six. It shows that many mothers with young children are working or looking for work. The graph is slightly right-skewed, showing fewer counties with very low participation rates.
Wednesday_class_clean <- Wednesday_class_clean %>%
mutate(sqrt_flfpr_under6 = sqrt(flfpr_20to64_under6))
ggplot(Wednesday_class_clean, aes(x = sqrt_flfpr_under6)) +
geom_histogram(binwidth = 0.5, color = "pink", fill = "red") +
labs(title = "Square Root Transformed Female Labor Force Participation",
x = "√(Participation Rate)",
y = "Number of Counties")
#The square root transformation made the data look a tad more balanced and less skewed. Most counties now cluster between 7 and 9, showing a smoother, more normal-looking distribution of female labor force participation rates.
#Overall, I believe this analysis helped me visualize how female labor force participation among mothers with young children varies across U.S. counties for the year of 2018. The transformation helped normalize the data for future statistical testing or some regression models that are related to childcare costs.
#shout out to my incredible classmate karina who shared/highlighted some notes from lecture 5 and 6 (the class I missed because I was sick) which made completing this homework a lot easier!
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.