# Loading the data first girly pop
setwd("~/Desktop/my class stuff/Wednesday Class")
Wednesday_class <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Wednesday_class)
## # A tibble: 6 × 61
##   county_fips_code study_year unr_16 funr_16 munr_16 unr_20to64 funr_20to64
##              <dbl>      <dbl>  <dbl>   <dbl>   <dbl>      <dbl>       <dbl>
## 1             1001       2008   5.42    4.41    6.32        4.6         3.5
## 2             1001       2009   5.93    5.72    6.11        4.8         4.6
## 3             1001       2010   6.21    5.57    6.78        5.1         4.6
## 4             1001       2011   7.55    8.13    7.03        6.2         6.3
## 5             1001       2012   8.6     8.88    8.29        6.7         6.4
## 6             1001       2013   9.39   10.3     8.56        7.3         7.6
## # ℹ 54 more variables: munr_20to64 <dbl>, flfpr_20to64 <dbl>,
## #   flfpr_20to64_under6 <dbl>, flfpr_20to64_6to17 <dbl>,
## #   flfpr_20to64_under6_6to17 <dbl>, mlfpr_20to64 <dbl>, pr_f <dbl>,
## #   pr_p <dbl>, mhi_2018 <dbl>, me_2018 <dbl>, fme_2018 <dbl>, mme_2018 <dbl>,
## #   total_pop <dbl>, one_race <dbl>, one_race_w <dbl>, one_race_b <dbl>,
## #   one_race_i <dbl>, one_race_a <dbl>, one_race_h <dbl>, one_race_other <dbl>,
## #   two_races <dbl>, hispanic <dbl>, households <dbl>, …
#The variable I want to look at is flfpr_20to64_under which is Labor force participation rate of the female population aged 20–64 years old who have children under 6 years old. I felt like it would tie perfectly to my 
library(pastecs)
stat.desc(Wednesday_class$flfpr_20to64_under6)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 3.456700e+04 3.900000e+01 0.000000e+00 0.000000e+00 1.000000e+02 1.000000e+02 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 2.378950e+06 6.960000e+01 6.882141e+01 6.324204e-02 1.239565e-01 1.382526e+02 
##      std.dev     coef.var 
## 1.175809e+01 1.708493e-01
# Summarize variable with pastecs
#for my own sake, PASTECS means :PAckage for STatistical Exploration of Complex Systems
library(dplyr)
#dplyr comes from the tidyverse and its to organzize data!
Wednesday_class_clean <- Wednesday_class %>%
  filter(!is.na(flfpr_20to64_under6))
library(ggplot2)

ggplot(Wednesday_class_clean, aes(x = flfpr_20to64_under6)) +
  geom_histogram(binwidth = 1, color = "purple", fill = "lightblue") +
  labs(title = "Female Labor Force Participation (Mothers with Children Under 6)",
       x = "Labor Force Participation Rate (%)",
       y = "Number of Counties")

#The histogram shows that most counties have female labor force participation rates between 60% and 80% for mothers with children under six. It shows that many mothers with young children are working or looking for work. The graph is slightly right-skewed, showing fewer counties with very low participation rates.
Wednesday_class_clean <- Wednesday_class_clean %>%
  mutate(sqrt_flfpr_under6 = sqrt(flfpr_20to64_under6))
ggplot(Wednesday_class_clean, aes(x = sqrt_flfpr_under6)) +
  geom_histogram(binwidth = 0.5, color = "pink", fill = "red") +
  labs(title = "Square Root Transformed Female Labor Force Participation",
       x = "√(Participation Rate)",
       y = "Number of Counties")

#The square root transformation made the data look a tad more balanced and less skewed. Most counties now cluster between 7 and 9, showing a smoother, more normal-looking distribution of female labor force participation rates.


#Overall, I believe this analysis helped me visualize how female labor force participation among mothers with young children varies across U.S. counties for the year of 2018. The transformation helped normalize the data for future statistical testing or some regression models that are related to childcare costs.

#shout out to my incredible classmate karina who shared/highlighted some notes from lecture 5 and 6 (the class I missed because I was sick) which made completing this homework a lot easier!

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.