library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# The data set I'm using includes childcare costs, family income, and labor force participation rates from 2000–2018.
setwd("~/Desktop/my class stuff/Wednesday Class")
# Import datasets
costs <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counties <- read_csv("counties.csv")
## Rows: 3144 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): county_name, state_name, state_abbreviation
## dbl (1): county_fips_code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Merge datasets by county FIPS code to get county names

data <- left_join(costs, counties, by = "county_fips_code")

Next I need to filter my data to the most recent year that was available, which is 2018. I select 2018 to conduct a cross-sectional analysis (one observation per county in the latest year). Focusing on one year avoids the complexity of time-series or panel correlations (because not doing that bestie!), and 2018 provides asnapshot of the relationship.

# Filter for the year 2018
data2018 <- data %>% filter(study_year == 2018)
# Remove counties with missing cost or LFPR data
data2018 <- data2018 %>% filter(!is.na(mc_infant) & !is.na(flfpr_20to64_under6))
# Number of observations
nrow(data2018)
## [1] 2360
# Labor force participation rate stats
mean(data2018$flfpr_20to64_under6, na.rm = TRUE)
## [1] 70.44784
sd(data2018$flfpr_20to64_under6, na.rm = TRUE)
## [1] 13.29605
# Infant care cost stats
summary(data2018[, c("flfpr_20to64_under6", "mc_infant")])
##  flfpr_20to64_under6   mc_infant     
##  Min.   :  0.00      Min.   : 27.73  
##  1st Qu.: 63.88      1st Qu.:126.40  
##  Median : 72.00      Median :153.60  
##  Mean   : 70.45      Mean   :162.39  
##  3rd Qu.: 78.50      3rd Qu.:182.29  
##  Max.   :100.00      Max.   :470.00
# Compute Pearson correlation
cor(data2018$mc_infant, data2018$flfpr_20to64_under6, use = "complete.obs")
## [1] 0.1013143
# Perform a correlation test with p-value
cor.test(data2018$mc_infant, data2018$flfpr_20to64_under6)
## 
##  Pearson's product-moment correlation
## 
## data:  data2018$mc_infant and data2018$flfpr_20to64_under6
## t = 4.9452, df = 2358, p-value = 8.143e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.06121555 0.14108649
## sample estimates:
##       cor 
## 0.1013143
#REGRESSION MODELLLLLS RAHHH
# Simple linear regression: LFPR and childcare cost only
model_simple <- lm(flfpr_20to64_under6 ~ mc_infant, data = data2018)
summary(model_simple)
## 
## Call:
## lm(formula = flfpr_20to64_under6 ~ mc_infant, data = data2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.282  -6.744   1.235   7.672  32.072 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.686548   0.807885  82.545  < 2e-16 ***
## mc_infant    0.023162   0.004684   4.945 8.14e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.23 on 2358 degrees of freedom
## Multiple R-squared:  0.01026,    Adjusted R-squared:  0.009845 
## F-statistic: 24.45 on 1 and 2358 DF,  p-value: 8.143e-07
# Multiple linear regression with childcare cost + controls
model <- lm(flfpr_20to64_under6 ~ mc_infant + mhi_2018 + funr_20to64, data = data2018)
summary(model)
## 
## Call:
## lm(formula = flfpr_20to64_under6 ~ mc_infant + mhi_2018 + funr_20to64, 
##     data = data2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.441  -6.380   1.227   7.505  38.665 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.769e+01  1.393e+00  48.581  < 2e-16 ***
## mc_infant    5.231e-03  6.391e-03   0.819  0.41313    
## mhi_2018     8.470e-05  2.832e-05   2.991  0.00281 ** 
## funr_20to64 -4.981e-01  1.021e-01  -4.877 1.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.09 on 2356 degrees of freedom
## Multiple R-squared:  0.03126,    Adjusted R-squared:  0.03002 
## F-statistic: 25.34 on 3 and 2356 DF,  p-value: 3.94e-16
#Assumption 1 : Linearity (Is the relationship a straight-line?)
plot(model, which = 1)  # Residuals vs Fitted

#Assumption 2: Independence of Observations
#I think I am good because I am only analayzing 2018, so theres no other code needed
#Assumption 3: Homoscedasticity (Are residuals spread out evenly?)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 39.078, df = 3, p-value = 1.671e-08
#NAUR my residuals do not have equal variance.
library(sandwich)
library(lmtest)
coeftest(model, vcov = vcovHC(model, type = "HC1"))
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)  6.7693e+01  1.5397e+00 43.9659 < 2.2e-16 ***
## mc_infant    5.2313e-03  6.0325e-03  0.8672 0.3859231    
## mhi_2018     8.4703e-05  2.9674e-05  2.8545 0.0043485 ** 
## funr_20to64 -4.9806e-01  1.3953e-01 -3.5695 0.0003648 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Assumption 3: Normality of Residuals
plot(model, which = 2)       # Q-Q plot

shapiro.test(resid(model))   # Formal test
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.9547, p-value < 2.2e-16
#ASSUMPTION4:Multicollinearity
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(model)
##   mc_infant    mhi_2018 funr_20to64 
##    1.900630    2.170791    1.198647
library(tidyverse)
library(ggplot2)

My Variables to make the graphing easy flfpr_20to64_under6 – % of moms with kids <6 in the labor force (DV) mc_infant – weekly infant care cost (IV) mhi_2018 – median household income funr_20to64 – female unemployment rate

#Histogram of Childcare Costs
# square-root rule for number of bins
sqrt(nrow(data2018))
## [1] 48.57983
bins_cost <- ceiling(sqrt(nrow(data2018)))
ggplot(data2018, aes(x = mc_infant)) +
  geom_histogram(bins = bins_cost) +
  labs(
    title    = "Distribution of Weekly Infant Care Costs",
    subtitle = "U.S. counties, 2018",
    x = "Median weekly infant care cost (USD)",
    y = "Number of counties"
  )

# Number of bins for LFPR histogram, using the same square root rule
bins_lfpr <- ceiling(sqrt(nrow(data2018)))
ggplot(data2018, aes(x = flfpr_20to64_under6)) +
  geom_histogram(bins = bins_lfpr) +
  labs(
    title    = "Distribution of Maternal Labor Force Participation",
    subtitle = "Women 20–64 with children under 6, by county (2018)",
    x = "Female labor force participation rate (%)",
    y = "Number of counties"
  )

#Scatterplot: Childcare Cost vs Maternal LFPR
ggplot(data2018,
       aes(x = mc_infant,
           y = flfpr_20to64_under6)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title    = "Childcare Costs and Maternal Labor Force Participation",
    subtitle = "Each point is a county (2018)",
    x = "Median weekly infant care cost (USD)",
    y = "Female LFPR (%) for mothers with children under 6"
  )
## `geom_smooth()` using formula = 'y ~ x'

#Scatterplot with Income as a Third Dimension
# LETSSSS Create income quartilessss
data2018 <- data2018 %>%
  mutate(income_quartile = ntile(mhi_2018, 4))

ggplot(data2018,
       aes(x   = mc_infant,
           y   = flfpr_20to64_under6,
           color = as.factor(income_quartile))) +
  geom_point(alpha = 0.5) +
  labs(
    title    = "Childcare Costs and Maternal LFPR by Income Quartile",
    subtitle = "Counties grouped into quartiles of median household income",
    x = "Median weekly infant care cost (USD)",
    y = "Female LFPR (%) for mothers with children under 6",
    color = "Income\nquartile"
  )

#Plot of Model Residuals with ggplot
data2018 <- data2018 %>%
  mutate(
    fitted = fitted(model),
    resid  = resid(model)
  )

ggplot(data2018, aes(x = fitted, y = resid)) +
  geom_point(alpha = 0.4) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title    = "Residuals vs Fitted Values",
    subtitle = "Checking linearity and constant variance",
    x = "Fitted values (predicted LFPR)",
    y = "Residuals"
  )

#Regression Table?
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                         flfpr_20to64_under6    
## -----------------------------------------------
## mc_infant                      0.005           
##                               (0.006)          
##                                                
## mhi_2018                     0.0001***         
##                              (0.00003)         
##                                                
## funr_20to64                  -0.498***         
##                               (0.102)          
##                                                
## Constant                     67.693***         
##                               (1.393)          
##                                                
## -----------------------------------------------
## Observations                   2,360           
## R2                             0.031           
## Adjusted R2                    0.030           
## Residual Std. Error     13.095 (df = 2356)     
## F Statistic          25.338*** (df = 3; 2356)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01