library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# The data set I'm using includes childcare costs, family income, and labor force participation rates from 2000–2018.
setwd("~/Desktop/my class stuff/Wednesday Class")
# Import datasets
costs <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counties <- read_csv("counties.csv")
## Rows: 3144 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): county_name, state_name, state_abbreviation
## dbl (1): county_fips_code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Merge datasets by county FIPS code to get county names
data <- left_join(costs, counties, by = "county_fips_code")
Next I need to filter my data to the most recent year that was available, which is 2018. I select 2018 to conduct a cross-sectional analysis (one observation per county in the latest year). Focusing on one year avoids the complexity of time-series or panel correlations (because not doing that bestie!), and 2018 provides asnapshot of the relationship.
# Filter for the year 2018
data2018 <- data %>% filter(study_year == 2018)
# Remove counties with missing cost or LFPR data
data2018 <- data2018 %>% filter(!is.na(mc_infant) & !is.na(flfpr_20to64_under6))
# Number of observations
nrow(data2018)
## [1] 2360
# Labor force participation rate stats
mean(data2018$flfpr_20to64_under6, na.rm = TRUE)
## [1] 70.44784
sd(data2018$flfpr_20to64_under6, na.rm = TRUE)
## [1] 13.29605
# Infant care cost stats
summary(data2018[, c("flfpr_20to64_under6", "mc_infant")])
## flfpr_20to64_under6 mc_infant
## Min. : 0.00 Min. : 27.73
## 1st Qu.: 63.88 1st Qu.:126.40
## Median : 72.00 Median :153.60
## Mean : 70.45 Mean :162.39
## 3rd Qu.: 78.50 3rd Qu.:182.29
## Max. :100.00 Max. :470.00
# Compute Pearson correlation
cor(data2018$mc_infant, data2018$flfpr_20to64_under6, use = "complete.obs")
## [1] 0.1013143
# Perform a correlation test with p-value
cor.test(data2018$mc_infant, data2018$flfpr_20to64_under6)
##
## Pearson's product-moment correlation
##
## data: data2018$mc_infant and data2018$flfpr_20to64_under6
## t = 4.9452, df = 2358, p-value = 8.143e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06121555 0.14108649
## sample estimates:
## cor
## 0.1013143
#REGRESSION MODELLLLLS RAHHH
# Simple linear regression: LFPR and childcare cost only
model_simple <- lm(flfpr_20to64_under6 ~ mc_infant, data = data2018)
summary(model_simple)
##
## Call:
## lm(formula = flfpr_20to64_under6 ~ mc_infant, data = data2018)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.282 -6.744 1.235 7.672 32.072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.686548 0.807885 82.545 < 2e-16 ***
## mc_infant 0.023162 0.004684 4.945 8.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.23 on 2358 degrees of freedom
## Multiple R-squared: 0.01026, Adjusted R-squared: 0.009845
## F-statistic: 24.45 on 1 and 2358 DF, p-value: 8.143e-07
# Multiple linear regression with childcare cost + controls
model <- lm(flfpr_20to64_under6 ~ mc_infant + mhi_2018 + funr_20to64, data = data2018)
summary(model)
##
## Call:
## lm(formula = flfpr_20to64_under6 ~ mc_infant + mhi_2018 + funr_20to64,
## data = data2018)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.441 -6.380 1.227 7.505 38.665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.769e+01 1.393e+00 48.581 < 2e-16 ***
## mc_infant 5.231e-03 6.391e-03 0.819 0.41313
## mhi_2018 8.470e-05 2.832e-05 2.991 0.00281 **
## funr_20to64 -4.981e-01 1.021e-01 -4.877 1.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.09 on 2356 degrees of freedom
## Multiple R-squared: 0.03126, Adjusted R-squared: 0.03002
## F-statistic: 25.34 on 3 and 2356 DF, p-value: 3.94e-16
#Assumption 1 : Linearity (Is the relationship a straight-line?)
plot(model, which = 1) # Residuals vs Fitted
#Assumption 2: Independence of Observations
#I think I am good because I am only analayzing 2018, so theres no other code needed
#Assumption 3: Homoscedasticity (Are residuals spread out evenly?)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 39.078, df = 3, p-value = 1.671e-08
#NAUR my residuals do not have equal variance.
library(sandwich)
library(lmtest)
coeftest(model, vcov = vcovHC(model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.7693e+01 1.5397e+00 43.9659 < 2.2e-16 ***
## mc_infant 5.2313e-03 6.0325e-03 0.8672 0.3859231
## mhi_2018 8.4703e-05 2.9674e-05 2.8545 0.0043485 **
## funr_20to64 -4.9806e-01 1.3953e-01 -3.5695 0.0003648 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Assumption 3: Normality of Residuals
plot(model, which = 2) # Q-Q plot
shapiro.test(resid(model)) # Formal test
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.9547, p-value < 2.2e-16
#ASSUMPTION4:Multicollinearity
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model)
## mc_infant mhi_2018 funr_20to64
## 1.900630 2.170791 1.198647
library(tidyverse)
library(ggplot2)
My Variables to make the graphing easy flfpr_20to64_under6 – % of moms with kids <6 in the labor force (DV) mc_infant – weekly infant care cost (IV) mhi_2018 – median household income funr_20to64 – female unemployment rate
#Histogram of Childcare Costs
# square-root rule for number of bins
sqrt(nrow(data2018))
## [1] 48.57983
bins_cost <- ceiling(sqrt(nrow(data2018)))
ggplot(data2018, aes(x = mc_infant)) +
geom_histogram(bins = bins_cost) +
labs(
title = "Distribution of Weekly Infant Care Costs",
subtitle = "U.S. counties, 2018",
x = "Median weekly infant care cost (USD)",
y = "Number of counties"
)
# Number of bins for LFPR histogram, using the same square root rule
bins_lfpr <- ceiling(sqrt(nrow(data2018)))
ggplot(data2018, aes(x = flfpr_20to64_under6)) +
geom_histogram(bins = bins_lfpr) +
labs(
title = "Distribution of Maternal Labor Force Participation",
subtitle = "Women 20–64 with children under 6, by county (2018)",
x = "Female labor force participation rate (%)",
y = "Number of counties"
)
#Scatterplot: Childcare Cost vs Maternal LFPR
ggplot(data2018,
aes(x = mc_infant,
y = flfpr_20to64_under6)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Childcare Costs and Maternal Labor Force Participation",
subtitle = "Each point is a county (2018)",
x = "Median weekly infant care cost (USD)",
y = "Female LFPR (%) for mothers with children under 6"
)
## `geom_smooth()` using formula = 'y ~ x'
#Scatterplot with Income as a Third Dimension
# LETSSSS Create income quartilessss
data2018 <- data2018 %>%
mutate(income_quartile = ntile(mhi_2018, 4))
ggplot(data2018,
aes(x = mc_infant,
y = flfpr_20to64_under6,
color = as.factor(income_quartile))) +
geom_point(alpha = 0.5) +
labs(
title = "Childcare Costs and Maternal LFPR by Income Quartile",
subtitle = "Counties grouped into quartiles of median household income",
x = "Median weekly infant care cost (USD)",
y = "Female LFPR (%) for mothers with children under 6",
color = "Income\nquartile"
)
#Plot of Model Residuals with ggplot
data2018 <- data2018 %>%
mutate(
fitted = fitted(model),
resid = resid(model)
)
ggplot(data2018, aes(x = fitted, y = resid)) +
geom_point(alpha = 0.4) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Residuals vs Fitted Values",
subtitle = "Checking linearity and constant variance",
x = "Fitted values (predicted LFPR)",
y = "Residuals"
)
#Regression Table?
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## flfpr_20to64_under6
## -----------------------------------------------
## mc_infant 0.005
## (0.006)
##
## mhi_2018 0.0001***
## (0.00003)
##
## funr_20to64 -0.498***
## (0.102)
##
## Constant 67.693***
## (1.393)
##
## -----------------------------------------------
## Observations 2,360
## R2 0.031
## Adjusted R2 0.030
## Residual Std. Error 13.095 (df = 2356)
## F Statistic 25.338*** (df = 3; 2356)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01