# Childcare Prices, Regulation and Baumol's Cost Disease
# Dataset = "childcare_data_state"
library(ggplot2)

# Data Frame
childcare_data_state <- data.frame(
  State = c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"),
  ratio = c(9,8,10.5,10.5,9,8.5,7,9,13,12.5,10,10,9,7.5,7,9.5,11,13,6,8,7,7,8.5,13,9,8,8,11.5,7,8,11,6,12.5,6,9.5,10,7.5,8,7.5,9,7.5,8,13,9.5,7.5,9,8.5,9,8,9),
  Hourly_Wage_2023 = c(26.99, 19.88, 18.78, 22.92, 25.98, 25.98, 26.98, 23.69, 21.67, 21.86, 24.28, 22.34, 21.27, 23.43, 21.86, 21.76, 21.02, 19.87, 29.18, 26.83, 22.88, 22.57, 24.46, 21.67, 18.03, 21.97, 21.85, 23.47, 22.33, 24.03, 26.38, 20.97, 21.54, 27.33, 22.45, 19.94, 24.04, 22.81, 24.5, 20.3, 21.0, 21.07, 22.1, 22.6, 24.0, 23.86, 28.81, 22.88, 19.12, 22.72),
  Center_Price_2023 = c("15076.57", "7919.43", "7162.37", "11488.62", "14224.55", "NR", "19175.76", "12415.38", "9184.25", "8669.60", "21186.70", "10980.39", "8096.16", "13160.25", "NR", "8490.01", " 
6,422","7720.59", "21125.26", "11717.78", "11505.70", "11942.49", "15405.18", "10616.79", "4602.89", "9126.74", "10403.49", "10605.70", "10954.86", "13200.64", "13434.34", "NR", "14689.60", "17475.54", "11929.22", "8768.15", "13802.34", "11964.45", "14230.24", "9127.03", "7545.56", "8372.43", "9180.19", "9510.56", "12208.17", "13682.19", "15202.69", "13188.73", "8363.08", "8287.78"),
  Home_Price_2023 = c("9922.83", "6383.60", "5782.36", "6611.26", "12300.97", "NR", "12626.15", "8897.59", "8001.13", "7345.45", "10334.90", "7177.96", "6806.65", "9443.30", "NR", "5999.39", "5,706", "5765.00", "14105.71", "10086.25", "9270.99", "7916.18", "9585.73", "7329.25", "4232.97", "7457.64", "8763.18", "7967.24", "7901.77", "10500.45", "10341.55", "NR", "11314.82", "12417.73", "9253.85", "7626.55", "8075.61", "9439.29", "10642.39", "7109.80", "5689.37", "7067.90", "8351.70", "8120.18", "9321.90", "10430.66", "12309.99", "10345.16", "6570.99", "8124.86")
)

# Convert the price columns to numeric, replacing "NR" with NA 
childcare_data_state$Center_Price_2023 <- as.numeric(gsub("NR", NA, childcare_data_state$Center_Price_2023))
## Warning: NAs introduced by coercion
childcare_data_state$Home_Price_2023 <- as.numeric(gsub("NR", NA, childcare_data_state$Home_Price_2023))
## Warning: NAs introduced by coercion
# Only complete cases for visualization
data_clean <- na.omit(childcare_data_state)

# Model 1: Center-based Childcare Price
model_center <- lm(Center_Price_2023 ~ Hourly_Wage_2023 + ratio, data = data_clean)
summary(model_center)
## 
## Call:
## lm(formula = Center_Price_2023 ~ Hourly_Wage_2023 + ratio, data = data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4417.9 -1322.5   -34.9   745.9  8116.2 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -14070.43    3894.08  -3.613 0.000787 ***
## Hourly_Wage_2023   1146.83     131.36   8.731 4.49e-11 ***
## ratio               -70.41     175.47  -0.401 0.690225    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2080 on 43 degrees of freedom
## Multiple R-squared:  0.6765, Adjusted R-squared:  0.6614 
## F-statistic: 44.96 on 2 and 43 DF,  p-value: 2.901e-11
# Model 2: Home-based Childcare Price
model_home <- lm(Home_Price_2023 ~ Hourly_Wage_2023 + ratio, data = data_clean)
summary(model_home)
## 
## Call:
## lm(formula = Home_Price_2023 ~ Hourly_Wage_2023 + ratio, data = data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2020.1  -621.3  -168.0   789.9  3716.4 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -7810.34    2063.59  -3.785 0.000472 ***
## Hourly_Wage_2023   724.84      69.61  10.413 2.49e-13 ***
## ratio              -16.35      92.99  -0.176 0.861215    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1102 on 43 degrees of freedom
## Multiple R-squared:  0.7443, Adjusted R-squared:  0.7324 
## F-statistic: 62.58 on 2 and 43 DF,  p-value: 1.847e-13
# Code Visualization 

plot_center <- ggplot(data_clean, aes(x = Hourly_Wage_2023, y = Center_Price_2023, color = ratio)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "darkblue", linetype = "dashed") +
  labs(
    title = "Center-Based Childcare Price vs. Hourly Wage",
    subtitle = "Points colored by Children to Staff Ratio (toddler)",
    x = "Median Hourly Wage ($)",
    y = "Center Price ($)"
  ) +
  scale_color_gradient(low = "red", high = "green", name = "Ratio") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

print(plot_center)
## `geom_smooth()` using formula = 'y ~ x'

plot_home <- ggplot(data_clean, aes(x = Hourly_Wage_2023, y = Home_Price_2023, color = ratio)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "darkblue", linetype = "dashed") +
  labs(
    title = "Home-Based Childcare Price vs. Hourly Wage",
    subtitle = "Points colored by Children to Staff Ratio (toddler)",
    x = "Median Hourly Wage ($)",
    y = "Home Price ($)"
  ) +
  scale_color_gradient(low = "red", high = "green", name = "Ratio") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

print(plot_home)
## `geom_smooth()` using formula = 'y ~ x'