library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
# Set your working directory girly pop
setwd("~/Desktop/my class stuff/Wednesday Class")

childcare <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(childcare)
##  [1] "county_fips_code"          "study_year"               
##  [3] "unr_16"                    "funr_16"                  
##  [5] "munr_16"                   "unr_20to64"               
##  [7] "funr_20to64"               "munr_20to64"              
##  [9] "flfpr_20to64"              "flfpr_20to64_under6"      
## [11] "flfpr_20to64_6to17"        "flfpr_20to64_under6_6to17"
## [13] "mlfpr_20to64"              "pr_f"                     
## [15] "pr_p"                      "mhi_2018"                 
## [17] "me_2018"                   "fme_2018"                 
## [19] "mme_2018"                  "total_pop"                
## [21] "one_race"                  "one_race_w"               
## [23] "one_race_b"                "one_race_i"               
## [25] "one_race_a"                "one_race_h"               
## [27] "one_race_other"            "two_races"                
## [29] "hispanic"                  "households"               
## [31] "h_under6_both_work"        "h_under6_f_work"          
## [33] "h_under6_m_work"           "h_under6_single_m"        
## [35] "h_6to17_both_work"         "h_6to17_fwork"            
## [37] "h_6to17_mwork"             "h_6to17_single_m"         
## [39] "emp_m"                     "memp_m"                   
## [41] "femp_m"                    "emp_service"              
## [43] "memp_service"              "femp_service"             
## [45] "emp_sales"                 "memp_sales"               
## [47] "femp_sales"                "emp_n"                    
## [49] "memp_n"                    "femp_n"                   
## [51] "emp_p"                     "memp_p"                   
## [53] "femp_p"                    "mcsa"                     
## [55] "mfccsa"                    "mc_infant"                
## [57] "mc_toddler"                "mc_preschool"             
## [59] "mfcc_infant"               "mfcc_toddler"             
## [61] "mfcc_preschool"
head(childcare)
## # A tibble: 6 × 61
##   county_fips_code study_year unr_16 funr_16 munr_16 unr_20to64 funr_20to64
##              <dbl>      <dbl>  <dbl>   <dbl>   <dbl>      <dbl>       <dbl>
## 1             1001       2008   5.42    4.41    6.32        4.6         3.5
## 2             1001       2009   5.93    5.72    6.11        4.8         4.6
## 3             1001       2010   6.21    5.57    6.78        5.1         4.6
## 4             1001       2011   7.55    8.13    7.03        6.2         6.3
## 5             1001       2012   8.6     8.88    8.29        6.7         6.4
## 6             1001       2013   9.39   10.3     8.56        7.3         7.6
## # ℹ 54 more variables: munr_20to64 <dbl>, flfpr_20to64 <dbl>,
## #   flfpr_20to64_under6 <dbl>, flfpr_20to64_6to17 <dbl>,
## #   flfpr_20to64_under6_6to17 <dbl>, mlfpr_20to64 <dbl>, pr_f <dbl>,
## #   pr_p <dbl>, mhi_2018 <dbl>, me_2018 <dbl>, fme_2018 <dbl>, mme_2018 <dbl>,
## #   total_pop <dbl>, one_race <dbl>, one_race_w <dbl>, one_race_b <dbl>,
## #   one_race_i <dbl>, one_race_a <dbl>, one_race_h <dbl>, one_race_other <dbl>,
## #   two_races <dbl>, hispanic <dbl>, households <dbl>, …

Outcome (Y): mc_preschool (median center cost for preschool) Predictors (X): mhi_2018 (median household income, 2018) and unr_20to64 (unemployment rate, age 20–64)

#time to drop the NA's
childcare_model1_df <- childcare |>
  dplyr::select(mc_preschool, mhi_2018, unr_20to64) |>
  tidyr::drop_na()
#Linear model
childcare_model1 <- lm(mc_preschool ~ mhi_2018 + unr_20to64,
                       data = childcare_model1_df)

summary(childcare_model1)
## 
## Call:
## lm(formula = mc_preschool ~ mhi_2018 + unr_20to64, data = childcare_model1_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -124.104  -19.218   -2.962   14.736  184.894 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.224e+00  1.063e+00   6.799 1.08e-11 ***
## mhi_2018    1.990e-03  1.575e-05 126.381  < 2e-16 ***
## unr_20to64  2.043e+00  6.089e-02  33.547  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.62 on 23590 degrees of freedom
## Multiple R-squared:  0.4095, Adjusted R-squared:  0.4094 
## F-statistic:  8179 on 2 and 23590 DF,  p-value: < 2.2e-16
# Residuals vs Fitted
plot(childcare_model1, which = 1)

library(lmtest)
raintest(childcare_model1)
## 
##  Rainbow test
## 
## data:  childcare_model1
## Rain = 1.4224, df1 = 11797, df2 = 11793, p-value < 2.2e-16
#dont forget p < .05 ⇒ possible nonlinearity
#Independence of errors - it looks like it passed
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
#we load car becasue it has the function for durbinWatson
#The Durbin–Watson Test checks if the errors (residuals) are independent of each other so like basically if one observation’s mistake doesn’t “influence” the next one.
# p-value > 0.05 - errors are independent
# < 0.05 the model’s errors might be linked
durbinWatsonTest(childcare_model1)
##  lag Autocorrelation D-W Statistic p-value
##    1        0.852482     0.2949153       0
##  Alternative hypothesis: rho != 0
#Homoscedasticity - I think it failed
plot(childcare_model1, which = 3)

#shows how spread out the errors are across predicted values
#Breusch–Pagan Test (bptest())  checks for equal variance in  errors from the spread out above
#making sure that my errors are evenly spread out 
bptest(childcare_model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  childcare_model1
## BP = 1719.8, df = 2, p-value < 2.2e-16
#Normality of residuals - i think it failed
plot(childcare_model1, which = 2)

#If residuals are normal like the bell-shaped we talked about in class, then my  confidence intervals and significance tests (p-values) are valid

#dots should fall roughly along the diagonal dotted line.

# Shapiro–Wilk
set.seed(123) 
resids <- residuals(childcare_model1)
resid_sample <- sample(resids, 5000)  # random 5000 subset
shapiro.test(resid_sample)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid_sample
## W = 0.96441, p-value < 2.2e-16
#No multicollinearity - passed
vif(childcare_model1)
##   mhi_2018 unr_20to64 
##   1.198966   1.198966
#Variance Inflation Factor checks if the predictors are too correlated with each other
#simple correlation matrix between median household income and unemployment rate

cor(childcare_model1_df[, c("mhi_2018", "unr_20to64")], 
    use = "complete.obs")
##              mhi_2018 unr_20to64
## mhi_2018    1.0000000 -0.4073669
## unr_20to64 -0.4073669  1.0000000
#simple correlation matrix between median household income and unemployment rate
#VIF < 5 means good (no collinearity)
#VIF > 10 is bad (strong overlap between predictors)
#Correlation closer to 0 is  good  closer to ±1 is too similar.

The linear regression model predicts median preschool cost based on median household income and unemployment rate. I believe the model met some of the main assumptions but not all of them.

The Durbin–Watson test (p = 0.29) showed that the errors were independent, which is suppose to mean that the model’s mistakes don’t follow a pattern, this is a good sign!

The Breusch–Pagan test (p < 0.001) showed that the errors did not have equal spread, it looked like the model’s predictions are more accurate for some income levels than others. This violates the equal variance assumption.

The Shapiro–Wilk test (p < 0.001) and the Q–Q plot both showed that the residuals were not perfectly normal, meaning the prediction errors don’t follow a perfect bell-shaped curve.

However, the VIF values (about 1.2) and a moderate correlation (r = –0.41) showed that there is no overlap or collinearity between income and unemployment rate.

Overall I think the model works fairly well and shows a clear relationship between income and preschool cost, but it violates the assumptions of equal variance and normality.

# okay lets try to fix the  issues by transforming the dependent variable
childcare_model_log <- lm(log(mc_preschool) ~ mhi_2018 + unr_20to64,
                          data = childcare_model1_df)

summary(childcare_model_log)
## 
## Call:
## lm(formula = log(mc_preschool) ~ mhi_2018 + unr_20to64, data = childcare_model1_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53670 -0.14600 -0.00292  0.14184  1.02072 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.945e+00  8.157e-03  483.63   <2e-16 ***
## mhi_2018    1.426e-05  1.209e-07  117.94   <2e-16 ***
## unr_20to64  1.368e-02  4.675e-04   29.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2274 on 23590 degrees of freedom
## Multiple R-squared:  0.3779, Adjusted R-squared:  0.3779 
## F-statistic:  7166 on 2 and 23590 DF,  p-value: < 2.2e-16
# Re-check key assumptions
plot(childcare_model_log, which = 3)   # Scale-Location (equal variance)

plot(childcare_model_log, which = 2)   # Q-Q plot (normality)

bptest(childcare_model_log)            # BP test again
## 
##  studentized Breusch-Pagan test
## 
## data:  childcare_model_log
## BP = 257.05, df = 2, p-value < 2.2e-16
shapiro.test(sample(residuals(childcare_model_log), 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(residuals(childcare_model_log), 5000)
## W = 0.99773, p-value = 9.583e-07

After taking the log of median preschool cost, the model looks like it worked better. The Scale–Location plot showed that the errors were spread out more evenly, and the Q–Q plot looked straighter, so the errors were closer to normal. The Breusch–Pagan test still showed some uneven spread, but it wasn’t as bad as before. The Shapiro–Wilk test also showed improvement, with the errors looking more normal overall. Other checks, like the independence of errors and the low overlap between predictors, stayed alright Overall, the log-transformed model fit the data better and gave a more stable prediction of preschool cost based on income and unemployment rate.