library(readr)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.1.0
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Set your working directory girly pop
setwd("~/Desktop/my class stuff/Wednesday Class")

childcare <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counties <- read_csv("counties.csv")
## Rows: 3144 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): county_name, state_name, state_abbreviation
## dbl (1): county_fips_code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Okay lets try to merge two datasets by county code
childcare_named <- childcare %>%
  left_join(counties, by = "county_fips_code")
# time to  only pull Texas data
childcare_tx <- childcare_named %>%
  filter(state_abbreviation == "TX")
head(childcare_tx)
## # A tibble: 6 × 64
##   county_fips_code study_year unr_16 funr_16 munr_16 unr_20to64 funr_20to64
##              <dbl>      <dbl>  <dbl>   <dbl>   <dbl>      <dbl>       <dbl>
## 1            48001       2008   5.89    4.2     6.95        5.3         2.8
## 2            48001       2009   7.79    6.47    8.66        7.2         4.8
## 3            48001       2010   7.01    6.75    7.21        6.2         5.3
## 4            48001       2011   7.4     7.47    7.35        6.4         5.3
## 5            48001       2012   6.97    8.09    6.14        5.8         5.6
## 6            48001       2013   7.32    7.2     7.42        6.5         5.8
## # ℹ 57 more variables: munr_20to64 <dbl>, flfpr_20to64 <dbl>,
## #   flfpr_20to64_under6 <dbl>, flfpr_20to64_6to17 <dbl>,
## #   flfpr_20to64_under6_6to17 <dbl>, mlfpr_20to64 <dbl>, pr_f <dbl>,
## #   pr_p <dbl>, mhi_2018 <dbl>, me_2018 <dbl>, fme_2018 <dbl>, mme_2018 <dbl>,
## #   total_pop <dbl>, one_race <dbl>, one_race_w <dbl>, one_race_b <dbl>,
## #   one_race_i <dbl>, one_race_a <dbl>, one_race_h <dbl>, one_race_other <dbl>,
## #   two_races <dbl>, hispanic <dbl>, households <dbl>, …
#Purrrr everything ran without a problem

Selecting my variables for the model

# Going to choose a  dependent (Y) and some independent (X) variables
data_model <- childcare_tx %>%
  select(flfpr_20to64_under6, mhi_2018, mc_infant, mc_preschool, mfcc_preschool) %>%
  drop_na()
# Y = Mothers’ labor force participation rate (aka flfpr_20to64_under6 )
# X = income and childcare costs 

Create a linear model

model <- lm(flfpr_20to64_under6 ~ mhi_2018 + mc_infant + mc_preschool + mfcc_preschool, data = data_model)
# lm - Linear Model
summary(model)
## 
## Call:
## lm(formula = flfpr_20to64_under6 ~ mhi_2018 + mc_infant + mc_preschool + 
##     mfcc_preschool, data = data_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.720  -6.424   0.739   7.445  39.368 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.518e+01  1.723e+00  32.023   <2e-16 ***
## mhi_2018        3.300e-05  2.698e-05   1.223   0.2214    
## mc_infant      -8.264e-02  9.597e-02  -0.861   0.3892    
## mc_preschool    2.077e-01  1.242e-01   1.673   0.0945 .  
## mfcc_preschool -6.939e-02  6.538e-02  -1.061   0.2886    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.21 on 2535 degrees of freedom
## Multiple R-squared:  0.006759,   Adjusted R-squared:  0.005192 
## F-statistic: 4.313 on 4 and 2535 DF,  p-value: 0.001768
#Lori Notes from Last Class
#R-squared - means how much of the dependent variable the model explains, the closer to 1 the better
# P- Values- < 0.05 = significant
#Estimates / Coefficients): tell you the direction (+ or -) and strength of the relationship.

Residuals

Smallest Error was -63.72 (too high) The biggest error was +39.32 (too low) The median was 50% of predictions and were + 7 pounts, the middle was not perfect!

Coefficients: Refresher from last week Intercepts- the baseline value of your dependent variable when all predictors are 0,basically the starting point of the equation

ESTIMATE- the slope or direction of the relationship. Positive + goes up together negative = goes opposite Std. Error - how much uncertainty there is in the estimate, if its smaller its more precise T value - how strong the relationship is R-Squared - how much of the dependent variable (the thing I am trying to predict) is explained by the independent variables (the things I think are causing it) P value - since 0.0018 is less than 0.05 it is statistically significant, but it is weak I thinkm since its under 0.10

The linear regression model explained about 0.7% of the variation in mothers’ labor force participation. While the overall model was statistically significant (p = 0.0018), most individual variables were not. The only variable that showed a weak relationship was preschool childcare cost (p = 0.094), which had a small positive effect as preschool costs increased, mothers’ participation slightly increased.

From what I understand for every 1-unit increase in preschool childcare cost, the mothers’ labor force participation rate is predicted to increase by about 0.21 percentage points, assuming all other variables stay the exact same. That’s a positive relationship, meaning in areas where preschool care costs are higher, mothers tend to work slightly more. This could be because higher childcare prices often occur in regions with stronger job markets or higher household incomes.

plot(model, which = 1)

The residual plot showed that the points were fairly evenly scattered around the horizontal line, without a strong curved pattern. This shows that the model mostly meets the assumption of linearity. There may be some minor variation, but no clear signs of any major violation.