library(readr)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.1.0
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Set your working directory girly pop
setwd("~/Desktop/my class stuff/Wednesday Class")
childcare <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counties <- read_csv("counties.csv")
## Rows: 3144 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): county_name, state_name, state_abbreviation
## dbl (1): county_fips_code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Okay lets try to merge two datasets by county code
childcare_named <- childcare %>%
left_join(counties, by = "county_fips_code")
# time to only pull Texas data
childcare_tx <- childcare_named %>%
filter(state_abbreviation == "TX")
head(childcare_tx)
## # A tibble: 6 × 64
## county_fips_code study_year unr_16 funr_16 munr_16 unr_20to64 funr_20to64
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 48001 2008 5.89 4.2 6.95 5.3 2.8
## 2 48001 2009 7.79 6.47 8.66 7.2 4.8
## 3 48001 2010 7.01 6.75 7.21 6.2 5.3
## 4 48001 2011 7.4 7.47 7.35 6.4 5.3
## 5 48001 2012 6.97 8.09 6.14 5.8 5.6
## 6 48001 2013 7.32 7.2 7.42 6.5 5.8
## # ℹ 57 more variables: munr_20to64 <dbl>, flfpr_20to64 <dbl>,
## # flfpr_20to64_under6 <dbl>, flfpr_20to64_6to17 <dbl>,
## # flfpr_20to64_under6_6to17 <dbl>, mlfpr_20to64 <dbl>, pr_f <dbl>,
## # pr_p <dbl>, mhi_2018 <dbl>, me_2018 <dbl>, fme_2018 <dbl>, mme_2018 <dbl>,
## # total_pop <dbl>, one_race <dbl>, one_race_w <dbl>, one_race_b <dbl>,
## # one_race_i <dbl>, one_race_a <dbl>, one_race_h <dbl>, one_race_other <dbl>,
## # two_races <dbl>, hispanic <dbl>, households <dbl>, …
#Purrrr everything ran without a problem
Selecting my variables for the model
# Going to choose a dependent (Y) and some independent (X) variables
data_model <- childcare_tx %>%
select(flfpr_20to64_under6, mhi_2018, mc_infant, mc_preschool, mfcc_preschool) %>%
drop_na()
# Y = Mothers’ labor force participation rate (aka flfpr_20to64_under6 )
# X = income and childcare costs
Create a linear model
model <- lm(flfpr_20to64_under6 ~ mhi_2018 + mc_infant + mc_preschool + mfcc_preschool, data = data_model)
# lm - Linear Model
summary(model)
##
## Call:
## lm(formula = flfpr_20to64_under6 ~ mhi_2018 + mc_infant + mc_preschool +
## mfcc_preschool, data = data_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.720 -6.424 0.739 7.445 39.368
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.518e+01 1.723e+00 32.023 <2e-16 ***
## mhi_2018 3.300e-05 2.698e-05 1.223 0.2214
## mc_infant -8.264e-02 9.597e-02 -0.861 0.3892
## mc_preschool 2.077e-01 1.242e-01 1.673 0.0945 .
## mfcc_preschool -6.939e-02 6.538e-02 -1.061 0.2886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.21 on 2535 degrees of freedom
## Multiple R-squared: 0.006759, Adjusted R-squared: 0.005192
## F-statistic: 4.313 on 4 and 2535 DF, p-value: 0.001768
#Lori Notes from Last Class
#R-squared - means how much of the dependent variable the model explains, the closer to 1 the better
# P- Values- < 0.05 = significant
#Estimates / Coefficients): tell you the direction (+ or -) and strength of the relationship.
Residuals
Smallest Error was -63.72 (too high) The biggest error was +39.32 (too low) The median was 50% of predictions and were + 7 pounts, the middle was not perfect!
Coefficients: Refresher from last week Intercepts- the baseline value of your dependent variable when all predictors are 0,basically the starting point of the equation
ESTIMATE- the slope or direction of the relationship. Positive + goes up together negative = goes opposite Std. Error - how much uncertainty there is in the estimate, if its smaller its more precise T value - how strong the relationship is R-Squared - how much of the dependent variable (the thing I am trying to predict) is explained by the independent variables (the things I think are causing it) P value - since 0.0018 is less than 0.05 it is statistically significant, but it is weak I thinkm since its under 0.10
The linear regression model explained about 0.7% of the variation in mothers’ labor force participation. While the overall model was statistically significant (p = 0.0018), most individual variables were not. The only variable that showed a weak relationship was preschool childcare cost (p = 0.094), which had a small positive effect as preschool costs increased, mothers’ participation slightly increased.
From what I understand for every 1-unit increase in preschool childcare cost, the mothers’ labor force participation rate is predicted to increase by about 0.21 percentage points, assuming all other variables stay the exact same. That’s a positive relationship, meaning in areas where preschool care costs are higher, mothers tend to work slightly more. This could be because higher childcare prices often occur in regions with stronger job markets or higher household incomes.
plot(model, which = 1)
The residual plot showed that the points were fairly evenly scattered around the horizontal line, without a strong curved pattern. This shows that the model mostly meets the assumption of linearity. There may be some minor variation, but no clear signs of any major violation.