# Setup chunk: This loads the main libraries I need.
# - readr: to read CSV files
# - dplyr: for data cleaning and manipulation
# - ggplot2: for visuals
# - pastecs: for extra descriptive stats 
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(dplyr)
library(ggplot2)
library(pastecs)
# STEP 1: Load my dataset
# The data set I'm using includes childcare costs, family income, and labor force participation rates from 2000–2018.
setwd("~/Desktop/my class stuff/Wednesday Class")
childcare <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
childcare <- read_csv("childcare_costs.csv")
## Rows: 34567 Columns: 61
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (61): county_fips_code, study_year, unr_16, funr_16, munr_16, unr_20to64...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counties <- read_csv("counties.csv")
## Rows: 3144 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): county_name, state_name, state_abbreviation
## dbl (1): county_fips_code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
childcare_named <- childcare %>%
  left_join(counties, by = "county_fips_code")
childcare_tx <- childcare_named %>% filter(state_abbreviation == "TX")
# Select key variables to study correlations
# mhi_2018 = median household income
# mc_infant/mc_preschool = childcare costs for infants/preschoolers (center-based)
# mfcc_preschool = childcare costs (family child care)
# flfpr_20to64_under6 = labor force participation for mothers with kids 
vars <- childcare_named %>%
  select(
    mhi_2018,                 # income
    mc_infant, mc_preschool,  # center prices
    mfcc_preschool,           # family child care price
    flfpr_20to64_under6       # moms' LFPR under age 6
  )

# Remove any rows with missing values so correlation can run properly
vars_complete <- vars %>% drop_na()
#  Compute correlation matrices using Pearson, Spearman, and Kendall
# Pearson assumes linear relationship I believe the others are rank-based, so might be better for skewed data)
cor_matrix_pearson <- cor(vars_complete, method = "pearson")
cor_matrix_pearson
##                      mhi_2018  mc_infant mc_preschool mfcc_preschool
## mhi_2018            1.0000000 0.59947695    0.6122877     0.59690660
## mc_infant           0.5994769 1.00000000    0.9624722     0.89632931
## mc_preschool        0.6122877 0.96247223    1.0000000     0.90441285
## mfcc_preschool      0.5969066 0.89632931    0.9044129     1.00000000
## flfpr_20to64_under6 0.1569362 0.09861032    0.1211850     0.09222734
##                     flfpr_20to64_under6
## mhi_2018                     0.15693617
## mc_infant                    0.09861032
## mc_preschool                 0.12118504
## mfcc_preschool               0.09222734
## flfpr_20to64_under6          1.00000000
cor_matrix_spearman <- cor(vars_complete, method = "spearman")
cor_matrix_kendall  <- cor(vars_complete, method = "kendall")

cor_matrix_spearman
##                      mhi_2018 mc_infant mc_preschool mfcc_preschool
## mhi_2018            1.0000000 0.5539651    0.5534705      0.5316485
## mc_infant           0.5539651 1.0000000    0.9629580      0.9078018
## mc_preschool        0.5534705 0.9629580    1.0000000      0.9062732
## mfcc_preschool      0.5316485 0.9078018    0.9062732      1.0000000
## flfpr_20to64_under6 0.2006453 0.1339412    0.1472661      0.1053287
##                     flfpr_20to64_under6
## mhi_2018                      0.2006453
## mc_infant                     0.1339412
## mc_preschool                  0.1472661
## mfcc_preschool                0.1053287
## flfpr_20to64_under6           1.0000000
cor_matrix_kendall
##                      mhi_2018  mc_infant mc_preschool mfcc_preschool
## mhi_2018            1.0000000 0.38940161    0.3885025      0.3724460
## mc_infant           0.3894016 1.00000000    0.8429096      0.7463630
## mc_preschool        0.3885025 0.84290956    1.0000000      0.7492900
## mfcc_preschool      0.3724460 0.74636298    0.7492900      1.0000000
## flfpr_20to64_under6 0.1348935 0.08996004    0.0990518      0.0710031
##                     flfpr_20to64_under6
## mhi_2018                     0.13489352
## mc_infant                    0.08996004
## mc_preschool                 0.09905180
## mfcc_preschool               0.07100310
## flfpr_20to64_under6          1.00000000
# Visualize the relationships with a pairs plot
# Each scatterplot shows how two variables move together.
# looking for any strong upward or downward patterns.
pairs(vars_complete,
      main = "Pairs Plot: Income, Prices, and Mothers' Labor Force Participation",
      pch = 19, cex = 0.35)

# Test the strongest-looking relationship
# Infant vs Preschool childcare costs – these are highly positively correlated.

x <- vars_complete$mc_infant
y <- vars_complete$mc_preschool

# Pearson = linear correlation
pearson_res <- cor.test(x, y, method = "pearson")

# Spearman = rank correlation (monotonic, less sensitive to outliers)
spearman_res <- cor.test(x, y, method = "spearman")
## Warning in cor.test.default(x, y, method = "spearman"): Cannot compute exact
## p-value with ties
# Kendall = concordance-based rank correlation
kendall_res  <- cor.test(x, y, method = "kendall")

pearson_res
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 541.83, df = 23340, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9615156 0.9634055
## sample estimates:
##       cor 
## 0.9624722
spearman_res
## 
##  Spearman's rank correlation rho
## 
## data:  x and y
## S = 7.8516e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.962958
kendall_res
## 
##  Kendall's rank correlation tau
## 
## data:  x and y
## z = 192.63, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.8429096
# Test another meaningful relationship
# Household income vs preschool childcare cost
# Expecting a positive correlation (higher income = higher costs)

x2 <- vars_complete$mhi_2018
y2 <- vars_complete$mc_preschool

pearson_res2  <- cor.test(x2, y2, method = "pearson")
spearman_res2 <- cor.test(x2, y2, method = "spearman")
## Warning in cor.test.default(x2, y2, method = "spearman"): Cannot compute exact
## p-value with ties
kendall_res2  <- cor.test(x2, y2, method = "kendall")

pearson_res2
## 
##  Pearson's product-moment correlation
## 
## data:  x2 and y2
## t = 118.31, df = 23340, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6042049 0.6202445
## sample estimates:
##       cor 
## 0.6122877
spearman_res2
## 
##  Spearman's rank correlation rho
## 
## data:  x2 and y2
## S = 9.4648e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5534705
kendall_res2
## 
##  Kendall's rank correlation tau
## 
## data:  x2 and y2
## z = 88.887, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.3885025

Findings! When I looked at the correlation matrix, I found a very strong positive relationship between different types of childcare prices, especially between infant care and preschool care. This means that when the cost of infant care goes up, the cost of preschool care usually goes up too. That makes sense, because the same things—like higher wages, rent, or cost of living—tend to raise prices for all childcare age groups in a community. I also found that median household income has a moderate positive correlation with childcare costs. In other words, counties with higher incomes also tend to have more expensive childcare. This could be because wealthier areas often have a higher overall cost of living, or because providers can charge more when families can afford to pay higher prices.

Method Justification: I used Pearson’s correlation first, because it’s best when the relationship looks linear—meaning the points in a scatterplot form a fairly straight line. Since the childcare price data looked like that, Pearson made sense. But I also ran Spearman and Kendall tests just to be sure, because those methods are based on rankings instead of exact numbers and work better if data are skewed or have outliers. All three methods showed the same result: the relationships were strong, positive, and statistically significant. This gave me more confidence that the patterns I saw were real and not just random.