library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Read the Excel file
library(readxl)
df <- read_excel("Desktop/In class/eci_post_income_data.xls")
## New names:
## • `` -> `...8`
## • `NO` -> `NO...10`
## • `ME` -> `ME...12`
## • `NO` -> `NO...13`
## • `ME` -> `ME...15`
print(head(df))
## # A tibble: 6 × 15
## County B3P CSCS CSFA TS PPSC TPPS ...8 HOUSEHOLDS NO...10 TWT1
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <lgl> <chr> <dbl> <dbl>
## 1 Anderson 2299 29 <NA> 29 0.012… 0.01… NA All house… 131200 70460
## 2 Andrews 1703 78 * * 0.045… * NA Type of H… NA NA
## 3 Angelina 5166 299 <NA> 299 0.057… 0.05… NA Family ho… 84270 90760
## 4 Aransas 1305 30 <NA> 30 0.022… 0.02… NA ...Marrie… 61440 104000
## 5 Archer 400 15 * * 0.037… * NA ...Female… 15620 57890
## 6 Armstrong 74 * <NA> * * * NA ...Male h… 7212 73630
## # ℹ 4 more variables: ME...12 <dbl>, NO...13 <dbl>, TWT2 <dbl>, ME...15 <dbl>
print(summary(df))
## County B3P CSCS CSFA
## Length:254 Min. : 2.0 Length:254 Length:254
## Class :character 1st Qu.: 383.0 Class :character Class :character
## Mode :character Median : 993.5 Mode :character Mode :character
## Mean : 6920.7
## 3rd Qu.: 2587.5
## Max. :316834.0
##
## TS PPSC TPPS ...8
## Length:254 Length:254 Length:254 Mode:logical
## Class :character Class :character Class :character NA's:254
## Mode :character Mode :character Mode :character
##
##
##
##
## HOUSEHOLDS NO...10 TWT1 ME...12
## Length:254 Min. : 6061 Min. : 36440 Min. : 628
## Class :character 1st Qu.: 19230 1st Qu.: 57890 1st Qu.: 828
## Mode :character Median : 28050 Median : 70770 Median : 963
## Mean : 44031 Mean : 69373 Mean :1181
## 3rd Qu.: 61440 3rd Qu.: 76150 3rd Qu.:1320
## Max. :131200 Max. :106200 Max. :3991
## NA's :217 NA's :217 NA's :217
## NO...13 TWT2 ME...15
## Min. : 6136 Min. :34160 Min. : 602
## 1st Qu.: 19320 1st Qu.:51200 1st Qu.: 825
## Median : 28280 Median :64490 Median : 984
## Mean : 44066 Mean :63109 Mean :1186
## 3rd Qu.: 62180 3rd Qu.:69590 3rd Qu.:1440
## Max. :131400 Max. :96340 Max. :3391
## NA's :217 NA's :217 NA's :217
library(dplyr)
# Clean the data
df_clean <- df %>%
# Convert B3P to numeric, removing any non-numeric characters
mutate(B3P = as.numeric(B3P),
# Convert PPSC to numeric, removing any non-numeric characters
PPSC = as.numeric(PPSC),
# Use TWT1 as household income (more complete than TWT2)
income = as.numeric(TWT1)) %>%
# Remove rows with NA or * in critical columns
filter(!is.na(B3P) & !is.na(PPSC) & !is.na(income))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `PPSC = as.numeric(PPSC)`.
## Caused by warning:
## ! NAs introduced by coercion
# Check for normality and create initial visualization
ggplot(df_clean, aes(x = income, y = PPSC)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) +
labs(x = "Household Income", y = "Proportion of Children Receiving ECI Services",
title = "Income vs ECI Services Utilization") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Create the linear model
model <- lm(PPSC ~ income, data = df_clean)
# Basic model summary
print(summary(model))
##
## Call:
## lm(formula = PPSC ~ income, data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.030971 -0.013890 -0.001617 0.015300 0.036326
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.562e-02 1.582e-02 4.149 0.000267 ***
## income -3.373e-07 2.192e-07 -1.539 0.134633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0201 on 29 degrees of freedom
## Multiple R-squared: 0.07551, Adjusted R-squared: 0.04363
## F-statistic: 2.369 on 1 and 29 DF, p-value: 0.1346
# Diagnostic plots
par(mfrow = c(2,2))
plot(model)

# Test for normality
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.96034, p-value = 0.2979
# Test for heteroscedasticity
bptest <- car::ncvTest(model)
print(bptest)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.05886381, Df = 1, p = 0.8083