library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(car)  
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
# Read the Excel file
library(readxl)
df <- read_excel("Desktop/In class/eci_post_income_data.xls")
## New names:
## • `` -> `...8`
## • `NO` -> `NO...10`
## • `ME` -> `ME...12`
## • `NO` -> `NO...13`
## • `ME` -> `ME...15`
print(head(df))
## # A tibble: 6 × 15
##   County      B3P CSCS  CSFA  TS    PPSC   TPPS  ...8  HOUSEHOLDS NO...10   TWT1
##   <chr>     <dbl> <chr> <chr> <chr> <chr>  <chr> <lgl> <chr>        <dbl>  <dbl>
## 1 Anderson   2299 29    <NA>  29    0.012… 0.01… NA    All house…  131200  70460
## 2 Andrews    1703 78    *     *     0.045… *     NA    Type of H…      NA     NA
## 3 Angelina   5166 299   <NA>  299   0.057… 0.05… NA    Family ho…   84270  90760
## 4 Aransas    1305 30    <NA>  30    0.022… 0.02… NA    ...Marrie…   61440 104000
## 5 Archer      400 15    *     *     0.037… *     NA    ...Female…   15620  57890
## 6 Armstrong    74 *     <NA>  *     *      *     NA    ...Male h…    7212  73630
## # ℹ 4 more variables: ME...12 <dbl>, NO...13 <dbl>, TWT2 <dbl>, ME...15 <dbl>
print(summary(df))
##     County               B3P               CSCS               CSFA          
##  Length:254         Min.   :     2.0   Length:254         Length:254        
##  Class :character   1st Qu.:   383.0   Class :character   Class :character  
##  Mode  :character   Median :   993.5   Mode  :character   Mode  :character  
##                     Mean   :  6920.7                                        
##                     3rd Qu.:  2587.5                                        
##                     Max.   :316834.0                                        
##                                                                             
##       TS                PPSC               TPPS             ...8        
##  Length:254         Length:254         Length:254         Mode:logical  
##  Class :character   Class :character   Class :character   NA's:254      
##  Mode  :character   Mode  :character   Mode  :character                 
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##   HOUSEHOLDS           NO...10            TWT1           ME...12    
##  Length:254         Min.   :  6061   Min.   : 36440   Min.   : 628  
##  Class :character   1st Qu.: 19230   1st Qu.: 57890   1st Qu.: 828  
##  Mode  :character   Median : 28050   Median : 70770   Median : 963  
##                     Mean   : 44031   Mean   : 69373   Mean   :1181  
##                     3rd Qu.: 61440   3rd Qu.: 76150   3rd Qu.:1320  
##                     Max.   :131200   Max.   :106200   Max.   :3991  
##                     NA's   :217      NA's   :217      NA's   :217   
##     NO...13            TWT2          ME...15    
##  Min.   :  6136   Min.   :34160   Min.   : 602  
##  1st Qu.: 19320   1st Qu.:51200   1st Qu.: 825  
##  Median : 28280   Median :64490   Median : 984  
##  Mean   : 44066   Mean   :63109   Mean   :1186  
##  3rd Qu.: 62180   3rd Qu.:69590   3rd Qu.:1440  
##  Max.   :131400   Max.   :96340   Max.   :3391  
##  NA's   :217      NA's   :217     NA's   :217
library(dplyr)

# Clean the data
df_clean <- df %>%
  # Convert B3P to numeric, removing any non-numeric characters
  mutate(B3P = as.numeric(B3P),
         # Convert PPSC to numeric, removing any non-numeric characters
         PPSC = as.numeric(PPSC),
         # Use TWT1 as household income (more complete than TWT2)
         income = as.numeric(TWT1)) %>%
  # Remove rows with NA or * in critical columns
  filter(!is.na(B3P) & !is.na(PPSC) & !is.na(income))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `PPSC = as.numeric(PPSC)`.
## Caused by warning:
## ! NAs introduced by coercion
# Check for normality and create initial visualization
ggplot(df_clean, aes(x = income, y = PPSC)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  labs(x = "Household Income", y = "Proportion of Children Receiving ECI Services",
       title = "Income vs ECI Services Utilization") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Create the linear model
model <- lm(PPSC ~ income, data = df_clean)

# Basic model summary
print(summary(model))
## 
## Call:
## lm(formula = PPSC ~ income, data = df_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.030971 -0.013890 -0.001617  0.015300  0.036326 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.562e-02  1.582e-02   4.149 0.000267 ***
## income      -3.373e-07  2.192e-07  -1.539 0.134633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0201 on 29 degrees of freedom
## Multiple R-squared:  0.07551,    Adjusted R-squared:  0.04363 
## F-statistic: 2.369 on 1 and 29 DF,  p-value: 0.1346
# Diagnostic plots
par(mfrow = c(2,2))
plot(model)

# Test for normality
shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.96034, p-value = 0.2979
# Test for heteroscedasticity
bptest <- car::ncvTest(model)
print(bptest)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.05886381, Df = 1, p = 0.8083