library(pastecs)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- read.csv("C:/Users/atg516/Desktop/Project/HHS_Unaccompanied_Alien_Children_Program.csv")

data_clean <- data %>%
  filter(!is.na(Children.discharged.from.HHS.Care) & !is.na(Children.in.HHS.Care))

data_clean <- data_clean %>%
  mutate(discharge_rate = Children.discharged.from.HHS.Care / Children.in.HHS.Care)
View(data_clean)
model <- lm(discharge_rate ~ Children.in.HHS.Care, data = data_clean)
summary(model)
## 
## Call:
## lm(formula = discharge_rate ~ Children.in.HHS.Care, data = data_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.024568 -0.005341 -0.000029  0.006325  0.038167 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.411e-02  8.132e-04  29.643  < 2e-16 ***
## Children.in.HHS.Care 6.290e-07  8.218e-08   7.653 4.55e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.009566 on 1013 degrees of freedom
## Multiple R-squared:  0.05466,    Adjusted R-squared:  0.05373 
## F-statistic: 58.57 on 1 and 1013 DF,  p-value: 4.554e-14
plot(model, which = 1)

A mild curvature indicates a violation, albeit mild, of linearity.

model_poly <- lm(discharge_rate ~ poly(Children.in.HHS.Care, 2), data = data_clean)
summary(model_poly)
## 
## Call:
## lm(formula = discharge_rate ~ poly(Children.in.HHS.Care, 2), 
##     data = data_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.021895 -0.005768 -0.000965  0.005307  0.038477 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.0298902  0.0002573 116.179   <2e-16 ***
## poly(Children.in.HHS.Care, 2)1  0.0732095  0.0081966   8.932   <2e-16 ***
## poly(Children.in.HHS.Care, 2)2 -0.1571725  0.0081966 -19.175   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.008197 on 1012 degrees of freedom
## Multiple R-squared:  0.3066, Adjusted R-squared:  0.3052 
## F-statistic: 223.7 on 2 and 1012 DF,  p-value: < 2.2e-16
plot(model_poly, which = 1)

To mitigate this, I fitted a polynomial regression model and found that the red smoother line is nearly flat, hovering around 0 for most fitted values.There’s less of a wave or curve compared to the earlier linear model.

The variance still increases slightly as fitted values increase, but it’s much less structured, showing that the model form has improved.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 0.88977, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

A DW statistic below 2 indicates positive autocorrelation in residuals, violating the assumption of independence.

data_clean <- data_clean %>%
  mutate(lag_discharge = lag(discharge_rate, 1))
model_lag <- lm(discharge_rate ~ Children.in.HHS.Care + lag_discharge, data = data_clean)
dwtest(model_lag)
## 
##  Durbin-Watson test
## 
## data:  model_lag
## DW = 2.0198, p-value = 0.6053
## alternative hypothesis: true autocorrelation is greater than 0
summary(model_lag)
## 
## Call:
## lm(formula = discharge_rate ~ Children.in.HHS.Care + lag_discharge, 
##     data = data_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.022415 -0.005998 -0.000447  0.005523  0.032253 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.039e-02  9.302e-04  11.168  < 2e-16 ***
## Children.in.HHS.Care 3.070e-07  6.985e-08   4.395 1.22e-05 ***
## lag_discharge        5.579e-01  2.606e-02  21.407  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007935 on 1011 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3498, Adjusted R-squared:  0.3485 
## F-statistic:   272 on 2 and 1011 DF,  p-value: < 2.2e-16

To fix the issue with autocorrelation in my first model, I added a lagged version of the discharge rate as a new variable. The results of the Durbin-Watson test gave me a value of 2.02 with a p-value of 0.6053, which means the problem with autocorrelation is resolved. In addition, the overall model got stronger as my adjusted R-squared jumped from about 0.05 to 0.35, showing that this version explains more of the variation in discharge rate.

Both the number of children in HHS care and the lagged discharge rate were statistically significant, meaning they both have an impact on the outcome.

bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 9.1572, df = 1, p-value = 0.002477

Since p < 0.05, the model violates homoscedasticity—meaning the residuals have non-constant variance

library(sandwich)
library(lmtest)

coeftest(model, vcov = vcovHC(model, type = "HC1"))
## 
## t test of coefficients:
## 
##                        Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)          2.4106e-02 1.1902e-03 20.2532 < 2.2e-16 ***
## Children.in.HHS.Care 6.2898e-07 1.2211e-07  5.1511 3.112e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To address the issue of heteroscedasticity in my original model, I ran the regression again but this time using robust standard errors. The results still showed that the number of children in HHS care is statistically significant, with a p-value of less than 0.001. The t-value is strong too, which gives me more confidence in the relationship between the independent and dependent variables.

By using robust standard errors, I was able to correct for the non-constant variance with only mildly needing to change the model structure.

plot(model, which = 2)

shapiro.test(resid(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.98868, p-value = 4.665e-07

The residuals are not normally distributed, especially at the extremes.

data_clean <- data_clean %>%
  mutate(sqrt_discharge = sqrt(discharge_rate))

model_sqrt <- lm(sqrt_discharge ~ Children.in.HHS.Care, data = data_clean)

plot(model_sqrt, which = 2)

shapiro.test(resid(model_sqrt))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model_sqrt)
## W = 0.93408, p-value < 2.2e-16

To try fixing the normality issue in my residuals, I applied a square root transformation to the dependent variable (discharge rate). After running the Shapiro-Wilk test again, the p-value was still extremely small (p < 2.2e-16), meaning the residuals still aren’t normally distributed.

The Q-Q plot also showed some improvement but still had visible deviations, especially in the tails. So overall, the transformation didn’t fully fix the issue. That said, linear regression can still work decently even if this assumption isn’t perfectly met, especially with a large sample size like mine.