library(readxl)
district_data <- read_excel("~/Desktop/grad school/Research II/data selection/district.xls")
head(district_data)
## # A tibble: 6 × 137
## DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAYUGA … 001902 001 AND… 07 A 3 574 4.4 11.5
## 2 ELKHART… 001903 001 AND… 07 A 4 1150 4 11.8
## 3 FRANKST… 001904 001 AND… 07 A 3 808 8.5 11.3
## 4 NECHES … 001906 001 AND… 07 A 2 342 8.2 13.5
## 5 PALESTI… 001907 001 AND… 07 B 6 3360 25.1 42.9
## 6 WESTWOO… 001908 001 AND… 07 B 4 1332 19.7 26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## # DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## # DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## # DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## # DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## # DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## # DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …
district_lm <- lm(DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data = district_data)
# View the model summary
summary(district_lm)
##
## Call:
## lm(formula = DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP,
## data = district_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25041 -631 6 571 54253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11165.06 1633.61 6.835 1.30e-11 ***
## DZCAMPUS 708.28 5.67 124.921 < 2e-16 ***
## DPETBLAP -123.06 18.96 -6.491 1.25e-10 ***
## DPETHISP -125.61 16.58 -7.574 7.19e-14 ***
## DPETWHIP -125.94 17.49 -7.199 1.06e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3177 on 1202 degrees of freedom
## Multiple R-squared: 0.9355, Adjusted R-squared: 0.9353
## F-statistic: 4358 on 4 and 1202 DF, p-value: < 2.2e-16
plot(district_lm, which = 1)
raintest(district_lm)
##
## Rainbow test
##
## data: district_lm
## Rain = 0.71642, df1 = 604, df2 = 598, p-value = 1
# Using the Durbin-Watson test, we can see if the residuals (errors) are correlated
dwtest(district_lm)
##
## Durbin-Watson test
##
## data: district_lm
## DW = 1.6975, p-value = 5.887e-08
## alternative hypothesis: true autocorrelation is greater than 0
#I think this means that this test fails the DW test. As in, since the number is close to two and the result was very significant, so we can assume that the different in residuals is correlated. Data would have to be adjusted to pass
#3. Homoscedasticity (bp test)
bptest(district_lm)
##
## studentized Breusch-Pagan test
##
## data: district_lm
## BP = 197.79, df = 4, p-value < 2.2e-16
#4 Normality of Residuals
# QQ Plot
qqnorm(residuals(district_lm))
qqline(residuals(district_lm))
# Shapiro-Wilk test
shapiro.test(residuals(district_lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(district_lm)
## W = 0.45568, p-value < 2.2e-16
#5. No multicolinarity
# Variance Inflation Factor (VIF)
library(car)
## Loading required package: carData
vif_values <- vif(district_lm)
print(vif_values)
## DZCAMPUS DPETBLAP DPETHISP DPETWHIP
## 1.090816 9.297754 24.146300 27.729531
# Correlation matrix for independent variables
cor_matrix <- cor(district_data[, c("DZCAMPUS", "DPETBLAP", "DPETHISP", "DPETWHIP")])
print(cor_matrix)
## DZCAMPUS DPETBLAP DPETHISP DPETWHIP
## DZCAMPUS 1.00000000 0.09098361 0.1507146 -0.2304338
## DPETBLAP 0.09098361 1.00000000 -0.1847777 -0.3807074
## DPETHISP 0.15071455 -0.18477766 1.0000000 -0.8180747
## DPETWHIP -0.23043379 -0.38070737 -0.8180747 1.0000000
# log( )
d_model_transformed <- lm(log(DPETALLC) ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data = district_data)
summary(d_model_transformed)
##
## Call:
## lm(formula = log(DPETALLC) ~ DZCAMPUS + DPETBLAP + DPETHISP +
## DPETWHIP, data = district_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1412 -0.8090 0.0747 0.9074 2.7992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.574277 0.604288 19.154 < 2e-16 ***
## DZCAMPUS 0.052715 0.002097 25.134 < 2e-16 ***
## DPETBLAP -0.051967 0.007014 -7.409 2.38e-13 ***
## DPETHISP -0.047583 0.006134 -7.757 1.85e-14 ***
## DPETWHIP -0.057332 0.006471 -8.860 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.175 on 1202 degrees of freedom
## Multiple R-squared: 0.4424, Adjusted R-squared: 0.4405
## F-statistic: 238.4 on 4 and 1202 DF, p-value: < 2.2e-16
#Run some of the same test,
bptest(d_model_transformed)
##
## studentized Breusch-Pagan test
##
## data: d_model_transformed
## BP = 357.56, df = 4, p-value < 2.2e-16
plot(d_model_transformed, which = 1)
# The BP number got bigger after the log() function, so this does still
pass homostacicity, and I am stumped on why it grew so much.The plot
also shows a curve, suggesting non-linearity.
#maybe sqrt
district_data$sqrt_DPETALLC <- sqrt(district_data$DPETALLC)
model_sqrt <- lm(sqrt_DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data = district_data)
summary(model_sqrt)
##
## Call:
## lm(formula = sqrt_DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP +
## DPETWHIP, data = district_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -250.404 -12.027 -3.704 9.778 90.498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 142.79408 11.23150 12.714 <2e-16 ***
## DZCAMPUS 2.39997 0.03898 61.567 <2e-16 ***
## DPETBLAP -1.18008 0.13036 -9.053 <2e-16 ***
## DPETHISP -1.11123 0.11402 -9.746 <2e-16 ***
## DPETWHIP -1.29623 0.12027 -10.778 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.84 on 1202 degrees of freedom
## Multiple R-squared: 0.7968, Adjusted R-squared: 0.7961
## F-statistic: 1178 on 4 and 1202 DF, p-value: < 2.2e-16
bptest(model_sqrt)
##
## studentized Breusch-Pagan test
##
## data: model_sqrt
## BP = 627.57, df = 4, p-value < 2.2e-16
plot(model_sqrt, which = 1)
#not that either, almost similar results to the log() function