library(readxl)
district_data <- read_excel("~/Desktop/grad school/Research II/data selection/district.xls")

head(district_data)
## # A tibble: 6 × 137
##   DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
##   <chr>    <chr>    <chr>    <chr>  <chr>       <dbl>    <dbl>    <dbl>    <dbl>
## 1 CAYUGA … 001902   001 AND… 07     A               3      574      4.4     11.5
## 2 ELKHART… 001903   001 AND… 07     A               4     1150      4       11.8
## 3 FRANKST… 001904   001 AND… 07     A               3      808      8.5     11.3
## 4 NECHES … 001906   001 AND… 07     A               2      342      8.2     13.5
## 5 PALESTI… 001907   001 AND… 07     B               6     3360     25.1     42.9
## 6 WESTWOO… 001908   001 AND… 07     B               4     1332     19.7     26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## #   DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## #   DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## #   DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## #   DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## #   DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## #   DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …

Create the lm

district_lm <- lm(DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data = district_data)

# View the model summary
summary(district_lm)
## 
## Call:
## lm(formula = DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, 
##     data = district_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25041   -631      6    571  54253 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11165.06    1633.61   6.835 1.30e-11 ***
## DZCAMPUS      708.28       5.67 124.921  < 2e-16 ***
## DPETBLAP     -123.06      18.96  -6.491 1.25e-10 ***
## DPETHISP     -125.61      16.58  -7.574 7.19e-14 ***
## DPETWHIP     -125.94      17.49  -7.199 1.06e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3177 on 1202 degrees of freedom
## Multiple R-squared:  0.9355, Adjusted R-squared:  0.9353 
## F-statistic:  4358 on 4 and 1202 DF,  p-value: < 2.2e-16

Check the Assumptions

1. Linearity

plot(district_lm, which = 1)

raintest(district_lm)
## 
##  Rainbow test
## 
## data:  district_lm
## Rain = 0.71642, df1 = 604, df2 = 598, p-value = 1

This data is not linear in nature, so it does not pass this assumption

2. Independence of Errors

# Using the Durbin-Watson test, we can see if the residuals (errors) are correlated

dwtest(district_lm)
## 
##  Durbin-Watson test
## 
## data:  district_lm
## DW = 1.6975, p-value = 5.887e-08
## alternative hypothesis: true autocorrelation is greater than 0

#I think this means that this test fails the DW test. As in, since the number is close to two and the result was very significant, so we can assume that the different in residuals is correlated. Data would have to be adjusted to pass

#3. Homoscedasticity (bp test)

bptest(district_lm)
## 
##  studentized Breusch-Pagan test
## 
## data:  district_lm
## BP = 197.79, df = 4, p-value < 2.2e-16

The significant p-value, means that the data is not homoscedastic but heterodcedastic. The variance is non-constant)

#4 Normality of Residuals

# QQ Plot
qqnorm(residuals(district_lm))
qqline(residuals(district_lm))

# Shapiro-Wilk test
shapiro.test(residuals(district_lm))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(district_lm)
## W = 0.45568, p-value < 2.2e-16

This is not a normal distribution at all.On the plot, The points deviate from the line, especially at the tails (extreme values), hence non-normality.The number is not very close to one, with a significant p-value, so not normal

#5. No multicolinarity

# Variance Inflation Factor (VIF)
library(car)
## Loading required package: carData
vif_values <- vif(district_lm)
print(vif_values)
##  DZCAMPUS  DPETBLAP  DPETHISP  DPETWHIP 
##  1.090816  9.297754 24.146300 27.729531
# Correlation matrix for independent variables
cor_matrix <- cor(district_data[, c("DZCAMPUS", "DPETBLAP", "DPETHISP", "DPETWHIP")])
print(cor_matrix)
##             DZCAMPUS    DPETBLAP   DPETHISP   DPETWHIP
## DZCAMPUS  1.00000000  0.09098361  0.1507146 -0.2304338
## DPETBLAP  0.09098361  1.00000000 -0.1847777 -0.3807074
## DPETHISP  0.15071455 -0.18477766  1.0000000 -0.8180747
## DPETWHIP -0.23043379 -0.38070737 -0.8180747  1.0000000

The high VIF values for DPETHISP and DPETWHIP suggest they are highly correlated with other predictors, particularly DPETWHIP and DPETHISP. Strong Negative Correlation (DPETHISP and DPETWHIP): This indicates that as the percentage of Hispanic students (DPETHISP) increases, the percentage of White students (DPETWHIP) tends to decrease and vice versa.

This model does not meet the assumptions, and you would have to transform the data to help meet them. I am going to do so by using the log() function

Transformation of DATA to help meet assumptions

# log(  )
d_model_transformed <- lm(log(DPETALLC) ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data = district_data)
summary(d_model_transformed)
## 
## Call:
## lm(formula = log(DPETALLC) ~ DZCAMPUS + DPETBLAP + DPETHISP + 
##     DPETWHIP, data = district_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1412 -0.8090  0.0747  0.9074  2.7992 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.574277   0.604288  19.154  < 2e-16 ***
## DZCAMPUS     0.052715   0.002097  25.134  < 2e-16 ***
## DPETBLAP    -0.051967   0.007014  -7.409 2.38e-13 ***
## DPETHISP    -0.047583   0.006134  -7.757 1.85e-14 ***
## DPETWHIP    -0.057332   0.006471  -8.860  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.175 on 1202 degrees of freedom
## Multiple R-squared:  0.4424, Adjusted R-squared:  0.4405 
## F-statistic: 238.4 on 4 and 1202 DF,  p-value: < 2.2e-16

#Run some of the same test,

bptest(d_model_transformed) 
## 
##  studentized Breusch-Pagan test
## 
## data:  d_model_transformed
## BP = 357.56, df = 4, p-value < 2.2e-16
plot(d_model_transformed, which = 1)  

# The BP number got bigger after the log() function, so this does still pass homostacicity, and I am stumped on why it grew so much.The plot also shows a curve, suggesting non-linearity.

#maybe sqrt
district_data$sqrt_DPETALLC <- sqrt(district_data$DPETALLC)
model_sqrt <- lm(sqrt_DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + DPETWHIP, data = district_data)
summary(model_sqrt)
## 
## Call:
## lm(formula = sqrt_DPETALLC ~ DZCAMPUS + DPETBLAP + DPETHISP + 
##     DPETWHIP, data = district_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -250.404  -12.027   -3.704    9.778   90.498 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 142.79408   11.23150  12.714   <2e-16 ***
## DZCAMPUS      2.39997    0.03898  61.567   <2e-16 ***
## DPETBLAP     -1.18008    0.13036  -9.053   <2e-16 ***
## DPETHISP     -1.11123    0.11402  -9.746   <2e-16 ***
## DPETWHIP     -1.29623    0.12027 -10.778   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.84 on 1202 degrees of freedom
## Multiple R-squared:  0.7968, Adjusted R-squared:  0.7961 
## F-statistic:  1178 on 4 and 1202 DF,  p-value: < 2.2e-16
bptest(model_sqrt) 
## 
##  studentized Breusch-Pagan test
## 
## data:  model_sqrt
## BP = 627.57, df = 4, p-value < 2.2e-16
plot(model_sqrt, which = 1) 

#not that either, almost similar results to the log() function