library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.4     âś” readr     2.1.5
## âś” forcats   1.0.0     âś” stringr   1.5.1
## âś” ggplot2   3.5.2     âś” tibble    3.3.0
## âś” lubridate 1.9.4     âś” tidyr     1.3.1
## âś” purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
district<-read_excel("district.xls")
numeric_clean_district_data<-district |> dplyr::select(where(is.numeric)) |> drop_na()
head(numeric_clean_district_data)
## # A tibble: 6 Ă— 128
##   DZCAMPUS DPETALLC DPETBLAP DPETHISP DPETWHIP DPETINDP DPETASIP DPETPCIP
##      <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1        6     3360     25.1     42.9     27.3      0.2      0.7      0.1
## 2        5     2799      7.2     27.9     60.6      0.3      1        0.1
## 3       17     7318     28.7     43.1     24        0.1      1.1      0.1
## 4        5     1612      2.4      6.6     87        0.3      0.1      0.2
## 5        4     3005      1.3     44.1     49.6      0.3      2        0.1
## 6        6     3374      0.7     69.6     27.6      0.4      0.5      0.1
## # ℹ 120 more variables: DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## #   DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## #   DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## #   DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## #   DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## #   DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>,
## #   DDA00AM01222R <dbl>, DDA00AM01322R <dbl>, DDA00AC01S22R <dbl>, …
districtmodel1<-lm(DDB00A001322R~DPSTBLFP+DPETBLAP+DPSTKIDR, data=numeric_clean_district_data)
plot(districtmodel1,which=1)

raintest(districtmodel1)
## 
##  Rainbow test
## 
## data:  districtmodel1
## Rain = 1.2594, df1 = 162, df2 = 157, p-value = 0.07346
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(districtmodel1)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.03194048      2.059369    0.65
##  Alternative hypothesis: rho != 0
plot(districtmodel1,which=3)

bptest(districtmodel1)
## 
##  studentized Breusch-Pagan test
## 
## data:  districtmodel1
## BP = 11.062, df = 3, p-value = 0.01139
plot(districtmodel1,which=2)

shapiro.test(districtmodel1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  districtmodel1$residuals
## W = 0.92188, p-value = 6.064e-12
vif(districtmodel1)
## DPSTBLFP DPETBLAP DPSTKIDR 
## 3.566432 3.484464 1.044194

Despite a high p-value from the Rainbow test, this model appears to be non-linear which violates the assumption of normality. Additionally, homoscedasticity was also violated according to the small p-value of the BP test and the visual evidence from the Q-Q plot. The assumption of normality was also violated according to a small p-value from the Shapiro-Wilk test. To mitigate this, I will use a log transformation on the model, specifically using square root.

districtmodel1log<-lm(sqrt(DDB00A001322R)~sqrt(DPSTBLFP)+sqrt(DPETBLAP)+sqrt(DPSTKIDR), data=numeric_clean_district_data)
## Warning in sqrt(DDB00A001322R): NaNs produced
## Warning in sqrt(DPSTKIDR): NaNs produced
plot(districtmodel1log, which=1)

This model looks to be much more linear and mostly fitted to the line after being transformed.

raintest(districtmodel1log)
## 
##  Rainbow test
## 
## data:  districtmodel1log
## Rain = 1.0897, df1 = 159, df2 = 155, p-value = 0.2958

#The p-value is now greater than .05 which means this model is likely linear.

library(car)
durbinWatsonTest(districtmodel1log)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01492273      2.024141   0.916
##  Alternative hypothesis: rho != 0

#The p-value is greater than .05 so the model probably doesn’t contain correlation of residuals.

plot(districtmodel1log,which=3)

bptest(districtmodel1log)
## 
##  studentized Breusch-Pagan test
## 
## data:  districtmodel1log
## BP = 24.438, df = 3, p-value = 2.024e-05

#This p-value is less than .05 so the model is likely still heteroscedastic.

plot(districtmodel1log,which=2)

#The residuals seem to fit closer to the line.

shapiro.test(districtmodel1log$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  districtmodel1log$residuals
## W = 0.95934, p-value = 9.764e-08

#The p-value is less than .05 so the model is most likely still not normally distributed.

vif(districtmodel1log)
## sqrt(DPSTBLFP) sqrt(DPETBLAP) sqrt(DPSTKIDR) 
##       4.012045       3.903713       1.055202

#These VIFs are all less than 10 meaning the variables are not strongly correlated with each other.

library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
stat.desc(numeric_clean_district_data$DPETBLAP)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##  323.0000000    0.0000000    0.0000000    0.1000000   73.2000000   73.1000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 3545.2000000    7.3000000   10.9758514    0.6406495    1.2603873  132.5694771 
##      std.dev     coef.var 
##   11.5138819    1.0490195
stat.desc(numeric_clean_district_data$DDB00A001322R)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##  323.0000000    6.0000000    0.0000000   -1.0000000   60.0000000   61.0000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 4860.0000000   13.0000000   15.0464396    0.4955070    0.9748400   79.3052901 
##      std.dev     coef.var 
##    8.9053518    0.5918577
hist(numeric_clean_district_data$DPETBLAP)

hist(numeric_clean_district_data$DDB00A001322R)

summary(numeric_clean_district_data$DPETBLAP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10    2.20    7.30   10.98   16.05   73.20
summary(numeric_clean_district_data$DDB00A001322R)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.00    9.00   13.00   15.05   19.00   60.00
head(numeric_clean_district_data$DPETBLAP)
## [1] 25.1  7.2 28.7  2.4  1.3  0.7
head(numeric_clean_district_data$DDB00A001322R)
## [1] 11 14 12 14 13 15

The descriptive statistics above provide more information on the percentage of African American students (DPETBLAP) and percentage of African American students who master their grade level standard on the STAAR test (DDB00A001322R).

summary(districtmodel1log)
## 
## Call:
## lm(formula = sqrt(DDB00A001322R) ~ sqrt(DPSTBLFP) + sqrt(DPETBLAP) + 
##     sqrt(DPSTKIDR), data = numeric_clean_district_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9421 -0.5909 -0.0418  0.5902  4.0787 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     2.470047   0.948360   2.605  0.00964 **
## sqrt(DPSTBLFP) -0.128933   0.076581  -1.684  0.09325 . 
## sqrt(DPETBLAP) -0.002587   0.075621  -0.034  0.97274   
## sqrt(DPSTKIDR)  0.416509   0.250434   1.663  0.09728 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.113 on 314 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.03786,    Adjusted R-squared:  0.02866 
## F-statistic: 4.118 on 3 and 314 DF,  p-value: 0.006928

There seems to be a small but negatuve effect which is significant enough to reject the null hypothesis. However, the negative effect of my model doesn’t support my hypothesis that the percentage of African American teachers within a district increases the percentage of African American students who master their grade level standard on the STAAR test.