library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.5.2
library(readxl)
library(dplyr)
library(tidyr)
setwd("C:/Users/KaeRo/Desktop/R Studio/Reseach Data Selection")
district <- read_excel("district.xls")
DistrictTeacherPay<-district %>% select(DPSTTOSA,DPSTWHFP,DPETHISP,DZRVLOCP, DPSTADFP, DDA00A001222R)%>% drop_na()
Cleaned_Data<- DistrictTeacherPay %>%
  rename(avg_teacher_pay = DPSTTOSA,wht_teachers=DPSTWHFP,hisp_student=DPETHISP,local_taxes_rev=DZRVLOCP,teach_advdg=DPSTADFP,student_STAAR=DDA00A001222R)

Dependent Variable: Average Teacher Pay Independent Variables: % of white teachers, % of hispanic students, total renveue from local taxes, % of teachers with advanced degrees, % of students meeting standards on STAAR Scores

summary(Cleaned_Data)
##  avg_teacher_pay  wht_teachers     hisp_student    local_taxes_rev
##  Min.   :36081   Min.   :  0.00   Min.   :  0.00   Min.   :-6.20  
##  1st Qu.:50438   1st Qu.: 59.00   1st Qu.: 21.00   1st Qu.:21.30  
##  Median :53381   Median : 82.50   Median : 38.00   Median :35.40  
##  Mean   :53928   Mean   : 71.69   Mean   : 43.32   Mean   :38.03  
##  3rd Qu.:56917   3rd Qu.: 92.60   3rd Qu.: 61.90   3rd Qu.:53.80  
##  Max.   :75719   Max.   :100.00   Max.   :100.00   Max.   :97.60  
##   teach_advdg    student_STAAR  
##  Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:15.00   1st Qu.:37.00  
##  Median :20.90   Median :46.00  
##  Mean   :20.88   Mean   :46.34  
##  3rd Qu.:26.20   3rd Qu.:55.00  
##  Max.   :65.00   Max.   :88.00

So average teacher pay is $53,928, the average percentage of white teachers is 71.69%, the average percentage of hispanic students is 43.32%, the average amount of revenue from local taxes is 38.03%, the average number of teachers with advanced degrees is 20.88%, and the average amount of students meeting STAAR standards or above is 46.00

Lets first look at the correlation between are 2 variables in our hypothesis: average teacher pay and percent of white teachers

cor(Cleaned_Data)
##                 avg_teacher_pay wht_teachers hisp_student local_taxes_rev
## avg_teacher_pay       1.0000000   -0.2091008   0.13995738      0.15816715
## wht_teachers         -0.2091008    1.0000000  -0.68963488      0.34090151
## hisp_student          0.1399574   -0.6896349   1.00000000     -0.16250887
## local_taxes_rev       0.1581671    0.3409015  -0.16250887      1.00000000
## teach_advdg           0.3060805   -0.1453769   0.06461044      0.02050455
## student_STAAR         0.1087580    0.4351362  -0.38374827      0.28291541
##                 teach_advdg student_STAAR
## avg_teacher_pay  0.30608051    0.10875801
## wht_teachers    -0.14537692    0.43513622
## hisp_student     0.06461044   -0.38374827
## local_taxes_rev  0.02050455    0.28291541
## teach_advdg      1.00000000    0.05568946
## student_STAAR    0.05568946    1.00000000

Before linear regression, let’s check the correlations more closely with a Kendall test:

cor.test(Cleaned_Data$avg_teacher_pay,Cleaned_Data$wht_teachers,method="kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  Cleaned_Data$avg_teacher_pay and Cleaned_Data$wht_teachers
## z = -9.3871, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## -0.1815146

The p-value is very low, so there is some correlation.

sqrt(nrow(Cleaned_Data))
## [1] 34.59769
ggplot(Cleaned_Data,aes(x=avg_teacher_pay)) + geom_histogram(bins=34)

ggplot(Cleaned_Data,aes(x=avg_teacher_pay,y=wht_teachers,color=hisp_student,)) + geom_point() 

Significance in this: While no clear correlation between teacher pay and percentage of white teachers, there is a trend of lower precentage of hispanic students when you have a higher percentage of white teachers Check the following assumptions: a) Linearity (plot and raintest)

library(pastecs)
library(lmtest)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
lmCleaned_Data<-lm(avg_teacher_pay~wht_teachers+hisp_student+local_taxes_rev+teach_advdg+student_STAAR,data=Cleaned_Data)
plot(lmCleaned_Data,which=1)

raintest(lmCleaned_Data)
## 
##  Rainbow test
## 
## data:  lmCleaned_Data
## Rain = 1.0851, df1 = 599, df2 = 592, p-value = 0.1596
  1. Independence of errors (durbin-watson)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(lmCleaned_Data)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2498611      1.499953       0
##  Alternative hypothesis: rho != 0
  1. Homoscedasticity (plot, bptest)
plot(lmCleaned_Data,which=3)

bptest(lmCleaned_Data)
## 
##  studentized Breusch-Pagan test
## 
## data:  lmCleaned_Data
## BP = 22.026, df = 5, p-value = 0.0005177
  1. Normality of residuals (QQ plot, shapiro test)
plot(lmCleaned_Data,which=2)

shapiro.test(lmCleaned_Data$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lmCleaned_Data$residuals
## W = 0.98354, p-value = 2.11e-10
  1. No multicolinarity (VIF, cor)
vif(lmCleaned_Data)
##    wht_teachers    hisp_student local_taxes_rev     teach_advdg   student_STAAR 
##        2.279147        1.975327        1.184999        1.045063        1.311643

Testing Linear Regression Assumptions 1) Linearity of variables:Somewhat, the line is fairly straight but the P-value is low (ideally, I would need to log transform the data to increase the p-value) 2) Independence of variables:Somewhat, the p-value is 0 and the D-W statistic is somewhat clost to 2 (1.4) 3) Homoscedasticity:Somewhat, the redline is fairly straight but the p-value is way below .05 (ideally, I would need to log transform the data to increase the p-value) 4) Normality of residuals: Somewhat, a lot of dots are on the line but the p-value is below .05 (ideally, I would beed to log transform the data to increase the p-value) 5) No multicollinearity: None of the VIF’s are above 10 (or 5), so the variable are not related to each other

So, to meet all assumptions, need to log transform. Let’s do that!

loglmCleaned_Data<-lm(log1p(avg_teacher_pay)~log1p(wht_teachers)+hisp_student+local_taxes_rev+teach_advdg+student_STAAR,data=Cleaned_Data)
  1. Linearity (plot and raintest)
plot(loglmCleaned_Data,which=1)

raintest(loglmCleaned_Data)
## 
##  Rainbow test
## 
## data:  loglmCleaned_Data
## Rain = 1.0762, df1 = 599, df2 = 592, p-value = 0.1853

Now the p-value is much higher and line is more striaght

  1. Homoscedasticity (plot, bptest)
plot(loglmCleaned_Data,which=3)

bptest(loglmCleaned_Data)
## 
##  studentized Breusch-Pagan test
## 
## data:  loglmCleaned_Data
## BP = 23.583, df = 5, p-value = 0.0002611

P-value is still quite low, so while the line is striaght, we will have to note the data is not completely homoscedastic (variance is not completely the same across the model)

  1. Normality of residuals (QQ plot, shapiro test)
plot(loglmCleaned_Data,which=2)

shapiro.test(loglmCleaned_Data$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  loglmCleaned_Data$residuals
## W = 0.98709, p-value = 8.522e-09

Ok p-value still extremely low, so we will have to note that the residuals are not normal (not normally distributed)

Let’s now look at the transformed data

summary(loglmCleaned_Data)
## 
## Call:
## lm(formula = log1p(avg_teacher_pay) ~ log1p(wht_teachers) + hisp_student + 
##     local_taxes_rev + teach_advdg + student_STAAR, data = Cleaned_Data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43229 -0.05270 -0.00140  0.05071  0.39090 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         10.8221791  0.0205388 526.913  < 2e-16 ***
## log1p(wht_teachers) -0.0203940  0.0038576  -5.287 1.48e-07 ***
## hisp_student         0.0003821  0.0001124   3.399 0.000699 ***
## local_taxes_rev      0.0007692  0.0001124   6.843 1.24e-11 ***
## teach_advdg          0.0028838  0.0002802  10.292  < 2e-16 ***
## student_STAAR        0.0010036  0.0002091   4.799 1.80e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08557 on 1191 degrees of freedom
## Multiple R-squared:  0.1689, Adjusted R-squared:  0.1654 
## F-statistic: 48.41 on 5 and 1191 DF,  p-value: < 2.2e-16

What does this mean? - It looks like each variable has a significant relationship with average teacher pay - The p-value is quite low, so the null hypothesis cannot be eliminated - The Adjusted R-squared let’s us know that these variables only account of 16% of change in average teacher pay - It looks like when teacher pay goes up, the percentage of white teachers goes down - But when teacher pay goes up, the percentage of hispanic students, total revenue from local taxes, teachers with advanced degrees, and student test scores go up.