library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.5.2
library(readxl)
library(dplyr)
library(tidyr)
setwd("C:/Users/KaeRo/Desktop/R Studio/Reseach Data Selection")
district <- read_excel("district.xls")
DistrictTeacherPay<-district %>% select(DPSTTOSA,DPSTWHFP,DPETHISP,DZRVLOCP, DPSTADFP, DDA00A001222R)%>% drop_na()
Cleaned_Data<- DistrictTeacherPay %>%
rename(avg_teacher_pay = DPSTTOSA,wht_teachers=DPSTWHFP,hisp_student=DPETHISP,local_taxes_rev=DZRVLOCP,teach_advdg=DPSTADFP,student_STAAR=DDA00A001222R)
Dependent Variable: Average Teacher Pay Independent Variables: % of white teachers, % of hispanic students, total renveue from local taxes, % of teachers with advanced degrees, % of students meeting standards on STAAR Scores
summary(Cleaned_Data)
## avg_teacher_pay wht_teachers hisp_student local_taxes_rev
## Min. :36081 Min. : 0.00 Min. : 0.00 Min. :-6.20
## 1st Qu.:50438 1st Qu.: 59.00 1st Qu.: 21.00 1st Qu.:21.30
## Median :53381 Median : 82.50 Median : 38.00 Median :35.40
## Mean :53928 Mean : 71.69 Mean : 43.32 Mean :38.03
## 3rd Qu.:56917 3rd Qu.: 92.60 3rd Qu.: 61.90 3rd Qu.:53.80
## Max. :75719 Max. :100.00 Max. :100.00 Max. :97.60
## teach_advdg student_STAAR
## Min. : 0.00 Min. : 0.00
## 1st Qu.:15.00 1st Qu.:37.00
## Median :20.90 Median :46.00
## Mean :20.88 Mean :46.34
## 3rd Qu.:26.20 3rd Qu.:55.00
## Max. :65.00 Max. :88.00
So average teacher pay is $53,928, the average percentage of white teachers is 71.69%, the average percentage of hispanic students is 43.32%, the average amount of revenue from local taxes is 38.03%, the average number of teachers with advanced degrees is 20.88%, and the average amount of students meeting STAAR standards or above is 46.00
Lets first look at the correlation between are 2 variables in our hypothesis: average teacher pay and percent of white teachers
cor(Cleaned_Data)
## avg_teacher_pay wht_teachers hisp_student local_taxes_rev
## avg_teacher_pay 1.0000000 -0.2091008 0.13995738 0.15816715
## wht_teachers -0.2091008 1.0000000 -0.68963488 0.34090151
## hisp_student 0.1399574 -0.6896349 1.00000000 -0.16250887
## local_taxes_rev 0.1581671 0.3409015 -0.16250887 1.00000000
## teach_advdg 0.3060805 -0.1453769 0.06461044 0.02050455
## student_STAAR 0.1087580 0.4351362 -0.38374827 0.28291541
## teach_advdg student_STAAR
## avg_teacher_pay 0.30608051 0.10875801
## wht_teachers -0.14537692 0.43513622
## hisp_student 0.06461044 -0.38374827
## local_taxes_rev 0.02050455 0.28291541
## teach_advdg 1.00000000 0.05568946
## student_STAAR 0.05568946 1.00000000
Before linear regression, let’s check the correlations more closely with a Kendall test:
cor.test(Cleaned_Data$avg_teacher_pay,Cleaned_Data$wht_teachers,method="kendall")
##
## Kendall's rank correlation tau
##
## data: Cleaned_Data$avg_teacher_pay and Cleaned_Data$wht_teachers
## z = -9.3871, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.1815146
The p-value is very low, so there is some correlation.
sqrt(nrow(Cleaned_Data))
## [1] 34.59769
ggplot(Cleaned_Data,aes(x=avg_teacher_pay)) + geom_histogram(bins=34)
ggplot(Cleaned_Data,aes(x=avg_teacher_pay,y=wht_teachers,color=hisp_student,)) + geom_point()
Significance in this: While no clear correlation between teacher pay and
percentage of white teachers, there is a trend of lower precentage of
hispanic students when you have a higher percentage of white teachers
Check the following assumptions: a) Linearity (plot and raintest)
library(pastecs)
library(lmtest)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lmCleaned_Data<-lm(avg_teacher_pay~wht_teachers+hisp_student+local_taxes_rev+teach_advdg+student_STAAR,data=Cleaned_Data)
plot(lmCleaned_Data,which=1)
raintest(lmCleaned_Data)
##
## Rainbow test
##
## data: lmCleaned_Data
## Rain = 1.0851, df1 = 599, df2 = 592, p-value = 0.1596
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
durbinWatsonTest(lmCleaned_Data)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2498611 1.499953 0
## Alternative hypothesis: rho != 0
plot(lmCleaned_Data,which=3)
bptest(lmCleaned_Data)
##
## studentized Breusch-Pagan test
##
## data: lmCleaned_Data
## BP = 22.026, df = 5, p-value = 0.0005177
plot(lmCleaned_Data,which=2)
shapiro.test(lmCleaned_Data$residuals)
##
## Shapiro-Wilk normality test
##
## data: lmCleaned_Data$residuals
## W = 0.98354, p-value = 2.11e-10
vif(lmCleaned_Data)
## wht_teachers hisp_student local_taxes_rev teach_advdg student_STAAR
## 2.279147 1.975327 1.184999 1.045063 1.311643
Testing Linear Regression Assumptions 1) Linearity of variables:Somewhat, the line is fairly straight but the P-value is low (ideally, I would need to log transform the data to increase the p-value) 2) Independence of variables:Somewhat, the p-value is 0 and the D-W statistic is somewhat clost to 2 (1.4) 3) Homoscedasticity:Somewhat, the redline is fairly straight but the p-value is way below .05 (ideally, I would need to log transform the data to increase the p-value) 4) Normality of residuals: Somewhat, a lot of dots are on the line but the p-value is below .05 (ideally, I would beed to log transform the data to increase the p-value) 5) No multicollinearity: None of the VIF’s are above 10 (or 5), so the variable are not related to each other
So, to meet all assumptions, need to log transform. Let’s do that!
loglmCleaned_Data<-lm(log1p(avg_teacher_pay)~log1p(wht_teachers)+hisp_student+local_taxes_rev+teach_advdg+student_STAAR,data=Cleaned_Data)
plot(loglmCleaned_Data,which=1)
raintest(loglmCleaned_Data)
##
## Rainbow test
##
## data: loglmCleaned_Data
## Rain = 1.0762, df1 = 599, df2 = 592, p-value = 0.1853
Now the p-value is much higher and line is more striaght
plot(loglmCleaned_Data,which=3)
bptest(loglmCleaned_Data)
##
## studentized Breusch-Pagan test
##
## data: loglmCleaned_Data
## BP = 23.583, df = 5, p-value = 0.0002611
P-value is still quite low, so while the line is striaght, we will have to note the data is not completely homoscedastic (variance is not completely the same across the model)
plot(loglmCleaned_Data,which=2)
shapiro.test(loglmCleaned_Data$residuals)
##
## Shapiro-Wilk normality test
##
## data: loglmCleaned_Data$residuals
## W = 0.98709, p-value = 8.522e-09
Ok p-value still extremely low, so we will have to note that the residuals are not normal (not normally distributed)
Let’s now look at the transformed data
summary(loglmCleaned_Data)
##
## Call:
## lm(formula = log1p(avg_teacher_pay) ~ log1p(wht_teachers) + hisp_student +
## local_taxes_rev + teach_advdg + student_STAAR, data = Cleaned_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43229 -0.05270 -0.00140 0.05071 0.39090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.8221791 0.0205388 526.913 < 2e-16 ***
## log1p(wht_teachers) -0.0203940 0.0038576 -5.287 1.48e-07 ***
## hisp_student 0.0003821 0.0001124 3.399 0.000699 ***
## local_taxes_rev 0.0007692 0.0001124 6.843 1.24e-11 ***
## teach_advdg 0.0028838 0.0002802 10.292 < 2e-16 ***
## student_STAAR 0.0010036 0.0002091 4.799 1.80e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08557 on 1191 degrees of freedom
## Multiple R-squared: 0.1689, Adjusted R-squared: 0.1654
## F-statistic: 48.41 on 5 and 1191 DF, p-value: < 2.2e-16
What does this mean? - It looks like each variable has a significant relationship with average teacher pay - The p-value is quite low, so the null hypothesis cannot be eliminated - The Adjusted R-squared let’s us know that these variables only account of 16% of change in average teacher pay - It looks like when teacher pay goes up, the percentage of white teachers goes down - But when teacher pay goes up, the percentage of hispanic students, total revenue from local taxes, teachers with advanced degrees, and student test scores go up.