library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(curl)
## Using libcurl 7.64.1 with LibreSSL/2.8.3
##
## Attaching package: 'curl'
## The following object is masked from 'package:readr':
##
## parse_date
library(ggplot2)
library(usmap)
library(reshape2)
scores<-read.csv(curl("https://raw.githubusercontent.com/brsingh7/R-Bridge/main/Guber99.csv?token=GHSAT0AAAAAABQGPEWODNQIQWPZ3JEPXJGAYPJWFGQ"))
states<-read.csv(curl("https://raw.githubusercontent.com/brsingh7/R-Bridge/main/State_Abbrev.csv?token=GHSAT0AAAAAABQGPEWOEBZUYHU2KKFRYMT6YPJWGCQ"))
score_data <- merge(scores,states,by.x=c("state"),by.y=c("State"),all.y=T)
summary(score_data) #summarize data set
## state X expendpp ptratio
## Length:51 Min. : 1.00 Min. :3.656 Min. :13.80
## Class :character 1st Qu.:13.25 1st Qu.:4.882 1st Qu.:15.22
## Mode :character Median :25.50 Median :5.768 Median :16.60
## Mean :25.50 Mean :5.905 Mean :16.86
## 3rd Qu.:37.75 3rd Qu.:6.434 3rd Qu.:17.57
## Max. :50.00 Max. :9.774 Max. :24.30
## NA's :1 NA's :1 NA's :1
## tsalary perctakers verbal math
## Min. :25.99 Min. : 4.00 Min. :401.0 Min. :443.0
## 1st Qu.:30.98 1st Qu.: 9.00 1st Qu.:427.2 1st Qu.:474.8
## Median :33.29 Median :28.00 Median :448.0 Median :497.5
## Mean :34.83 Mean :35.24 Mean :457.1 Mean :508.8
## 3rd Qu.:38.55 3rd Qu.:63.00 3rd Qu.:490.2 3rd Qu.:539.5
## Max. :50.05 Max. :81.00 Max. :516.0 Max. :592.0
## NA's :1 NA's :1 NA's :1 NA's :1
## total Abbrev Code
## Min. : 844.0 Length:51 Length:51
## 1st Qu.: 897.2 Class :character Class :character
## Median : 945.5 Mode :character Mode :character
## Mean : 965.9
## 3rd Qu.:1032.0
## Max. :1107.0
## NA's :1
#Summarize average and median math, verbal and total SAT scores for the 50 states
score_data %>% summarize(AvgMath=mean(math), MedMath=median(math), AvgVerbal=mean(verbal), MedVerbal=median(verbal),PctDiff=(AvgMath-AvgVerbal)/AvgVerbal)
## AvgMath MedMath AvgVerbal MedVerbal PctDiff
## 1 NA NA NA NA NA
hist(score_data$`math`,col = "red",breaks = 10, xlab = "Score", ylab="# of Students", main = "SAT Math & Verbal Score Frequencies")
hist(score_data$`verbal`,col = "blue",breaks=5, add = TRUE)
In general, it appears across the United States, individual test takers perform better on the math section of the SAT in comparison to the verbal section, performing approximately 51 points or 11.3% better.
score_data2 <- score_data[-c(9),-c(2)] #remove District of Colombia, second column with #s1-50
colnames(score_data2) <- c("State", "Cost Per Pupil (in thous)", "Avg Students per Teacher", "Avg Teacher Salary (thous)", "% Eligible", "Verbal Score", "Math Score", "Total", "Abbrev","State Code")
score_data2 <- score_data2[,c(10,1,2,3,4,5,6,7,8,9)] #move state code to first column
head(score_data2,10)
## State Code State Cost Per Pupil (in thous) Avg Students per Teacher
## 1 AL Alabama 4.405 17.2
## 2 AK Alaska 8.963 17.6
## 3 AZ Arizona 4.778 19.3
## 4 AR Arkansas 4.459 17.1
## 5 CA California 4.992 24.0
## 6 CO Colorado 5.443 18.4
## 7 CT Connecticut 8.817 14.4
## 8 DE Delaware 7.030 16.6
## 10 FL Florida 5.718 19.1
## 11 GA Georgia 5.193 16.3
## Avg Teacher Salary (thous) % Eligible Verbal Score Math Score Total Abbrev
## 1 31.144 8 491 538 1029 Ala.
## 2 47.951 47 445 489 934 Alaska
## 3 32.175 27 448 496 944 Ariz.
## 4 28.934 6 482 523 1005 Ark.
## 5 41.078 45 417 485 902 Calif.
## 6 34.571 29 462 518 980 Colo.
## 7 50.045 81 431 477 908 Conn.
## 8 39.076 68 429 468 897 Del.
## 10 32.588 48 420 469 889 Fla.
## 11 32.291 65 406 448 854 Ga.
x <- score_data2$`Avg Teacher Salary (thous)`
y <- score_data2$Total
plot(x,y, main="Teacher Salaries & Student SAT Scores", xlab = "Teacher Salary", ylab= "Total SAT Score",pch=20)
lines(lowess(x,y), col="red")
score_data3 <- score_data2
score_data3$fips <- fips(score_data3$State)
plot_usmap(data = score_data3, values = "Total", color = "blue", labels=TRUE) +
scale_fill_continuous(low = "white", high = "blue", name = "SAT Scores by State", label = scales::comma) +
labs(title="Avg SAT Scores by State") +
theme(legend.position = "right")
State_Salaries <- score_data3[,c("State","Avg Teacher Salary (thous)")]
State_Salaries <- State_Salaries[order(-State_Salaries$`Avg Teacher Salary (thous)`),]
head(State_Salaries,15)
## State Avg Teacher Salary (thous)
## 7 Connecticut 50.045
## 2 Alaska 47.951
## 33 New York 47.612
## 31 New Jersey 46.087
## 39 Pennsylvania 44.510
## 23 Michigan 41.895
## 5 California 41.078
## 22 Massachusetts 40.795
## 40 Rhode Island 40.729
## 21 Maryland 40.661
## 14 Illinois 39.431
## 8 Delaware 39.076
## 38 Oregon 38.555
## 12 Hawaii 38.518
## 50 Wisconsin 37.746
From the data above, we can hypothesize that students’ performances on the SAT exam do not correlate with the cost of education. As seen, there is actually a negative regression between teacher’s salaries and students’ performance on the SAT. Moreover, the highest scores tend to be in the Mid-west region, while those states do not fall within the top tier of teacher salaries. Further information would be necessary to decipher why this is so. It could be due to the fact that cost of living in the region is less, therefore salaries are lower; however, data would prove this theory.
From the initial data, we can also conclude that, for the most part, students tend to perform better on math than verbal sections of the SAT. Within my own experience, this has also proven to hold true.