Justin Herman

NOTES

- Dataset documents State SAT scores along side several exploratory categories

  • Python blocks with double comments are executable code commented out after exploration and for the sake of a clean knit to html
  • Python blocks with single comments are meant to be inline comments

- Explanation of Column names

  • X                  Integer based index(will not be used)
  • state            State
  • expend        Total money spent by state on education
  • ratio             Student-teacher ratio
  • salary          Teacher salary
  • frac              Percentage of state population taking test(Participation Rate)
  • verbal          Score on verbal SAT
  • math            Score on math SAT
  • sat               Total score on SAT

Hypothesis

- SAT scores will have a positive correlation with total expenditure and teacher salary, as well as a negative correlation with teaher/student class ratio size.

Bonus

  • Save dataframe to disk, push to git, load dataframe through git
##sat_data <- read.csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/mosaicData/SAT.csv",header=1, sep=",")
##setwd("C:/Users/justin/Documents/GitHub/Assignment_4")
##write.csv(sat_data, file="upload_final_project_to_git.csv",row.names=FALSE)
git_df <- read.csv("https://raw.githubusercontent.com/justinherman42/Assignment_4/master/upload_final_project_to_git.csv")
head(git_df)
##   X      state expend ratio salary frac verbal math  sat
## 1 1    Alabama  4.405  17.2 31.144    8    491  538 1029
## 2 2     Alaska  8.963  17.6 47.951   47    445  489  934
## 3 3    Arizona  4.778  19.3 32.175   27    448  496  944
## 4 4   Arkansas  4.459  17.1 28.934    6    482  523 1005
## 5 5 California  4.992  24.0 41.078   45    417  485  902
## 6 6   Colorado  5.443  18.4 34.571   29    462  518  980

General data exploration

sat_data <- read.csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/mosaicData/SAT.csv",header=1, sep=",")
sat_data <- data.frame(sat_data)
print (summary(sat_data))
##        X                state        expend          ratio      
##  Min.   : 1.00   Alabama   : 1   Min.   :3.656   Min.   :13.80  
##  1st Qu.:13.25   Alaska    : 1   1st Qu.:4.882   1st Qu.:15.22  
##  Median :25.50   Arizona   : 1   Median :5.768   Median :16.60  
##  Mean   :25.50   Arkansas  : 1   Mean   :5.905   Mean   :16.86  
##  3rd Qu.:37.75   California: 1   3rd Qu.:6.434   3rd Qu.:17.57  
##  Max.   :50.00   Colorado  : 1   Max.   :9.774   Max.   :24.30  
##                  (Other)   :44                                  
##      salary           frac           verbal           math      
##  Min.   :25.99   Min.   : 4.00   Min.   :401.0   Min.   :443.0  
##  1st Qu.:30.98   1st Qu.: 9.00   1st Qu.:427.2   1st Qu.:474.8  
##  Median :33.29   Median :28.00   Median :448.0   Median :497.5  
##  Mean   :34.83   Mean   :35.24   Mean   :457.1   Mean   :508.8  
##  3rd Qu.:38.55   3rd Qu.:63.00   3rd Qu.:490.2   3rd Qu.:539.5  
##  Max.   :50.05   Max.   :81.00   Max.   :516.0   Max.   :592.0  
##                                                                 
##       sat        
##  Min.   : 844.0  
##  1st Qu.: 897.2  
##  Median : 945.5  
##  Mean   : 965.9  
##  3rd Qu.:1032.0  
##  Max.   :1107.0  
## 
print(str(sat_data))
## 'data.frame':    50 obs. of  9 variables:
##  $ X     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ state : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ expend: num  4.41 8.96 4.78 4.46 4.99 ...
##  $ ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
##  $ salary: num  31.1 48 32.2 28.9 41.1 ...
##  $ frac  : int  8 47 27 6 45 29 81 68 48 65 ...
##  $ verbal: int  491 445 448 482 417 462 431 429 420 406 ...
##  $ math  : int  538 489 496 523 485 518 477 468 469 448 ...
##  $ sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...
## NULL
colnames(sat_data)[colnames(sat_data)=="frac"] <- "Participation_rate"
#general histograms
hist_info <- hist(sat_data$sat,breaks=seq(840,1120,by=40))

#hists of columns with normalized curve
hist(sat_data$sat, freq=FALSE, main="Sat Scores Density plot")
curve(dnorm(x, mean=mean(sat_data$sat), sd=sd(sat_data$sat)), add=TRUE, col="darkblue", lwd=2) 

hist(sat_data$expend, freq= FALSE, main="expenditures density plot")
curve(dnorm(x, mean=mean(sat_data$expend), sd=sd(sat_data$expend)), add=TRUE, col="darkblue", lwd=2) 

hist(sat_data$Participation_rate, freq=FALSE, main="participation Density plot")
curve(dnorm(x, mean=mean(sat_data$Participation_rate), sd=sd(sat_data$Participation_rate)), add=TRUE, col="darkblue", lwd=2)

hist(sat_data$ratio, freq=FALSE, main="teacher/student ratio Density plot")
curve(dnorm(x, mean=mean(sat_data$ratio), sd=sd(sat_data$ratio)), add=TRUE, col="darkblue", lwd=2)

Initial thoughts on summary statistics

  • Extremely wide spread in columns frac(participation density)
  • Rightwards skew in teacher-student ratio and expenditures
  • Most of the data isn’t normally distributed
  • All columns other than state are numeric

Check Box plots for outliers

plot_expend <- subset(sat_data, select=expend)
plot_ratio<- subset(sat_data, select=ratio)
plot_salary <- subset(sat_data, select=salary)
plot_Participation_rate <- subset(sat_data, select=Participation_rate)
plot_verbal <- subset(sat_data, select=verbal)
plot_math<- subset(sat_data, select=math)
plot_sat<- subset(sat_data, select=sat)

# plot individually to get an accurate  look at each column
boxplot(plot_expend,las=2,show.names=TRUE)

##boxplot(plot_ratio,las=2,show.names=TRUE)
##boxplot(plot_salary,las=2,show.names=TRUE)
##boxplot(plot_Participation_rate,las=2,show.names=TRUE)
##boxplot(plot_verbal,las=2,show.names=TRUE)
##boxplot(plot_math,las=2,show.names=TRUE)
##boxplot(plot_sat,las=2,show.names=TRUE)

Attempt to normalize data to couple all Box plots

dat <- data.frame(sat_data[,3:9])
scaled.dat <- scale(dat)
boxplot(scaled.dat, las=2,col=c("mediumvioletred","lightseagreen", "yellow","lightslateblue","olivedrab3","powderblue","mistyrose"))

#checks to verify data is normalized below:

##colMeans(scaled.dat) 
##apply(scaled.dat, 2, sd)
##zVar <- (sat_data[,3] - mean(sat_data[,3])) / sd(sat_data[,3])
##((zVar)-(scaled.dat[,1]))

Conclusions from Box plots

  • Some outliers exist in expend, ratio, and salary columns
  • Math and verbal can likely be dropped from analysis as sat is there aggregate and should suffice

Sorted by columns

sorted_by_sat <- sat_data[order(sat_data$sat,decreasing=TRUE),]
head(sorted_by_sat,10)
##     X        state expend ratio salary Participation_rate verbal math  sat
## 34 34 North Dakota  4.775  15.3 26.327                  5    515  592 1107
## 15 15         Iowa  5.483  15.8 31.511                  5    516  583 1099
## 23 23    Minnesota  6.000  17.5 35.948                  9    506  579 1085
## 44 44         Utah  3.656  24.3 29.082                  4    513  563 1076
## 49 49    Wisconsin  6.930  15.9 37.746                  9    501  572 1073
## 41 41 South Dakota  4.775  14.4 25.994                  5    505  563 1068
## 16 16       Kansas  5.817  15.1 34.652                  9    503  557 1060
## 27 27     Nebraska  5.935  14.5 30.922                  9    494  556 1050
## 13 13     Illinois  6.136  17.3 39.431                 13    488  560 1048
## 25 25     Missouri  5.383  15.5 31.189                  9    495  550 1045
sorted_by_salary <- sat_data[order(sat_data$salary),]
head(sorted_by_salary,10)
##     X        state expend ratio salary Participation_rate verbal math  sat
## 41 41 South Dakota  4.775  14.4 25.994                  5    505  563 1068
## 34 34 North Dakota  4.775  15.3 26.327                  5    515  592 1107
## 18 18    Louisiana  4.761  16.8 26.461                  9    486  535 1021
## 24 24  Mississippi  4.080  17.5 26.818                  4    496  540 1036
## 36 36     Oklahoma  4.845  15.5 28.172                  9    491  536 1027
## 31 31   New Mexico  4.586  17.2 28.493                 11    485  530 1015
## 26 26      Montana  5.692  16.3 28.785                 21    473  536 1009
## 4   4     Arkansas  4.459  17.1 28.934                  6    482  523 1005
## 44 44         Utah  3.656  24.3 29.082                  4    513  563 1076
## 12 12        Idaho  4.210  19.1 29.783                 15    468  511  979
sorted_by_Participation_rate <- sat_data[order(sat_data$Participation_rate),]
head(sorted_by_Participation_rate,10)
##     X        state expend ratio salary Participation_rate verbal math  sat
## 24 24  Mississippi  4.080  17.5 26.818                  4    496  540 1036
## 44 44         Utah  3.656  24.3 29.082                  4    513  563 1076
## 15 15         Iowa  5.483  15.8 31.511                  5    516  583 1099
## 34 34 North Dakota  4.775  15.3 26.327                  5    515  592 1107
## 41 41 South Dakota  4.775  14.4 25.994                  5    505  563 1068
## 4   4     Arkansas  4.459  17.1 28.934                  6    482  523 1005
## 1   1      Alabama  4.405  17.2 31.144                  8    491  538 1029
## 16 16       Kansas  5.817  15.1 34.652                  9    503  557 1060
## 18 18    Louisiana  4.761  16.8 26.461                  9    486  535 1021
## 23 23    Minnesota  6.000  17.5 35.948                  9    506  579 1085

Observations from sorted data

  • It looks like Participation_rate and salary are correlated to SAT scores. Let’s explore this with Scatter plots and correlation numbers

Scatter plots after dropping Verbal and Math columns

##plot(sat_data$sat,sat_data$Participation_rate)
##plot.abline(lm(sat~Participation_rate, data=sat_data))
##plot(sat_data$sat,sat_data$ratio)
##plot(sat_data$sat,sat_data$expend)
##plot(sat_data$sat,sat_data$salary)

#lets drop non numeric as well as math and verbal

##pairs(sat_data[,c(3:6,9)])
##cor(sat_data$sat,sat_data$salary)
##cor(sat_data$sat,sat_data$Participation_rate)
##cor(sat_data$sat,sat_data$ratio)
##cor(sat_data$sat,sat_data$expend)
##cor(sat_data$Participation_rate,sat_data$expend)
##plot(sat_data$Participation_rate,sat_data$expend)
##cor(sat_data$Participation_rate,sat_data$ratio)
##cor(sat_data$Participation_rate,sat_data$salary)
##cor(sat_data$sat,sat_data$Participation_rate)



panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- (cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * abs(r))
}
pairs(~sat+expend+salary+ratio+Participation_rate,data=sat_data,lower.panel = panel.smooth, upper.panel = panel.cor)

Observations from Scatter plots

My hypothesis is almost entirely disproven from this data.

Hypothesis Conclusion
Correlation between SAT_scores & expenditure is positive the correlation is actually negative
Correlation between SAT_scores & Teacher Salary is positive the correlation is actually negative
Correlation between SAT_scores & Teacher/student ratio is negative the correlation is non existent

New hypothesis

  • Something is wrong with these correlations
    • Teacher/student ratio has been well documented as increasing student outcome
    • Perhaps teacher Salary and expenditures are not correlated, but a relatively strong negative correlation doesn’t make sense
  • Internet search for deeper understanding regarding the relationship of state SAT scores and spending led to this article
    https://blog.prepscholar.com/average-sat-and-act-scores-by-stated-adjusted-for-participation-rate
    • In the above link, the author is using an updated more robust version of my dataset. It includes ACT & SAT scores and participation rates. The author points out a big problem with the datasets, the effects of participation rate on scores.

What is wrong with the participation rate(Frac)

  • There is a -.89 correlation between SAT scores and participation rate
  • Nearly every single high SAT score is associated with participation rates under 10
  • Nearly all the low scores have high participation rates
color_df <- sat_data
color_df$Colour <- "black"
color_df$Colour[color_df$Participation_rate<10] <- "red"
color_df$Colour[color_df$Participation_rate>55] <- "blue"
plot(color_df$sat,color_df$Participation_rate, col=color_df$Colour,pch=7)
legend("topright", legend=c("participation_rate>55","participation_rate<10"),pch=7,col=c("blue","red"))

  • Some correlations in the dataset as a whole are strong correlations yet participation rate is the only one that biases the data
    • In states With participation rate under 10%, students taking the test aren’t doing so out of mandatory state regulations. They are taking the test with the intent of getting into specific colleges, colleges that require SAT over the ACT.
    • These students are much better performers as compared to the general population as a whole. Therefore our random sampling conditions are violated. We need to do something to adjust for the vast differences in participation rate population as a whole.

Options on how to proceed

  • Attempt to subset into similar participation rates and compare the correlations of these subsets
    • The problem with this approach may be that most of this data will not be useful do to the strong correlation between participation rate and all our other variables
  • Attempt through linear regression to figure out how our target-Sat scores is effected by our features(participation rate and others). We can then you this line to predict an SAT score column based on an average participation rate
    • I chose this approach

Attempt to run a machine learning linear regression with two features(Participation_rate,expend)

library(tidyr)
require(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# create test/train split
smp_size <- floor(0.25 * nrow(sat_data))
set.seed(123)
train_ind <- sample(seq_len(nrow(sat_data)), size = smp_size)
train <- sat_data[train_ind, ]
test <- sat_data[-train_ind, ]

#run model and check its accuracy
linearMod <- lm(formula= sat~Participation_rate+expend, data=test)  
summary(linearMod)
## 
## Call:
## lm(formula = sat ~ Participation_rate + expend, data = test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.995 -19.309   0.381  17.870  75.793 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        984.8048    26.0426  37.815  < 2e-16 ***
## Participation_rate  -2.7492     0.2553 -10.769 1.18e-12 ***
## expend              12.5966     4.9092   2.566   0.0147 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.69 on 35 degrees of freedom
## Multiple R-squared:  0.7951, Adjusted R-squared:  0.7834 
## F-statistic: 67.91 on 2 and 35 DF,  p-value: 8.952e-13
AIC(linearMod)
## [1] 380.0143
predicted_sat_score <- predict(linearMod,sat_data)
actuals_preds <- data.frame(cbind(actuals=sat_data$sat, predicteds=predicted_sat_score))  
correlation_accuracy <- cor(actuals_preds)  #.905

What does the model show?

  • Test model isn’t showing a very significant p value for the feature(expend)
  • AIC seems very high, which undermines our results
  • May lead to faulty results but as this is my first experiment with machine learning, let’s see where it takes us

Apply coefficient values from model to a mean particiaptios rate column(34.2).

  • I am making an assumption that SAT scores predicted from the mean participation rate will be more accurate
# Had to comment out tidy function because I couldnt get the document to knit with it
# I set the values from TIDY to their own variables 
##coeffs <- tidy(linearMod)
##coeffs
##coeffs$estimate[1]

Participation_rate_coeff <- -2.7492          
expend_coeff <- 12.5966                         
y_inter <- 984.8048 
sat_data$equal_participation <- 34.2
# add new column with regressed sat scores and standardized particpation rate of mean participation rate
sat_data$regressed_sat_scores <- y_inter+ sat_data$equal_participation*(Participation_rate_coeff) + sat_data$expend*(expend_coeff)

Print out sorted by new Post Regression SAT scores and accounting for equal participation rates by state

regression_scores <-sat_data[,c("state","expend","ratio","salary","Participation_rate","sat","regressed_sat_scores")]
regression_scores <- regression_scores[order(regression_scores$regressed_sat_scores, decreasing = TRUE),]
head(regression_scores,15)
##            state expend ratio salary Participation_rate  sat
## 30    New Jersey  9.774  13.8 46.087                 70  898
## 32      New York  9.623  15.2 47.612                 74  892
## 2         Alaska  8.963  17.6 47.951                 47  934
## 7    Connecticut  8.817  14.4 50.045                 81  908
## 39  Rhode Island  7.469  14.7 40.729                 70  888
## 21 Massachusetts  7.287  14.8 40.795                 80  907
## 20      Maryland  7.245  17.0 40.661                 64  909
## 38  Pennsylvania  7.109  17.1 44.510                 70  880
## 8       Delaware  7.030  16.6 39.076                 68  897
## 22      Michigan  6.994  20.1 41.895                 11 1033
## 49     Wisconsin  6.930  15.9 37.746                  9 1073
## 45       Vermont  6.750  13.8 35.406                 68  901
## 37        Oregon  6.436  19.9 38.555                 51  947
## 19         Maine  6.428  13.8 31.972                 68  896
## 35          Ohio  6.162  16.6 36.802                 23  975
##    regressed_sat_scores
## 30            1013.9013
## 32            1011.9992
## 2             1003.6855
## 7             1001.8464
## 39             984.8662
## 21             982.5736
## 20             982.0445
## 38             980.3314
## 8              979.3363
## 22             978.8828
## 49             978.0766
## 45             975.8092
## 37             971.8539
## 19             971.7531
## 35             968.4024

New matrix of Scatter plots against the new Post Regression SAT scores

##plot(sat_data$regressed_sat_scores,sat_data$expend)
##plot(sat_data$regressed_sat_scores,sat_data$Participation_rate)
##pairs(~regressed_sat_scores+ratio+Participation_rate+salary,data=sat_data, 
##   main="Scatterplot Matrix with regressed data",col=c("red"))
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- (cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * abs(r))
}
pairs(~regressed_sat_scores+expend+salary+ratio+Participation_rate,data=regression_scores,lower.panel = panel.smooth, upper.panel = panel.cor)

old_salary              <- cor(sat_data$sat,sat_data$salary)
old_Participation_rate  <- cor(sat_data$sat,sat_data$Participation_rate)
old_ratio               <- cor(sat_data$sat,sat_data$ratio)
old_expend              <- cor(sat_data$sat,sat_data$expend)
old_correlations        <- c(old_salary,old_Participation_rate,old_ratio,old_expend)

new_salary              <-cor(sat_data$regressed_sat_scores,sat_data$salary)
new_Participation_rate  <-cor(sat_data$regressed_sat_scores,sat_data$Participation_rate)
new_ratio               <-cor(sat_data$regressed_sat_scores,sat_data$ratio)
new_expend              <-cor(sat_data$regressed_sat_scores,sat_data$expend)
new_correlations        <-c(new_salary,new_Participation_rate,new_ratio,new_expend) 
comparison              <- data.frame(old_correlations,new_correlations)
row.names(comparison)   <- c("salary","participation_Rate","ratio","expend")
comparison
##                    old_correlations new_correlations
## salary                  -0.43988338        0.8698015
## participation_Rate      -0.88711868        0.5926274
## ratio                    0.08125382       -0.3710254
## expend                  -0.38053700        1.0000000

Scatter plot of teacher salary with new Post Regression SAT scores in blue and old SAT scores in red

plot(regression_scores$salary,regression_scores$sat,col="red",pch=2)
par(new=T)  
plot(regression_scores$salary,regression_scores$regressed_sat_scores, pch=9,col="blue", axes=F, ylab="")  # don't overwrite
par(new=T)  
axis(side=4)   # add axis
mtext(side=4,line=3.8,"regressed_sat_scores")   # add label
legend("topright", legend=c("sat","regressed_sat_scores"), pch=c(2,9),col=c("red","blue"))

Conclusion

  • This is an incredible example of how looking at data on the surface, can be extremely misleading
  • Initially all of my hypothesis were wrong. In fact they were so wrong that it led me to take a deeper look at the data
  • Upon further exploration I realized I had to figure out a way to go about unbiasing my samples(states).
  • My idea to create a column of the average participation rate and build state SAT scores from that column may have been an over-adjustment or a faulty assumption
  • Considering I received a correlation value of 1 between my regressed SAT scores and state expenditures, something must have went wrong with my model. Therefore my tentative observations based on a faulty model are as follows
    • Shows a significant difference that affirms my hypothesis. There is a negative correlation between SAT scores and teacher student ratio(-.37) and a positive correlation between SAT scores and teacher salary(.87), participation rate(.59), and expenditures(1)