##sat_data <- read.csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/mosaicData/SAT.csv",header=1, sep=",")
##setwd("C:/Users/justin/Documents/GitHub/Assignment_4")
##write.csv(sat_data, file="upload_final_project_to_git.csv",row.names=FALSE)
git_df <- read.csv("https://raw.githubusercontent.com/justinherman42/Assignment_4/master/upload_final_project_to_git.csv")
head(git_df)
## X state expend ratio salary frac verbal math sat
## 1 1 Alabama 4.405 17.2 31.144 8 491 538 1029
## 2 2 Alaska 8.963 17.6 47.951 47 445 489 934
## 3 3 Arizona 4.778 19.3 32.175 27 448 496 944
## 4 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 5 5 California 4.992 24.0 41.078 45 417 485 902
## 6 6 Colorado 5.443 18.4 34.571 29 462 518 980
sat_data <- read.csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/mosaicData/SAT.csv",header=1, sep=",")
sat_data <- data.frame(sat_data)
print (summary(sat_data))
## X state expend ratio
## Min. : 1.00 Alabama : 1 Min. :3.656 Min. :13.80
## 1st Qu.:13.25 Alaska : 1 1st Qu.:4.882 1st Qu.:15.22
## Median :25.50 Arizona : 1 Median :5.768 Median :16.60
## Mean :25.50 Arkansas : 1 Mean :5.905 Mean :16.86
## 3rd Qu.:37.75 California: 1 3rd Qu.:6.434 3rd Qu.:17.57
## Max. :50.00 Colorado : 1 Max. :9.774 Max. :24.30
## (Other) :44
## salary frac verbal math
## Min. :25.99 Min. : 4.00 Min. :401.0 Min. :443.0
## 1st Qu.:30.98 1st Qu.: 9.00 1st Qu.:427.2 1st Qu.:474.8
## Median :33.29 Median :28.00 Median :448.0 Median :497.5
## Mean :34.83 Mean :35.24 Mean :457.1 Mean :508.8
## 3rd Qu.:38.55 3rd Qu.:63.00 3rd Qu.:490.2 3rd Qu.:539.5
## Max. :50.05 Max. :81.00 Max. :516.0 Max. :592.0
##
## sat
## Min. : 844.0
## 1st Qu.: 897.2
## Median : 945.5
## Mean : 965.9
## 3rd Qu.:1032.0
## Max. :1107.0
##
print(str(sat_data))
## 'data.frame': 50 obs. of 9 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ state : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ expend: num 4.41 8.96 4.78 4.46 4.99 ...
## $ ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
## $ salary: num 31.1 48 32.2 28.9 41.1 ...
## $ frac : int 8 47 27 6 45 29 81 68 48 65 ...
## $ verbal: int 491 445 448 482 417 462 431 429 420 406 ...
## $ math : int 538 489 496 523 485 518 477 468 469 448 ...
## $ sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
## NULL
colnames(sat_data)[colnames(sat_data)=="frac"] <- "Participation_rate"
#general histograms
hist_info <- hist(sat_data$sat,breaks=seq(840,1120,by=40))
#hists of columns with normalized curve
hist(sat_data$sat, freq=FALSE, main="Sat Scores Density plot")
curve(dnorm(x, mean=mean(sat_data$sat), sd=sd(sat_data$sat)), add=TRUE, col="darkblue", lwd=2)
hist(sat_data$expend, freq= FALSE, main="expenditures density plot")
curve(dnorm(x, mean=mean(sat_data$expend), sd=sd(sat_data$expend)), add=TRUE, col="darkblue", lwd=2)
hist(sat_data$Participation_rate, freq=FALSE, main="participation Density plot")
curve(dnorm(x, mean=mean(sat_data$Participation_rate), sd=sd(sat_data$Participation_rate)), add=TRUE, col="darkblue", lwd=2)
hist(sat_data$ratio, freq=FALSE, main="teacher/student ratio Density plot")
curve(dnorm(x, mean=mean(sat_data$ratio), sd=sd(sat_data$ratio)), add=TRUE, col="darkblue", lwd=2)
plot_expend <- subset(sat_data, select=expend)
plot_ratio<- subset(sat_data, select=ratio)
plot_salary <- subset(sat_data, select=salary)
plot_Participation_rate <- subset(sat_data, select=Participation_rate)
plot_verbal <- subset(sat_data, select=verbal)
plot_math<- subset(sat_data, select=math)
plot_sat<- subset(sat_data, select=sat)
# plot individually to get an accurate look at each column
boxplot(plot_expend,las=2,show.names=TRUE)
##boxplot(plot_ratio,las=2,show.names=TRUE)
##boxplot(plot_salary,las=2,show.names=TRUE)
##boxplot(plot_Participation_rate,las=2,show.names=TRUE)
##boxplot(plot_verbal,las=2,show.names=TRUE)
##boxplot(plot_math,las=2,show.names=TRUE)
##boxplot(plot_sat,las=2,show.names=TRUE)
dat <- data.frame(sat_data[,3:9])
scaled.dat <- scale(dat)
boxplot(scaled.dat, las=2,col=c("mediumvioletred","lightseagreen", "yellow","lightslateblue","olivedrab3","powderblue","mistyrose"))
#checks to verify data is normalized below:
##colMeans(scaled.dat)
##apply(scaled.dat, 2, sd)
##zVar <- (sat_data[,3] - mean(sat_data[,3])) / sd(sat_data[,3])
##((zVar)-(scaled.dat[,1]))
sorted_by_sat <- sat_data[order(sat_data$sat,decreasing=TRUE),]
head(sorted_by_sat,10)
## X state expend ratio salary Participation_rate verbal math sat
## 34 34 North Dakota 4.775 15.3 26.327 5 515 592 1107
## 15 15 Iowa 5.483 15.8 31.511 5 516 583 1099
## 23 23 Minnesota 6.000 17.5 35.948 9 506 579 1085
## 44 44 Utah 3.656 24.3 29.082 4 513 563 1076
## 49 49 Wisconsin 6.930 15.9 37.746 9 501 572 1073
## 41 41 South Dakota 4.775 14.4 25.994 5 505 563 1068
## 16 16 Kansas 5.817 15.1 34.652 9 503 557 1060
## 27 27 Nebraska 5.935 14.5 30.922 9 494 556 1050
## 13 13 Illinois 6.136 17.3 39.431 13 488 560 1048
## 25 25 Missouri 5.383 15.5 31.189 9 495 550 1045
sorted_by_salary <- sat_data[order(sat_data$salary),]
head(sorted_by_salary,10)
## X state expend ratio salary Participation_rate verbal math sat
## 41 41 South Dakota 4.775 14.4 25.994 5 505 563 1068
## 34 34 North Dakota 4.775 15.3 26.327 5 515 592 1107
## 18 18 Louisiana 4.761 16.8 26.461 9 486 535 1021
## 24 24 Mississippi 4.080 17.5 26.818 4 496 540 1036
## 36 36 Oklahoma 4.845 15.5 28.172 9 491 536 1027
## 31 31 New Mexico 4.586 17.2 28.493 11 485 530 1015
## 26 26 Montana 5.692 16.3 28.785 21 473 536 1009
## 4 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 44 44 Utah 3.656 24.3 29.082 4 513 563 1076
## 12 12 Idaho 4.210 19.1 29.783 15 468 511 979
sorted_by_Participation_rate <- sat_data[order(sat_data$Participation_rate),]
head(sorted_by_Participation_rate,10)
## X state expend ratio salary Participation_rate verbal math sat
## 24 24 Mississippi 4.080 17.5 26.818 4 496 540 1036
## 44 44 Utah 3.656 24.3 29.082 4 513 563 1076
## 15 15 Iowa 5.483 15.8 31.511 5 516 583 1099
## 34 34 North Dakota 4.775 15.3 26.327 5 515 592 1107
## 41 41 South Dakota 4.775 14.4 25.994 5 505 563 1068
## 4 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 1 1 Alabama 4.405 17.2 31.144 8 491 538 1029
## 16 16 Kansas 5.817 15.1 34.652 9 503 557 1060
## 18 18 Louisiana 4.761 16.8 26.461 9 486 535 1021
## 23 23 Minnesota 6.000 17.5 35.948 9 506 579 1085
##plot(sat_data$sat,sat_data$Participation_rate)
##plot.abline(lm(sat~Participation_rate, data=sat_data))
##plot(sat_data$sat,sat_data$ratio)
##plot(sat_data$sat,sat_data$expend)
##plot(sat_data$sat,sat_data$salary)
#lets drop non numeric as well as math and verbal
##pairs(sat_data[,c(3:6,9)])
##cor(sat_data$sat,sat_data$salary)
##cor(sat_data$sat,sat_data$Participation_rate)
##cor(sat_data$sat,sat_data$ratio)
##cor(sat_data$sat,sat_data$expend)
##cor(sat_data$Participation_rate,sat_data$expend)
##plot(sat_data$Participation_rate,sat_data$expend)
##cor(sat_data$Participation_rate,sat_data$ratio)
##cor(sat_data$Participation_rate,sat_data$salary)
##cor(sat_data$sat,sat_data$Participation_rate)
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- (cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * abs(r))
}
pairs(~sat+expend+salary+ratio+Participation_rate,data=sat_data,lower.panel = panel.smooth, upper.panel = panel.cor)
My hypothesis is almost entirely disproven from this data.
| Hypothesis | Conclusion | ||
|---|---|---|---|
| Correlation between SAT_scores & expenditure is positive | the correlation is actually negative | ||
| Correlation between SAT_scores & Teacher Salary is positive | the correlation is actually negative | ||
| Correlation between SAT_scores & Teacher/student ratio is negative | the correlation is non existent |
color_df <- sat_data
color_df$Colour <- "black"
color_df$Colour[color_df$Participation_rate<10] <- "red"
color_df$Colour[color_df$Participation_rate>55] <- "blue"
plot(color_df$sat,color_df$Participation_rate, col=color_df$Colour,pch=7)
legend("topright", legend=c("participation_rate>55","participation_rate<10"),pch=7,col=c("blue","red"))
library(tidyr)
require(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# create test/train split
smp_size <- floor(0.25 * nrow(sat_data))
set.seed(123)
train_ind <- sample(seq_len(nrow(sat_data)), size = smp_size)
train <- sat_data[train_ind, ]
test <- sat_data[-train_ind, ]
#run model and check its accuracy
linearMod <- lm(formula= sat~Participation_rate+expend, data=test)
summary(linearMod)
##
## Call:
## lm(formula = sat ~ Participation_rate + expend, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.995 -19.309 0.381 17.870 75.793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 984.8048 26.0426 37.815 < 2e-16 ***
## Participation_rate -2.7492 0.2553 -10.769 1.18e-12 ***
## expend 12.5966 4.9092 2.566 0.0147 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.69 on 35 degrees of freedom
## Multiple R-squared: 0.7951, Adjusted R-squared: 0.7834
## F-statistic: 67.91 on 2 and 35 DF, p-value: 8.952e-13
AIC(linearMod)
## [1] 380.0143
predicted_sat_score <- predict(linearMod,sat_data)
actuals_preds <- data.frame(cbind(actuals=sat_data$sat, predicteds=predicted_sat_score))
correlation_accuracy <- cor(actuals_preds) #.905
# Had to comment out tidy function because I couldnt get the document to knit with it
# I set the values from TIDY to their own variables
##coeffs <- tidy(linearMod)
##coeffs
##coeffs$estimate[1]
Participation_rate_coeff <- -2.7492
expend_coeff <- 12.5966
y_inter <- 984.8048
sat_data$equal_participation <- 34.2
# add new column with regressed sat scores and standardized particpation rate of mean participation rate
sat_data$regressed_sat_scores <- y_inter+ sat_data$equal_participation*(Participation_rate_coeff) + sat_data$expend*(expend_coeff)
regression_scores <-sat_data[,c("state","expend","ratio","salary","Participation_rate","sat","regressed_sat_scores")]
regression_scores <- regression_scores[order(regression_scores$regressed_sat_scores, decreasing = TRUE),]
head(regression_scores,15)
## state expend ratio salary Participation_rate sat
## 30 New Jersey 9.774 13.8 46.087 70 898
## 32 New York 9.623 15.2 47.612 74 892
## 2 Alaska 8.963 17.6 47.951 47 934
## 7 Connecticut 8.817 14.4 50.045 81 908
## 39 Rhode Island 7.469 14.7 40.729 70 888
## 21 Massachusetts 7.287 14.8 40.795 80 907
## 20 Maryland 7.245 17.0 40.661 64 909
## 38 Pennsylvania 7.109 17.1 44.510 70 880
## 8 Delaware 7.030 16.6 39.076 68 897
## 22 Michigan 6.994 20.1 41.895 11 1033
## 49 Wisconsin 6.930 15.9 37.746 9 1073
## 45 Vermont 6.750 13.8 35.406 68 901
## 37 Oregon 6.436 19.9 38.555 51 947
## 19 Maine 6.428 13.8 31.972 68 896
## 35 Ohio 6.162 16.6 36.802 23 975
## regressed_sat_scores
## 30 1013.9013
## 32 1011.9992
## 2 1003.6855
## 7 1001.8464
## 39 984.8662
## 21 982.5736
## 20 982.0445
## 38 980.3314
## 8 979.3363
## 22 978.8828
## 49 978.0766
## 45 975.8092
## 37 971.8539
## 19 971.7531
## 35 968.4024
##plot(sat_data$regressed_sat_scores,sat_data$expend)
##plot(sat_data$regressed_sat_scores,sat_data$Participation_rate)
##pairs(~regressed_sat_scores+ratio+Participation_rate+salary,data=sat_data,
## main="Scatterplot Matrix with regressed data",col=c("red"))
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- (cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * abs(r))
}
pairs(~regressed_sat_scores+expend+salary+ratio+Participation_rate,data=regression_scores,lower.panel = panel.smooth, upper.panel = panel.cor)
old_salary <- cor(sat_data$sat,sat_data$salary)
old_Participation_rate <- cor(sat_data$sat,sat_data$Participation_rate)
old_ratio <- cor(sat_data$sat,sat_data$ratio)
old_expend <- cor(sat_data$sat,sat_data$expend)
old_correlations <- c(old_salary,old_Participation_rate,old_ratio,old_expend)
new_salary <-cor(sat_data$regressed_sat_scores,sat_data$salary)
new_Participation_rate <-cor(sat_data$regressed_sat_scores,sat_data$Participation_rate)
new_ratio <-cor(sat_data$regressed_sat_scores,sat_data$ratio)
new_expend <-cor(sat_data$regressed_sat_scores,sat_data$expend)
new_correlations <-c(new_salary,new_Participation_rate,new_ratio,new_expend)
comparison <- data.frame(old_correlations,new_correlations)
row.names(comparison) <- c("salary","participation_Rate","ratio","expend")
comparison
## old_correlations new_correlations
## salary -0.43988338 0.8698015
## participation_Rate -0.88711868 0.5926274
## ratio 0.08125382 -0.3710254
## expend -0.38053700 1.0000000
plot(regression_scores$salary,regression_scores$sat,col="red",pch=2)
par(new=T)
plot(regression_scores$salary,regression_scores$regressed_sat_scores, pch=9,col="blue", axes=F, ylab="") # don't overwrite
par(new=T)
axis(side=4) # add axis
mtext(side=4,line=3.8,"regressed_sat_scores") # add label
legend("topright", legend=c("sat","regressed_sat_scores"), pch=c(2,9),col=c("red","blue"))