#Links to the Datasource
NYC sat scores for 2012-https://data.cityofnewyork.us/Education/2012-SAT-Results/f9bf-2cp4
school accountability- https://data.cityofnewyork.us/Education/2006-2012-School-Demographics-and-Accountability-S/ihfw-zy9j
NYC general education survey-https://data.cityofnewyork.us/Education/2012-NYC-General-Education-School-Survey/xiyj-m4sj
#include summary of dataset
#
#Explorotary data analysis
# How SAT is scored-(Ehtesham)
# check the code in the Joining the three datasets section to make sure that
# we have the nessesary variables + any accuracy issues (Ehtesham)
# add the survey response columns from all_joined to df dataframe (laknath)
# make characrter columns numeric that have SAT scores and percentage (Abinav)
# Based on the real data deternime how we combine scores/or whether to keep seperate (Abhinav)
# should include histogram also boxplot (abhinav)
# see if there is difference in aveage scores between just high scool vs those other types
# see if tere is difference between male and female (Ehtesham)
# see if there is a relationship between enrollment and SAT scores-scatterplot (Laknath)
# correlation plot (multi coleniarity) -Raj
# also include the numbers -Raj
# plot histogram of (distribution) of SAT scores -Raj/Kushboo
dfr = all_joined%>%select(DBN,
`SCHOOL NAME`,
`Num of SAT Test Takers`,
total_enrollment,
`SAT Critical Reading Avg. Score`,
`SAT Math Avg. Score`,
`SAT Writing Avg. Score`,
male_per,
female_per,
black_per,
white_per,
hispanic_per,
asian_per,
ell_percent,
frl_percent)
dfr[] <- lapply(dfr, function(x) as.character(gsub("s", NA, x))) #replacing "s" with NA's
#Char to Num
dfr[] <- lapply(dfr, function(x) type.convert(as.character(x), as.is = TRUE))
#Error check
dfr$totgen = dfr$male_per + dfr$female_per
dfr$totdiv = dfr$black_per + dfr$white_per + dfr$hispanic_per + dfr$asian_per
#Simpson Diversity index
dfr$div = 1 - (((dfr$black_per*(dfr$black_per-1))+(dfr$white_per*(dfr$white_per-1))+(dfr$hispanic_per*(dfr$hispanic_per-1))+(dfr$asian_per*(dfr$asian_per-1)))/9900)
# Missing data check
#summary(dfr)
#apply(dfr,2,function(x) sum(is.na(x)))
#apply(dfr,2,function(x) sum(is.na(x))*100/nrow(dfr)) #percentage of NAs
percentmiss = function(x){sum(is.na(x))/length(x)*100} #percent miss row
missing = apply(dfr,1,percentmiss)
table(missing)
## missing
## 0 5.55555555555556 22.2222222222222 66.6666666666667
## 407 5 38 8
## 72.2222222222222 88.8888888888889 94.4444444444444
## 1 18 1
replace = subset(dfr, missing <= 5)
missing1 = apply(replace,1,percentmiss)
table(missing1)
## missing1
## 0
## 407
dont = subset(dfr, missing > 5)
missing2 = apply(dont,1,percentmiss)
table(missing2)
## missing2
## 5.55555555555556 22.2222222222222 66.6666666666667 72.2222222222222
## 5 38 8 1
## 88.8888888888889 94.4444444444444
## 18 1
#apply(replace,2,percentmiss) #missing in column
# Outlier check
mah = mahalanobis(replace[,-c(1,2,16,17,18)],
colMeans(replace[,-c(1,2,16,17,18)], na.rm=TRUE),
cov(replace[,-c(1,2,16,17,18)], use ="pairwise.complete.obs")
)
#mah
cutoff = qchisq(1-.001,ncol(replace))
#print(cutoff)
summary(mah < cutoff)
## Mode FALSE TRUE
## logical 15 392
noout = subset(replace, mah < cutoff) #Eliminate outliers
#str(noout)
# Additivity check
corrplot(cor(noout[,-c(1,2,16,17)]))
cormat = cor(noout[,-c(1,2,16,17)]) #Correlation quantified
cormat %>%
as.data.frame() %>%
ztable() %>%
makeHeatmap(mycolor = gradientColor(low="red",mid="white",high="blue")) %>%
print(caption="Correlation Heatmap")
| Â | Num of SAT Test Takers | total_enrollment | SAT Critical Reading Avg. Score | SAT Math Avg. Score | SAT Writing Avg. Score | male_per | female_per | black_per | white_per | hispanic_per | asian_per | ell_percent | frl_percent | div |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Num of SAT Test Takers | 1.00 | 0.95 | 0.38 | 0.50 | 0.39 | -0.00 | 0.00 | -0.28 | 0.39 | -0.18 | 0.56 | -0.05 | -0.39 | 0.37 |
#Normality check
plot.new()
par(mfrow=c(3, 3)); hist(noout$`SAT Critical Reading Avg. Score`, breaks=15, main = "SAT Reading", xlab=NA, ylab=NA); hist(noout$`SAT Math Avg. Score`, breaks=15, main = "SAT Math", xlab=NA, ylab=NA); hist(noout$`SAT Writing Avg. Score`, breaks=15, main = "SAT Writing", xlab=NA, ylab=NA); hist(noout$ell_percent, breaks=15, main = "ELL", xlab=NA, ylab=NA); hist(noout$frl_percent, breaks=15, main = "FRL", xlab=NA, ylab=NA); hist(noout$female_per, breaks=15, main = "Female %", xlab=NA, ylab=NA); hist(noout$white_per, breaks=15, main = "White %", xlab=NA, ylab=NA); hist(noout$black_per, breaks=15, main = "Black %", xlab=NA, ylab=NA); hist(noout$asian_per, breaks=15, main = "Asian %", xlab=NA, ylab=NA)
#apply(noout[,-c(1,2,16,17)], 2, skewness, na.rm =TRUE)
#apply(noout[,-c(1,2,16,17)], 2, kurtosis, na.rm =TRUE)
hist(noout$div, breaks=25, main = "Diversity", xlab=NA, ylab=NA)
par(mfrow=c(2, 2)); plot(noout$white_per, noout$asian_per); plot(noout$white_per, noout$black_per); plot(noout$hispanic_per, noout$black_per); plot(noout$hispanic_per, noout$asian_per)
plot(noout$white_per, noout$`SAT Writing Avg. Score`)