#Links to the Datasource

NYC sat scores for 2012-https://data.cityofnewyork.us/Education/2012-SAT-Results/f9bf-2cp4
school accountability- https://data.cityofnewyork.us/Education/2006-2012-School-Demographics-and-Accountability-S/ihfw-zy9j
NYC general education survey-https://data.cityofnewyork.us/Education/2012-NYC-General-Education-School-Survey/xiyj-m4sj

#include summary of dataset 
 #

#Explorotary data analysis

# How SAT is scored-(Ehtesham)

# check the code in the Joining the three datasets section to make sure that 
 # we have the nessesary variables + any accuracy issues (Ehtesham)

# add the survey response columns from all_joined to df dataframe (laknath)

# make characrter columns numeric that have SAT scores and percentage (Abinav)

# Based on the real data deternime how we combine scores/or whether to keep seperate (Abhinav)
  # should include histogram also boxplot (abhinav)

# see if there is difference in aveage scores between just high scool vs those other types
  # see if tere is difference between male and female (Ehtesham)

# see if there is a relationship between enrollment and SAT scores-scatterplot (Laknath)

# correlation plot (multi coleniarity) -Raj
    # also include the numbers -Raj

# plot histogram of (distribution) of SAT scores -Raj/Kushboo

dfr = all_joined%>%select(DBN,
                      `SCHOOL NAME`,
                      `Num of SAT Test Takers`,
                      total_enrollment,
                      `SAT Critical Reading Avg. Score`,
                      `SAT Math Avg. Score`,
                      `SAT Writing Avg. Score`,
                      male_per,
                      female_per,
                      black_per,
                      white_per,
                      hispanic_per,
                      asian_per,
                      ell_percent,
                      frl_percent) 

dfr[] <- lapply(dfr, function(x) as.character(gsub("s", NA, x))) #replacing "s" with NA's

#Char to Num
dfr[] <- lapply(dfr, function(x) type.convert(as.character(x), as.is = TRUE))

#Error check
dfr$totgen = dfr$male_per + dfr$female_per
dfr$totdiv = dfr$black_per + dfr$white_per + dfr$hispanic_per + dfr$asian_per

#Simpson Diversity index
dfr$div = 1 - (((dfr$black_per*(dfr$black_per-1))+(dfr$white_per*(dfr$white_per-1))+(dfr$hispanic_per*(dfr$hispanic_per-1))+(dfr$asian_per*(dfr$asian_per-1)))/9900)

# Missing data check
#summary(dfr)
#apply(dfr,2,function(x) sum(is.na(x)))
#apply(dfr,2,function(x) sum(is.na(x))*100/nrow(dfr)) #percentage of NAs

percentmiss = function(x){sum(is.na(x))/length(x)*100} #percent miss row
missing = apply(dfr,1,percentmiss)
table(missing)

## missing
##                0 5.55555555555556 22.2222222222222 66.6666666666667 
##              407                5               38                8 
## 72.2222222222222 88.8888888888889 94.4444444444444 
##                1               18                1

replace = subset(dfr, missing <= 5)
missing1 = apply(replace,1,percentmiss)
table(missing1)

## missing1
##   0 
## 407

dont = subset(dfr, missing > 5)
missing2 = apply(dont,1,percentmiss)
table(missing2)

## missing2
## 5.55555555555556 22.2222222222222 66.6666666666667 72.2222222222222 
##                5               38                8                1 
## 88.8888888888889 94.4444444444444 
##               18                1

#apply(replace,2,percentmiss) #missing in column


# Outlier check
mah = mahalanobis(replace[,-c(1,2,16,17,18)],
                    colMeans(replace[,-c(1,2,16,17,18)], na.rm=TRUE),
                    cov(replace[,-c(1,2,16,17,18)], use ="pairwise.complete.obs")
                    )
#mah
cutoff = qchisq(1-.001,ncol(replace))
#print(cutoff)
summary(mah < cutoff)

##    Mode   FALSE    TRUE 
## logical      15     392

noout = subset(replace, mah < cutoff) #Eliminate outliers
#str(noout)

Correlation

# Additivity check
corrplot(cor(noout[,-c(1,2,16,17)]))

cormat = cor(noout[,-c(1,2,16,17)]) #Correlation quantified

cormat %>%
  as.data.frame() %>%
  ztable() %>% 
  makeHeatmap(mycolor = gradientColor(low="red",mid="white",high="blue")) %>%
  print(caption="Correlation Heatmap")

Correlation Heatmap
	Num of SAT Test Takers	total_enrollment	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score	male_per	female_per	black_per	white_per	hispanic_per	asian_per	ell_percent	frl_percent	div
Num of SAT Test Takers	1.00	0.95	0.38	0.50	0.39	-0.00	0.00	-0.28	0.39	-0.18	0.56	-0.05	-0.39	0.37
total_enrollment	0.95	1.00	0.28	0.41	0.29	0.05	-0.05	-0.26	0.35	-0.15	0.49	-0.03	-0.34	0.34
SAT Critical Reading Avg. Score	0.38	0.28	1.00	0.90	0.97	-0.15	0.15	-0.20	0.65	-0.35	0.44	-0.48	-0.73	0.42
SAT Math Avg. Score	0.50	0.41	0.90	1.00	0.91	-0.05	0.05	-0.36	0.68	-0.31	0.65	-0.26	-0.68	0.49
SAT Writing Avg. Score	0.39	0.29	0.97	0.91	1.00	-0.19	0.19	-0.26	0.68	-0.32	0.46	-0.44	-0.72	0.42
male_per	-0.00	0.05	-0.15	-0.05	-0.19	1.00	-1.00	0.02	-0.07	0.01	0.01	0.12	0.06	-0.05
female_per	0.00	-0.05	0.15	0.05	0.19	-1.00	1.00	-0.02	0.07	-0.01	-0.01	-0.12	-0.06	0.05
black_per	-0.28	-0.26	-0.20	-0.36	-0.26	0.02	-0.02	1.00	-0.42	-0.64	-0.42	-0.37	0.03	-0.46
white_per	0.39	0.35	0.65	0.68	0.68	-0.07	0.07	-0.42	1.00	-0.30	0.43	-0.13	-0.70	0.51
hispanic_per	-0.18	-0.15	-0.35	-0.31	-0.32	0.01	-0.01	-0.64	-0.30	1.00	-0.28	0.40	0.56	-0.05
asian_per	0.56	0.49	0.44	0.65	0.46	0.01	-0.01	-0.42	0.43	-0.28	1.00	0.14	-0.41	0.56
ell_percent	-0.05	-0.03	-0.48	-0.26	-0.44	0.12	-0.12	-0.37	-0.13	0.40	0.14	1.00	0.41	-0.17
frl_percent	-0.39	-0.34	-0.73	-0.68	-0.72	0.06	-0.06	0.03	-0.70	0.56	-0.41	0.41	1.00	-0.35
div	0.37	0.34	0.42	0.49	0.42	-0.05	0.05	-0.46	0.51	-0.05	0.56	-0.17	-0.35	1.00

#Normality check
plot.new()
par(mfrow=c(3, 3)); hist(noout$`SAT Critical Reading Avg. Score`, breaks=15, main = "SAT Reading", xlab=NA, ylab=NA); hist(noout$`SAT Math Avg. Score`, breaks=15, main = "SAT Math", xlab=NA, ylab=NA); hist(noout$`SAT Writing Avg. Score`, breaks=15, main = "SAT Writing", xlab=NA, ylab=NA); hist(noout$ell_percent, breaks=15, main = "ELL", xlab=NA, ylab=NA); hist(noout$frl_percent, breaks=15, main = "FRL", xlab=NA, ylab=NA); hist(noout$female_per, breaks=15, main = "Female %", xlab=NA, ylab=NA); hist(noout$white_per, breaks=15, main = "White %", xlab=NA, ylab=NA); hist(noout$black_per, breaks=15, main = "Black %", xlab=NA, ylab=NA); hist(noout$asian_per, breaks=15, main = "Asian %", xlab=NA, ylab=NA)

#apply(noout[,-c(1,2,16,17)], 2, skewness, na.rm =TRUE)
#apply(noout[,-c(1,2,16,17)], 2, kurtosis, na.rm =TRUE)

hist(noout$div, breaks=25, main = "Diversity", xlab=NA, ylab=NA)

par(mfrow=c(2, 2)); plot(noout$white_per, noout$asian_per); plot(noout$white_per, noout$black_per); plot(noout$hispanic_per, noout$black_per); plot(noout$hispanic_per, noout$asian_per)

plot(noout$white_per, noout$`SAT Writing Avg. Score`)

NYSAT

EKLAR

null

Correlation