#Links to the Datasource

#include summary of dataset 
 #   

#Explorotary data analysis

# How SAT is scored-(Ehtesham)

# check the code in the Joining the three datasets section to make sure that 
 # we have the nessesary variables + any accuracy issues (Ehtesham)

# add the survey response columns from all_joined to df dataframe (laknath)

# make characrter columns numeric that have SAT scores and percentage (Abinav)

# Based on the real data deternime how we combine scores/or whether to keep seperate (Abhinav)
  # should include histogram also boxplot (abhinav)

# see if there is difference in aveage scores between just high scool vs those other types
  # see if tere is difference between male and female (Ehtesham)

# see if there is a relationship between enrollment and SAT scores-scatterplot (Laknath)

# correlation plot (multi coleniarity) -Raj
    # also include the numbers -Raj

# plot histogram of (distribution) of SAT scores -Raj/Kushboo
dfr = all_joined%>%select(DBN,
                      `SCHOOL NAME`,
                      `Num of SAT Test Takers`,
                      total_enrollment,
                      `SAT Critical Reading Avg. Score`,
                      `SAT Math Avg. Score`,
                      `SAT Writing Avg. Score`,
                      male_per,
                      female_per,
                      black_per,
                      white_per,
                      hispanic_per,
                      asian_per,
                      ell_percent,
                      frl_percent) 

dfr[] <- lapply(dfr, function(x) as.character(gsub("s", NA, x))) #replacing "s" with NA's

#Char to Num
dfr[] <- lapply(dfr, function(x) type.convert(as.character(x), as.is = TRUE))

#Error check
dfr$totgen = dfr$male_per + dfr$female_per
dfr$totdiv = dfr$black_per + dfr$white_per + dfr$hispanic_per + dfr$asian_per

#Simpson Diversity index
dfr$div = 1 - (((dfr$black_per*(dfr$black_per-1))+(dfr$white_per*(dfr$white_per-1))+(dfr$hispanic_per*(dfr$hispanic_per-1))+(dfr$asian_per*(dfr$asian_per-1)))/9900)

# Missing data check
#summary(dfr)
#apply(dfr,2,function(x) sum(is.na(x)))
#apply(dfr,2,function(x) sum(is.na(x))*100/nrow(dfr)) #percentage of NAs

percentmiss = function(x){sum(is.na(x))/length(x)*100} #percent miss row
missing = apply(dfr,1,percentmiss)
table(missing)
## missing
##                0 5.55555555555556 22.2222222222222 66.6666666666667 
##              407                5               38                8 
## 72.2222222222222 88.8888888888889 94.4444444444444 
##                1               18                1
replace = subset(dfr, missing <= 5)
missing1 = apply(replace,1,percentmiss)
table(missing1)
## missing1
##   0 
## 407
dont = subset(dfr, missing > 5)
missing2 = apply(dont,1,percentmiss)
table(missing2)
## missing2
## 5.55555555555556 22.2222222222222 66.6666666666667 72.2222222222222 
##                5               38                8                1 
## 88.8888888888889 94.4444444444444 
##               18                1
#apply(replace,2,percentmiss) #missing in column


# Outlier check
mah = mahalanobis(replace[,-c(1,2,16,17,18)],
                    colMeans(replace[,-c(1,2,16,17,18)], na.rm=TRUE),
                    cov(replace[,-c(1,2,16,17,18)], use ="pairwise.complete.obs")
                    )
#mah
cutoff = qchisq(1-.001,ncol(replace))
#print(cutoff)
summary(mah < cutoff)
##    Mode   FALSE    TRUE 
## logical      15     392
noout = subset(replace, mah < cutoff) #Eliminate outliers
#str(noout)

Correlation

# Additivity check
corrplot(cor(noout[,-c(1,2,16,17)]))

cormat = cor(noout[,-c(1,2,16,17)]) #Correlation quantified

cormat %>%
  as.data.frame() %>%
  ztable() %>% 
  makeHeatmap(mycolor = gradientColor(low="red",mid="white",high="blue")) %>%
  print(caption="Correlation Heatmap")
Correlation Heatmap
  Num of SAT Test Takers total_enrollment SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score male_per female_per black_per white_per hispanic_per asian_per ell_percent frl_percent div
Num of SAT Test Takers 1.00 0.95 0.38 0.50 0.39 -0.00 0.00 -0.28 0.39 -0.18 0.56 -0.05 -0.39 0.37
total_enrollment 0.95 1.00 0.28 0.41 0.29 0.05 -0.05 -0.26 0.35 -0.15 0.49 -0.03 -0.34 0.34
SAT Critical Reading Avg. Score 0.38 0.28 1.00 0.90 0.97 -0.15 0.15 -0.20 0.65 -0.35 0.44 -0.48 -0.73 0.42
SAT Math Avg. Score 0.50 0.41 0.90 1.00 0.91 -0.05 0.05 -0.36 0.68 -0.31 0.65 -0.26 -0.68 0.49
SAT Writing Avg. Score 0.39 0.29 0.97 0.91 1.00 -0.19 0.19 -0.26 0.68 -0.32 0.46 -0.44 -0.72 0.42
male_per -0.00 0.05 -0.15 -0.05 -0.19 1.00 -1.00 0.02 -0.07 0.01 0.01 0.12 0.06 -0.05
female_per 0.00 -0.05 0.15 0.05 0.19 -1.00 1.00 -0.02 0.07 -0.01 -0.01 -0.12 -0.06 0.05
black_per -0.28 -0.26 -0.20 -0.36 -0.26 0.02 -0.02 1.00 -0.42 -0.64 -0.42 -0.37 0.03 -0.46
white_per 0.39 0.35 0.65 0.68 0.68 -0.07 0.07 -0.42 1.00 -0.30 0.43 -0.13 -0.70 0.51
hispanic_per -0.18 -0.15 -0.35 -0.31 -0.32 0.01 -0.01 -0.64 -0.30 1.00 -0.28 0.40 0.56 -0.05
asian_per 0.56 0.49 0.44 0.65 0.46 0.01 -0.01 -0.42 0.43 -0.28 1.00 0.14 -0.41 0.56
ell_percent -0.05 -0.03 -0.48 -0.26 -0.44 0.12 -0.12 -0.37 -0.13 0.40 0.14 1.00 0.41 -0.17
frl_percent -0.39 -0.34 -0.73 -0.68 -0.72 0.06 -0.06 0.03 -0.70 0.56 -0.41 0.41 1.00 -0.35
div 0.37 0.34 0.42 0.49 0.42 -0.05 0.05 -0.46 0.51 -0.05 0.56 -0.17 -0.35 1.00
#Normality check
plot.new()
par(mfrow=c(3, 3)); hist(noout$`SAT Critical Reading Avg. Score`, breaks=15, main = "SAT Reading", xlab=NA, ylab=NA); hist(noout$`SAT Math Avg. Score`, breaks=15, main = "SAT Math", xlab=NA, ylab=NA); hist(noout$`SAT Writing Avg. Score`, breaks=15, main = "SAT Writing", xlab=NA, ylab=NA); hist(noout$ell_percent, breaks=15, main = "ELL", xlab=NA, ylab=NA); hist(noout$frl_percent, breaks=15, main = "FRL", xlab=NA, ylab=NA); hist(noout$female_per, breaks=15, main = "Female %", xlab=NA, ylab=NA); hist(noout$white_per, breaks=15, main = "White %", xlab=NA, ylab=NA); hist(noout$black_per, breaks=15, main = "Black %", xlab=NA, ylab=NA); hist(noout$asian_per, breaks=15, main = "Asian %", xlab=NA, ylab=NA)

#apply(noout[,-c(1,2,16,17)], 2, skewness, na.rm =TRUE)
#apply(noout[,-c(1,2,16,17)], 2, kurtosis, na.rm =TRUE)

hist(noout$div, breaks=25, main = "Diversity", xlab=NA, ylab=NA)

par(mfrow=c(2, 2)); plot(noout$white_per, noout$asian_per); plot(noout$white_per, noout$black_per); plot(noout$hispanic_per, noout$black_per); plot(noout$hispanic_per, noout$asian_per)

plot(noout$white_per, noout$`SAT Writing Avg. Score`)