OVERVIEW

The societal impact of income is well known. Can sports shape a childs life who otherwise would not be exposed to the positive influences and challenges that sports can bring.

Objective

In this data anaylsis we try to answer the question What is the future impact for children who play sports vs children that do not play sports?

Approach

We will use data from City of New York government site from 2013-2014. One dataset will list schools in NYC with their sports programs and the second dataset will show school grades

https://data.cityofnewyork.us/Education/DOE-High-School-Directory-2013-2014/u553-m549/data

DATA EXPLORATION

Import Data

After review of data, it was easier to deal with N/A and missing values during import rather than during data cleanup. Added na.strings=c("“,”N/A")

Data Prepartion

In data prepartion we are renaming DBN columns that were changed during the import. We get a subset of data of interest, renamed some columns to shorter names then merge the sports dataset with the grade dataset. We also convert the school grade columns to numerice by removing the % sign. Lastly, we creat a sports code column to show a 1 for schools with sports programs and 0 with schools wihtout sports programs.

#Set N/A rows for school sports column to NA
#nrow(subset(schlsprts,is.na(schlsprts$School.Sports)))
#is.na(schlsprts$School.Sports)<-schlsprts$School.Sports == "N/A"
#nrow(subset(schlsprts,is.na(schlsprts$School.Sports)))

#Rename DBA Columns
names(schlsprts)[names(schlsprts) == "ï..DBN"] <- "DBN"
names(schlgrds)[names(schlgrds) == "ï..DBN"] <- "DBN"

#Select subset columns and rename
sub_schlsprts<-subset(schlsprts, select = c(DBN,BN,Printed_Name,Total.Student.10.26,School_Type,Language.Classes,Advanced.Placement.Courses,Extra_Leadership.Support,School.Sports,Online.AP.Courses,Online.Language.Courses,School_Type2,neighborhood,postalCode,precinct,school_district,latitude,longitude,Location.1,Borough,Council.District,Census.Tract))

names(sub_schlsprts)<-c('DBN','BN','Printed_Name','TotStudnts','School_Type','LangClass','APCourses','LeaderSupport','Sports','OnlineAP','OnlineLang','School_Type2','neighborhood','postalCode','precinct','school_district','lat','lon','Location','Boro','CouncilDistrict','CensusTract')

sub_schlgrds<-subset(schlgrds, select = c(DBN,graduation.2010.11,college.enroll.2010.11,graduation.2011.12,college.enroll.2011.12))

names(sub_schlgrds)<-c('DBN','grad2011','collenroll2011','grad2012','collenroll2012')

#get subset of rows with School sports NA or N/A values
#subset(schlsprts,is.na(schlsprts$School.Sports)|schlsprts$School.Sports == "N/A")
#set sumarry columns from characters to numeric
sub_schlgrds[,-1]<-lapply(sub_schlgrds[,-1], function(x) as.numeric(sub("%","",x)))

#Merge School and sports details with summary data from performance details
mdata<-merge(sub_schlsprts, sub_schlgrds, by="DBN")

#Add column to show if a school as sports programs
mdata$sprtcode<- as.factor(ifelse(is.na(mdata$Sports), 0, 1))

Data Summary

The summary summarises the numeric coluns in the grade dataset and provides a glimpse of the merged dataset.

##      DBN               grad2011      collenroll2011      grad2012    
##  Length:422         Min.   : 41.20   Min.   : 14.10   Min.   : 25.5  
##  Class :character   1st Qu.: 62.52   1st Qu.: 39.10   1st Qu.: 63.5  
##  Mode  :character   Median : 72.60   Median : 48.90   Median : 73.5  
##                     Mean   : 73.80   Mean   : 53.12   Mean   : 73.7  
##                     3rd Qu.: 85.08   3rd Qu.: 65.60   3rd Qu.: 84.7  
##                     Max.   :100.00   Max.   :100.00   Max.   :100.0  
##                     NA's   :112      NA's   :131      NA's   :85     
##  collenroll2012  
##  Min.   :  8.90  
##  1st Qu.: 40.23  
##  Median : 50.35  
##  Mean   : 53.93  
##  3rd Qu.: 66.28  
##  Max.   :100.00  
##  NA's   :108
## Observations: 422
## Variables: 27
## $ DBN             <chr> "01M292", "01M448", "01M450", "01M509", "01M539", "...
## $ BN              <chr> "M292", "M448", "M450", "M509", "M539", "M696", "M0...
## $ Printed_Name    <chr> "Henry Street School for International Studies", "U...
## $ TotStudnts      <chr> "388", "331", "636", "367", "1,672", "601", "189", ...
## $ School_Type     <chr> NA, NA, NA, NA, NA, NA, NA, "CTE, New School", "CTE...
## $ LangClass       <chr> "Chinese (Mandarin), Spanish", "Chinese, Spanish", ...
## $ APCourses       <chr> "Psychology", "Calculus AB, Chinese Language and Cu...
## $ LeaderSupport   <chr> "Math through Card Play, Poetry Club, Drama Club, B...
## $ Sports          <chr> "Boxing, CHAMPS, Double Dutch, Table tennis, Track ...
## $ OnlineAP        <chr> "Chinese Language and Culture, Spanish Literature",...
## $ OnlineLang      <chr> "Chinese (Mandarin), Spanish", NA, "American Sign L...
## $ School_Type2    <chr> NA, NA, NA, NA, NA, NA, NA, "CTE, New School", "CTE...
## $ neighborhood    <chr> "Lower East Side", "Lower East Side", "East Village...
## $ postalCode      <chr> "10,002", "10,002", "10,009", "10,002", "10,002", "...
## $ precinct        <int> 7, 7, 9, 7, 7, 7, 13, 18, 5, 18, 7, 18, 5, 18, 18, ...
## $ school_district <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ lat             <dbl> 40.71348, 40.71223, 40.72985, 40.72055, 40.71949, 4...
## $ lon             <dbl> -73.98532, -73.98382, -73.98315, -73.98568, -73.979...
## $ Location        <chr> "220 Henry Street\nNew York, NY 10002\n(40.7134809,...
## $ Boro            <chr> "MANHATTAN", "MANHATTAN", "MANHATTAN", "MANHATTAN",...
## $ CouncilDistrict <int> 1, 1, 2, 1, 2, 2, 2, 3, 1, 3, 1, 3, 1, 3, 3, 1, 1, ...
## $ CensusTract     <chr> "201", "202", "34", "3,001", "2,201", "1,002", "64"...
## $ grad2011        <dbl> 56.3, 70.7, 71.6, 56.4, 95.3, 97.6, 69.6, NA, NA, 8...
## $ collenroll2011  <dbl> 51.9, 36.3, 69.2, 47.7, 87.0, 95.7, 47.1, NA, NA, 6...
## $ grad2012        <dbl> 50.7, 74.4, 72.7, 59.5, 97.5, 96.6, 54.8, NA, NA, 7...
## $ collenroll2012  <dbl> 40.0, 67.3, 55.7, 47.9, 82.8, 96.8, 30.4, NA, NA, 5...
## $ sprtcode        <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

Modeling

We begin with a model of sports code vs all of the college grad columns

## 
## Call:
## glm(formula = sprtcode ~ grad2011 + collenroll2011 + grad2012 + 
##     collenroll2012, family = binomial(link = "logit"), data = mdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7624  -1.4008   0.7847   0.8857   1.1624  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     2.33139    0.86454   2.697   0.0070 **
## grad2011       -0.01577    0.02202  -0.716   0.4738   
## collenroll2011 -0.01012    0.01410  -0.718   0.4729   
## grad2012       -0.02264    0.01938  -1.168   0.2426   
## collenroll2012  0.03427    0.01735   1.975   0.0483 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 359.24  on 289  degrees of freedom
## Residual deviance: 353.62  on 285  degrees of freedom
##   (132 observations deleted due to missingness)
## AIC: 363.62
## 
## Number of Fisher Scoring iterations: 4

Visualizations

In the visualizaitons we start by showing the columns with missing data, wiht some basic residual plots.

The comparison of graduate to college enroll for 2011 and 2012 shows there is an increase in percentage as more sports programs are added.

These qq density plots hint that there may be a link to a sports vs students that graduate and go to college where schools have 80% or more students in that category.

These histogram filter for >80% and split the sport vs no sport programs.

I need to do more modeling to see what other predictors should be added to show if a combination with sports would have more impact to a students probability to graduate and apply to college. Also, i would do more of an web interactive presentation. This is just a static draft

APPENDIX

Code used in analysis

knitr::opts_chunk$set(
    echo = FALSE,
    message = FALSE,
    warning = FALSE
)
require(knitr)
library(ggplot2)
library(tidyr)
library(MASS)
library(psych)
library(kableExtra)
library(dplyr)
library(faraway)
library(gridExtra)
library(reshape2)
library(leaps)
library(pROC)
library(caret)
library(naniar)
library(pander)
library(pROC)
library(plotly)
library(effects)
schlsprts <- read.csv("DOE_High_School_Directory_2013-2014.csv",na.strings=c("","N/A"),header=TRUE, stringsAsFactors = FALSE)
schlgrds <- read.csv("2013_-_2014_DOE_High_School_Performance-Directory.csv",na.strings=c("","N/A"),header=TRUE, stringsAsFactors = FALSE)

#Set N/A rows for school sports column to NA
#nrow(subset(schlsprts,is.na(schlsprts$School.Sports)))
#is.na(schlsprts$School.Sports)<-schlsprts$School.Sports == "N/A"
#nrow(subset(schlsprts,is.na(schlsprts$School.Sports)))

#Rename DBA Columns
names(schlsprts)[names(schlsprts) == "ï..DBN"] <- "DBN"
names(schlgrds)[names(schlgrds) == "ï..DBN"] <- "DBN"

#Select subset columns and rename
sub_schlsprts<-subset(schlsprts, select = c(DBN,BN,Printed_Name,Total.Student.10.26,School_Type,Language.Classes,Advanced.Placement.Courses,Extra_Leadership.Support,School.Sports,Online.AP.Courses,Online.Language.Courses,School_Type2,neighborhood,postalCode,precinct,school_district,latitude,longitude,Location.1,Borough,Council.District,Census.Tract))

names(sub_schlsprts)<-c('DBN','BN','Printed_Name','TotStudnts','School_Type','LangClass','APCourses','LeaderSupport','Sports','OnlineAP','OnlineLang','School_Type2','neighborhood','postalCode','precinct','school_district','lat','lon','Location','Boro','CouncilDistrict','CensusTract')

sub_schlgrds<-subset(schlgrds, select = c(DBN,graduation.2010.11,college.enroll.2010.11,graduation.2011.12,college.enroll.2011.12))

names(sub_schlgrds)<-c('DBN','grad2011','collenroll2011','grad2012','collenroll2012')

#get subset of rows with School sports NA or N/A values
#subset(schlsprts,is.na(schlsprts$School.Sports)|schlsprts$School.Sports == "N/A")
#set sumarry columns from characters to numeric
sub_schlgrds[,-1]<-lapply(sub_schlgrds[,-1], function(x) as.numeric(sub("%","",x)))

#Merge School and sports details with summary data from performance details
mdata<-merge(sub_schlsprts, sub_schlgrds, by="DBN")

#Add column to show if a school as sports programs
mdata$sprtcode<- as.factor(ifelse(is.na(mdata$Sports), 0, 1))
summary(sub_schlgrds)
glimpse(mdata)

glmod<-glm(sprtcode~grad2011+collenroll2011+grad2012+collenroll2012,data=mdata,family="binomial"(link="logit"))
summary(glmod)
vis_miss(mdata)
gg_miss_upset(mdata)
par(mfrow=(c(2,2)))
plot(glmod)

ggplot(mdata, aes(x=grad2011, y=collenroll2011, color = sprtcode)) +geom_point() +stat_smooth(method="lm", se=TRUE)
ggplot(mdata, aes(x=grad2012, y=collenroll2012, color = sprtcode)) +geom_point() +stat_smooth(method="lm", se=TRUE)


qplot(grad2011, data = mdata, geom = "density",
      fill = sprtcode)
qplot(grad2012, data = mdata, geom = "density",
      fill = sprtcode)
qplot(collenroll2011, data = mdata, geom = "density",
      fill = sprtcode)
qplot(collenroll2012, data = mdata, geom = "density",
      fill = sprtcode)

qplot(grad2011, data = subset(mdata,grad2011 > 80), geom = "histogram",fill = sprtcode)+facet_wrap(~ sprtcode) 
qplot(grad2012, data = subset(mdata,grad2012 > 80), geom = "histogram",
      fill = sprtcode)+facet_wrap(~ sprtcode) 
qplot(collenroll2011, data = subset(mdata,collenroll2011 > 80), geom = "histogram",
      fill = sprtcode)+facet_wrap(~ sprtcode) 
qplot(collenroll2012, data = subset(mdata,collenroll2012 > 80), geom = "histogram",
      fill = sprtcode)+facet_wrap(~ sprtcode)