- Research Question: Do environmental factors (like air quality) impact a student’s academic performance?
2022-11-23
dbn_url <- "https://raw.githubusercontent.com/andrewbowen19/cunyDATA607/main/projects/final_project/2006_-_2012__Math_Test_Results__-_All_Students.csv" tests <- read.csv(dbn_url) # head(tests)
Looking up school district GIS data from NYC Open Data. School point locations can be found here, we’ll download them and unzip the file to get the shp file
# Create temporary files temp <- tempfile() temp2 <- tempfile() # Download the zip file and save to 'temp' school_locs_url <- "https://data.cityofnewyork.us/download/jfju-ynrr/application%2Fzip" download.file(school_locs_url, temp) # Unzip the folder contents and then read in the shp file to a dataframe unzip(zipfile = temp, exdir = temp2) school_locs <- sf::read_sf(temp2)
## Simple feature collection with 6 features and 19 fields ## Geometry type: POINT ## Dimension: XY ## Bounding box: xmin: -74.01177 ymin: 40.58348 xmax: -73.95122 ymax: 40.65642 ## Geodetic CRS: WGS 84 ## # A tibble: 6 × 20 ## DBN BORO BORONUM LOC_CODE SCHOOL…¹ SCH_T…² MANAG…³ GEO_D…⁴ ADMIN…⁵ ADDRESS ## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> ## 1 15K001 K 2 K001 P.S. 00… Elemen… 1 15 15 309 47… ## 2 17K002 K 2 K002 M.S. 002 Junior… 1 17 17 655 PA… ## 3 21K095 K 2 K095 P.S. 09… K-8 1 21 21 345 VA… ## 4 21K096 K 2 K096 I.S. 09… Junior… 1 21 21 99 AVE… ## 5 21K097 K 2 K097 P.S. 97… Elemen… 1 21 21 1855 S… ## 6 21K098 K 2 K098 I.S. 98… Junior… 1 21 21 1401 E… ## # … with 10 more variables: STATE_CODE <chr>, ZIP <dbl>, PRINCIPAL <chr>, ## # PRIN_PH <chr>, FAX <chr>, GRADES <chr>, City <chr>, geometry <POINT [°]>, ## # lat <dbl>, lon <dbl>, and abbreviated variable names ¹SCHOOLNAME, ## # ²SCH_TYPE, ³MANAGED_BY, ⁴GEO_DISTRI, ⁵ADMIN_DIST
Let’s join our school_locs dataframe to our tests
# Adding in borough lookup as well
school_boro_lookup = data.frame(BORO=c("K", "M", "Q", "R", "X"),
borough=c("Brooklyn", "Manhattan", "Queens", "Staten Island", "Bronx")
)
school_data <- as.data.frame(left_join(tests, school_locs, by="DBN")) %>%
left_join(., school_boro_lookup, by="BORO") %>%
rename("year" = "Year") %>%
mutate(year = as.character(year))
head(school_data)
## DBN Grade year Demographic Number.Tested Mean.Scale.Score Num.Level.1 ## 1 01M015 3 2006 All Students 39 667 2 ## 2 01M015 3 2007 All Students 31 672 2 ## 3 01M015 3 2008 All Students 37 668 0 ## 4 01M015 3 2009 All Students 33 668 0 ## 5 01M015 3 2010 All Students 26 677 6 ## 6 01M015 3 2011 All Students 28 671 10 ## Pct.Level.1 Num.Level.2 Pct.Level.2 Num.Level.3 Pct.Level.3 Num.Level.4 ## 1 5.1 11 28.2 20 51.3 6 ## 2 6.5 3 9.7 22 71 4 ## 3 0 6 16.2 29 78.4 2 ## 4 0 4 12.1 28 84.8 1 ## 5 23.1 12 46.2 6 23.1 2 ## 6 35.7 13 46.4 5 17.9 0 ## Pct.Level.4 Num.Level.3.and.4 Pct.Level.3.and.4 BORO BORONUM LOC_CODE ## 1 15.4 26 66.7 M 1 M015 ## 2 12.9 26 83.9 M 1 M015 ## 3 5.4 31 83.8 M 1 M015 ## 4 3 29 87.9 M 1 M015 ## 5 7.7 8 30.8 M 1 M015 ## 6 0 5 17.9 M 1 M015 ## SCHOOLNAME SCH_TYPE MANAGED_BY GEO_DISTRI ADMIN_DIST ## 1 P.S. 015 ROBERTO CLEMENTE Elementary 1 1 1 ## 2 P.S. 015 ROBERTO CLEMENTE Elementary 1 1 1 ## 3 P.S. 015 ROBERTO CLEMENTE Elementary 1 1 1 ## 4 P.S. 015 ROBERTO CLEMENTE Elementary 1 1 1 ## 5 P.S. 015 ROBERTO CLEMENTE Elementary 1 1 1 ## 6 P.S. 015 ROBERTO CLEMENTE Elementary 1 1 1 ## ADDRESS STATE_CODE ZIP PRINCIPAL PRIN_PH FAX ## 1 333 EAST 4 STREET NY 10009 IRENE SANCHEZ 212-228-8730 212-477-0931 ## 2 333 EAST 4 STREET NY 10009 IRENE SANCHEZ 212-228-8730 212-477-0931 ## 3 333 EAST 4 STREET NY 10009 IRENE SANCHEZ 212-228-8730 212-477-0931 ## 4 333 EAST 4 STREET NY 10009 IRENE SANCHEZ 212-228-8730 212-477-0931 ## 5 333 EAST 4 STREET NY 10009 IRENE SANCHEZ 212-228-8730 212-477-0931 ## 6 333 EAST 4 STREET NY 10009 IRENE SANCHEZ 212-228-8730 212-477-0931 ## GRADES City geometry lat ## 1 PK,0K,01,02,03,04,05,SE MANHATTAN POINT (-73.97881 40.72189) 40.72189 ## 2 PK,0K,01,02,03,04,05,SE MANHATTAN POINT (-73.97881 40.72189) 40.72189 ## 3 PK,0K,01,02,03,04,05,SE MANHATTAN POINT (-73.97881 40.72189) 40.72189 ## 4 PK,0K,01,02,03,04,05,SE MANHATTAN POINT (-73.97881 40.72189) 40.72189 ## 5 PK,0K,01,02,03,04,05,SE MANHATTAN POINT (-73.97881 40.72189) 40.72189 ## 6 PK,0K,01,02,03,04,05,SE MANHATTAN POINT (-73.97881 40.72189) 40.72189 ## lon borough ## 1 -73.97881 Manhattan ## 2 -73.97881 Manhattan ## 3 -73.97881 Manhattan ## 4 -73.97881 Manhattan ## 5 -73.97881 Manhattan ## 6 -73.97881 Manhattan
Using this NYC Open Data Air Quality Dataset from their API. Going to call their API
borough column to our Air quality dataset# Getting data from API call
# https://data.cityofnewyork.us/Environment/Air-Quality/c3uy-2p5r
nyc_open_data_aq_url <- "https://data.cityofnewyork.us/resource/c3uy-2p5r.json"
nyc_aq <- as.data.frame(fromJSON(nyc_open_data_aq_url))
# Adding in borough lookup code to compare to school data
# Also adding year column to compare with testing data
borough_lookup <- data.frame(borough_code = c("1", "2", "3", "4", "5"),
borough=c("Bronx",
"Brooklyn",
"Manhattan",
"Queens",
"Staten Island"))
nyc_aq <- nyc_aq %>%
mutate(borough_code = str_extract(geo_join_id, "^\\d"),
year = str_extract(start_date, "^\\d{4}")) %>%
left_join(., borough_lookup, by="borough_code")
glimpse(nyc_aq)
## Rows: 1,000 ## Columns: 14 ## $ unique_id <chr> "216498", "216499", "219969", "219970", "164876", "1648… ## $ indicator_id <chr> "386", "386", "386", "386", "383", "383", "386", "386",… ## $ name <chr> "Ozone (O3)", "Ozone (O3)", "Ozone (O3)", "Ozone (O3)",… ## $ measure <chr> "Mean", "Mean", "Mean", "Mean", "Mean", "Mean", "Mean",… ## $ measure_info <chr> "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "ppb",… ## $ geo_type_name <chr> "CD", "CD", "Borough", "Borough", "CD", "CD", "Borough"… ## $ geo_join_id <chr> "313", "313", "1", "1", "211", "212", "2", "2", "301", … ## $ geo_place_name <chr> "Coney Island (CD13)", "Coney Island (CD13)", "Bronx", … ## $ time_period <chr> "Summer 2013", "Summer 2014", "Summer 2013", "Summer 20… ## $ start_date <chr> "2013-06-01T00:00:00.000", "2014-06-01T00:00:00.000", "… ## $ data_value <chr> "34.64", "33.22", "31.25", "31.15", "5.89", "5.75", "26… ## $ borough_code <chr> "3", "3", "1", "1", "2", "2", "2", "2", "3", "3", "3", … ## $ year <chr> "2013", "2014", "2013", "2014", "2008", "2008", "2009",… ## $ borough <chr> "Manhattan", "Manhattan", "Bronx", "Bronx", "Brooklyn",…
df <- left_join(school_data, nyc_aq, by=c("borough", "year"))
glimpse(df)
## Rows: 307,403 ## Columns: 61 ## $ DBN <chr> … ## $ Grade <chr> … ## $ year <chr> … ## $ Demographic <chr> … ## $ Number.Tested <int> … ## $ Mean.Scale.Score <chr> … ## $ Num.Level.1 <chr> … ## $ Pct.Level.1 <chr> … ## $ Num.Level.2 <chr> … ## $ Pct.Level.2 <chr> … ## $ Num.Level.3 <chr> … ## $ Pct.Level.3 <chr> … ## $ Num.Level.4 <chr> … ## $ Pct.Level.4 <chr> … ## $ Num.Level.3.and.4 <chr> … ## $ Pct.Level.3.and.4 <chr> … ## $ BORO <chr> … ## $ BORONUM <dbl> … ## $ LOC_CODE <chr> … ## $ SCHOOLNAME <chr> … ## $ SCH_TYPE <chr> … ## $ MANAGED_BY <dbl> … ## $ GEO_DISTRI <dbl> … ## $ ADMIN_DIST <dbl> … ## $ ADDRESS <chr> … ## $ STATE_CODE <chr> … ## $ ZIP <dbl> … ## $ PRINCIPAL <chr> … ## $ PRIN_PH <chr> … ## $ FAX <chr> … ## $ GRADES <chr> … ## $ City <chr> … ## $ geometry <POINT [°]> … ## $ lat <dbl> … ## $ lon <dbl> … ## $ borough <chr> … ## $ unique_id <chr> … ## $ indicator_id <chr> … ## $ measure <chr> … ## $ measure_info <chr> … ## $ geo_type_name <chr> … ## $ geo_join_id <chr> … ## $ geo_place_name <chr> … ## $ time_period <chr> … ## $ start_date <chr> … ## $ borough_code <chr> … ## $ ozone <dbl> … ## $ sulfur_o2 <dbl> … ## $ `PM2.5-Attributable Deaths` <chr> … ## $ `Boiler Emissions- Total SO2 Emissions` <chr> … ## $ `Boiler Emissions- Total PM2.5 Emissions` <chr> … ## $ `Boiler Emissions- Total NOx Emissions` <chr> … ## $ `Air Toxics Concentrations- Average Benzene Concentrations` <chr> … ## $ `Air Toxics Concentrations- Average Formaldehyde Concentrations` <chr> … ## $ `PM2.5-Attributable Asthma Emergency Department Visits` <chr> … ## $ `PM2.5-Attributable Respiratory Hospitalizations (Adults 20 Yrs and Older)` <chr> … ## $ `PM2.5-Attributable Cardiovascular Hospitalizations (Adults 40 Yrs and Older)` <chr> … ## $ `Traffic Density- Annual Vehicle Miles Traveled` <chr> … ## $ `O3-Attributable Cardiac and Respiratory Deaths` <chr> … ## $ `O3-Attributable Asthma Emergency Department Visits` <chr> … ## $ `O3-Attributable Asthma Hospitalizations` <chr> …
Let’s get a sense of the distributions for our ozone variable
# SO2 distribution ggplot(df, aes(x=sulfur_o2)) + geom_histogram(binwidth=bin_width)
Running an Analysis of Variance (ANOVA) hypothesis test to see if there’s a statistically significant different in ozone levels between boroughs.We’ll conduct this test with significance level (\(\alpha = 0.05\)) and the following hypotheses:
\(H_0\): There is no difference in mean ozone (O3) levels between NYC boroughs
\(H_a\): There is a difference in mean ozone (O3) levels between NYC boroughs
ozone_aov <- aov(ozone ~ borough, data = df ) summary(ozone_aov)
## Df Sum Sq Mean Sq F value Pr(>F) ## borough 1 67576 67576 6310 <2e-16 *** ## Residuals 9710 103984 11 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## 297691 observations deleted due to missingness
\(H_0\): There is no difference in mean sulfur dioxide (SO2) levels between NYC boroughs
\(H_a\): There is a difference in mean sulfur dioxide (SO2) levels between NYC boroughs
so2_aov <- aov(sulfur_o2 ~ borough, data = df ) summary(so2_aov)
## Df Sum Sq Mean Sq F value Pr(>F) ## borough 1 22844 22844 6608 <2e-16 *** ## Residuals 20329 70277 3 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## 287072 observations deleted due to missingness
Use the built-in cor.test function to check if air quality measurements correlate with test scores
cor.test(x = df$ozone, y = df$mean_scaled_score)
## ## Pearson's product-moment correlation ## ## data: df$ozone and df$mean_scaled_score ## t = -2.7795, df = 307401, p-value = 0.005445 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## -0.008548011 -0.001478108 ## sample estimates: ## cor ## -0.005013122
cor.test(x = df$sulfur_o2, y = df$mean_scaled_score)
## ## Pearson's product-moment correlation ## ## data: df$sulfur_o2 and df$mean_scaled_score ## t = -11.434, df = 307401, p-value < 2.2e-16 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## -0.02415201 -0.01708493 ## sample estimates: ## cor ## -0.02061873
Some further work on this topic I’d like to incorporate
More granular air quality data
Try using different environmental factors (pollution, etc.) as inputs to our hypothesis testing
Updated academic data (our dataset only ran from 2006 - 2012)
Using academic data besides test scores. Tests are not always indiicative of a student’s academic skill. Grade point average could give a good sense of longer-term effects of environmental impacts on students’ performances