library(RSocrata) # for loading the data from NYC Open Data
library(tidyverse) # for data analysis and visualizations
library(ggcharts) # for easy visualizations based on ggplot2
library(ggblanket) # for easy visualizations based on ggplot2
library(knitr) # for printing tables
library(DT) # for interactive tables in html format
data <- read.socrata("https://data.cityofnewyork.us/resource/f6s7-vytj.csv")

Selected variables (columns)

columns <-c("district","name","borough","latitude","longitude",
            "coursepassrate","elaprof","mathprof","surveysafety",
            "totalstudents","gradespan","tophs1","tophs2","tophs3",
            "acceleratedclasses","electiveclasses","languageclasses",
            "diversityinadmissions")
mydata <- data |> 
  select(all_of(columns))

Exploratory Data Analysis with Tidyverse

mydata |>
  select(1:3) |>
  slice(1:7) |> kable()
district name borough
1 P.S. 034 Franklin D. Roosevelt MANHATTAN
1 P.S. 140 Nathan Straus MANHATTAN
1 P.S. 184m Shuang Wen MANHATTAN
1 P.S. 188 The Island School MANHATTAN
1 University Neighborhood Middle School MANHATTAN
1 School for Global Leaders MANHATTAN
1 East Side Community School MANHATTAN

Filtering the data

mydata |> 
  filter(borough == "MANHATTAN") |> 
  select(c(name,coursepassrate,mathprof,elaprof)) |> 
  arrange(desc(mathprof)) |> 
  slice(1:10) |> 
  kable()
name coursepassrate mathprof elaprof
Special Music School 100 98 93
East Side Middle School 100 97 94
The Anderson School 100 97 97
Tag Young Scholars 99 96 92
New Explorations into Science, Technology & Math 99 95 94
New York City Lab Middle School for Collaborative Studies 100 94 91
The Clinton School 100 93 92
M.S. 255 Salk School of Science 100 92 94
M.S. 243 Center School 99 90 94
Columbia Secondary School 95 88 87

District Statistics

mydata |> 
  filter(district == 13 | district == 2) |> 
  select(c(district,name,mathprof)) |> 
  arrange(desc(mathprof)) |> 
  datatable() # for html only
district_stats <- mydata |> 
  group_by(district) |> 
  summarize(med_mathprof = median(mathprof, na.rm=TRUE),
            avg_mathprof = mean(mathprof, na.rm=TRUE)) 
district_stats |> 
  arrange(desc(med_mathprof)) |> 
  #kable()
  datatable()
bar_chart(district_stats, x=district, y=med_mathprof)

Borough Statistics

mydata <- mydata |> 
  mutate(borough = replace(borough, 
                           name == "P.S. 046 Arthur Tappan", "MANHATTAN")) |> 
  mutate(borough = replace(borough, 
                           name == "M.S. 935", "BROOKLYN"))
borough_stats<-mydata |> 
  group_by(borough) |> 
  summarize(med_mathprof = median(mathprof, na.rm=TRUE),
            avg_mathprof = mean(mathprof, na.rm=TRUE)) 
borough_stats |> 
  arrange(desc(med_mathprof)) |> 
  #kable()
  datatable()
bar_chart(borough_stats, x=borough, y=med_mathprof)

mydata |> 
  group_by(district) |> 
  summarize(nschools = n()) |> 
  mutate(district=reorder(district,nschools)) |> 
  ggplot(aes(x=district,y=nschools,fill=district)) +
  geom_col(show.legend = FALSE) +
  coord_flip()

mathprof_zscore <- mydata |>
  select(district,borough,name,mathprof) |> 
  mutate(zscore = scale(mathprof)) |> 
  filter(!is.na(zscore))
mathprof_zscore |> 
  filter(borough %in% c("QUEENS","MANHATTAN","BROOKLYN","BRONX")) |>
  gg_histogram(x=zscore,facet=borough, bins=12)

Created using code from: https://rpubs.com/bkostadi/data_analysis_cite2022