library(RSocrata) # for loading the data from NYC Open Data
library(tidyverse) # for data analysis and visualizations
library(ggcharts) # for easy visualizations based on ggplot2
library(ggblanket) # for easy visualizations based on ggplot2
library(knitr) # for printing tables
library(DT) # for interactive tables in html format
data <- read.socrata("https://data.cityofnewyork.us/resource/f6s7-vytj.csv")
Selected variables (columns)
columns <-c("district","name","borough","latitude","longitude",
"coursepassrate","elaprof","mathprof","surveysafety",
"totalstudents","gradespan","tophs1","tophs2","tophs3",
"acceleratedclasses","electiveclasses","languageclasses",
"diversityinadmissions")
mydata <- data |>
select(all_of(columns))
Exploratory Data Analysis with Tidyverse
mydata |>
select(1:3) |>
slice(1:7) |> kable()
1 |
P.S. 034 Franklin D. Roosevelt |
MANHATTAN |
1 |
P.S. 140 Nathan Straus |
MANHATTAN |
1 |
P.S. 184m Shuang Wen |
MANHATTAN |
1 |
P.S. 188 The Island School |
MANHATTAN |
1 |
University Neighborhood Middle School |
MANHATTAN |
1 |
School for Global Leaders |
MANHATTAN |
1 |
East Side Community School |
MANHATTAN |
Filtering the data
mydata |>
filter(borough == "MANHATTAN") |>
select(c(name,coursepassrate,mathprof,elaprof)) |>
arrange(desc(mathprof)) |>
slice(1:10) |>
kable()
Special Music School |
100 |
98 |
93 |
East Side Middle School |
100 |
97 |
94 |
The Anderson School |
100 |
97 |
97 |
Tag Young Scholars |
99 |
96 |
92 |
New Explorations into Science, Technology &
Math |
99 |
95 |
94 |
New York City Lab Middle School for Collaborative
Studies |
100 |
94 |
91 |
The Clinton School |
100 |
93 |
92 |
M.S. 255 Salk School of Science |
100 |
92 |
94 |
M.S. 243 Center School |
99 |
90 |
94 |
Columbia Secondary School |
95 |
88 |
87 |
District Statistics
mydata |>
filter(district == 13 | district == 2) |>
select(c(district,name,mathprof)) |>
arrange(desc(mathprof)) |>
datatable() # for html only
district_stats <- mydata |>
group_by(district) |>
summarize(med_mathprof = median(mathprof, na.rm=TRUE),
avg_mathprof = mean(mathprof, na.rm=TRUE))
district_stats |>
arrange(desc(med_mathprof)) |>
#kable()
datatable()
bar_chart(district_stats, x=district, y=med_mathprof)

Borough Statistics
mydata <- mydata |>
mutate(borough = replace(borough,
name == "P.S. 046 Arthur Tappan", "MANHATTAN")) |>
mutate(borough = replace(borough,
name == "M.S. 935", "BROOKLYN"))
borough_stats<-mydata |>
group_by(borough) |>
summarize(med_mathprof = median(mathprof, na.rm=TRUE),
avg_mathprof = mean(mathprof, na.rm=TRUE))
borough_stats |>
arrange(desc(med_mathprof)) |>
#kable()
datatable()
bar_chart(borough_stats, x=borough, y=med_mathprof)

mydata |>
group_by(district) |>
summarize(nschools = n()) |>
mutate(district=reorder(district,nschools)) |>
ggplot(aes(x=district,y=nschools,fill=district)) +
geom_col(show.legend = FALSE) +
coord_flip()

mathprof_zscore <- mydata |>
select(district,borough,name,mathprof) |>
mutate(zscore = scale(mathprof)) |>
filter(!is.na(zscore))
mathprof_zscore |>
filter(borough %in% c("QUEENS","MANHATTAN","BROOKLYN","BRONX")) |>
gg_histogram(x=zscore,facet=borough, bins=12)

Created using code from: https://rpubs.com/bkostadi/data_analysis_cite2022