library(tidyverse)
library(RSocrata)
library(knitr)
library(ggblanket) # for easy visualizations based on ggplot2
library(DT)
library(ggcharts)
# import the data directly into RStudio using url path
data <- read.socrata("https://data.cityofnewyork.us/resource/f6s7-vytj.csv")
## Warning in
## read.socrata("https://data.cityofnewyork.us/resource/f6s7-vytj.csv"): Dates and
## currency fields will be converted to character
# selected variables (columns)
columns <-c("district","name","borough","latitude","longitude",
"coursepassrate","elaprof","mathprof","surveysafety",
"totalstudents","gradespan","tophs1","tophs2","tophs3",
"acceleratedclasses","electiveclasses","languageclasses",
"diversityinadmissions")
mydata <- data |>
select(all_of(columns))
mydata |>
select(1:3) |>
slice(1:7) |> kable()
| 1 |
P.S. 034 Franklin D. Roosevelt |
MANHATTAN |
| 1 |
P.S. 140 Nathan Straus |
MANHATTAN |
| 1 |
P.S. 184m Shuang Wen |
MANHATTAN |
| 1 |
P.S. 188 The Island School |
MANHATTAN |
| 1 |
University Neighborhood Middle School |
MANHATTAN |
| 1 |
School for Global Leaders |
MANHATTAN |
| 1 |
East Side Community School |
MANHATTAN |
mydata |>
filter(borough == "MANHATTAN") |>
select(c(name,coursepassrate,mathprof,elaprof)) |>
arrange(desc(mathprof)) |>
slice(1:10) |>
kable()
| Special Music School |
100 |
98 |
93 |
| East Side Middle School |
100 |
97 |
94 |
| The Anderson School |
100 |
97 |
97 |
| Tag Young Scholars |
99 |
96 |
92 |
| New Explorations into Science, Technology &
Math |
99 |
95 |
94 |
| New York City Lab Middle School for Collaborative
Studies |
100 |
94 |
91 |
| The Clinton School |
100 |
93 |
92 |
| M.S. 255 Salk School of Science |
100 |
92 |
94 |
| M.S. 243 Center School |
99 |
90 |
94 |
| Columbia Secondary School |
95 |
88 |
87 |
mydata |>
filter(district == 13 | district == 2) |>
select(c(district,name,mathprof)) |>
arrange(desc(mathprof)) |>
datatable() # for html only
district_stats<-mydata |>
group_by(district) |>
summarize(med_mathprof = median(mathprof, na.rm=TRUE),
avg_mathprof = mean(mathprof, na.rm=TRUE))
district_stats |>
arrange(desc(med_mathprof)) |>
#kable()
datatable()
bar_chart(district_stats, x=district, y=med_mathprof)

mydata <- mydata |>
mutate(borough = replace(borough,
name == "P.S. 046 Arthur Tappan", "MANHATTAN")) |>
mutate(borough = replace(borough,
name == "M.S. 935", "BROOKLYN"))
borough_stats<-mydata |>
group_by(borough) |>
summarize(med_mathprof = median(mathprof, na.rm=TRUE),
avg_mathprof = mean(mathprof, na.rm=TRUE))
borough_stats |>
arrange(desc(med_mathprof)) |>
#kable()
datatable()
bar_chart(borough_stats, x=borough, y=med_mathprof)

mydata |>
group_by(district) |>
summarize(nschools = n()) |>
mutate(district=reorder(district,nschools)) |>
ggplot(aes(x=district,y=nschools,fill=district)) +
geom_col(show.legend = FALSE) +
coord_flip()

mathprof_zscore <- mydata |>
select(district,borough,name,mathprof) |>
mutate(zscore = scale(mathprof)) |>
filter(!is.na(zscore))
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mathprof_zscore |>
filter(borough %in% c("QUEENS","MANHATTAN","BROOKLYN","BRONX")) |>
gg_histogram(x=zscore,facet=borough, bins=12)
