library(tidyverse)
library(RSocrata)
library(knitr)
library(ggblanket) # for easy visualizations based on ggplot2
library(DT)
library(ggcharts)

# import the data directly into RStudio using url path
data <- read.socrata("https://data.cityofnewyork.us/resource/f6s7-vytj.csv")
## Warning in
## read.socrata("https://data.cityofnewyork.us/resource/f6s7-vytj.csv"): Dates and
## currency fields will be converted to character
# selected variables (columns)
columns <-c("district","name","borough","latitude","longitude",
            "coursepassrate","elaprof","mathprof","surveysafety",
            "totalstudents","gradespan","tophs1","tophs2","tophs3",
            "acceleratedclasses","electiveclasses","languageclasses",
            "diversityinadmissions")
mydata <- data |> 
  select(all_of(columns))

mydata |>
  select(1:3) |>
  slice(1:7) |> kable()
district name borough
1 P.S. 034 Franklin D. Roosevelt MANHATTAN
1 P.S. 140 Nathan Straus MANHATTAN
1 P.S. 184m Shuang Wen MANHATTAN
1 P.S. 188 The Island School MANHATTAN
1 University Neighborhood Middle School MANHATTAN
1 School for Global Leaders MANHATTAN
1 East Side Community School MANHATTAN
mydata |> 
  filter(borough == "MANHATTAN") |> 
  select(c(name,coursepassrate,mathprof,elaprof)) |> 
  arrange(desc(mathprof)) |> 
  slice(1:10) |> 
  kable()
name coursepassrate mathprof elaprof
Special Music School 100 98 93
East Side Middle School 100 97 94
The Anderson School 100 97 97
Tag Young Scholars 99 96 92
New Explorations into Science, Technology & Math 99 95 94
New York City Lab Middle School for Collaborative Studies 100 94 91
The Clinton School 100 93 92
M.S. 255 Salk School of Science 100 92 94
M.S. 243 Center School 99 90 94
Columbia Secondary School 95 88 87
mydata |> 
  filter(district == 13 | district == 2) |> 
  select(c(district,name,mathprof)) |> 
  arrange(desc(mathprof)) |> 
  datatable() # for html only
district_stats<-mydata |> 
  group_by(district) |> 
  summarize(med_mathprof = median(mathprof, na.rm=TRUE),
            avg_mathprof = mean(mathprof, na.rm=TRUE)) 

district_stats |> 
  arrange(desc(med_mathprof)) |> 
  #kable()
  datatable()
bar_chart(district_stats, x=district, y=med_mathprof)

mydata <- mydata |> 
  mutate(borough = replace(borough, 
                           name == "P.S. 046 Arthur Tappan", "MANHATTAN")) |> 
  mutate(borough = replace(borough, 
                           name == "M.S. 935", "BROOKLYN"))

borough_stats<-mydata |> 
  group_by(borough) |> 
  summarize(med_mathprof = median(mathprof, na.rm=TRUE),
            avg_mathprof = mean(mathprof, na.rm=TRUE)) 

borough_stats |> 
  arrange(desc(med_mathprof)) |> 
  #kable()
  datatable()
bar_chart(borough_stats, x=borough, y=med_mathprof)

mydata |> 
  group_by(district) |> 
  summarize(nschools = n()) |> 
  mutate(district=reorder(district,nschools)) |> 
  ggplot(aes(x=district,y=nschools,fill=district)) +
  geom_col(show.legend = FALSE) +
  coord_flip()

mathprof_zscore <- mydata |>
  select(district,borough,name,mathprof) |> 
  mutate(zscore = scale(mathprof)) |> 
  filter(!is.na(zscore))
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mathprof_zscore |> 
  filter(borough %in% c("QUEENS","MANHATTAN","BROOKLYN","BRONX")) |>
  gg_histogram(x=zscore,facet=borough, bins=12)