Summary

In this excercise we will be playing with the NYC Public High School Program Data from the NYC OpenData Catalog.

The goal is to:

The Libraries

library(RCurl)
library(rjson)
library(knitr)
library(rmongodb)
library(plyr)
library(dplyr)

Connecting to MongoDB

mongo <- mongo.create()
coll <- "test.nyc_data_json"

Importing the data into MongoDB

Unfortunatelly the mongoimport command has it’s limitations. In the process of experimenting with the data I stumbled upon a couple of limitations:

Therefore, I decided to import the data into Mongo thru this R-script.

# url to data
json_data_url <- "https://data.cityofnewyork.us/api/views/mreg-rk5p/rows.json?accessType=DOWNLOAD"
# get the RAW JSON data
json_data <- getURL(json_data_url)

# Parse into a list
nyc_school_Programs <- fromJSON(json_data)

# Get the column names form the dataset
nyc_data_columns <- nyc_school_Programs[["meta"]][["view"]][["columns"]]

colnames <- c()
for (i in nyc_data_columns){
  colnames <- c(colnames,i[["name"]])
}

# Get the data from the dataset
nyc_data_data <- nyc_school_Programs[["data"]]

#turn data into dataframe

nyc_data <- data.frame(matrix(unlist(nyc_data_data), ncol=length(colnames), byrow=T))

colnames(nyc_data) <- colnames

#importing the data into JSON
# convert data back via DF -> JSON -> BSON -> MONGODB


nyc_data_json<-apply(nyc_data,1,function(x) toJSON(x))

#export to mongo
apply(nyc_data,1,function(x) {mongo.insert(mongo
                                                    , coll
                                                    ,mongo.bson.from.JSON(
                                                      toJSON(x)
                                                      )
                                                    )
                                       }
      )
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [309] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [323] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [337] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [351] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [365] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [379] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [393] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [407] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [421] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [435] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [449] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [463] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [477] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [491] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [505] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [519] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [533] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [547] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [561] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [575] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [589] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [603] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [617] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [631] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [645] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [659] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [673] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [687] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [701] TRUE

Programs by Borough

if(mongo.is.connected(mongo) == TRUE) {
  pipe_1 <- mongo.bson.from.JSON('
            {"$group":
              {"_id":"$Borough","Programs":{"$sum":1}}
            }')
  cmd_list <- list(pipe_1)
  totalPrograms <- mongo.aggregation(mongo, coll, cmd_list)
  
  ltotalPrograms <- mongo.bson.value(totalPrograms , "result")
  mtotalPrograms  <- sapply(ltotalPrograms , 
                    function(x) return( c(x$'_id',x$Programs)) )
  dtotalPrograms  <- as.data.frame(t(mtotalPrograms ))
  rownames(dtotalPrograms ) <- dtotalPrograms [,1]
  colnames(dtotalPrograms ) <- c("Borough", "Programs")
  head(dtotalPrograms)
  dtotalPrograms$Programs <- as.numeric(as.character(dtotalPrograms$Programs))

  plot(dtotalPrograms,main='High School Programs by Borough')

  kable(dtotalPrograms[c(2)])
}

Programs
Staten Island 53
Manhattan 143
Brooklyn 214
Queens 152
Bronx 139

Programs by Interest Area

if(mongo.is.connected(mongo) == TRUE) {
  pipe_1 <- mongo.bson.from.JSON('
            {"$group":
              {"_id":"$Interest Area","Programs":{"$sum":1}}
            }')
  cmd_list <- list(pipe_1)
  totalPrograms <- mongo.aggregation(mongo, coll, cmd_list)
  
  ltotalPrograms <- mongo.bson.value(totalPrograms , "result")
  mtotalPrograms  <- sapply(ltotalPrograms , 
                    function(x) return( c(x$'_id',x$Programs)) )
  dtotalPrograms  <- as.data.frame(t(mtotalPrograms ))
  rownames(dtotalPrograms ) <- dtotalPrograms [,1]
  colnames(dtotalPrograms ) <- c("Interest Area", "Programs")
  head(dtotalPrograms)
  dtotalPrograms$Programs <- as.numeric(as.character(dtotalPrograms$Programs))

  kable(dtotalPrograms[c(2)])
}
Programs
Hospitality, Travel and Tourism 3
Cosmetology 4
JROTC 5
Culinary Arts 7
Computer Science & Technology 40
Health Professions 41
Teaching 11
Animal Science 4
Humanities & Interdisciplinary 220
Visual Art & Design 43
Law & Government 42
Engineering 39
Performing Arts 69
Business 34
Science & Math 61
Zoned 29
Environmental Science 7
Architecture 6
Performing Arts/Visual Art & Design 6
Film/Video 8
Communications 22