In this excercise we will be playing with the NYC Public High School Program Data from the NYC OpenData Catalog.
The goal is to:
library(RCurl)
library(rjson)
library(knitr)
library(rmongodb)
library(plyr)
library(dplyr)
mongo <- mongo.create()
coll <- "test.nyc_data_json"
Unfortunatelly the mongoimport command has it’s limitations. In the process of experimenting with the data I stumbled upon a couple of limitations:
Therefore, I decided to import the data into Mongo thru this R-script.
# url to data
json_data_url <- "https://data.cityofnewyork.us/api/views/mreg-rk5p/rows.json?accessType=DOWNLOAD"
# get the RAW JSON data
json_data <- getURL(json_data_url)
# Parse into a list
nyc_school_Programs <- fromJSON(json_data)
# Get the column names form the dataset
nyc_data_columns <- nyc_school_Programs[["meta"]][["view"]][["columns"]]
colnames <- c()
for (i in nyc_data_columns){
colnames <- c(colnames,i[["name"]])
}
# Get the data from the dataset
nyc_data_data <- nyc_school_Programs[["data"]]
#turn data into dataframe
nyc_data <- data.frame(matrix(unlist(nyc_data_data), ncol=length(colnames), byrow=T))
colnames(nyc_data) <- colnames
#importing the data into JSON
# convert data back via DF -> JSON -> BSON -> MONGODB
nyc_data_json<-apply(nyc_data,1,function(x) toJSON(x))
#export to mongo
apply(nyc_data,1,function(x) {mongo.insert(mongo
, coll
,mongo.bson.from.JSON(
toJSON(x)
)
)
}
)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [309] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [323] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [337] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [351] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [365] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [379] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [393] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [407] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [421] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [435] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [449] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [463] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [477] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [491] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [505] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [519] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [533] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [547] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [561] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [575] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [589] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [603] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [617] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [631] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [645] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [659] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [673] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [687] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [701] TRUE
if(mongo.is.connected(mongo) == TRUE) {
pipe_1 <- mongo.bson.from.JSON('
{"$group":
{"_id":"$Borough","Programs":{"$sum":1}}
}')
cmd_list <- list(pipe_1)
totalPrograms <- mongo.aggregation(mongo, coll, cmd_list)
ltotalPrograms <- mongo.bson.value(totalPrograms , "result")
mtotalPrograms <- sapply(ltotalPrograms ,
function(x) return( c(x$'_id',x$Programs)) )
dtotalPrograms <- as.data.frame(t(mtotalPrograms ))
rownames(dtotalPrograms ) <- dtotalPrograms [,1]
colnames(dtotalPrograms ) <- c("Borough", "Programs")
head(dtotalPrograms)
dtotalPrograms$Programs <- as.numeric(as.character(dtotalPrograms$Programs))
plot(dtotalPrograms,main='High School Programs by Borough')
kable(dtotalPrograms[c(2)])
}
| Programs | |
|---|---|
| Staten Island | 53 |
| Manhattan | 143 |
| Brooklyn | 214 |
| Queens | 152 |
| Bronx | 139 |
if(mongo.is.connected(mongo) == TRUE) {
pipe_1 <- mongo.bson.from.JSON('
{"$group":
{"_id":"$Interest Area","Programs":{"$sum":1}}
}')
cmd_list <- list(pipe_1)
totalPrograms <- mongo.aggregation(mongo, coll, cmd_list)
ltotalPrograms <- mongo.bson.value(totalPrograms , "result")
mtotalPrograms <- sapply(ltotalPrograms ,
function(x) return( c(x$'_id',x$Programs)) )
dtotalPrograms <- as.data.frame(t(mtotalPrograms ))
rownames(dtotalPrograms ) <- dtotalPrograms [,1]
colnames(dtotalPrograms ) <- c("Interest Area", "Programs")
head(dtotalPrograms)
dtotalPrograms$Programs <- as.numeric(as.character(dtotalPrograms$Programs))
kable(dtotalPrograms[c(2)])
}
| Programs | |
|---|---|
| Hospitality, Travel and Tourism | 3 |
| Cosmetology | 4 |
| JROTC | 5 |
| Culinary Arts | 7 |
| Computer Science & Technology | 40 |
| Health Professions | 41 |
| Teaching | 11 |
| Animal Science | 4 |
| Humanities & Interdisciplinary | 220 |
| Visual Art & Design | 43 |
| Law & Government | 42 |
| Engineering | 39 |
| Performing Arts | 69 |
| Business | 34 |
| Science & Math | 61 |
| Zoned | 29 |
| Environmental Science | 7 |
| Architecture | 6 |
| Performing Arts/Visual Art & Design | 6 |
| Film/Video | 8 |
| Communications | 22 |