1.Setup - Load data and the required libraries Data Source: http://mysafeinfo.com/api/data?list=presidents&format=json Import json into MondoDB database “rtest” : C:\Users\Suman>mongoimport -d rtest -c presidents < C:\Users\Suman\presidents.json
#install.packages("rmongodb")
library(rmongodb)
2.Analysis
#Connect to my local mongo db server or replset and return an object of class "mongo" for communication over connection.
mgdb <- mongo.create(host="127.0.0.1")
#get collections from my database , which 'rtest'
(mongo.get.database.collections(mgdb, 'rtest'))
## [1] "rtest.presidents"
#Get Democrat presidents only
bson.query <- mongo.bson.from.JSON('{ "pp" : "Democrat" } ')
cur <- mongo.find(mgdb, "rtest.presidents", bson.query)
democrat.df <- mongo.cursor.to.data.frame(cur)
head(democrat.df)
## id nm pp tm
## 1 7 Andrew Jackson Democrat 1829-1837
## 2 8 Martin van Buren Democrat 1837-1841
## 3 11 James K. Polk Democrat 1845-1849
## 4 14 Franklin Pierce Democrat 1853-1857
## 5 15 James Buchanan Democrat 1857-1861
## 6 22 Grover Cleveland Democrat 1885-1889
Now lets try getting all the data from the mongo collection
#Get all presidents
cur.all <- mongo.find(mgdb, "rtest.presidents")
presidents.df <- mongo.cursor.to.data.frame(cur.all)
## Warning in mongo.cursor.to.data.frame(cur.all): This fails for most NoSQL
## data structures. I am working on a new solution
#Make sure to destroy the connection to mongoDb.
mongo.destroy(mgdb)
## NULL
names(presidents.df) <- c("id", "name", "party", "time")
head(presidents.df)
## id name party time
## 1 2 John Adams Federalist 1797-1801
## 2 4 James Madison Democratic-Republican 1809-1817
## 3 3 Thomas Jefferson Democratic-Republican 1801-1809
## 4 6 John Quincy Adams Democratic-Republican 1825-1829
## 5 7 Andrew Jackson Democrat 1829-1837
## 6 5 James Monroe Democratic-Republican 1817-1825
Quick visualization - Histogram of US Presidents by party
library(ggplot2)
ggplot(data=presidents.df, aes(x=party, fill = party)) + geom_histogram() + ggtitle("Count of Presidents from each party") +
stat_bin(aes(y=..count.., label=..count..), geom="text", vjust=-.5)
## ymax not defined: adjusting position using y instead