Loading Packages Used
knitr::opts_chunk$set(message = FALSE, echo = TRUE)
# Libraries Used : DT , RMySQL, dplyr
# Library for data display in tabular format
library(DT)
# Library for db operations
library(RMySQL)
# Library for data structure operations
library(dplyr)
library(jsonlite)
library(mongolite)MongoDb (64 bit version) was downloaded and installed in default directory under “C:\Program Files” The new mongolite package was used for client connectivity from R to MongoDB database.
To set up the MongoDB environment:
MongoDB requires a data directory to store all data. MongoDB’s default data directory path is \data\db. Hence directory for data and log is created
“F:\mongodb\data” “F:\mongodb\log”
The mongodb startup requires
The following command “C:\Program Files\MongoDB\Server\3.2\bin\mongod.exe” –dbpath F:\mongodb\data
To connect to MongoDB:
One needs to start the mongo.exe shell in another Command Prompt.
“C:\Program Files\MongoDB\Server\3.2\bin\mongo.exe”
The default database is ‘test’
In mongo sheel one can connect to test using following command at mongo shell prompt :
use test
One can check the collections ( equivalent to tables concept from relational databases) using :
show collections
The default collection available in ‘test’ database is ‘restaurants’
The MongoDB Manual : (https://docs.mongodb.com/manual/crud/
Tutorial Mongolite : https://cran.r-project.org/web/packages/mongolite/mongolite.pdf
A connection to MYSQL relational databse is done and the data from ‘entertainment’ schema and ‘tb’ schema are extracted to perform a migration to NoSQL database of MongoDB.
mysql_dbconn <- function(schema, uid, pwd) {
rmysql.settingsfile <- "C:/ProgramData/MySQL/MySQL Server 5.6/my.ini"
conn <- dbConnect(RMySQL::MySQL(), dbname = schema, username = uid, password = pwd)
return(conn)
}conntb <- mysql_dbconn("tb", "root", "WhiteLotus21")
# List the database tables in entertainment schema
entdbtables <- dbListTables(conntb)Loading tb and movie data in R
if (isIdCurrent(conntb)) {
tbdb <- dbGetQuery(conntb, "select country, year, sex, child, adult, elderly from tb where tb.year > 2010")
}
connent <- mysql_dbconn("entertainment", "root", "WhiteLotus21")
if (isIdCurrent(conntb)) {
moviesdb <- dbGetQuery(connent, "select movieid, moviename, genre, yearreleased, leadcast, dirrectedby, countryreleased, language, watchtimemin, certificate, awards, info from movies, genre, certificate where movies.genre = genre.genreid and movies.certificate = certificate.certificateid")
genredb <- dbGetQuery(connent, "select genreid, genretype from genre")
certificatedb <- dbGetQuery(connent, "select certificateid, certificatedesc from certificate")
surveydb <- dbGetQuery(connent, "select respondentid, movieid, rating, responsedt from survey")
}c = mongo(collection = "restaurants", db = "test")
c$count()## [1] 60219
# To take care of the insertion of 'tb' data during the re-run of this Rmd , we
# drop it initially if existing at all
tb = mongo(collection = "tb", db = "tbdatabase")
tb$count()## [1] 0
tb$insert(tbdb)##
Complete! Processed total of 600 rows.
## $nInserted
## [1] 600
##
## $nMatched
## [1] 0
##
## $nRemoved
## [1] 0
##
## $nUpserted
## [1] 0
##
## $writeErrors
## list()
tb$count()## [1] 600
# To take care of the insertion of 'tb' data during the re-run of this Rmd , we
# drop it at the end.
tb$drop()## [1] TRUE
mov = mongo(collection = "movies", db = "entertainment")
gen = mongo(collection = "genre", db = "entertainment")
cert = mongo(collection = "certificate", db = "entertainment")
survey = mongo(collection = "survey", db = "entertainment")# To take care of the insertion of all collections in 'entertainment' database
# during the re-run of this Rmd , we drop it at the end.
survey$drop()## [1] TRUE
gen$drop()## [1] TRUE
cert$drop()## [1] TRUE
mov$drop()## [1] TRUE
mov$count()## [1] 0
mov$insert(moviesdb)##
Complete! Processed total of 22 rows.
## $nInserted
## [1] 22
##
## $nMatched
## [1] 0
##
## $nRemoved
## [1] 0
##
## $nUpserted
## [1] 0
##
## $writeErrors
## list()
gen$count()## [1] 0
gen$insert(genredb)##
Complete! Processed total of 5 rows.
## $nInserted
## [1] 5
##
## $nMatched
## [1] 0
##
## $nRemoved
## [1] 0
##
## $nUpserted
## [1] 0
##
## $writeErrors
## list()
cert$count()## [1] 0
cert$insert(certificatedb)##
Complete! Processed total of 8 rows.
## $nInserted
## [1] 8
##
## $nMatched
## [1] 0
##
## $nRemoved
## [1] 0
##
## $nUpserted
## [1] 0
##
## $writeErrors
## list()
survey$count()## [1] 0
survey$insert(surveydb)##
Complete! Processed total of 90 rows.
## $nInserted
## [1] 90
##
## $nMatched
## [1] 0
##
## $nRemoved
## [1] 0
##
## $nUpserted
## [1] 0
##
## $writeErrors
## list()
mov$count()## [1] 22
gen$count()## [1] 5
cert$count()## [1] 8
survey$count()## [1] 90
# Selecting distinct respondents from Survey collection
survey$distinct("respondentid")## [1] 1 2 3 4 5 6 7 8 9
# Select respondent ids who did ot respond/ rate to MN1 movie code, ie Ice Age
survey$distinct("respondentid", "{\"movieid\" : \"MN1\", \"rating\": null }")## [1] 1 8
# Select respondent ids who respond/ rate to MN1 movie code, ie Ice Age , handle
# null
survey$find("{\"movieid\" : \"MN1\"}", handler = NULL)##
Found 9 records...
Imported 9 records. Simplifying into dataframe...
## respondentid movieid responsedt rating
## 1 1 MN1 2016-09-07 NA
## 2 2 MN1 2016-09-07 5
## 3 3 MN1 2016-09-07 3
## 4 4 MN1 2016-09-07 2
## 5 5 MN1 2016-09-07 4
## 6 6 MN1 2016-09-08 3
## 7 7 MN1 2016-09-08 5
## 8 8 MN1 2016-09-08 NA
## 9 9 MN1 2016-09-08 5
# Add index and sort By Rating
survey$index(add = "rating")## v key._id key.rating name ns
## 1 1 1 NA _id_ entertainment.survey
## 2 1 NA 1 rating_1 entertainment.survey
mysurveydata <- survey$find(sort = "{\"rating\":-1}")##
Found 90 records...
Imported 90 records. Simplifying into dataframe...
datatable(mysurveydata)survey$aggregate("[{\"$group\":{\"_id\" : \"$movieid\",\"avgRating\" : {\"$avg\" : \"$rating\"}}}]")##
Found 10 records...
Imported 10 records. Simplifying into dataframe...
## _id avgRating
## 1 MA1 4.444444
## 2 MN2 3.875000
## 3 MA2 3.333333
## 4 MV2 3.666667
## 5 MH2 3.375000
## 6 MH1 3.250000
## 7 MN1 3.857143
## 8 MV1 4.111111
## 9 MW1 3.142857
## 10 MW2 3.555556
# Logical And
western2016 <- mov$find("{\"genre\":\"W\",\"yearreleased\":2016}")##
Found 1 records...
Imported 1 records. Simplifying into dataframe...
datatable(western2016)# Movie Either Animated OR Action , Sorted By Movie Name Descending
actionanimation <- mov$find("{\"genre\": { \"$in\" : [\"N\",\"A\"]}}", fields = "{\"moviename\":1,\"leadcast\":1,\"info\":1} ",
sort = "{\"moviename\" : -1}")##
Found 9 records...
Imported 9 records. Simplifying into dataframe...
datatable(actionanimation)# Movies (Either Western OR Horror) And Released In 2016
westernorhorror2016 <- mov$find("{\"genre\": { \"$in\" : [\"W\",\"H\"]} , \"yearreleased\" : 2016 }",
fields = "{\"moviename\":1,\"leadcast\":1,\"info\":1}")##
Found 2 records...
Imported 2 records. Simplifying into dataframe...
datatable(westernorhorror2016)# The collection can be stored to file and restored back
# Dump to bson
dump <- tempfile()
mov$export(file(dump), bson = TRUE)##
Exported 22 lines...
Done! Exported a total of 22 lines.
# Remove the collection
mov$drop()## [1] TRUE
# Restore
mov$count()## [1] 0
mov$import(file(dump), bson = TRUE)##
Restored 22 records...
Done! Inserted total of 22 records.
## [1] 22
file(dump)## description
## "C:\\Users\\Kumudini\\AppData\\Local\\Temp\\RtmpwpPtiW\\file24c422a0456b"
## class
## "file"
## mode
## "r"
## text
## "text"
## opened
## "closed"
## can read
## "yes"
## can write
## "yes"
mov$count()## [1] 22
# Attempt to insert Json file in MongoDB
# donut <- mongo('zips', verbose=FALSE, db = 'entertainment')
# donutdf <-
# stream_in(url('https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week12P4/donut.json'))
# donut$insert(donutdf)
# Alternatively
# stream_in(url('https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week12P4/donut.json'),handler=function(df){donut$insert(df)})
# Could not import, This gives an error premature EOF
restaurants <- mongo("restaurants", verbose = FALSE, db = "entertainment")
restaurants$drop()## [1] TRUE
restaurants$count()## [1] 0
restaurants$import(url("https://raw.githubusercontent.com/mongodb/docs-assets/primer-dataset/primer-dataset.json"))## Warning: closing unused connection 5 (C:\Users\Kumudini\AppData\Local\Temp
## \RtmpwpPtiW\file24c422a0456b)
restaurants$count()## [1] 25359
MongoDB is an open source document-oriented NoSQL database which stores data in the form of JSON-like objects. Hence a collection in MongoDB is equivalent to Table in relational database And a document in a collection in MongoDB is equivalent to a row in relational database table.
NoSQL and relational databses differ in the manner they represent data. While a row stores data in columns, document stores in JSON like structure.
MongoDB documents support dynamic schema, i.e. documents in same collection can have different schema , one can have 5 fileds , while other can have more or less than five. This allows complex hierarchical /tree structured schemas is be efficiently stored in MongoDB.
Relationships in relational databse can be achieved through primary key /foreign key constraints ; No such mapping is present in MongoDB other than linking / emnbedding of documents .
However the document type representation and the unconventional way of querying seems to be bit hard to get used to syntactically. The plain english like SQL of relational database seems more easy to adapt to.