Package


library(tidyverse)
library(RMySQL)
library(DBI)
library(gridExtra)
library(mongolite)

Read/Write

Read Data from MySQL

# install.packages("RMySQL")

 con2 <- dbConnect(RMySQL::MySQL(), dbname = "flights", host="localhost", user="root",password= "")
 

dbListTables(con2)
## [1] "airlines"      "airports"      "flights"       "movies_master"
## [5] "movies_review" "planes"        "weather"
 # Database Operation 
con2 <- dbConnect(RMySQL::MySQL(), dbname = "flights", host="localhost", user="root",password= "")
query <- "SELECT * FROM movies_review"

tab1 <- dbSendQuery(con2,query)
suppressWarnings(rm(movies_review ))
movies_review <- dbFetch(tab1)

query <- "SELECT * FROM movies_master"

tab1 <- dbSendQuery(con2,query)
movies_master <- dbFetch(tab1)

DT::datatable(movies_master)
DT::datatable(movies_review)

Move to Mongo

We are creating two colelction for this data, one for Movie and Movei_Review

Upload

# Using plain-text
URI = sprintf("mongodb+srv://%s:%s@%s/", USER, PASS, HOST)
OpenCon<- mongo(url = URI)

#Deault is Test colelction 
OpenCon
## <Mongo collection> 'test' 
##  $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
##  $count(query = "{}") 
##  $disconnect(gc = TRUE) 
##  $distinct(key, query = "{}") 
##  $drop() 
##  $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
##  $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
##  $import(con, bson = FALSE) 
##  $index(add = NULL, remove = NULL) 
##  $info() 
##  $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
##  $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
##  $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
##  $remove(query, just_one = FALSE) 
##  $rename(name, db = NULL) 
##  $replace(query, update = "{}", upsert = FALSE) 
##  $run(command = "{\"ping\": 1}", simplify = TRUE) 
##  $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)
# Lets create new colection MOvie

CON_movie <- mongo("movie", db = "MSDS_Review",url=URI)


#Check what is in collection Movie
# empty the colelction
CON_movie$drop()         #Refresh the collection
CON_movie$count()        # COunt collection
## [1] 0
CON_movie$insert(movies_master)
## List of 5
##  $ nInserted  : num 6
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
CON_movie$count()        # COunt collection
## [1] 6
CON_movie$disconnect()
CON_movie$count()
## Connection lost. Trying to reconnect with mongo...
## [1] 6
# Lets create new colection MOvieR for Movie Reviews

CON_movieR <- mongo("movie_review", db = "MSDS_Review",url=URI)
CON_movieR$drop()      #Refresh the collection
CON_movieR$count()     # COunt collection
## [1] 0
CON_movieR$insert(movies_review)
## List of 5
##  $ nInserted  : num 149
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
CON_movieR$count()     # COunt collection
## [1] 149

Query on Data

  • count
  • Find
  • Create Index
#View connection
CON_movie
## <Mongo collection> 'movie' 
##  $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
##  $count(query = "{}") 
##  $disconnect(gc = TRUE) 
##  $distinct(key, query = "{}") 
##  $drop() 
##  $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
##  $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
##  $import(con, bson = FALSE) 
##  $index(add = NULL, remove = NULL) 
##  $info() 
##  $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
##  $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
##  $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
##  $remove(query, just_one = FALSE) 
##  $rename(name, db = NULL) 
##  $replace(query, update = "{}", upsert = FALSE) 
##  $run(command = "{\"ping\": 1}", simplify = TRUE) 
##  $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)
CON_movieR
## <Mongo collection> 'movie_review' 
##  $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
##  $count(query = "{}") 
##  $disconnect(gc = TRUE) 
##  $distinct(key, query = "{}") 
##  $drop() 
##  $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
##  $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
##  $import(con, bson = FALSE) 
##  $index(add = NULL, remove = NULL) 
##  $info() 
##  $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
##  $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
##  $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
##  $remove(query, just_one = FALSE) 
##  $rename(name, db = NULL) 
##  $replace(query, update = "{}", upsert = FALSE) 
##  $run(command = "{\"ping\": 1}", simplify = TRUE) 
##  $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)
#Validation if data was moved successfuly to Mongo DB
ifelse(CON_movie$count()==nrow(movies_master),"movies_master All Data in Mongo","Something is not in Mongo")
## [1] "movies_master All Data in Mongo"
ifelse(CON_movieR$count()==nrow(movies_review),"movies_review All Data in Mongo","Something is not in Mongo")
## [1] "movies_review All Data in Mongo"
# Quick Way
CON_movieR$count()==nrow(movies_master)
## [1] FALSE
#The empty query {} means: select all data. 
# Get all records
CON_movieR$count('{}')
## [1] 149
## Read all the data back into R
CON_movieR$find()
CON_movieR$find('{}')
## To query all rows where User_Name == "TheTopDawg" AND Rating > 5 



CON_movieR$find('{"User_Name" : "TheTopDawg" ,"Rating":{"$gt":5}}')
#Read rating for Movie_ID = 1 and rating less than 5

subsetRevie  <- CON_movieR$find('{"Movie_ID" : 1 ,"Rating":{"$lt":5}}')

subsetRevie
#Using In , find all the movie rating for 1 and 2 where rating is less than 5  
subsetRevie12  <- CON_movieR$find('{"Movie_ID" :{"$in":[1,2]} ,"Rating":{"$lt":5}}')

subsetRevie12
# using Qury and Fields 
MR_Query_Fields <- CON_movieR$find(
  query = '{"Movie_ID" : 1 , "Rating" : {"$gt":5 }}', 
  fields = '{"ID" : true, "Rating" : true,"User_Name" : true,"Movie_ID":true}',
  limit = 5
)
print(MR_Query_Fields)
##                        _id ID Movie_ID        User_Name Rating
## 1 5cbd25d1364ca61aa0003858  1        1       TheTopDawg      8
## 2 5cbd25d1364ca61aa0003859  2        1          krice23      8
## 3 5cbd25d1364ca61aa000385a  3        1 celestinoavilajr      8
## 4 5cbd25d1364ca61aa000385b  4        1     dave-mcclain      8
## 5 5cbd25d1364ca61aa000385c  5        1       neener3707      7
#sort by rating IN dESCENDING ORDER for Movie_ID = 1

CON_movieR$find('{"Movie_ID" : 1}' , sort = '{"Rating": -1}')   
# Check time , since by default Mongo has _id is the only index, and other field would be costly to the db 

system.time(CON_movieR$find('{"Movie_ID" : 1}' , sort = '{"Rating": -1}') )
##    user  system elapsed 
##    0.00    0.00    0.06
system.time(CON_movieR$find('{"Movie_ID" : 1}' , sort = '{"_id": -1}') )
##    user  system elapsed 
##    0.00    0.00    0.05
#lets add Movie_ID as Index 

CON_movieR$index(add = '{"Movie_ID" : 1}')
# Lets see how many index we have
CON_movieR$index()
# Check sysetm time taken for excecution now
system.time(CON_movieR$find('{"Movie_ID" : 1}' , sort = '{"Rating": -1}') )
##    user  system elapsed 
##    0.00    0.00    0.05

iterate

# Lets use iterate to read data 
DT::datatable(CON_movieR$find())
Review_Iter <- CON_movieR$iterate('{"Movie_ID" : 1}', sort = '{"Rating": -1}', limit = 7)

# read records from  the iterator
while(!is.null(x <- Review_Iter$one())){
  cat(sprintf("Found Movie ID %s for USER : %s\n", x$Movie_ID, x$User_Name))
}
## Found Movie ID 1 for USER : bdwyphil
## Found Movie ID 1 for USER : terry_z
## Found Movie ID 1 for USER : vhinzajhltb
## Found Movie ID 1 for USER : godsgirlfrombluford
## Found Movie ID 1 for USER : Neymar
## Found Movie ID 1 for USER : maruugaa
## Found Movie ID 1 for USER : TheTopDawg

Remove

# Lets say I want to remove all the rating of 4 for Movie 1
# 1 list those record
CON_movieR$find('{"Movie_ID" : 1,"Rating":{"$eq":4}}') 
# 2 : Remvoe 
CON_movieR$remove('{"Movie_ID" : 1,"Rating":{"$eq":4}}')

# 3 : Valdiate 
CON_movieR$find('{"Movie_ID" : 1,"Rating":{"$eq":4}}') 
CON_movieR$find('{"Movie_ID" : 1,"Rating":{"$ne":4}}') # not equal to 4
CON_movieR$count() # Count got reduced .
## [1] 148
#Let update rating from 8 to 9

CON_movieR$find('{"Movie_ID" : 1,"User_Name":"krice23"}') 
CON_movieR$update('{"Movie_ID" : 1,"User_Name":"krice23"}','{"$set":{"Rating":9}}')
## List of 3
##  $ modifiedCount: int 1
##  $ matchedCount : int 1
##  $ upsertedCount: int 0
CON_movieR$find('{"Movie_ID" : 1,"User_Name":"krice23"}') 
# we should using multiple = TRUE if we want to udpate more that one record.


CON_movieR$find('{"User_Name":"TheTopDawg"}') 
CON_movieR$update('{"User_Name":"TheTopDawg"}','{"$set":{"Rating":9}}')
## List of 3
##  $ modifiedCount: int 1
##  $ matchedCount : int 1
##  $ upsertedCount: int 0
CON_movieR$find('{"User_Name":"TheTopDawg"}') 
# multiple = TRUE if we want to udpate more that one record.
CON_movieR$update('{"User_Name":"TheTopDawg"}','{"$set":{"Rating":7}}',multiple = TRUE)
## List of 3
##  $ modifiedCount: int 2
##  $ matchedCount : int 2
##  $ upsertedCount: int 0
CON_movieR$find('{"User_Name":"TheTopDawg"}') 

Export and other function

Mongo stores data in BSON , we can get BSON output by passing BSON = true

CON_movie$export()
## { "_id" : { "$oid" : "5cbd25d0364ca61aa0003852" }, "ID" : 1, "Movie_Name" : "Bad Samaritan", "Year" : "2018" }
## { "_id" : { "$oid" : "5cbd25d0364ca61aa0003853" }, "ID" : 2, "Movie_Name" : "Avengers: Infinity War", "Year" : "2018" }
## { "_id" : { "$oid" : "5cbd25d0364ca61aa0003854" }, "ID" : 3, "Movie_Name" : "Rampage", "Year" : "2018" }
## { "_id" : { "$oid" : "5cbd25d0364ca61aa0003855" }, "ID" : 4, "Movie_Name" : "Truth or Dare", "Year" : "2018" }
## { "_id" : { "$oid" : "5cbd25d0364ca61aa0003856" }, "ID" : 5, "Movie_Name" : "Incredibles 2", "Year" : "2018" }
## { "_id" : { "$oid" : "5cbd25d0364ca61aa0003857" }, "ID" : 6, "Movie_Name" : "A Star Is Born", "Year" : "2018" }
CON_movie$export(file("mr.bson"),bson = TRUE)


OpenCon$run('{"listCollections":1}')
## $cursor
## $cursor$id
## [1] 0
## 
## $cursor$ns
## [1] "test.$cmd.listCollections"
## 
## $cursor$firstBatch
##        name       type info.readOnly
## 1 week9msds collection         FALSE
## 2      book collection         FALSE
## 3     users collection         FALSE
## 4  fs.files collection         FALSE
## 5 fs.chunks collection         FALSE
##                                                        info.uuid idIndex.v
## 1 c7, c8, 84, 9c, 97, 98, 4c, a2, 92, e9, 7b, 86, ba, 19, 75, 99         2
## 2 b3, e0, 85, bb, 07, 0c, 43, 76, a2, 2e, f6, 33, ae, c0, 85, e5         2
## 3 18, 40, 84, 18, 57, c0, 4b, e3, 81, fe, 8d, bb, dc, 6c, 9e, 4a         2
## 4 7c, 33, 08, 49, fd, 51, 48, 06, b6, 92, e4, c5, c5, 62, 13, ba         2
## 5 7a, 0a, 2a, ac, 3b, cd, 43, 62, a3, 5d, 20, bb, 65, 36, 36, 3d         2
##   idIndex._id idIndex.name     idIndex.ns
## 1           1         _id_ test.week9msds
## 2           1         _id_      test.book
## 3           1         _id_     test.users
## 4           1         _id_  test.fs.files
## 5           1         _id_ test.fs.chunks
## 
## 
## $ok
## [1] 1
## 
## $operationTime
## $operationTime$t
## [1] 1555899862
## 
## $operationTime$i
## [1] 7
## 
## 
## $`$clusterTime`
## $`$clusterTime`$clusterTime
## $`$clusterTime`$clusterTime$t
## [1] 1555899862
## 
## $`$clusterTime`$clusterTime$i
## [1] 7
## 
## 
## $`$clusterTime`$signature
## $`$clusterTime`$signature$hash
##  [1] 24 0e 93 88 d5 fb 1f 67 01 cc 54 fa df 7a 36 12 28 97 23 fc
## attr(,"type")
## [1] 00
## 
## $`$clusterTime`$signature$keyId
## [1] 6.659713e+18
CON_admin <- admin <- mongo(db = "admin",url = URI)
CON_admin$run('{"listDatabases":1}')
## $databases
##           name sizeOnDisk empty
## 1   606Project      24576 FALSE
## 2           MS     229376 FALSE
## 3  MSDS_Review      32768 FALSE
## 4   MSProject3    3239936 FALSE
## 5        Week9      24576 FALSE
## 6         test     212992 FALSE
## 7   testSUNDAY      98304 FALSE
## 8    week9msds      24576 FALSE
## 9        admin     245760 FALSE
## 10       local 3534430208 FALSE
## 
## $totalSize
## [1] 3538563072
## 
## $ok
## [1] 1
## 
## $operationTime
## $operationTime$t
## [1] 1555899862
## 
## $operationTime$i
## [1] 7
## 
## 
## $`$clusterTime`
## $`$clusterTime`$clusterTime
## $`$clusterTime`$clusterTime$t
## [1] 1555899862
## 
## $`$clusterTime`$clusterTime$i
## [1] 7
## 
## 
## $`$clusterTime`$signature
## $`$clusterTime`$signature$hash
##  [1] 24 0e 93 88 d5 fb 1f 67 01 cc 54 fa df 7a 36 12 28 97 23 fc
## attr(,"type")
## [1] 00
## 
## $`$clusterTime`$signature$keyId
## [1] 6.659713e+18

Distinct

NOSQL_CON = mongo(collection = "movie", db = "MSDS_Review", url = URI) # create connection, database and collection
NOSQL_CON$index()
NOSQL_CON$count()
## [1] 6
NOSQL_CON$distinct("Movie_Name")
## [1] "Bad Samaritan"          "Avengers: Infinity War"
## [3] "Rampage"                "Truth or Dare"         
## [5] "Incredibles 2"          "A Star Is Born"
DT::datatable(NOSQL_CON$find())
NOSQL_CON$iterate()$one()
## $ID
## [1] 1
## 
## $Movie_Name
## [1] "Bad Samaritan"
## 
## $Year
## [1] "2018"
# Find Movie with name "Incredibles 2"
# USe count to get the count of matching record . 
# use find to get the data

NOSQL_CON$count('{"Movie_Name" : "Incredibles 2"  }')
## [1] 1
NOSQL_CON$count('{"Movie_Name" : "Incredibles2"  }') # Note space missing between name `Incredibles 2`.
## [1] 0
NOSQL_CON$find('{"Movie_Name" : "Incredibles 2"  }')
MOview_Rating <- NOSQL_CON$find('{"ID" : 1}')
print(MOview_Rating)
##   ID    Movie_Name Year
## 1  1 Bad Samaritan 2018

Compare Nosql and MYSQL

SQL Database NoSQL Database (MongoDB)
Relational database Non-relational database
Supports SQL query language Supports JSON query language
Table based Collection based and key-value pair
Row based Document based
Column based Field based
Support foreign key No support for foreign key
Support for triggers No Support for triggers
Contains schema which is predefined Contains dynamic schema
Not fit for hierarchical data storage Best fit for hierarchical data storage
Vertically scalable - increasing RAM Horizontally scalable - add more servers
Emphasizes on ACID properties (Atomicity, Consistency, Isolation and Durability) Emphasizes on CAP theorem (Consistency, Availability and Partition tolerance)

MongoDB is an open source document-oriented NoSQL database which stores data in the form of JSON-like objects. Hence a collection in MongoDB is equivalent to Table in relational database And a document in a collection in MongoDB is equivalent to a row in relational database table.

If we were using a relational database, we could perform a join on users and stores, and get all our objects in a single query. But MongoDB does not support joins and so, at times, requires bit of denormalization

NoSQL and relational databses differ in the manner they represent data. While a row stores data in columns, document stores in JSON like structure.

MongoDB documents support dynamic schema, i.e. documents in same collection can have different schema , one can have 5 fileds , while other can have more or less than five. This allows complex hierarchical /tree structured schemas is be efficiently stored in MongoDB.

Relationships in relational databse can be achieved through primary key /foreign key constraints ; No such mapping is present in MongoDB other than linking / emnbedding of documents .