D607 Wk13 - Migrating Data to MongoDB

Kyle Gilde

April 29, 2017

The Task

For this assignment, you should take information from a relational database and migrate it to a NoSQL database of your own choosing.

For the NoSQL database, you may use MongoDB (which we introduced in week 7), Neo4j, or another NoSQL database of your choosing.

Your migration process needs to be reproducible. R code is encouraged, but not required.

You should also briefly describe the advantages and disadvantages of storing the data in a relational database vs. your NoSQL database.

Loaded Packages

              
prettydoc TRUE
devtools  TRUE
rmongodb  TRUE
RMySQL    TRUE
knitr     TRUE
jsonlite  TRUE

Create the MySQL Database from CSVs

movieDB <- dbConnect(MySQL(), user = "root", password = "kmgkmg", dbname = "movie_ratings", 
    host = "localhost", port = 3306)

my.files <- c("friends", "movies", "movies_ratings_friends", "ratings")

root <- "https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/movie_ratings/"

# Load my files as DFs and then create tables
create_tables <- function(file_names, root, filetype, database) {
    for (i in 1:length(file_names)) {
        my.url <- paste0(root, file_names[i], filetype)
        x <- read.csv(my.url)
        dbWriteTable(database, file_names[i], x, overwrite = T, row.names = F)
    }
}

create_tables(my.files, root, ".csv", movieDB)
# dbGetQuery(movieDB, paste('describe', my.files[1])) dbGetQuery(movieDB,
# paste('describe', my.files[2])) dbGetQuery(movieDB, paste('describe',
# my.files[3])) dbGetQuery(movieDB, paste('describe', my.files[4]))


all_tables <- dbGetQuery(movieDB, "show tables")
kable(all_tables, caption = "Tables")
Tables
Tables_in_movie_ratings
friends
movies
movies_ratings_friends
ratings

Create the MongoDB Database and Collections from the created MySQL Database

my.mongo <- mongo.create(host = "localhost")

create_collections <- function(mysql_database, mongo_obj, mongo_db) {
    if (mongo.is.connected(mongo_obj)) {
        # get all tables in the DB
        table_list <- dbGetQuery(mysql_database, "show tables")[, 1]
        select_star <- "select * from"
        for (i in 1:length(table_list)) {
            # create a DF from the table
            df <- dbGetQuery(movieDB, paste(select_star, table_list[i]))
            # create bson list
            bson_list <- lapply(split(df, 1:nrow(df)), function(x) mongo.bson.from.JSON(toJSON(x)))
            # insert bson list into collection
            collection <- paste(mongo_db, table_list[i], sep = ".")
            mongo.insert.batch(mongo_obj, collection, bson_list)
            # print one record
            print(mongo.find.one(mongo_obj, collection))
        }
        # print(mongo.get.database.collections(my.mongo, mongo_db))
    } else {
        return("No connection")
    }
    
}


create_collections(movieDB, my.mongo, "movieDB")
##  _id : 7      59063b7f9394d40d27fed7bb
##  1 : 3    
##      id : 16      1
##      name : 2     Dave
## 
##  _id : 7      59063b7f9394d40d27fed7c1
##  1 : 3    
##      id : 16      1
##      movie : 2    Arrival
## 
##  _id : 7      59063b7f9394d40d27fed7ca
##  1 : 3    
##      id : 16      1
##      movie_id : 16    1
##      friend_id : 16   1
##      rating_id : 16   4
## 
##  _id : 7      59063b7f9394d40d27fed7f3
##  1 : 3    
##      id : 16      1
##      rating : 2   Great!
##      rating_score : 16    5
# mongo.get.database.collections(my.mongo, db = 'movieDB')
# mongo.get.databases(my.mongo) mongo.find.all(my.mongo, 'movieBD.friends')

The Advantages of Using MongoDB

According to Tutorialspoint.com, MongoDB is advantageous because

  • it is schema-less. A collection can hold more than one type of document.
  • it is very easy to scale.
dbDisconnect(movieDB)
## [1] TRUE
mongo.drop.database(my.mongo, db = "movieDB")
## [1] TRUE
mongo.destroy(my.mongo)
## NULL