DATA 607 Week 12 - NoSQL migration

Introduction

For this assignment, I take information from a relational database (MySQL) and migrate it to a NoSQL database (MongoDB). For the relational database I am using nycflights13 package. nycflights13 package has airline on-time data for all flights departing NYC in 2013. Also includes useful ‘metadata’ on airlines, airports, weather, and planes.

library(DBI)
library(RMySQL)
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(knitr)
library(mongolite)

Inspect data from nycflights13

library(nycflights13)
dim(nycflights13::airlines)

## [1] 16  2

dim(nycflights13::airports)

## [1] 1458    8

head(nycflights13::airports)

## # A tibble: 6 x 8
##   faa   name                       lat   lon   alt    tz dst   tzone       
##   <chr> <chr>                    <dbl> <dbl> <dbl> <dbl> <chr> <chr>       
## 1 04G   Lansdowne Airport         41.1 -80.6  1044    -5 A     America/New~
## 2 06A   Moton Field Municipal A~  32.5 -85.7   264    -6 A     America/Chi~
## 3 06C   Schaumburg Regional       42.0 -88.1   801    -6 A     America/Chi~
## 4 06N   Randall Airport           41.4 -74.4   523    -5 A     America/New~
## 5 09J   Jekyll Island Airport     31.1 -81.4    11    -5 A     America/New~
## 6 0A9   Elizabethton Municipal ~  36.4 -82.2  1593    -5 A     America/New~

Export data into .csv files

In order to convert the datasets into slq database, first we convert them to .csv files. After we export nycflights13 datasets into local file, we can switch to MySQL to load them into relational database using table Table data import wizard.

Connecting to MySQL

sqldb <- dbConnect(MySQL(),
                  user = "root", 
                  password = "spring2019",
                  dbname = "nycflights",
                  host = "localhost"
                     )
tables <- dbListTables(sqldb)
tables

## [1] "airlines" "airports"

# Query to load the data from airline and airport tables
query <- sprintf("SELECT * FROM %s", tables)
query

## [1] "SELECT * FROM airlines" "SELECT * FROM airports"

temp1 <- rep(list(NA), length(tables))            

# load tables into temp dataframes
for (j in 1:length(tables)) {
    temp1[[j]] <- as_tibble(dbGetQuery(sqldb, query[j]))
}

temp1

## [[1]]
## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.         
## 
## [[2]]
## # A tibble: 38 x 8
##    faa   name                    lat    lon   alt    tz dst   tzone        
##    <chr> <chr>                 <dbl>  <dbl> <int> <int> <chr> <chr>        
##  1 04G   Lansdowne Airport      41.1  -80.6  1044    -5 A     America/New_~
##  2 06A   Moton Field Municipa~  32.5  -85.7   264    -6 A     America/Chic~
##  3 06C   Schaumburg Regional    42.0  -88.1   801    -6 A     America/Chic~
##  4 06N   Randall Airport        41.4  -74.4   523    -5 A     America/New_~
##  5 09J   Jekyll Island Airport  31.1  -81.4    11    -5 A     America/New_~
##  6 0A9   Elizabethton Municip~  36.4  -82.2  1593    -5 A     America/New_~
##  7 0G6   Williams County Airp~  41.5  -84.5   730    -5 A     America/New_~
##  8 0G7   Finger Lakes Regiona~  42.9  -76.8   492    -5 A     America/New_~
##  9 0P2   Shoestring Aviation ~  39.8  -76.6  1000    -5 U     America/New_~
## 10 0S9   Jefferson County Intl  48.1 -123.    108    -8 A     America/Los_~
## # ... with 28 more rows

# initialize temp tables
temp2 <- rep(list(NA), length(tables))            

# loop through each table and save into each collection
for (j in 1:length(tables)) {
    # connect to collection in mongodb 
    temp2[[j]] <- mongo(collection = tables[j], db = "nycflights13")

    # drop collection if already exists
    if (temp2[[j]]$count() > 0) {
        temp2[[j]]$drop() 
    }

    # insert data
    temp2[[j]]$insert(temp1[[j]])

    # stop if row counts are not identical
    stopifnot(temp2[[j]]$count() == nrow(temp1[[j]]))
}

Now we have two tables airline and airport loaded as dataframes in temp 1.

Migrating data into MongoDB

Save each dataframe as a separate collection in mongodb. And then inspect them to makesure all the data migrated correctly.

airlines <- mongo(collection = "airlines", db = "nycflights13")
airports <- mongo(collection = "airports", db = "nycflights13")

# inspect the data
( airlines_data <- as_tibble(airlines$find()) )

## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.

( airports_data <- as_tibble(airports$find()) )

## # A tibble: 38 x 8
##    faa   name                    lat    lon   alt    tz dst   tzone        
##    <chr> <chr>                 <dbl>  <dbl> <int> <int> <chr> <chr>        
##  1 04G   Lansdowne Airport      41.1  -80.6  1044    -5 A     America/New_~
##  2 06A   Moton Field Municipa~  32.5  -85.7   264    -6 A     America/Chic~
##  3 06C   Schaumburg Regional    42.0  -88.1   801    -6 A     America/Chic~
##  4 06N   Randall Airport        41.4  -74.4   523    -5 A     America/New_~
##  5 09J   Jekyll Island Airport  31.1  -81.4    11    -5 A     America/New_~
##  6 0A9   Elizabethton Municip~  36.4  -82.2  1593    -5 A     America/New_~
##  7 0G6   Williams County Airp~  41.5  -84.5   730    -5 A     America/New_~
##  8 0G7   Finger Lakes Regiona~  42.9  -76.8   492    -5 A     America/New_~
##  9 0P2   Shoestring Aviation ~  39.8  -76.6  1000    -5 U     America/New_~
## 10 0S9   Jefferson County Intl  48.1 -123.    108    -8 A     America/Los_~
## # ... with 28 more rows

airlines_rs = dbSendQuery(sqldb, 'select * from airlines')
airlines.df <- fetch(airlines_rs)
kable(head(airlines.df))

carrier	name
9E	Endeavor Air Inc.
AA	American Airlines Inc.
AS	Alaska Airlines Inc.
B6	JetBlue Airways
DL	Delta Air Lines Inc.
EV	ExpressJet Airlines Inc.

airports_rs = dbSendQuery(sqldb, 'select * from airports')
airports.df <- fetch(airports_rs)
kable(head(airports.df))

faa	name	lat	lon	alt	tz	dst	tzone
04G	Lansdowne Airport	41.13047	-80.61958	1044	-5	A	America/New_York
06A	Moton Field Municipal Airport	32.46057	-85.68003	264	-6	A	America/Chicago
06C	Schaumburg Regional	41.98934	-88.10124	801	-6	A	America/Chicago
06N	Randall Airport	41.43191	-74.39156	523	-5	A	America/New_York
09J	Jekyll Island Airport	31.07447	-81.42778	11	-5	A	America/New_York
0A9	Elizabethton Municipal Airport	36.37122	-82.17342	1593	-5	A	America/New_York

data607 <- mongo("data607")

data607

## <Mongo collection> 'data607' 
##  $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
##  $count(query = "{}") 
##  $disconnect(gc = TRUE) 
##  $distinct(key, query = "{}") 
##  $drop() 
##  $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
##  $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
##  $import(con, bson = FALSE) 
##  $index(add = NULL, remove = NULL) 
##  $info() 
##  $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
##  $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
##  $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
##  $remove(query, just_one = FALSE) 
##  $rename(name, db = NULL) 
##  $replace(query, update = "{}", upsert = FALSE) 
##  $run(command = "{\"ping\": 1}", simplify = TRUE) 
##  $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)

insert data

data607$insert(airports.df)

## List of 5
##  $ nInserted  : num 38
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()

data607$insert(airlines.df)

## List of 5
##  $ nInserted  : num 16
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()

Number of Records inserted

data607$count()

## [1] 508

data607$iterate()$one()

## $faa
## [1] "04G"
## 
## $name
## [1] "Lansdowne Airport"
## 
## $lat
## [1] 41.13047
## 
## $lon
## [1] -80.61958
## 
## $alt
## [1] 1044
## 
## $tz
## [1] -5
## 
## $dst
## [1] "A"
## 
## $tzone
## [1] "America/New_York"

MySQL vs. MongoDB

It is clear that both MySQL and MongoDB have their own advantages and disadvantages.

Advantages of MongoDB:

Flexible Database
Sharding
High Speed
High Availability
Scalability
Ad-hoc Query support
Easy Environment Setup

Disadvantages of MongoDB:

Joins not supported
High memory usage
Limited data size
Limited nesting

Advantages of MySQL:

Affordability
Widely adopted (Extremely popular)
Increasing the performance of the application
Well supported software

Disadvantages of MySQL:

Not suitable for large sized data
Hard to scale