Introduction

For this assignment, I take information from a relational database (MySQL) and migrate it to a NoSQL database (MongoDB). For the relational database I am using nycflights13 package. nycflights13 package has airline on-time data for all flights departing NYC in 2013. Also includes useful ‘metadata’ on airlines, airports, weather, and planes.

library(DBI)
library(RMySQL)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(mongolite)

Inspect data from nycflights13

library(nycflights13)
dim(nycflights13::airlines)
## [1] 16  2
dim(nycflights13::airports)
## [1] 1458    8
head(nycflights13::airports)
## # A tibble: 6 x 8
##   faa   name                       lat   lon   alt    tz dst   tzone       
##   <chr> <chr>                    <dbl> <dbl> <dbl> <dbl> <chr> <chr>       
## 1 04G   Lansdowne Airport         41.1 -80.6  1044    -5 A     America/New~
## 2 06A   Moton Field Municipal A~  32.5 -85.7   264    -6 A     America/Chi~
## 3 06C   Schaumburg Regional       42.0 -88.1   801    -6 A     America/Chi~
## 4 06N   Randall Airport           41.4 -74.4   523    -5 A     America/New~
## 5 09J   Jekyll Island Airport     31.1 -81.4    11    -5 A     America/New~
## 6 0A9   Elizabethton Municipal ~  36.4 -82.2  1593    -5 A     America/New~

Export data into .csv files

In order to convert the datasets into slq database, first we convert them to .csv files. After we export nycflights13 datasets into local file, we can switch to MySQL to load them into relational database using table Table data import wizard.

Connecting to MySQL

sqldb <- dbConnect(MySQL(),
                  user = "root", 
                  password = "spring2019",
                  dbname = "nycflights",
                  host = "localhost"
                     )
tables <- dbListTables(sqldb)
tables
## [1] "airlines" "airports"
# Query to load the data from airline and airport tables
query <- sprintf("SELECT * FROM %s", tables)
query
## [1] "SELECT * FROM airlines" "SELECT * FROM airports"
temp1 <- rep(list(NA), length(tables))            

# load tables into temp dataframes
for (j in 1:length(tables)) {
    temp1[[j]] <- as_tibble(dbGetQuery(sqldb, query[j]))
}

temp1
## [[1]]
## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.         
## 
## [[2]]
## # A tibble: 38 x 8
##    faa   name                    lat    lon   alt    tz dst   tzone        
##    <chr> <chr>                 <dbl>  <dbl> <int> <int> <chr> <chr>        
##  1 04G   Lansdowne Airport      41.1  -80.6  1044    -5 A     America/New_~
##  2 06A   Moton Field Municipa~  32.5  -85.7   264    -6 A     America/Chic~
##  3 06C   Schaumburg Regional    42.0  -88.1   801    -6 A     America/Chic~
##  4 06N   Randall Airport        41.4  -74.4   523    -5 A     America/New_~
##  5 09J   Jekyll Island Airport  31.1  -81.4    11    -5 A     America/New_~
##  6 0A9   Elizabethton Municip~  36.4  -82.2  1593    -5 A     America/New_~
##  7 0G6   Williams County Airp~  41.5  -84.5   730    -5 A     America/New_~
##  8 0G7   Finger Lakes Regiona~  42.9  -76.8   492    -5 A     America/New_~
##  9 0P2   Shoestring Aviation ~  39.8  -76.6  1000    -5 U     America/New_~
## 10 0S9   Jefferson County Intl  48.1 -123.    108    -8 A     America/Los_~
## # ... with 28 more rows
# initialize temp tables
temp2 <- rep(list(NA), length(tables))            

# loop through each table and save into each collection
for (j in 1:length(tables)) {
    # connect to collection in mongodb 
    temp2[[j]] <- mongo(collection = tables[j], db = "nycflights13")

    # drop collection if already exists
    if (temp2[[j]]$count() > 0) {
        temp2[[j]]$drop() 
    }

    # insert data
    temp2[[j]]$insert(temp1[[j]])

    # stop if row counts are not identical
    stopifnot(temp2[[j]]$count() == nrow(temp1[[j]]))
}

Now we have two tables airline and airport loaded as dataframes in temp 1.

Migrating data into MongoDB

Save each dataframe as a separate collection in mongodb. And then inspect them to makesure all the data migrated correctly.

airlines <- mongo(collection = "airlines", db = "nycflights13")
airports <- mongo(collection = "airports", db = "nycflights13")

# inspect the data
( airlines_data <- as_tibble(airlines$find()) )
## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.
( airports_data <- as_tibble(airports$find()) )
## # A tibble: 38 x 8
##    faa   name                    lat    lon   alt    tz dst   tzone        
##    <chr> <chr>                 <dbl>  <dbl> <int> <int> <chr> <chr>        
##  1 04G   Lansdowne Airport      41.1  -80.6  1044    -5 A     America/New_~
##  2 06A   Moton Field Municipa~  32.5  -85.7   264    -6 A     America/Chic~
##  3 06C   Schaumburg Regional    42.0  -88.1   801    -6 A     America/Chic~
##  4 06N   Randall Airport        41.4  -74.4   523    -5 A     America/New_~
##  5 09J   Jekyll Island Airport  31.1  -81.4    11    -5 A     America/New_~
##  6 0A9   Elizabethton Municip~  36.4  -82.2  1593    -5 A     America/New_~
##  7 0G6   Williams County Airp~  41.5  -84.5   730    -5 A     America/New_~
##  8 0G7   Finger Lakes Regiona~  42.9  -76.8   492    -5 A     America/New_~
##  9 0P2   Shoestring Aviation ~  39.8  -76.6  1000    -5 U     America/New_~
## 10 0S9   Jefferson County Intl  48.1 -123.    108    -8 A     America/Los_~
## # ... with 28 more rows
airlines_rs = dbSendQuery(sqldb, 'select * from airlines')
airlines.df <- fetch(airlines_rs)
kable(head(airlines.df))
carrier name
9E Endeavor Air Inc.
AA American Airlines Inc.
AS Alaska Airlines Inc.
B6 JetBlue Airways
DL Delta Air Lines Inc.
EV ExpressJet Airlines Inc.
airports_rs = dbSendQuery(sqldb, 'select * from airports')
airports.df <- fetch(airports_rs)
kable(head(airports.df))
faa name lat lon alt tz dst tzone
04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A America/New_York
06A Moton Field Municipal Airport 32.46057 -85.68003 264 -6 A America/Chicago
06C Schaumburg Regional 41.98934 -88.10124 801 -6 A America/Chicago
06N Randall Airport 41.43191 -74.39156 523 -5 A America/New_York
09J Jekyll Island Airport 31.07447 -81.42778 11 -5 A America/New_York
0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -5 A America/New_York
data607 <- mongo("data607")
data607
## <Mongo collection> 'data607' 
##  $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
##  $count(query = "{}") 
##  $disconnect(gc = TRUE) 
##  $distinct(key, query = "{}") 
##  $drop() 
##  $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
##  $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
##  $import(con, bson = FALSE) 
##  $index(add = NULL, remove = NULL) 
##  $info() 
##  $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
##  $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
##  $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
##  $remove(query, just_one = FALSE) 
##  $rename(name, db = NULL) 
##  $replace(query, update = "{}", upsert = FALSE) 
##  $run(command = "{\"ping\": 1}", simplify = TRUE) 
##  $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)

insert data

data607$insert(airports.df)
## List of 5
##  $ nInserted  : num 38
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
data607$insert(airlines.df)
## List of 5
##  $ nInserted  : num 16
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()

Number of Records inserted

data607$count()
## [1] 508
data607$iterate()$one()
## $faa
## [1] "04G"
## 
## $name
## [1] "Lansdowne Airport"
## 
## $lat
## [1] 41.13047
## 
## $lon
## [1] -80.61958
## 
## $alt
## [1] 1044
## 
## $tz
## [1] -5
## 
## $dst
## [1] "A"
## 
## $tzone
## [1] "America/New_York"

MySQL vs. MongoDB

It is clear that both MySQL and MongoDB have their own advantages and disadvantages.

Advantages of MongoDB:

Flexible Database
Sharding
High Speed
High Availability
Scalability
Ad-hoc Query support
Easy Environment Setup

Disadvantages of MongoDB:

Joins not supported
High memory usage
Limited data size
Limited nesting

Advantages of MySQL:

Affordability
Widely adopted (Extremely popular)
Increasing the performance of the application
Well supported software

Disadvantages of MySQL:

Not suitable for large sized data
Hard to scale