NoSQL migration

Data acquisition and management

CUNY MSDS DATA 607

Rose Koh

2018/04/25

The database used in this assignment is nycflight13, This package contains information about all flights that departed from NYC (e.g. EWR, JFK and LGA) in 2013: 336,776 flights in total. The migration was done from RDB to NoSQL(Neo4J, MongoDB). There is also a brief description of the advantages and disadvantages of storing the data in a relational database vs. NoSQL database.


Install | Load Packages

library(DBI)
library(RPostgres)
library(knitr)

Flights data

#install.packages("nycflights13")
library(nycflights13)

# check number of rows of all dataset
nrow(nycflights13::airlines)
## [1] 16
nrow(nycflights13::airports)
## [1] 1458
nrow(nycflights13::flights)
## [1] 336776
nrow(nycflights13::planes)
## [1] 3322
nrow(nycflights13::weather)
## [1] 26130
head(nycflights13::airlines)
## # A tibble: 6 x 2
##   carrier name                    
##   <chr>   <chr>                   
## 1 9E      Endeavor Air Inc.       
## 2 AA      American Airlines Inc.  
## 3 AS      Alaska Airlines Inc.    
## 4 B6      JetBlue Airways         
## 5 DL      Delta Air Lines Inc.    
## 6 EV      ExpressJet Airlines Inc.
head(nycflights13::airports)
## # A tibble: 6 x 8
##   faa   name                             lat   lon   alt    tz dst   tzone
##   <chr> <chr>                          <dbl> <dbl> <int> <dbl> <chr> <chr>
## 1 04G   Lansdowne Airport               41.1 -80.6  1044   -5. A     Amer…
## 2 06A   Moton Field Municipal Airport   32.5 -85.7   264   -6. A     Amer…
## 3 06C   Schaumburg Regional             42.0 -88.1   801   -6. A     Amer…
## 4 06N   Randall Airport                 41.4 -74.4   523   -5. A     Amer…
## 5 09J   Jekyll Island Airport           31.1 -81.4    11   -5. A     Amer…
## 6 0A9   Elizabethton Municipal Airport  36.4 -82.2  1593   -5. A     Amer…
head(nycflights13::flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515        2.      830
## 2  2013     1     1      533            529        4.      850
## 3  2013     1     1      542            540        2.      923
## 4  2013     1     1      544            545       -1.     1004
## 5  2013     1     1      554            600       -6.      812
## 6  2013     1     1      554            558       -4.      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
head(nycflights13::planes)
## # A tibble: 6 x 9
##   tailnum  year type       manufacturer  model  engines seats speed engine
##   <chr>   <int> <chr>      <chr>         <chr>    <int> <int> <int> <chr> 
## 1 N10156   2004 Fixed win… EMBRAER       EMB-1…       2    55    NA Turbo…
## 2 N102UW   1998 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
## 3 N103US   1999 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
## 4 N104UW   1999 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
## 5 N10575   2002 Fixed win… EMBRAER       EMB-1…       2    55    NA Turbo…
## 6 N105UW   1999 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
head(nycflights13::weather)
## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##   <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
## 1 EWR    2013.    1.     1     0  37.0  21.9  54.0     230.       10.4
## 2 EWR    2013.    1.     1     1  37.0  21.9  54.0     230.       13.8
## 3 EWR    2013.    1.     1     2  37.9  21.9  52.1     230.       12.7
## 4 EWR    2013.    1.     1     3  37.9  23.0  54.5     230.       13.8
## 5 EWR    2013.    1.     1     4  37.9  24.1  57.0     240.       15.0
## 6 EWR    2013.    1.     1     6  39.0  26.1  59.4     270.       10.4
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>
airlines <- na.omit(airlines)
airports <- na.omit(airports)
planes <- na.omit(planes)       
flights <- na.omit(flights)
flights <- flights[1:3000,]
weather <- na.omit(weather)
weather <- weather[1:3000,]

Connect to the AWS RDS

source("./source_lib/aws_rep_connect.R")
con <- dbConnect(RPostgres::Postgres(),
                 host = host,
                 port = port,
                 dbname = dbname,
                 user = user,
                 password = password)

RDB operations

# Connection check
is.null(con)
## [1] FALSE
# List tables in DB
dbListTables(con)
## character(0)
# Wrtie table
dbWriteTable(con, name='airlines', value=airlines)
dbWriteTable(con, name='airports', value=airports)
dbWriteTable(con, name='flights', value=flights)
dbWriteTable(con, name='planes', value=planes)
dbWriteTable(con, name='weather', value=weather)
# # Query the table
rs <- dbSendQuery(con, statement="select * from airlines limit 10;")

# Fetch all result
data <- dbFetch(rs, n= -1)

# Change it as data table
kable(head(data))
carrier name
9E Endeavor Air Inc.
AA American Airlines Inc.
AS Alaska Airlines Inc.
B6 JetBlue Airways
DL Delta Air Lines Inc.
EV ExpressJet Airlines Inc.

RNeo4j

#devtools::install_github("nicolewhite/RNeo4j")
library(RNeo4j)
source("./source_lib/neo4j_connect.R")

Connect

# To reproduce, please start your Neo4J desktop and replace your username and password in local database.
graph = startGraph("http://localhost:7474/db/data/", username = neo4j.username, password = neo4j.password)

RNeo4j Operations

# Clear the graph
clear(graph, input = FALSE)
# colnames(airlines)
# colnames(airports)
# colnames(flights)
# colnames(planes)
# colnames(weather)

Define unique constraints

addConstraint(graph, "Airlines", "carrier")
addConstraint(graph, "Airports", "faa")
addConstraint(graph, "Flights", "flight")
addConstraint(graph, "Planes", "tailnum")
addConstraint(graph, "Weather", "origin")

Build database

# Create node

airlines.n <- createNode(graph, "Airlines", carrier = airlines$carrier, names = airlines$name)


airports.n <- createNode(graph, "Airports", faa = airports$faa, name = airports$name, 
                            lat = airports$lat, lon = airports$lon, alt = airports$alt, 
                            tz = airports$tz, dst = airports$dst, tzone = airports$tzone)

flights.n <- createNode(graph, "Flights", flight = flights$year, month = flights$month, day = flights$day, dep_time = flights$dep_time,
                           sched_dep_time = flights$sched_dep_time, dep_delay = flights$dep_delay, arr_time = flights$arr_time,
                           sched_arr_time = flights$sched_arr_time, arr_delay = flights$arr_delay, carrier = flights$carrier,
                           tailnum = flights$tailnum, origin = flights$origin, dest = flights$dest, air_time = flights$air_time,
                           distance = flights$distance, hour = flights$hour, minute = flights$minute, time_hour = flights$time_hour)

planes.n <- createNode(graph, "Planes", tailnum = planes$tailnum, year = planes$year, type = planes$type,
                          manufacturer = planes$manufacturer, model = planes$model, engines = planes$engines, 
                          seats = planes$seats, speed = planes$speed, engine = planes$engine)

weather.n <- createNode(graph, "Weather", origin = weather$origin, year = weather$year, month = weather$month,
                           day = weather$day, hour = weather$hour, temp = weather$temp, dewp = weather$dewp, 
                           humid = weather$humid, wind_dir = weather$wind_dir, wind_speed = weather$wind_speed,
                           wind_gust = weather$wind_gust, precip = weather$precip, pressure = weather$pressure, 
                           visib = weather$visib, time_hour = weather$time_hour)
# Create relations

createRel(airports.n, "Airports_flights", flights.n)
## < Relationship > 
## Airports_flights
createRel(airports.n, "Airports_airlines", airlines.n)
## < Relationship > 
## Airports_airlines
createRel(airlines.n, "Airlines_flights", flights.n)
## < Relationship > 
## Airlines_flights
createRel(airports.n, "Airports_weather", weather.n)
## < Relationship > 
## Airports_weather
createRel(planes.n, "Planes_flights", flights.n)
## < Relationship > 
## Planes_flights
summary(graph)
##       This                To     That
## 1 Airlines  Airlines_flights  Flights
## 2 Airports  Airports_weather  Weather
## 3 Airports Airports_airlines Airlines
## 4 Airports  Airports_flights  Flights
## 5   Planes    Planes_flights  Flights

Mongodb

library(mongolite)

mon.a <- mongo(collection = "nycflight13", db = "airlines")
mon.b <- mongo(collection = "nycflight13", db = "airports")
mon.c <- mongo(collection = "nycflight13", db = "flights")
mon.d <- mongo(collection = "nycflight13", db = "planes")
mon.e <- mongo(collection = "nycflight13", db = "weather")

mon.a$insert(airlines)
## List of 5
##  $ nInserted  : num 16
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
mon.b$insert(airports)
## List of 5
##  $ nInserted  : num 1458
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
mon.c$insert(flights)
## List of 5
##  $ nInserted  : num 3000
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
mon.d$insert(planes)
## List of 5
##  $ nInserted  : num 23
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
mon.e$insert(weather)
## List of 5
##  $ nInserted  : num 3000
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()
mon.a$count()
## [1] 80
mon.a.result <- mon.a$find()
mon.a.result
##    carrier                        name
## 1       9E           Endeavor Air Inc.
## 2       AA      American Airlines Inc.
## 3       AS        Alaska Airlines Inc.
## 4       B6             JetBlue Airways
## 5       DL        Delta Air Lines Inc.
## 6       EV    ExpressJet Airlines Inc.
## 7       F9      Frontier Airlines Inc.
## 8       FL AirTran Airways Corporation
## 9       HA      Hawaiian Airlines Inc.
## 10      MQ                   Envoy Air
## 11      OO       SkyWest Airlines Inc.
## 12      UA       United Air Lines Inc.
## 13      US             US Airways Inc.
## 14      VX              Virgin America
## 15      WN      Southwest Airlines Co.
## 16      YV          Mesa Airlines Inc.
## 17      9E           Endeavor Air Inc.
## 18      AA      American Airlines Inc.
## 19      AS        Alaska Airlines Inc.
## 20      B6             JetBlue Airways
## 21      DL        Delta Air Lines Inc.
## 22      EV    ExpressJet Airlines Inc.
## 23      F9      Frontier Airlines Inc.
## 24      FL AirTran Airways Corporation
## 25      HA      Hawaiian Airlines Inc.
## 26      MQ                   Envoy Air
## 27      OO       SkyWest Airlines Inc.
## 28      UA       United Air Lines Inc.
## 29      US             US Airways Inc.
## 30      VX              Virgin America
## 31      WN      Southwest Airlines Co.
## 32      YV          Mesa Airlines Inc.
## 33      9E           Endeavor Air Inc.
## 34      AA      American Airlines Inc.
## 35      AS        Alaska Airlines Inc.
## 36      B6             JetBlue Airways
## 37      DL        Delta Air Lines Inc.
## 38      EV    ExpressJet Airlines Inc.
## 39      F9      Frontier Airlines Inc.
## 40      FL AirTran Airways Corporation
## 41      HA      Hawaiian Airlines Inc.
## 42      MQ                   Envoy Air
## 43      OO       SkyWest Airlines Inc.
## 44      UA       United Air Lines Inc.
## 45      US             US Airways Inc.
## 46      VX              Virgin America
## 47      WN      Southwest Airlines Co.
## 48      YV          Mesa Airlines Inc.
## 49      9E           Endeavor Air Inc.
## 50      AA      American Airlines Inc.
## 51      AS        Alaska Airlines Inc.
## 52      B6             JetBlue Airways
## 53      DL        Delta Air Lines Inc.
## 54      EV    ExpressJet Airlines Inc.
## 55      F9      Frontier Airlines Inc.
## 56      FL AirTran Airways Corporation
## 57      HA      Hawaiian Airlines Inc.
## 58      MQ                   Envoy Air
## 59      OO       SkyWest Airlines Inc.
## 60      UA       United Air Lines Inc.
## 61      US             US Airways Inc.
## 62      VX              Virgin America
## 63      WN      Southwest Airlines Co.
## 64      YV          Mesa Airlines Inc.
## 65      9E           Endeavor Air Inc.
## 66      AA      American Airlines Inc.
## 67      AS        Alaska Airlines Inc.
## 68      B6             JetBlue Airways
## 69      DL        Delta Air Lines Inc.
## 70      EV    ExpressJet Airlines Inc.
## 71      F9      Frontier Airlines Inc.
## 72      FL AirTran Airways Corporation
## 73      HA      Hawaiian Airlines Inc.
## 74      MQ                   Envoy Air
## 75      OO       SkyWest Airlines Inc.
## 76      UA       United Air Lines Inc.
## 77      US             US Airways Inc.
## 78      VX              Virgin America
## 79      WN      Southwest Airlines Co.
## 80      YV          Mesa Airlines Inc.
mon.b$count()
## [1] 5832
mon.b.result <- mon.b$find()
#mon.b.result

mon.c$count()
## [1] 12000
mon.c.result <- mon.c$find()
#mon.c.result

mon.d$count()
## [1] 92
mon.d.result <- mon.d$find()
#mon.d.result

mon.e$count()
## [1] 12000
mon.e.result <- mon.e$find()
#mon.e.result

Clear the result and Disconnect

# Remove, clear, disconnect
dbExistsTable(con, "airlines")
## Warning in result_create(conn@ptr, statement): Closing open result set,
## cancelling previous query
## [1] TRUE
dbRemoveTable(con, "airlines")
dbExistsTable(con, "airports")
## [1] TRUE
dbRemoveTable(con, "airports")
dbExistsTable(con, "flights")
## [1] TRUE
dbRemoveTable(con, "flights")
dbExistsTable(con, "planes")
## [1] TRUE
dbRemoveTable(con, "planes")
dbExistsTable(con, "weather")
## [1] TRUE
dbRemoveTable(con, "weather")

dbClearResult(rs)
## Warning: Expired, result set already closed
dbDisconnect(con)

Conclusion

Describe the advantages and disadvantages of storing the data in a relational database vs. your NoSQL database.

  • RDB
  • NoSQL

After all, Between SQL and NoSQL the high-level differences exists – SQL databases are vertically scalable whereas the NoSQL databases are horizontally scalable.

References

RNeo4j

RNeo4j