For this assignment, I take information from a relational database (MySQL) and migrate it to a NoSQL database (MongoDB). For the relational database I am using nycflights13 package. nycflights13 package has airline on-time data for all flights departing NYC in 2013. Also includes useful ‘metadata’ on airlines, airports, weather, and planes.
library(DBI)
library(RMySQL)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(mongolite)
library(nycflights13)
dim(nycflights13::airlines)
## [1] 16 2
dim(nycflights13::airports)
## [1] 1458 8
head(nycflights13::airports)
## # A tibble: 6 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New~
## 2 06A Moton Field Municipal A~ 32.5 -85.7 264 -6 A America/Chi~
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chi~
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New~
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New~
## 6 0A9 Elizabethton Municipal ~ 36.4 -82.2 1593 -5 A America/New~
In order to convert the datasets into slq database, first we convert them to .csv files. After we export nycflights13 datasets into local file, we can switch to MySQL to load them into relational database using table Table data import wizard.
sqldb <- dbConnect(MySQL(),
user = "root",
password = "spring2019",
dbname = "nycflights",
host = "localhost"
)
tables <- dbListTables(sqldb)
tables
## [1] "airlines" "airports"
# Query to load the data from airline and airport tables
query <- sprintf("SELECT * FROM %s", tables)
query
## [1] "SELECT * FROM airlines" "SELECT * FROM airports"
temp1 <- rep(list(NA), length(tables))
# load tables into temp dataframes
for (j in 1:length(tables)) {
temp1[[j]] <- as_tibble(dbGetQuery(sqldb, query[j]))
}
temp1
## [[1]]
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
##
## [[2]]
## # A tibble: 38 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <int> <int> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_~
## 2 06A Moton Field Municipa~ 32.5 -85.7 264 -6 A America/Chic~
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chic~
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_~
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_~
## 6 0A9 Elizabethton Municip~ 36.4 -82.2 1593 -5 A America/New_~
## 7 0G6 Williams County Airp~ 41.5 -84.5 730 -5 A America/New_~
## 8 0G7 Finger Lakes Regiona~ 42.9 -76.8 492 -5 A America/New_~
## 9 0P2 Shoestring Aviation ~ 39.8 -76.6 1000 -5 U America/New_~
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_~
## # ... with 28 more rows
# initialize temp tables
temp2 <- rep(list(NA), length(tables))
# loop through each table and save into each collection
for (j in 1:length(tables)) {
# connect to collection in mongodb
temp2[[j]] <- mongo(collection = tables[j], db = "nycflights13")
# drop collection if already exists
if (temp2[[j]]$count() > 0) {
temp2[[j]]$drop()
}
# insert data
temp2[[j]]$insert(temp1[[j]])
# stop if row counts are not identical
stopifnot(temp2[[j]]$count() == nrow(temp1[[j]]))
}
Now we have two tables airline and airport loaded as dataframes in temp 1.
Save each dataframe as a separate collection in mongodb. And then inspect them to makesure all the data migrated correctly.
airlines <- mongo(collection = "airlines", db = "nycflights13")
airports <- mongo(collection = "airports", db = "nycflights13")
# inspect the data
( airlines_data <- as_tibble(airlines$find()) )
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
( airports_data <- as_tibble(airports$find()) )
## # A tibble: 38 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <int> <int> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_~
## 2 06A Moton Field Municipa~ 32.5 -85.7 264 -6 A America/Chic~
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chic~
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_~
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_~
## 6 0A9 Elizabethton Municip~ 36.4 -82.2 1593 -5 A America/New_~
## 7 0G6 Williams County Airp~ 41.5 -84.5 730 -5 A America/New_~
## 8 0G7 Finger Lakes Regiona~ 42.9 -76.8 492 -5 A America/New_~
## 9 0P2 Shoestring Aviation ~ 39.8 -76.6 1000 -5 U America/New_~
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_~
## # ... with 28 more rows
airlines_rs = dbSendQuery(sqldb, 'select * from airlines')
airlines.df <- fetch(airlines_rs)
kable(head(airlines.df))
carrier | name |
---|---|
9E | Endeavor Air Inc. |
AA | American Airlines Inc. |
AS | Alaska Airlines Inc. |
B6 | JetBlue Airways |
DL | Delta Air Lines Inc. |
EV | ExpressJet Airlines Inc. |
airports_rs = dbSendQuery(sqldb, 'select * from airports')
airports.df <- fetch(airports_rs)
kable(head(airports.df))
faa | name | lat | lon | alt | tz | dst | tzone |
---|---|---|---|---|---|---|---|
04G | Lansdowne Airport | 41.13047 | -80.61958 | 1044 | -5 | A | America/New_York |
06A | Moton Field Municipal Airport | 32.46057 | -85.68003 | 264 | -6 | A | America/Chicago |
06C | Schaumburg Regional | 41.98934 | -88.10124 | 801 | -6 | A | America/Chicago |
06N | Randall Airport | 41.43191 | -74.39156 | 523 | -5 | A | America/New_York |
09J | Jekyll Island Airport | 31.07447 | -81.42778 | 11 | -5 | A | America/New_York |
0A9 | Elizabethton Municipal Airport | 36.37122 | -82.17342 | 1593 | -5 | A | America/New_York |
data607 <- mongo("data607")
data607
## <Mongo collection> 'data607'
## $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE)
## $count(query = "{}")
## $disconnect(gc = TRUE)
## $distinct(key, query = "{}")
## $drop()
## $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}")
## $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000)
## $import(con, bson = FALSE)
## $index(add = NULL, remove = NULL)
## $info()
## $insert(data, pagesize = 1000, stop_on_error = TRUE, ...)
## $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0)
## $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL)
## $remove(query, just_one = FALSE)
## $rename(name, db = NULL)
## $replace(query, update = "{}", upsert = FALSE)
## $run(command = "{\"ping\": 1}", simplify = TRUE)
## $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)
data607$insert(airports.df)
## List of 5
## $ nInserted : num 38
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
data607$insert(airlines.df)
## List of 5
## $ nInserted : num 16
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
data607$count()
## [1] 508
data607$iterate()$one()
## $faa
## [1] "04G"
##
## $name
## [1] "Lansdowne Airport"
##
## $lat
## [1] 41.13047
##
## $lon
## [1] -80.61958
##
## $alt
## [1] 1044
##
## $tz
## [1] -5
##
## $dst
## [1] "A"
##
## $tzone
## [1] "America/New_York"
It is clear that both MySQL and MongoDB have their own advantages and disadvantages.
Advantages of MongoDB:
Flexible Database
Sharding
High Speed
High Availability
Scalability
Ad-hoc Query support
Easy Environment Setup
Disadvantages of MongoDB:
Joins not supported
High memory usage
Limited data size
Limited nesting
Advantages of MySQL:
Affordability
Widely adopted (Extremely popular)
Increasing the performance of the application
Well supported software
Disadvantages of MySQL:
Not suitable for large sized data
Hard to scale