NoSQL Data Store
Take information from a relational database and migrate it to a NoSQL database of your own choosing. For the relational database, you might use the flights database, the tb database, the “data skills” database your team created for Project 3, or another database of your own choosing or creation. For the NoSQL database, you may use MongoDB (which we introduced in week 7), Neo4j, or another NoSQL database of your choosing. Migration process needs to be reproducible. R code is encouraged, but not required. Briefly describe the advantages and disadvantages of storing the data in a relational database vs. your NoSQL database.
For this assignment I will use “nycflights13” database that was previously imported into PostgreSQL. For NoSQLdatabase I will use MongoDB
suppressWarnings({library(mongolite)})
suppressWarnings({library(RPostgreSQL)})
suppressWarnings({library(dplyr)})
suppressWarnings({library(tidyverse)})
drv <- dbDriver("PostgreSQL")
con <- dbConnect(
dbDriver('PostgreSQL'),
dbname = 'Flights',
host = 'localhost',
port = 5432,
user = 'postgres',
password = 'Ylua4786'
)
airlines <- dbGetQuery(con, "SELECT * FROM airlines")
airports <- dbGetQuery(con, "SELECT * FROM airports ")
flights <- dbGetQuery(con, "SELECT * FROM flights ")
planes <- dbGetQuery(con, "SELECT * FROM planes ")
weather <- dbGetQuery(con, "SELECT * FROM weather ")
head(airlines,3)
## carrier name
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
head(airports,3)
## faa name lat lon alt tz dst
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5 A
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6 A
head(flights,3)
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 2 1347 2 1509 -1 WN N917WN
## 2 2013 1 2 1352 42 1509 38 EV N14920
## 3 2013 1 2 1352 -3 1635 -34 B6 N712JB
## flight origin dest air_time distance hour minute
## 1 2967 LGA MDW 123 725 13 47
## 2 4497 EWR RIC 61 277 13 52
## 3 83 JFK SEA 323 2422 13 52
head(planes,3)
## tailnum year type manufacturer model engines
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## seats speed engine
## 1 55 NA Turbo-fan
## 2 182 NA Turbo-fan
## 3 182 NA Turbo-fan
head(weather,3)
## origin year month day hour temp dewp humid wind_dir wind_speed
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## wind_gust precip pressure visib
## 1 11.91865 0 1013.9 10
## 2 15.89154 0 1013.0 10
## 3 14.56724 0 1012.6 10
dbDisconnect(con)
## [1] TRUE
mongo_airlines <- mongo(collection = "Airlines")
mongo_airlines$insert(airlines)
## List of 5
## $ nInserted : num 16
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
head(mongo_airlines$find())
## carrier name
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
mongo_aiport <- mongo(collection = "Airports")
mongo_aiport$insert(airports)
## List of 5
## $ nInserted : num 1397
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
head(mongo_aiport$find())
## faa name lat lon alt tz dst
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5 A
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6 A
## 4 06N Randall Airport 41.43191 -74.39156 523 -5 A
## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -4 A
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -4 A
mongo_flights <- mongo(collection = "Flight")
mongo_flights$insert(flights)
## List of 5
## $ nInserted : num 336776
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
mongo_wth <- mongo(collection = "Weather")
mongo_wth$insert(weather)
## List of 5
## $ nInserted : num 8719
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
head(mongo_wth$find())
## origin year month day hour temp dewp humid wind_dir wind_speed
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## wind_gust precip pressure visib
## 1 11.91865 0 1013.9 10
## 2 15.89154 0 1013.0 10
## 3 14.56724 0 1012.6 10
## 4 15.89154 0 1012.7 10
## 5 17.21583 0 1012.8 10
## 6 11.91865 0 1012.0 10
mongo_plns <- mongo(collection = "Airplanes")
mongo_plns$insert(planes)
## List of 5
## $ nInserted : num 3322
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
head(mongo_plns$find())
## tailnum year type manufacturer model engines
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## seats engine speed
## 1 55 Turbo-fan NA
## 2 182 Turbo-fan NA
## 3 182 Turbo-fan NA
## 4 182 Turbo-fan NA
## 5 55 Turbo-fan NA
## 6 182 Turbo-fan NA
Let’s see what the data looks like by displaying one record
head(mongo_flights$iterate()$one() )
## $year
## [1] 2013
##
## $month
## [1] 1
##
## $day
## [1] 1
##
## $dep_time
## [1] 517
##
## $dep_delay
## [1] 2
##
## $arr_time
## [1] 830
How many distinct “Carriers” do we have in Flight collection?
length(mongo_flights$distinct("carrier"))
## [1] 16
Choose all flights on January 1
jan1 <- mongo_flights$find('{"month":1, "day":1}')
head(jan1,10)
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 542 2 923 33 AA N619AA
## 4 2013 1 1 544 -1 1004 -18 B6 N804JB
## 5 2013 1 1 554 -6 812 -25 DL N668DN
## 6 2013 1 1 554 -4 740 12 UA N39463
## 7 2013 1 1 555 -5 913 19 B6 N516JB
## 8 2013 1 1 557 -3 709 -14 EV N829AS
## 9 2013 1 1 557 -3 838 -8 B6 N593JB
## 10 2013 1 1 558 -2 753 8 AA N3ALAA
## flight origin dest air_time distance hour minute
## 1 1545 EWR IAH 227 1400 5 17
## 2 1714 LGA IAH 227 1416 5 33
## 3 1141 JFK MIA 160 1089 5 42
## 4 725 JFK BQN 183 1576 5 44
## 5 461 LGA ATL 116 762 5 54
## 6 1696 EWR ORD 150 719 5 54
## 7 507 EWR FLL 158 1065 5 55
## 8 5708 LGA IAD 53 229 5 57
## 9 79 JFK MCO 140 944 5 57
## 10 301 LGA ORD 138 733 5 58
How many flights were on January 1 ?
mongo_flights$count('{"month":1, "day":1}')
## [1] 22734
How many flights were on December 31 ?
mongo_flights$count('{"month":12, "day":31}')
## [1] 19400
Sort by distance
Dec31 <- mongo_flights$find('{"month":12,"day":31}', sort='{"distance":-1}')
head(Dec31,10)
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 12 31 922 -8 1537 2 HA N395HA
## 2 2013 12 31 922 -8 1537 2 HA N395HA
## 3 2013 12 31 922 -8 1537 2 HA N395HA
## 4 2013 12 31 922 -8 1537 2 HA N395HA
## 5 2013 12 31 922 -8 1537 2 HA N395HA
## 6 2013 12 31 922 -8 1537 2 HA N395HA
## 7 2013 12 31 922 -8 1537 2 HA N395HA
## 8 2013 12 31 922 -8 1537 2 HA N395HA
## 9 2013 12 31 922 -8 1537 2 HA N395HA
## 10 2013 12 31 922 -8 1537 2 HA N395HA
## flight origin dest air_time distance hour minute
## 1 51 JFK HNL 655 4983 9 22
## 2 51 JFK HNL 655 4983 9 22
## 3 51 JFK HNL 655 4983 9 22
## 4 51 JFK HNL 655 4983 9 22
## 5 51 JFK HNL 655 4983 9 22
## 6 51 JFK HNL 655 4983 9 22
## 7 51 JFK HNL 655 4983 9 22
## 8 51 JFK HNL 655 4983 9 22
## 9 51 JFK HNL 655 4983 9 22
## 10 51 JFK HNL 655 4983 9 22
mongo_flights$aggregate('[{"$group":{"_id":"$carrier", "count": {"$sum":1}, "average":{"$avg":"$distance"}}}]')%>%
ggplot(aes(x=reorder(`_id`,count),y=count))+geom_bar(stat="identity")+coord_flip()+xlab("Carriers")+ theme_bw() + ggtitle("Average Distance")
There are many important features that have not yet been implemented. Simple queries require some programming knowledge.
NoSQL databases can be easily installed in cheap commodity hardware clusters as transaction and data volumes increase. This means that you can process and store more data at much less cost.
NoSQL databases require much less hands-on management, with data distribution and auto repair capabilities, simplified data models and fewer tuning and administration requirements. However, in practice, someone will always be needed to take care of performance and availability of databases.