Introduction

In this project information from flights database will be migrated from MySQL(relational database) to MongoDB, which is NoSQL .

Libraries

Loading the required libraries to perform the task using R studio.

library(knitr)
library(RMySQL)
library(RSQLite)
library(rmongodb)
library(mongolite)

Load Data

Load the flights dataset from MySQL to R environment.

Set password as mypassword:

Get data from MySQL.

mydrv <- dbDriver("MySQL")
mydb <- dbConnect(mydrv, user ='root', password = mypassword, dbname='flights')

dbListTables(mydb)
##  [1] "airlines"         "airports"         "birthdays"       
##  [4] "buildingengtype"  "buildings"        "energycategories"
##  [7] "energytype"       "flights"          "foodranking"     
## [10] "foodtypes"        "pbirthdays"       "pfoodranking"    
## [13] "pfoodtypes"       "planes"           "weather"
dbListFields(mydb, 'flights')
##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay"
##  [6] "arr_time"  "arr_delay" "carrier"   "tailnum"   "flight"   
## [11] "origin"    "dest"      "air_time"  "distance"  "hour"     
## [16] "minute"
mydata <- dbGetQuery(mydb,"SELECT month, day, 
                     dep_time, arr_time, air_time,  
                     dep_delay, arr_delay, distance,
                     carrier FROM flights")

kable(head(mydata))
month day dep_time arr_time air_time dep_delay arr_delay distance carrier
1 1 517 830 227 2 11 1400 UA
1 1 533 850 227 4 20 1416 UA
1 1 542 923 160 2 33 1089 AA
1 1 544 1004 183 -1 -18 1576 B6
1 1 554 812 116 -6 -25 762 DL
1 1 554 740 150 -4 12 719 UA
kable(tail(mydata))
month day dep_time arr_time air_time dep_delay arr_delay distance carrier
336771 9 30 NA NA NA NA NA 764 EV
336772 9 30 NA NA NA NA NA 213 9E
336773 9 30 NA NA NA NA NA 198 9E
336774 9 30 NA NA NA NA NA 764 MQ
336775 9 30 NA NA NA NA NA 419 MQ
336776 9 30 NA NA NA NA NA 431 MQ

Observe the data structure of the imported object.

str(mydata)
## 'data.frame':    336776 obs. of  9 variables:
##  $ month    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time : int  517 533 542 544 554 554 555 557 557 558 ...
##  $ arr_time : int  830 850 923 1004 812 740 913 709 838 753 ...
##  $ air_time : int  227 227 160 183 116 150 158 53 140 138 ...
##  $ dep_delay: int  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_delay: int  11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ distance : int  1400 1416 1089 1576 762 719 1065 229 944 733 ...
##  $ carrier  : chr  "UA" "UA" "AA" "B6" ...

Substitute NA with 0, since all Na are integers data type.

mydata[is.na(mydata)]<-0
kable(tail(mydata))
month day dep_time arr_time air_time dep_delay arr_delay distance carrier
336771 9 30 0 0 0 0 0 764 EV
336772 9 30 0 0 0 0 0 213 9E
336773 9 30 0 0 0 0 0 198 9E
336774 9 30 0 0 0 0 0 764 MQ
336775 9 30 0 0 0 0 0 419 MQ
336776 9 30 0 0 0 0 0 431 MQ

Disconnect the MySQL database.

dbDisconnect(mydb)
## [1] TRUE

Export Data

Export the R data frame to mongodb, the NoSQL database.

Establish connection with mongodb server:

mongodata = mongo.create(host = "localhost")
mongo.is.connected(mongodata)
## [1] TRUE

Create the database and then upload it into mongo.

# create the database in mongodb
mongodata <- mongo(collection="documents",db="flights")

# put column header into a list
feilds<-colnames(mydata)

#establish connection with mongodb
mongodata = mongo.create(host = "localhost")
mongo.is.connected(mongodata)
## [1] TRUE
# remove all record in database if it exist
for(i in 1:length(feilds)){
  st<- paste("flights",feilds[i], sep=".")
  print(mongo.remove(mongodata, ns=st))
}
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
# Check if records are removed 
for(i in 1:length(feilds)){
  DBNS<-paste("flights", feilds[i], sep=".")
  print(mongo.count(mongodata,ns=DBNS))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
# insert the records to mongodb flights database
for(i in 1:length(mydata)){
  tmp<-as.list(mydata[i])
  bson=mongo.bson.from.list(tmp)
  st<- paste("flights", feilds[i], sep=".")
  mongo.insert(mongodata,st, bson)
}

Check if the dataset is loaded to mongodb server by reloading the dataset to R.

# if databse exists
mongo.get.databases(mongodata)
## [1] "flights"
# create dataframe and a list
df<-data.frame(stringsAsFactors = FALSE)
lst<-list()

# get the data into a list
for(i in 1:9){
  st<- paste("flights", feilds[i], sep=".")
  tmplst<-mongo.find.all(mongodata, ns= st, mongo.oid2character = T)
  tmplst[[1]][1]<-NULL
  lst<-c(lst,tmplst)
}
#convert list to datframe
df<-as.data.frame(lst,stringsAsFactors = FALSE)

str(df)
## 'data.frame':    336776 obs. of  9 variables:
##  $ month    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time : num  517 533 542 544 554 554 555 557 557 558 ...
##  $ arr_time : num  830 850 923 1004 812 ...
##  $ air_time : num  227 227 160 183 116 150 158 53 140 138 ...
##  $ dep_delay: num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_delay: num  11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ distance : int  1400 1416 1089 1576 762 719 1065 229 944 733 ...
##  $ carrier  : chr  "UA" "UA" "AA" "B6" ...
# display the data 
kable(head(df))
month day dep_time arr_time air_time dep_delay arr_delay distance carrier
1 1 517 830 227 2 11 1400 UA
1 1 533 850 227 4 20 1416 UA
1 1 542 923 160 2 33 1089 AA
1 1 544 1004 183 -1 -18 1576 B6
1 1 554 812 116 -6 -25 762 DL
1 1 554 740 150 -4 12 719 UA

Advantages of NoSQL over SQL

NoSQL can handle large volumes data, unstructured dataset, allow multiple data structures, and object oriented programming. in addition to these create benefits it is also efficient and scalable. When working in an evolving project, NoSQL will provide a more particle option; otherwise, SQL is more practical for structural and logical related data with a standard and proven technology.

References

[rmongodb][https://gist.github.com/Btib7751989ert3/]

[rmongo][https://www.r-bloggers.com/rmongo-accessing-mongodb-in-r/]