First we will take the Flights Database from MySQL and transfer it to a .csv file with the help of https://www.r-bloggers.com/dump-mysql-to-csv-using-r/
tables <- dbListTables(db)
tables
## [1] "airlines" "airports" "cities" "flights" "ontime" "planes"
## [7] "weather"
for (i in 1 : length(tables)){
temp<-(dbReadTable(db, tables[i]))
write.table(temp, tables[i], row.names=F, sep = ',')
}
flights <- data.frame(read.csv("~\\Dataset\\flights", stringsAsFactors = FALSE))
head(flights)
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 542 2 923 33 AA N619AA
## 4 2013 1 1 544 -1 1004 -18 B6 N804JB
## 5 2013 1 1 554 -6 812 -25 DL N668DN
## 6 2013 1 1 554 -4 740 12 UA N39463
## flight origin dest air_time distance hour minute
## 1 1545 EWR IAH 227 1400 5 17
## 2 1714 LGA IAH 227 1416 5 33
## 3 1141 JFK MIA 160 1089 5 42
## 4 725 JFK BQN 183 1576 5 44
## 5 461 LGA ATL 116 762 6 54
## 6 1696 EWR ORD 150 719 6 54
ontime <- data.frame(read.csv("~\\Dataset\\ontime", stringsAsFactors = FALSE))
head(ontime)
## city_id air_line arrived n
## 1 1 Alaska ontime 497
## 2 1 Alaska delayed 62
## 3 2 Alaska ontime 221
## 4 2 Alaska delayed 12
## 5 3 Alaska ontime 212
## 6 3 Alaska delayed 20
planes <- data.frame(read.csv("~\\Dataset\\planes", stringsAsFactors = FALSE))
head(planes)
## tailnum year type manufacturer model engines
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## seats speed engine
## 1 55 NA Turbo-fan
## 2 182 NA Turbo-fan
## 3 182 NA Turbo-fan
## 4 182 NA Turbo-fan
## 5 55 NA Turbo-fan
## 6 182 NA Turbo-fan
weather <- data.frame(read.csv("~\\Dataset\\weather", stringsAsFactors = FALSE))
head(weather)
## origin year month day hour temp dewp humid wind_dir wind_speed
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## wind_gust precip pressure visib
## 1 11.91865 0 1013.9 10
## 2 15.89154 0 1013.0 10
## 3 14.56724 0 1012.6 10
## 4 15.89154 0 1012.7 10
## 5 17.21583 0 1012.8 10
## 6 11.91865 0 1012.0 10
airlines <- data.frame(read.csv("~\\Dataset\\airlines", stringsAsFactors = FALSE))
head(airlines)
## carrier name
## 1 9E Endeavor Air Inc.\n
## 2 AA American Airlines Inc.\n
## 3 AS Alaska Airlines Inc.\n
## 4 B6 JetBlue Airways\n
## 5 DL Delta Air Lines Inc.\n
## 6 EV ExpressJet Airlines Inc.\n
airports <- data.frame(read.csv("~\\Dataset\\airports", stringsAsFactors = FALSE))
head(airports)
## faa name lat lon alt tz dst
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5 A
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6 A
## 4 06N Randall Airport 41.43191 -74.39156 523 -5 A
## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -4 A
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -4 A
cities <- data.frame(read.csv("~\\Dataset\\cities", stringsAsFactors = FALSE))
head(cities)
## city_id city
## 1 1 Los Angeles
## 2 2 Phoenix
## 3 3 San Diego
## 4 4 San Francisco
## 5 5 Seatle
Now we will use the dataframes to populate the MongoDB.
suppressMessages(suppressWarnings(library(rmongodb)))
mongo = mongo.create(host = "localhost")
mongo.is.connected(mongo)
## [1] TRUE
b1=mongo.bson.from.df(flights)
b2=mongo.bson.from.df(ontime)
b3=mongo.bson.from.df(planes)
b4=mongo.bson.from.df(weather)
b5=mongo.bson.from.df(airlines)
b6=mongo.bson.from.df(airports)
b7=mongo.bson.from.df(cities)
b1[[1]]
## year : 16 2013
## month : 16 1
## day : 16 1
## dep_time : 16 517
## dep_delay : 16 2
## arr_time : 16 830
## arr_delay : 16 11
## carrier : 2 UA
## tailnum : 2 N14228
## flight : 16 1545
## origin : 2 EWR
## dest : 2 IAH
## air_time : 16 227
## distance : 16 1400
## hour : 16 5
## minute : 16 17
b2[[1]]
## city_id : 16 1
## air_line : 2 Alaska
## arrived : 2 ontime
## n : 16 497
b3[[1]]
## tailnum : 2 N10156
## year : 16 2004
## type : 2 Fixed wing multi engine
## manufacturer : 2 EMBRAER
## model : 2 EMB-145XR
## engines : 16 2
## seats : 16 55
## speed : 10 BSON_NULL
## engine : 2 Turbo-fan
b4[[1]]
## origin : 2 EWR
## year : 16 2013
## month : 16 1
## day : 16 1
## hour : 16 0
## temp : 1 37.040000
## dewp : 1 21.920000
## humid : 1 53.970000
## wind_dir : 16 230
## wind_speed : 1 10.357020
## wind_gust : 1 11.918651
## precip : 1 0.000000
## pressure : 1 1013.900000
## visib : 16 10
b5[[1]]
## carrier : 2 9E
## name : 2 Endeavor Air Inc.
b6[[1]]
## faa : 2 04G
## name : 2 Lansdowne Airport
## lat : 1 41.130472
## lon : 1 -80.619583
## alt : 16 1044
## tz : 16 -5
## dst : 2 A
b7[[1]]
## city_id : 16 1
## city : 2 Los Angeles
mongo.insert(mongo,"flights",b1)
## [1] FALSE
mongo.insert(mongo,"ontime",b2)
## [1] FALSE
mongo.insert(mongo,"planes",b3)
## [1] FALSE
mongo.insert(mongo,"weather",b4)
## [1] FALSE
mongo.insert(mongo,"airlines",b5)
## [1] FALSE
mongo.insert(mongo,"airports",b6)
## [1] FALSE
mongo.insert(mongo,"cities",b7)
## [1] FALSE
First I established a database called ‘Flights’ using the Neo4J desktop app. Neo4J assigned this database to an http port of 11001. To access this database in Chrome, I entered http:\\localhost:11001 note this is different than the default port of 7474.
To use the ‘LOAD CSV WITH HEADERS’ cypher command I had to move the csv files into Neo4J desktop import file or the following warning would appear “Couldn’t load the external resource at: file:/C:/Users/Nate/AppData/Roaming/Neo4j%20Desktop/Application/neo4jDatabases/database-c2becb82-5c85-430d-bb42-4ad6749f1f0b/current/import/airlines”
The successfull Cypher queries were as follows:
load csv with headers from “file:///airlines” as airlines create (n:Airlines {carrier: airlines.carrier, name: airlines.name})
MATCH (n:Airlines) RETURN n LIMIT 1
returned: { “name”: “Endeavor Air Inc.”, “carrier”: “9E” }
load csv with headers from “file:///airports” as airports create (n:Airports {faa: airports.faa, name: airports.name, lat: airports.lat, lon: airports.lon, alt: airports.alt, tz: airports.tz, dst: airports.dst})
MATCH (n:Airports) RETURN n LIMIT 1
returned:
{ “dst”: “A”, “tz”: “-5”, “faa”: “04G”, “alt”: “1044”, “name”: “Lansdowne Airport”, “lon”: “-80.6195833”, “lat”: “41.1304722” }
load csv with headers from “file:///planes” as planes create (n:Planes {tailnum: planes.tailnum, year: planes.year, type: planes.type, manufacturer: planes.manufacturer, model: planes.model,engines: planes.engines, seats: planes.seats, speed: planes.speed. engine: planes.engine})
MATCH (n:Planes) RETURN n LIMIT 1
returned:
{ “tailnum”: “N10156”, “year”: “2004”, “engine”: “Turbo-fan”, “engines”: “2”, “model”: “EMB-145XR”, “type”: “Fixed wing multi engine”, “seats”: “55”, “speed”: “NA”, “manufacturer”: “EMBRAER” }
load csv with headers from “file:///weather” as weather create (n:Weather {origin:weather.origin, year:weather.year, month:weather.month, day:weather.day, hour:weather.hour, temp:weather.temp, dewp:weather.dewp, humid:weather.humid, wind_dir:weather.wind_dir, wind_speed:weather.wind_speed, wind_gust:weather.wind_gust, precip:weather.precip, pressure:weather.pressure, visib:weather.visib})
MATCH (n:Weather) RETURN n LIMIT 1
returned:
{ “temp”: “37.04”, “year”: “2013”, “origin”: “EWR”, “visib”: “10”, “humid”: “53.97”, “wind_dir”: “230”, “pressure”: “1013.9”, “wind_gust”: “11.9186514756”, “precip”: “0”, “month”: “1”, “dewp”: “21.92”, “hour”: “0”, “wind_speed”: “10.35702”, “day”: “1” }
load csv with headers from “file:///flights” as flights create (n:Flights {year:flights.year, month:flights.month, day:flights.day, dep_time:flights.dep_time, dep_delay:flights.dep_delay, arr_time:flights.arr_time, arr_delay:flights.arr_delay, carrier:flights.carrier, tailnum:flights.tailnum, flight:flights.flight, origin:flights.origin, dest:flights.dest, air_time:flights.air_time, distance:flights.distance, hour:flights.hour, minute:flights.minute})
Note the retured values once completed: Added 336776 labels, created 336776 nodes, set 5388416 properties, completed after 14772 ms.
This required me to allot 8 Gb of RAM to Neo4J to complete.
MATCH (n:Flights) RETURN n LIMIT 1
{ “flight”: “1545”, “arr_delay”: “11”, “distance”: “1400”, “tailnum”: “N14228”, “year”: “2013”, “origin”: “EWR”, “dep_time”: “517”, “dep_delay”: “2”, “dest”: “IAH”, “minute”: “17”, “carrier”: “UA”, “month”: “1”, “hour”: “5”, “arr_time”: “830”, “air_time”: “227”, “day”: “1” }
These were from the Alsaka Air vs. AM West assignment:
load csv with headers from “file:///cities” as cities create (n:Cities {city_id:cities.city_id, city:cities.city})
MATCH (n:Cities) RETURN n LIMIT 1
{ “city”: “Los Angeles”, “city_id”: “1” }
load csv with headers from “file:///ontime” as ontime create (n:Ontime {city_id:ontime.city_id, air_line:ontime.air_line, arrived:ontime.arrived, n:ontime.n})
MATCH (n:Ontime) RETURN n LIMIT 1
{ “arrived”: “ontime”, “n”: “497”, “air_line”: “Alaska”, “city_id”: “1” }