library(dbConnect)
library(jsonlite)
library(mongolite)
setwd("~/Google Drive/CUNY/git/DATA607/Project4")
After loading the flights database into a local MySQL database, I then imported each of the tables into R data frames. Since I’m building the data around the flights table, I only selected those flights originating from EWR since the weather data was really only available for EWR. I also limited it to the first 100 flights since it othewise takes a significant amount of time to format the data and import it into MongoDB.
myDb = dbConnect(MySQL(), user=username, password=password, dbname=dbname, host=host)
flightsQry = dbSendQuery(myDb, 'SELECT * FROM flights WHERE origin = "EWR" LIMIT 100;')
flights = fetch(flightsQry, n = -1)
airportsQry = dbSendQuery(myDb, 'SELECT * FROM airports;')
airports = fetch(airportsQry, n = -1)
airlinesQry = dbSendQuery(myDb, 'SELECT * FROM airlines;')
airlines = fetch(airlinesQry, n = -1)
planesQry = dbSendQuery(myDb, 'SELECT * FROM planes;')
planes = fetch(planesQry, n = -1)
weatherQry = dbSendQuery(myDb, 'SELECT * FROM weather;')
weather = fetch(weatherQry, n = -1)
Then, in order to nest the airport, airline, plane and weather details, I looped through the flights data frame and nested the aforementioned details as embedded data frames. To verify that this nesting process would produce the JSON format I wanted, I used the prettify and toJSON functions to confirm.
I initially tried and failed to apply a vectorized approach to this procedure. Furthermore, I also tried and failed to use the apply family of functions to accomplish this. Any help would be much appreciated regarding this, as I seem to often struggle to find more efficient approaches to these types of challenges.
for (i in 1:nrow(flights)) {
flights$origin_detail[[i]] = airports[airports$faa == flights$origin[i],]
flights$dest_detail[[i]] = airports[airports$faa == flights$dest[i],]
flights$airline_detail[[i]] = airlines[airlines$carrier == flights$carrier[i],]
flights$plane_detail[[i]] = planes[planes$tailnum == flights$tailnum[i],]
# check if weather row is NA
weather_row = weather[weather$origin == flights$origin[i] &
weather$year == flights$year[i] &
weather$month == flights$month[i] &
weather$day == flights$day[i] &
weather$hour == flights$hour[i]
, ][1,]
if (!is.na(weather_row[1])) {
flights$weather_detail[[i]] = weather[weather$origin == flights$origin[i] &
weather$year == flights$year[i] &
weather$month == flights$month[i] &
weather$day == flights$day[i] &
weather$hour == flights$hour[i]
, ][1,]
}
}
# check JSON validity
prettify(toJSON(flights[1:3,]))
[
{
"year": 2013,
"month": 1,
"day": 1,
"dep_time": 517,
"dep_delay": 2,
"arr_time": 830,
"arr_delay": 11,
"carrier": "UA",
"tailnum": "N14228",
"flight": 1545,
"origin": "EWR",
"dest": "IAH",
"air_time": 227,
"distance": 1400,
"hour": 5,
"minute": 17,
"origin_detail": [
{
"faa": "EWR",
"name": "Newark Liberty Intl",
"lat": 40.6925,
"lon": -74.1687,
"alt": 18,
"tz": -5,
"dst": "A"
}
],
"dest_detail": [
{
"faa": "IAH",
"name": "George Bush Intercontinental",
"lat": 29.9844,
"lon": -95.3414,
"alt": 97,
"tz": -6,
"dst": "A"
}
],
"airline_detail": [
{
"carrier": "UA",
"name": "United Air Lines Inc.\r"
}
],
"plane_detail": [
{
"tailnum": "N14228",
"year": 1999,
"type": "Fixed wing multi engine",
"manufacturer": "BOEING",
"model": "737-824",
"engines": 2,
"seats": 149,
"engine": "Turbo-fan"
}
],
"weather_detail": {
}
},
{
"year": 2013,
"month": 1,
"day": 1,
"dep_time": 554,
"dep_delay": -4,
"arr_time": 740,
"arr_delay": 12,
"carrier": "UA",
"tailnum": "N39463",
"flight": 1696,
"origin": "EWR",
"dest": "ORD",
"air_time": 150,
"distance": 719,
"hour": 6,
"minute": 54,
"origin_detail": [
{
"faa": "EWR",
"name": "Newark Liberty Intl",
"lat": 40.6925,
"lon": -74.1687,
"alt": 18,
"tz": -5,
"dst": "A"
}
],
"dest_detail": [
{
"faa": "ORD",
"name": "Chicago Ohare Intl",
"lat": 41.9786,
"lon": -87.9048,
"alt": 668,
"tz": -6,
"dst": "A"
}
],
"airline_detail": [
{
"carrier": "UA",
"name": "United Air Lines Inc.\r"
}
],
"plane_detail": [
{
"tailnum": "N39463",
"year": 2012,
"type": "Fixed wing multi engine",
"manufacturer": "BOEING",
"model": "737-924ER",
"engines": 2,
"seats": 191,
"engine": "Turbo-fan"
}
],
"weather_detail": [
{
"origin": "EWR",
"year": 2013,
"month": 1,
"day": 1,
"hour": 6,
"temp": 39.02,
"dewp": 26.06,
"humid": 59.37,
"wind_dir": 270,
"wind_speed": 10.357,
"wind_gust": 11.9187,
"precip": 0,
"pressure": 1012,
"visib": 10
}
]
},
{
"year": 2013,
"month": 1,
"day": 1,
"dep_time": 555,
"dep_delay": -5,
"arr_time": 913,
"arr_delay": 19,
"carrier": "B6",
"tailnum": "N516JB",
"flight": 507,
"origin": "EWR",
"dest": "FLL",
"air_time": 158,
"distance": 1065,
"hour": 6,
"minute": 55,
"origin_detail": [
{
"faa": "EWR",
"name": "Newark Liberty Intl",
"lat": 40.6925,
"lon": -74.1687,
"alt": 18,
"tz": -5,
"dst": "A"
}
],
"dest_detail": [
{
"faa": "FLL",
"name": "Fort Lauderdale Hollywood Intl",
"lat": 26.0726,
"lon": -80.1527,
"alt": 9,
"tz": -5,
"dst": "A"
}
],
"airline_detail": [
{
"carrier": "B6",
"name": "JetBlue Airways\r"
}
],
"plane_detail": [
{
"tailnum": "N516JB",
"year": 2000,
"type": "Fixed wing multi engine",
"manufacturer": "AIRBUS INDUSTRIE",
"model": "A320-232",
"engines": 2,
"seats": 200,
"engine": "Turbo-fan"
}
],
"weather_detail": [
{
"origin": "EWR",
"year": 2013,
"month": 1,
"day": 1,
"hour": 6,
"temp": 39.02,
"dewp": 26.06,
"humid": 59.37,
"wind_dir": 270,
"wind_speed": 10.357,
"wind_gust": 11.9187,
"precip": 0,
"pressure": 1012,
"visib": 10
}
]
}
]
Having confirmed that the JSON output was valid, I then researched potential MongoDB packages for R and settled on mongolite as it builds on jsonlite, which I found to be more user-friendly than the alternatives. I also installed MongoDB locally and started a mongod service in order to import data and interact with them. Lastly, I run a few simple queries and then finally drop the collection.
m = mongo(collection = 'flights')
m$insert(flights)
Complete! Processed total of 100 rows.
$nInserted
[1] 100
$nMatched
[1] 0
$nRemoved
[1] 0
$nUpserted
[1] 0
$writeErrors
list()
# check record counts
m$count()
[1] 100
# perform query and retrieve data
m$find( '{ "carrier": "UA" }' )
Found 51 records...
Imported 51 records. Simplifying into dataframe...
m$find( ' { "plane_detail.manufacturer" : "BOEING" } ' )
Found 44 records...
Imported 44 records. Simplifying into dataframe...
# list distinct values
m$distinct("plane_detail.manufacturer")
[1] "BOEING" "AIRBUS INDUSTRIE" "AIRBUS" "EMBRAER"
[5] "HURLEY JAMES LARRY" "BOMBARDIER INC"
# remove the collection
m$drop()
[1] TRUE