options(width = 100)
# This is a standard setup I include so that my working directory is set correctly whether I work on
# one of my windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
setwd("~/Masters/DATA607/Week5/Assignment")
} else {
setwd("~/Documents/Masters/DATA607/Week5/Assignment")
}
library(RMySQL)
## Loading required package: DBI
mysqldriver <- dbDriver("MySQL")
db = dbConnect(mysqldriver, user = "data607", password = "password", dbname = "FlightsWk5", host = "127.0.0.1")
db_tables <- dbGetQuery(db, "SHOW TABLES")
if ("flights" %in% db_tables[, 1]) {
flights_db <- dbGetQuery(db, "DROP TABLE flights")
}
db_df <- data.frame(c("ALASKA", "ALASKA", "", "AM WEST", "AM WEST"), c("on time", "delayed", "", "on time",
"delayed"), c(497, 62, "", 694, 117), c(221, 12, "", 4840, 415), c(212, 20, "", 383, 65), c(503,
102, "", 320, 129), c(1841, 305, "", 201, 61))
names(db_df) <- c("company", "status", "los_angeles", "phoenix", "san_diego", "san_francisco", "seattle")
dbWriteTable(db, name = "flights", value = db_df)
## [1] TRUE
(flights_db <- dbGetQuery(db, "SELECT * FROM flights"))
## row_names company status los_angeles phoenix san_diego san_francisco seattle
## 1 1 ALASKA on time 497 221 212 503 1841
## 2 2 ALASKA delayed 62 12 20 102 305
## 3 3
## 4 4 AM WEST on time 694 4840 383 320 201
## 5 5 AM WEST delayed 117 415 65 129 61
In one step, I take the dataframe, that was loaded from the database, and do the following:
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flights_clean <- (flights_db %>% filter(company != "") %>% gather("city", "count", 4:8) %>% mutate(count = as.numeric(count)))
(flights_clean)
## row_names company status city count
## 1 1 ALASKA on time los_angeles 497
## 2 2 ALASKA delayed los_angeles 62
## 3 4 AM WEST on time los_angeles 694
## 4 5 AM WEST delayed los_angeles 117
## 5 1 ALASKA on time phoenix 221
## 6 2 ALASKA delayed phoenix 12
## 7 4 AM WEST on time phoenix 4840
## 8 5 AM WEST delayed phoenix 415
## 9 1 ALASKA on time san_diego 212
## 10 2 ALASKA delayed san_diego 20
## 11 4 AM WEST on time san_diego 383
## 12 5 AM WEST delayed san_diego 65
## 13 1 ALASKA on time san_francisco 503
## 14 2 ALASKA delayed san_francisco 102
## 15 4 AM WEST on time san_francisco 320
## 16 5 AM WEST delayed san_francisco 129
## 17 1 ALASKA on time seattle 1841
## 18 2 ALASKA delayed seattle 305
## 19 4 AM WEST on time seattle 201
## 20 5 AM WEST delayed seattle 61
Here I make use of group and summarise to show aggregated data. The result shows the number of flights counted for each company and status type. Note, here I used summarise twice to show you can move up the chain of groups that are set.
flights_new <- flights_clean %>% group_by(company, status, city) %>% summarise(flight_count = sum(count),
flight_mean = mean(count), flight_max = max(count), flight_min = min(count)) %>% summarize(flight_count = sum(flight_count))
(flights_new)
## # A tibble: 4 x 3
## # Groups: company [?]
## company status flight_count
## <chr> <chr> <dbl>
## 1 ALASKA delayed 501
## 2 ALASKA on time 3274
## 3 AM WEST delayed 787
## 4 AM WEST on time 6438
In this step, I use spread to get the status types of flights (on time, delayed) to be separate columns. This is so that I can then calculate an on-time ratio using mutate.
flights_new2 <- flights_new %>% spread(status, flight_count) %>% mutate(ratio_on_time = round(`on time`/(`on time` +
delayed), 2))
(flights_new2)
## # A tibble: 2 x 4
## # Groups: company [2]
## company delayed `on time` ratio_on_time
## <chr> <dbl> <dbl> <dbl>
## 1 ALASKA 501 3274 0.87
## 2 AM WEST 787 6438 0.89
Here I perform a similar analysis as before, but start from the cleaned data and arrive to my final table in one line.
flights_new3 <- flights_clean %>% group_by(city, status) %>% summarise(flight_count = sum(count)) %>%
spread(status, flight_count) %>% mutate(ratio_by_city = round(`on time`/(`on time` + delayed), 2))
(flights_new3)
## # A tibble: 5 x 4
## # Groups: city [5]
## city delayed `on time` ratio_by_city
## <chr> <dbl> <dbl> <dbl>
## 1 los_angeles 179 1191 0.87
## 2 phoenix 427 5061 0.92
## 3 san_diego 85 595 0.88
## 4 san_francisco 231 823 0.78
## 5 seattle 366 2042 0.85