Below is a raw file of the SQL file.
https://raw.githubusercontent.com/jcp9010/MSDA/master/flight.sql
Load library RMySQL
library(RMySQL)
## Loading required package: DBI
Import the MySQL dataset.
con <- dbConnect(RMySQL::MySQL(), dbname = 'tidy', user = 'root', password = 'MSDA')
# The above code may need readjustment depending on your user name and password.
# List tables that exist in the database 'tidy'
dbListTables(con)
## [1] "flight"
# Assign the table 'flight' into d1
rs <- dbSendQuery(con, "SELECT * FROM flight;")
d1 <- dbFetch(rs, n = -1)
d1
## Airline Time_Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
dbDisconnect(con)
## Warning: Closing open result sets
## [1] TRUE
Tidy and transform the data into ‘tidy’ data.
Load libraries.
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Currently the dataset above in untidy and needs to be formatted into a ‘tidy’ format.
flight <- gather(d1, City, Flight_Count, Los_Angeles:Seattle)
flight
## Airline Time_Status City Flight_Count
## 1 ALASKA on time Los_Angeles 497
## 2 ALASKA delayed Los_Angeles 62
## 3 AM WEST on time Los_Angeles 694
## 4 AM WEST delayed Los_Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San_Diego 212
## 10 ALASKA delayed San_Diego 20
## 11 AM WEST on time San_Diego 383
## 12 AM WEST delayed San_Diego 65
## 13 ALASKA on time San_Francisco 503
## 14 ALASKA delayed San_Francisco 102
## 15 AM WEST on time San_Francisco 320
## 16 AM WEST delayed San_Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
# Will use the pipe operators so the code is easier to read.
# Will compare the two airlines with the 'on-time' flight status.
# Will calculate the total number of flights for each airlines.
total <- flight %>% group_by(Airline) %>% summarise(Total_Flights = sum(Flight_Count))
total
## # A tibble: 2 × 2
## Airline Total_Flights
## <chr> <int>
## 1 ALASKA 3775
## 2 AM WEST 7225
# Total of flights from each airline that were on time
on.time <- flight %>% group_by(Airline) %>% filter(Time_Status == 'on time') %>% summarise(Flights_On_Time = sum(Flight_Count))
on.time
## # A tibble: 2 × 2
## Airline Flights_On_Time
## <chr> <int>
## 1 ALASKA 3274
## 2 AM WEST 6438
# Total of flights from each airline that were delayed.
delayed <- flight %>% group_by(Airline) %>% filter(Time_Status == 'delayed') %>% summarise(Flights_Delayed = sum(Flight_Count))
delayed
## # A tibble: 2 × 2
## Airline Flights_Delayed
## <chr> <int>
## 1 ALASKA 501
## 2 AM WEST 787
# Now will combine all the data set information (including new columns) into data.frame flights.summary
flights.summary <- cbind(on.time, Flights_Delayed = delayed$Flights_Delayed, Total_Flights = total$Total_Flights)
flights.summary <- flights.summary %>% mutate(Percent_On_Time = Flights_On_Time/Total_Flights, Percent_Delayed = Flights_Delayed/Total_Flights)
flights.summary
## Airline Flights_On_Time Flights_Delayed Total_Flights Percent_On_Time
## 1 ALASKA 3274 501 3775 0.8672848
## 2 AM WEST 6438 787 7225 0.8910727
## Percent_Delayed
## 1 0.1327152
## 2 0.1089273
Overall, it appears that AM_West seems to be doing a better job of staying on time. And not to mention, AM West flew more flights than Alaska.
The dataset is now pulled from MySQL and now have been created into a tidy data.frame in R. Now onto more data analysis.
Will also create two more data.frames where one is Alaska Airlines, and the other is AM_West Airlines.
Alaska <- flight %>% filter(Airline == 'ALASKA')
AM_West <- flight %>% filter(Airline == 'AM WEST')
Alaskan Airlines:
# Which city had by count, the most delays?
Alaska %>% filter(Time_Status == 'delayed') %>% select(City, Flight_Count) %>% arrange(desc(Flight_Count)) %>% slice(1)
## City Flight_Count
## 1 Seattle 305
# Which city had by percentage, the most delays?
Alaska.City.Totals<- Alaska %>% group_by(City) %>% summarise(Total = sum(Flight_Count))
Alaska.City.Delays <- Alaska %>% filter(Time_Status == 'delayed') %>% mutate(Total = Alaska.City.Totals$Total, Percent = round(Flight_Count/Total,2))
Alaska.City.Delays %>% select(City, Percent) %>% arrange(desc(Percent)) %>% slice(1)
## City Percent
## 1 San_Francisco 0.17
# Which city had by count, the most on time arrivals?
Alaska %>% filter(Time_Status == 'on time') %>% select(City, Flight_Count) %>% arrange(desc(Flight_Count)) %>% slice(1)
## City Flight_Count
## 1 Seattle 1841
# Again, which city had by percentage, the most on time arrivals?
Alaska.City.Arrivals <- Alaska %>% filter(Time_Status == 'on time') %>% mutate(Total = Alaska.City.Totals$Total, Percent = round(Flight_Count/Total,2))
Alaska.City.Arrivals %>% select(City, Percent) %>% arrange(desc(Percent)) %>% slice(1)
## City Percent
## 1 Phoenix 0.95
The cities differ in percentage and absolute count. If you look at the original data, the absolute count for the total amount of trips to Seattle is significantly larger than the other cities.
The same functions can be performed for the AM_West data as well. But given that it is simply a copy and paste, I like to move onto other graphical analysis for AM West Airlines.
Load ggplot2 (Though I have used a lot of ggplot2 in prior my homework assignments, it never hurts to continue to practice this library. In this example, I will use the ggplot geom_plot.)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
AM West Airlines:
AM_West.Delays <- AM_West %>% filter(Time_Status == 'delayed') %>% select(City, Flight_Count)
ggplot(AM_West.Delays, aes(x = City, y = Flight_Count)) + geom_point(alpha = 0.5, size = 5, color = 'blue') + labs(title =" AM West Flight Delays", x = "City", y = "Flight Count")
Before I finish this assignment, I like to demonstrate the spread() function, which is important in the dplyr library.
AM_West.Delays <- AM_West.Delays %>% spread(City, Flight_Count)
AM_West.Delays
## Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 117 415 65 129 61