library(RMySQL)
## Loading required package: DBI
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'stringr' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(dplyr)
library(tidyselect)
## Warning: package 'tidyselect' was built under R version 3.4.3
##
## Attaching package: 'tidyselect'
## The following objects are masked from 'package:dplyr':
##
## contains, ends_with, everything, matches, num_range, one_of,
## starts_with
library(ggplot2)
library(stringr)
library(zoo, warn.conflicts = FALSE)
Establishing connection between R and Database
loadsql <- dbDriver("MySQL")
openlink = dbConnect(MySQL(), user='root', password ='celeborn', dbname='arrivals', host='localhost')
Query to get Data
AirportDataset<-"SELECT * FROM arrivals"
arrivals <-dbGetQuery(openlink, AirportDataset)
dbDisconnect(openlink)
## [1] TRUE
Loading Queried Data into data frame
FlightData <- data.frame(arrivals)
FlightData
## Airline FlightStatus LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
Initial data is missing labels for 2 rows, will fix with “zoo” Library
FlightData$Airline[FlightData$Airline == ""] <- NA
FlightData$Airline <- na.locf(FlightData$Airline, option="locf")
FlightData <- FlightData
FlightData
## Airline FlightStatus LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
Finally Tidying the data (Stacking)
NewFlightData<-gather(FlightData, "City", "FlightCount", 3:7)
tidied_up <- arrange(NewFlightData, FlightStatus)
tidied_up
## Airline FlightStatus City FlightCount
## 1 Alaska delayed LosAngeles 62
## 2 AM West delayed LosAngeles 117
## 3 Alaska delayed Phoenix 12
## 4 AM West delayed Phoenix 415
## 5 Alaska delayed SanDiego 20
## 6 AM West delayed SanDiego 65
## 7 Alaska delayed SanFrancisco 102
## 8 AM West delayed SanFrancisco 129
## 9 Alaska delayed Seattle 305
## 10 AM West delayed Seattle 61
## 11 Alaska on time LosAngeles 497
## 12 AM West on time LosAngeles 694
## 13 Alaska on time Phoenix 221
## 14 AM West on time Phoenix 4840
## 15 Alaska on time SanDiego 212
## 16 AM West on time SanDiego 383
## 17 Alaska on time SanFrancisco 503
## 18 AM West on time SanFrancisco 320
## 19 Alaska on time Seattle 1841
## 20 AM West on time Seattle 201
Summary Statistics to give us a general overview of what the numbers can vaguely describe to us
tidied_up %>% group_by(Airline) %>% filter(FlightStatus == "delayed") %>% summarise(mean = mean(FlightCount), min = min(FlightCount), max = max(FlightCount), median = median(FlightCount), stdev = sd(FlightCount), total = sum(FlightCount))
## # A tibble: 2 x 7
## Airline mean min max median stdev total
## <chr> <dbl> <dbl> <dbl> <int> <dbl> <int>
## 1 Alaska 100. 12. 305. 62 120. 501
## 2 AM West 157. 61. 415. 117 147. 787
We seek to find the last bit needed to reach a conclusion of any kind regarding this data. We create a field that describes the ratio of flights delayed/on time to the amount of flights scheduled.
NewTidied <- tidied_up %>% group_by(Airline, City) %>% arrange(Airline) %>% mutate(CityCounts = sum(FlightCount), NewRatio = FlightCount/CityCounts)
NewTidied
## # A tibble: 20 x 6
## # Groups: Airline, City [10]
## Airline FlightStatus City FlightCount CityCounts NewRatio
## <chr> <chr> <chr> <int> <int> <dbl>
## 1 Alaska delayed LosAngeles 62 559 0.111
## 2 Alaska delayed Phoenix 12 233 0.0515
## 3 Alaska delayed SanDiego 20 232 0.0862
## 4 Alaska delayed SanFrancisco 102 605 0.169
## 5 Alaska delayed Seattle 305 2146 0.142
## 6 Alaska on time LosAngeles 497 559 0.889
## 7 Alaska on time Phoenix 221 233 0.948
## 8 Alaska on time SanDiego 212 232 0.914
## 9 Alaska on time SanFrancisco 503 605 0.831
## 10 Alaska on time Seattle 1841 2146 0.858
## 11 AM West delayed LosAngeles 117 811 0.144
## 12 AM West delayed Phoenix 415 5255 0.0790
## 13 AM West delayed SanDiego 65 448 0.145
## 14 AM West delayed SanFrancisco 129 449 0.287
## 15 AM West delayed Seattle 61 262 0.233
## 16 AM West on time LosAngeles 694 811 0.856
## 17 AM West on time Phoenix 4840 5255 0.921
## 18 AM West on time SanDiego 383 448 0.855
## 19 AM West on time SanFrancisco 320 449 0.713
## 20 AM West on time Seattle 201 262 0.767
Finally we find The rations for the airlines in general (we only look at the Delayed flights) and we can see by a decent margin that AM West has more delayed flights than Alaska.
NewTidied %>% group_by(Airline) %>% filter(FlightStatus == "delayed") %>% summarise(mean = mean(NewRatio), min = min(NewRatio), max = max(NewRatio), median = median(NewRatio), standard_deviation = sd(NewRatio))
## # A tibble: 2 x 6
## Airline mean min max median standard_deviation
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska 0.112 0.0515 0.169 0.111 0.0459
## 2 AM West 0.178 0.0790 0.287 0.145 0.0821