library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Data was entered in a csv file called flightsInfo.csv using the wide format given
CSV read into R
dfWide<-read.csv(file="flightsInfo.csv")
dfWide
## Airline Delayed Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
tbWide<-tbl_df(dfWide)
tbWide
## # A tibble: 4 x 7
## Airline Delayed Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <fct> <fct> <int> <int> <int> <int> <int>
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
We reshape the data by going from a wide table to a tidy table with variables as columns and observations as rows
tbTidy<-gather(dfWide,"Destinations","Flights",3:7)
tbTidy
## Airline Delayed Destinations Flights
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AM West on time Los.Angeles 694
## 4 AM West delayed Los.Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AM West on time Phoenix 4840
## 8 AM West delayed Phoenix 415
## 9 Alaska on time San.Diego 212
## 10 Alaska delayed San.Diego 20
## 11 AM West on time San.Diego 383
## 12 AM West delayed San.Diego 65
## 13 Alaska on time San.Francisco 503
## 14 Alaska delayed San.Francisco 102
## 15 AM West on time San.Francisco 320
## 16 AM West delayed San.Francisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AM West on time Seattle 201
## 20 AM West delayed Seattle 61
We then group the data by airline and delayed fligths
tbTidyGroupAir<-group_by(tbTidy,Airline,Delayed)
tbTidyGroupAir
## # A tibble: 20 x 4
## # Groups: Airline, Delayed [4]
## Airline Delayed Destinations Flights
## <fct> <fct> <chr> <int>
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AM West on time Los.Angeles 694
## 4 AM West delayed Los.Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AM West on time Phoenix 4840
## 8 AM West delayed Phoenix 415
## 9 Alaska on time San.Diego 212
## 10 Alaska delayed San.Diego 20
## 11 AM West on time San.Diego 383
## 12 AM West delayed San.Diego 65
## 13 Alaska on time San.Francisco 503
## 14 Alaska delayed San.Francisco 102
## 15 AM West on time San.Francisco 320
## 16 AM West delayed San.Francisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AM West on time Seattle 201
## 20 AM West delayed Seattle 61
Using the tidy table with groups we can compare both airlines arrival times performing a summary. From here we can see how AM West has more delayed flights than Alaska, but also has more on time. Overall AM West has more flights than Alaska, so a better way to do this comparison is using ratios.
summarise(tbTidyGroupAir,totalFligths=sum(Flights))
## # A tibble: 4 x 3
## # Groups: Airline [?]
## Airline Delayed totalFligths
## <fct> <fct> <int>
## 1 Alaska delayed 501
## 2 Alaska on time 3274
## 3 AM West delayed 787
## 4 AM West on time 6438
Before continuing, we can point out how all these operations can be done using pipies, which will be the method used going forward
dfWide %>% tbl_df() %>% gather("Destinations","Flights",3:7) %>% group_by(Airline) %>% summarise(totalFligths=sum(Flights))
## # A tibble: 2 x 2
## Airline totalFligths
## <fct> <int>
## 1 Alaska 3775
## 2 AM West 7225
dfWide %>% tbl_df() %>% gather("Destinations","Flights",3:7) %>% group_by(Airline,Delayed) %>% summarise(totalFligths=sum(Flights))
## # A tibble: 4 x 3
## # Groups: Airline [?]
## Airline Delayed totalFligths
## <fct> <fct> <int>
## 1 Alaska delayed 501
## 2 Alaska on time 3274
## 3 AM West delayed 787
## 4 AM West on time 6438
To calculate ratios we can mutate our final table from before to add a ratio column
dfWide %>% tbl_df() %>% gather("Destinations","Flights",3:7) %>% group_by(Airline,Delayed) %>% summarise(totalFligths=sum(Flights)) %>% mutate(Ratio=totalFligths/sum(totalFligths))
## # A tibble: 4 x 4
## # Groups: Airline [2]
## Airline Delayed totalFligths Ratio
## <fct> <fct> <int> <dbl>
## 1 Alaska delayed 501 0.133
## 2 Alaska on time 3274 0.867
## 3 AM West delayed 787 0.109
## 4 AM West on time 6438 0.891
From the ratio shown above we discover that Alaska has slightly more delayed flights, with 13.3% of its flights being delayed against 10.9% of AM West flights being delayed.
Instead of loading the data into R from a csv file, we could have also created a table in MySQL and imported it into R.
library("RMySQL")
## Loading required package: DBI
mydb<-dbConnect(MySQL(),user='root',password='123',dbname='flightDelays',host='localhost')
dbListTables(mydb)
## [1] "tablewide"
dbListFields(mydb,'tableWide')
## [1] "Airline" "Delays" "losAngeles" "phoenix"
## [5] "sanDiego" "sanFrancisco" "seattle"
tableWideDataQuery<-dbSendQuery(mydb,'select * from tableWide')
dfTableDbWide<-fetch(tableWideDataQuery,n=-1)
dfTableDbWide
## Airline Delays losAngeles phoenix sanDiego sanFrancisco seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
tbWide<-tbl_df(dfTableDbWide)
tbWide
## # A tibble: 4 x 7
## Airline Delays losAngeles phoenix sanDiego sanFrancisco seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
The resulting tbWide table is the same table we obtained from the CSV import. The same analysis can be performed using this table as the starting point.