Overview: The assignment will consider a wide dataset describing arrival delays for two flightdata1s across five destinations. The purpose of this assignment is to use the tidyr and dplyr packages to tidy and transform the flights datatset and perform comparative analysis of the two flightdata1s.
library("tidyr")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Import our data/read csv file into R
flight <- read.csv(file="C:\\Users\\26291\\Documents\\Data_607\\flight.csv", header =TRUE, stringsAsFactors = FALSE)
flight
##        X     X.1 Los.Angeles Phoenix San.Diego San.Fransico Seattle
## 1 ALASKA on time         497     221       212          503    1841
## 2 ALASKA delayed          62      12        20          102     305
## 3                         NA      NA        NA           NA      NA
## 4 AMWEST on time         694    4840       383          320     201
## 5 AMWEST delayed         117     415        65          129      61
str(flight)
## 'data.frame':    5 obs. of  7 variables:
##  $ X           : chr  "ALASKA" "ALASKA" "" "AMWEST" ...
##  $ X.1         : chr  "on time" "delayed" "" "on time" ...
##  $ Los.Angeles : int  497 62 NA 694 117
##  $ Phoenix     : int  221 12 NA 4840 415
##  $ San.Diego   : int  212 20 NA 383 65
##  $ San.Fransico: int  503 102 NA 320 129
##  $ Seattle     : int  1841 305 NA 201 61
Tidy Data
1. Fill first column with complete row names
library(tidyr)
flight <- flight %>% fill(X)
flight
##        X     X.1 Los.Angeles Phoenix San.Diego San.Fransico Seattle
## 1 ALASKA on time         497     221       212          503    1841
## 2 ALASKA delayed          62      12        20          102     305
## 3                         NA      NA        NA           NA      NA
## 4 AMWEST on time         694    4840       383          320     201
## 5 AMWEST delayed         117     415        65          129      61
2. Remove NAs
flight <- na.omit(flight)
3. Make City attribute data observation
flight <- gather(flight, "City", "n", 3:7)
flight
##         X     X.1         City    n
## 1  ALASKA on time  Los.Angeles  497
## 2  ALASKA delayed  Los.Angeles   62
## 3  AMWEST on time  Los.Angeles  694
## 4  AMWEST delayed  Los.Angeles  117
## 5  ALASKA on time      Phoenix  221
## 6  ALASKA delayed      Phoenix   12
## 7  AMWEST on time      Phoenix 4840
## 8  AMWEST delayed      Phoenix  415
## 9  ALASKA on time    San.Diego  212
## 10 ALASKA delayed    San.Diego   20
## 11 AMWEST on time    San.Diego  383
## 12 AMWEST delayed    San.Diego   65
## 13 ALASKA on time San.Fransico  503
## 14 ALASKA delayed San.Fransico  102
## 15 AMWEST on time San.Fransico  320
## 16 AMWEST delayed San.Fransico  129
## 17 ALASKA on time      Seattle 1841
## 18 ALASKA delayed      Seattle  305
## 19 AMWEST on time      Seattle  201
## 20 AMWEST delayed      Seattle   61
4. Turn “OnTime” and “Delayed” row variables into observations
flight <- spread(flight, "X.1", "n")
flight
##         X         City delayed on time
## 1  ALASKA  Los.Angeles      62     497
## 2  ALASKA      Phoenix      12     221
## 3  ALASKA    San.Diego      20     212
## 4  ALASKA San.Fransico     102     503
## 5  ALASKA      Seattle     305    1841
## 6  AMWEST  Los.Angeles     117     694
## 7  AMWEST      Phoenix     415    4840
## 8  AMWEST    San.Diego      65     383
## 9  AMWEST San.Fransico     129     320
## 10 AMWEST      Seattle      61     201
flight <- dplyr::rename(flight, Carrier = X)
flight <- dplyr::rename(flight, Delayed = delayed)
flight <- dplyr::rename(flight, On_Time = 'on time')
flight
##    Carrier         City Delayed On_Time
## 1   ALASKA  Los.Angeles      62     497
## 2   ALASKA      Phoenix      12     221
## 3   ALASKA    San.Diego      20     212
## 4   ALASKA San.Fransico     102     503
## 5   ALASKA      Seattle     305    1841
## 6   AMWEST  Los.Angeles     117     694
## 7   AMWEST      Phoenix     415    4840
## 8   AMWEST    San.Diego      65     383
## 9   AMWEST San.Fransico     129     320
## 10  AMWEST      Seattle      61     201
DATA ANALYSIS
dplyr::glimpse(flight)
## Observations: 10
## Variables: 4
## $ Carrier <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AMW...
## $ City    <chr> "Los.Angeles", "Phoenix", "San.Diego", "San.Fransico",...
## $ Delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ On_Time <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
Delay duration by days
select(flight, Carrier, Delayed)
##    Carrier Delayed
## 1   ALASKA      62
## 2   ALASKA      12
## 3   ALASKA      20
## 4   ALASKA     102
## 5   ALASKA     305
## 6   AMWEST     117
## 7   AMWEST     415
## 8   AMWEST      65
## 9   AMWEST     129
## 10  AMWEST      61
Mean and Median of all Delays
dplyr::summarise(flight, Mean = mean(Delayed), Median = median(Delayed))
##    Mean Median
## 1 128.8   83.5
Number of delays in Descending order
dplyr::arrange(flight, desc(Delayed))
##    Carrier         City Delayed On_Time
## 1   AMWEST      Phoenix     415    4840
## 2   ALASKA      Seattle     305    1841
## 3   AMWEST San.Fransico     129     320
## 4   AMWEST  Los.Angeles     117     694
## 5   ALASKA San.Fransico     102     503
## 6   AMWEST    San.Diego      65     383
## 7   ALASKA  Los.Angeles      62     497
## 8   AMWEST      Seattle      61     201
## 9   ALASKA    San.Diego      20     212
## 10  ALASKA      Phoenix      12     221
Number of delays in ascending order by Carrier
dplyr::arrange(flight, Carrier, Delayed)
##    Carrier         City Delayed On_Time
## 1   ALASKA      Phoenix      12     221
## 2   ALASKA    San.Diego      20     212
## 3   ALASKA  Los.Angeles      62     497
## 4   ALASKA San.Fransico     102     503
## 5   ALASKA      Seattle     305    1841
## 6   AMWEST      Seattle      61     201
## 7   AMWEST    San.Diego      65     383
## 8   AMWEST  Los.Angeles     117     694
## 9   AMWEST San.Fransico     129     320
## 10  AMWEST      Phoenix     415    4840
The mean and sum of delays by Carrier:
Carrier <- flight %>% group_by(Carrier) %>% 
  summarise(mean = mean(Delayed), sum = sum(Delayed), n = n())
Carrier
## # A tibble: 2 x 4
##   Carrier  mean   sum     n
##   <chr>   <dbl> <int> <int>
## 1 ALASKA    100   501     5
## 2 AMWEST    157   787     5
The mean and sum of delays by City:
City <- flight %>% group_by(City) %>% 
  summarise(mean = mean(Delayed), sum = sum(Delayed)) %>% 
  arrange(desc(mean))
City
## # A tibble: 5 x 3
##   City          mean   sum
##   <chr>        <dbl> <int>
## 1 Phoenix      214     427
## 2 Seattle      183     366
## 3 San.Fransico 116     231
## 4 Los.Angeles   89.5   179
## 5 San.Diego     42.5    85
VISUALIZATION
Table creation for data visualization:
CityCarrier <- flight %>% group_by(City, Carrier) %>% 
  summarise(mean = mean(Delayed), sum = sum(Delayed))
Carrier Delays By City
library(ggplot2)
LineGraph <- ggplot(CityCarrier, aes(x = City, y = mean))
LineGraph <- LineGraph + geom_line(aes(color=factor(Carrier), group = Carrier))
LineGraph <- LineGraph + scale_color_discrete(name = "Carrier")
LineGraph <- LineGraph + labs(title = "Carrier Delays by City", x = "City", y = "Total Number of Delays")
LineGraph