Reading
airline_data <- read_csv("Tidying and Transforming Data.csv")
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9], 'X10' [10],
## 'X11' [11], 'X12' [12], 'X13' [13]
## Parsed with column specification:
## cols(
## AIRLINE = col_character(),
## STATUS = col_character(),
## `LOS ANGELES` = col_double(),
## PHOENIX = col_double(),
## `SAN DIEGO` = col_double(),
## `SAN FRANCISCO` = col_double(),
## SEATTLE = col_number(),
## X8 = col_logical(),
## X9 = col_logical(),
## X10 = col_logical(),
## X11 = col_logical(),
## X12 = col_logical(),
## X13 = col_logical()
## )
str(airline_data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 13 variables:
## $ AIRLINE : chr "ALASKA" "ALASKA" NA "AM WEST" ...
## $ STATUS : chr "on time" "delayed" NA "on time" ...
## $ LOS ANGELES : num 497 62 NA 694 117
## $ PHOENIX : num 221 12 NA 4840 415
## $ SAN DIEGO : num 212 20 NA 383 65
## $ SAN FRANCISCO: num 503 102 NA 320 129
## $ SEATTLE : num 1841 305 NA 201 61
## $ X8 : logi NA NA NA NA NA
## $ X9 : logi NA NA NA NA NA
## $ X10 : logi NA NA NA NA NA
## $ X11 : logi NA NA NA NA NA
## $ X12 : logi NA NA NA NA NA
## $ X13 : logi NA NA NA NA NA
## - attr(*, "spec")=
## .. cols(
## .. AIRLINE = col_character(),
## .. STATUS = col_character(),
## .. `LOS ANGELES` = col_double(),
## .. PHOENIX = col_double(),
## .. `SAN DIEGO` = col_double(),
## .. `SAN FRANCISCO` = col_double(),
## .. SEATTLE = col_number(),
## .. X8 = col_logical(),
## .. X9 = col_logical(),
## .. X10 = col_logical(),
## .. X11 = col_logical(),
## .. X12 = col_logical(),
## .. X13 = col_logical()
## .. )
summary(airline_data)
## AIRLINE STATUS LOS ANGELES PHOENIX
## Length:5 Length:5 Min. : 62.0 Min. : 12.0
## Class :character Class :character 1st Qu.:103.2 1st Qu.: 168.8
## Mode :character Mode :character Median :307.0 Median : 318.0
## Mean :342.5 Mean :1372.0
## 3rd Qu.:546.2 3rd Qu.:1521.2
## Max. :694.0 Max. :4840.0
## NA's :1 NA's :1
## SAN DIEGO SAN FRANCISCO SEATTLE X8
## Min. : 20.00 Min. :102.0 Min. : 61 Mode:logical
## 1st Qu.: 53.75 1st Qu.:122.2 1st Qu.: 166 NA's:5
## Median :138.50 Median :224.5 Median : 253
## Mean :170.00 Mean :263.5 Mean : 602
## 3rd Qu.:254.75 3rd Qu.:365.8 3rd Qu.: 689
## Max. :383.00 Max. :503.0 Max. :1841
## NA's :1 NA's :1 NA's :1
## X9 X10 X11 X12
## Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:5 NA's:5 NA's:5 NA's:5
##
##
##
##
##
## X13
## Mode:logical
## NA's:5
##
##
##
##
##
Tidying
table(airline_data$AIRLINE)
##
## ALASKA AM WEST
## 2 2
airline_data$AIRLINE <- factor(airline_data$AIRLINE)
table(airline_data$STATUS)
##
## delayed on time
## 2 2
airline_data$STATUS <- factor(airline_data$STATUS)
dim(airline_data)
## [1] 5 13
Preparation for Analysis - Missing values
airline_data_g <- gather(airline_data, CITY, MINUTES, -AIRLINE, -STATUS)
airline_data_g$CITY <- factor(airline_data_g$CITY)
dim(airline_data_g)
## [1] 55 4
head(airline_data_g)
## # A tibble: 6 x 4
## AIRLINE STATUS CITY MINUTES
## <fct> <fct> <fct> <dbl>
## 1 ALASKA on time LOS ANGELES 497
## 2 ALASKA delayed LOS ANGELES 62
## 3 <NA> <NA> LOS ANGELES NA
## 4 AM WEST on time LOS ANGELES 694
## 5 AM WEST delayed LOS ANGELES 117
## 6 ALASKA on time PHOENIX 221
airline_data_ready <- na.omit(airline_data_g)
airline_data_ready <- spread(airline_data_ready, STATUS, MINUTES)
glimpse(airline_data_ready)
## Observations: 10
## Variables: 4
## $ AIRLINE <fct> ALASKA, ALASKA, ALASKA, ALASKA, ALASKA, AM WEST, AM ...
## $ CITY <fct> LOS ANGELES, PHOENIX, SAN DIEGO, SAN FRANCISCO, SEAT...
## $ delayed <dbl> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ `on time` <dbl> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
dim(airline_data_ready)
## [1] 10 4
head(airline_data_ready)
## # A tibble: 6 x 4
## AIRLINE CITY delayed `on time`
## <fct> <fct> <dbl> <dbl>
## 1 ALASKA LOS ANGELES 62 497
## 2 ALASKA PHOENIX 12 221
## 3 ALASKA SAN DIEGO 20 212
## 4 ALASKA SAN FRANCISCO 102 503
## 5 ALASKA SEATTLE 305 1841
## 6 AM WEST LOS ANGELES 117 694
tail(airline_data_ready)
## # A tibble: 6 x 4
## AIRLINE CITY delayed `on time`
## <fct> <fct> <dbl> <dbl>
## 1 ALASKA SEATTLE 305 1841
## 2 AM WEST LOS ANGELES 117 694
## 3 AM WEST PHOENIX 415 4840
## 4 AM WEST SAN DIEGO 65 383
## 5 AM WEST SAN FRANCISCO 129 320
## 6 AM WEST SEATTLE 61 201
Analysis-Compare delays across airlines
airline_delay_tbl <- select(airline_data_ready, AIRLINE, delayed)
glimpse(airline_delay_tbl)
## Observations: 10
## Variables: 2
## $ AIRLINE <fct> ALASKA, ALASKA, ALASKA, ALASKA, ALASKA, AM WEST, AM WE...
## $ delayed <dbl> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
summarize(filter(airline_delay_tbl, AIRLINE=='ALASKA'), Alasks_Avg_delay=mean(delayed))
## # A tibble: 1 x 1
## Alasks_Avg_delay
## <dbl>
## 1 100.
summarize(filter(airline_delay_tbl, AIRLINE=='AM WEST'), AM_Avg_delay=mean(delayed))
## # A tibble: 1 x 1
## AM_Avg_delay
## <dbl>
## 1 157.