library(readr)
library(tidyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
numbersense <- read_csv("~/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/numbersense.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_double(),
## Phoenix = col_double(),
## `San Diego` = col_double(),
## `San Francisco` = col_double(),
## Seattle = col_double()
## )
numbersense
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
Condensing the destinations(observations) to rows and removing empty data.
numbersense %>% remove_empty("rows")
## # A tibble: 4 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
numbersense_gather <- numbersense %>% gather(Destination, Flights, -X1, -X2, na.rm =TRUE)
numbersense_gather
## # A tibble: 20 x 4
## X1 X2 Destination Flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
rename columns for clarity
names(numbersense_gather) <- c("AIRLINE", "Flight_Status", "Destination", "Flights")
spread function to focus on the target data
numbersense_spread <- numbersense_gather %>%
spread(Flight_Status, Flights)
rename columns for clarity
names(numbersense_spread) <- c("AIRLINE", "Destination", "Delayed", "On_Time")
Display the average number of delays for both airlines
numbersense_spread %>%
group_by(AIRLINE) %>%
summarise(aver_delay_per_airline = mean(Delayed))
## # A tibble: 2 x 2
## AIRLINE aver_delay_per_airline
## <chr> <dbl>
## 1 ALASKA 100.
## 2 AM WEST 157.
AM WEST experiences 57% more delays than ALASKA
Display the chance that a flight will be delayed based on the chosen Airline and Destination
numbersense_spread %>%
group_by(AIRLINE, Destination) %>%
select(Delayed, On_Time) %>%
mutate('(%) Chance_Delay' = (Delayed/(Delayed + On_Time)*100))
## Adding missing grouping variables: `AIRLINE`, `Destination`
## # A tibble: 10 x 5
## # Groups: AIRLINE, Destination [10]
## AIRLINE Destination Delayed On_Time `(%) Chance_Delay`
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497 11.1
## 2 ALASKA Phoenix 12 221 5.15
## 3 ALASKA San Diego 20 212 8.62
## 4 ALASKA San Francisco 102 503 16.9
## 5 ALASKA Seattle 305 1841 14.2
## 6 AM WEST Los Angeles 117 694 14.4
## 7 AM WEST Phoenix 415 4840 7.90
## 8 AM WEST San Diego 65 383 14.5
## 9 AM WEST San Francisco 129 320 28.7
## 10 AM WEST Seattle 61 201 23.3
With this information potential passengers will be see the probability of their flight being delayed before they book it.
Display the two destinations that experience the lowest number of delays per airline.
numbersense_spread %>%
group_by(AIRLINE,Destination) %>%
summarise(min_delay_per_dest = min(Delayed)) %>%
arrange(Destination) %>% slice(1:2)
## # A tibble: 4 x 3
## # Groups: AIRLINE [2]
## AIRLINE Destination min_delay_per_dest
## <chr> <chr> <dbl>
## 1 ALASKA Los Angeles 62
## 2 ALASKA Phoenix 12
## 3 AM WEST Los Angeles 117
## 4 AM WEST Phoenix 415
Both Airlines experience their lowest number of delays in LA and Phoenix However, ALASKA is 417% more likely to not experience a delay in Phoenix, while AM WEST is 255% more likely to not experience a delay in LA