library(readr)
library(tidyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janitor)

(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

numbersense <- read_csv("~/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/numbersense.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   `Los Angeles` = col_double(),
##   Phoenix = col_double(),
##   `San Diego` = col_double(),
##   `San Francisco` = col_double(),
##   Seattle = col_double()
## )
numbersense
## # A tibble: 5 x 7
##   X1      X2      `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 ALASKA  delayed            62      12          20             102     305
## 3 <NA>    <NA>               NA      NA          NA              NA      NA
## 4 AM WEST on time           694    4840         383             320     201
## 5 AM WEST delayed           117     415          65             129      61

(2) Read the information from your.CSV file into R,and use tidyr and dplyr as needed to tidy and transform your data.

Condensing the destinations(observations) to rows and removing empty data.

numbersense %>% remove_empty("rows")
## # A tibble: 4 x 7
##   X1      X2      `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 ALASKA  delayed            62      12          20             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 AM WEST delayed           117     415          65             129      61
numbersense_gather <- numbersense %>% gather(Destination, Flights, -X1, -X2, na.rm =TRUE)
numbersense_gather
## # A tibble: 20 x 4
##    X1      X2      Destination   Flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 ALASKA  on time Los Angeles       497
##  2 ALASKA  delayed Los Angeles        62
##  3 AM WEST on time Los Angeles       694
##  4 AM WEST delayed Los Angeles       117
##  5 ALASKA  on time Phoenix           221
##  6 ALASKA  delayed Phoenix            12
##  7 AM WEST on time Phoenix          4840
##  8 AM WEST delayed Phoenix           415
##  9 ALASKA  on time San Diego         212
## 10 ALASKA  delayed San Diego          20
## 11 AM WEST on time San Diego         383
## 12 AM WEST delayed San Diego          65
## 13 ALASKA  on time San Francisco     503
## 14 ALASKA  delayed San Francisco     102
## 15 AM WEST on time San Francisco     320
## 16 AM WEST delayed San Francisco     129
## 17 ALASKA  on time Seattle          1841
## 18 ALASKA  delayed Seattle           305
## 19 AM WEST on time Seattle           201
## 20 AM WEST delayed Seattle            61

rename columns for clarity

names(numbersense_gather) <- c("AIRLINE", "Flight_Status", "Destination", "Flights")

spread function to focus on the target data

numbersense_spread <- numbersense_gather %>%
  spread(Flight_Status, Flights)

rename columns for clarity

names(numbersense_spread) <- c("AIRLINE", "Destination", "Delayed", "On_Time")
(3) Perform analysis to compare the arrival delays for the two airlines.

Display the average number of delays for both airlines

numbersense_spread %>%
  group_by(AIRLINE) %>%
  summarise(aver_delay_per_airline = mean(Delayed))
## # A tibble: 2 x 2
##   AIRLINE aver_delay_per_airline
##   <chr>                    <dbl>
## 1 ALASKA                    100.
## 2 AM WEST                   157.

AM WEST experiences 57% more delays than ALASKA

Display the chance that a flight will be delayed based on the chosen Airline and Destination

numbersense_spread %>%
  group_by(AIRLINE, Destination) %>%
  select(Delayed, On_Time) %>%
  mutate('(%) Chance_Delay' = (Delayed/(Delayed + On_Time)*100))
## Adding missing grouping variables: `AIRLINE`, `Destination`
## # A tibble: 10 x 5
## # Groups:   AIRLINE, Destination [10]
##    AIRLINE Destination   Delayed On_Time `(%) Chance_Delay`
##    <chr>   <chr>           <dbl>   <dbl>              <dbl>
##  1 ALASKA  Los Angeles        62     497              11.1 
##  2 ALASKA  Phoenix            12     221               5.15
##  3 ALASKA  San Diego          20     212               8.62
##  4 ALASKA  San Francisco     102     503              16.9 
##  5 ALASKA  Seattle           305    1841              14.2 
##  6 AM WEST Los Angeles       117     694              14.4 
##  7 AM WEST Phoenix           415    4840               7.90
##  8 AM WEST San Diego          65     383              14.5 
##  9 AM WEST San Francisco     129     320              28.7 
## 10 AM WEST Seattle            61     201              23.3

With this information potential passengers will be see the probability of their flight being delayed before they book it.

Display the two destinations that experience the lowest number of delays per airline.

numbersense_spread %>%
  group_by(AIRLINE,Destination) %>%
  summarise(min_delay_per_dest = min(Delayed)) %>%
  arrange(Destination) %>% slice(1:2)
## # A tibble: 4 x 3
## # Groups:   AIRLINE [2]
##   AIRLINE Destination min_delay_per_dest
##   <chr>   <chr>                    <dbl>
## 1 ALASKA  Los Angeles                 62
## 2 ALASKA  Phoenix                     12
## 3 AM WEST Los Angeles                117
## 4 AM WEST Phoenix                    415

Both Airlines experience their lowest number of delays in LA and Phoenix However, ALASKA is 417% more likely to not experience a delay in Phoenix, while AM WEST is 255% more likely to not experience a delay in LA