#1.load csv file into r

flight <- read.csv (file ='/Users/joycealdrich/Documents/SPS Data Science/Data 607/Assignment_4/Flight.csv')
head(flight)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#2 remove empty row

flight2 <- flight [-c(3),]
flight2
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

#3 Put the airline name in the x col

flight2$X <- c("ALASKA","ALASKA","AM WEST", "AM WEST")

#4 organize the table

flight2 <- gather(flight2, "Destination", "Count", 3:7)

#5 rename the 1st and 2nd col

flight2 <- rename(flight2, Airline=X, Status=X.1)

#6 reorder the col

flight2 <- select(flight2, Airline, Destination, Status,Count)

#6 cast status and count

flight3 <- flight2 %>%
  pivot_wider(names_from = "Status", values_from = "Count")
head(flight3)
## # A tibble: 6 × 4
##   Airline Destination `on time` delayed
##   <chr>   <chr>           <int>   <int>
## 1 ALASKA  Los.Angeles       497      62
## 2 AM WEST Los.Angeles       694     117
## 3 ALASKA  Phoenix           221      12
## 4 AM WEST Phoenix          4840     415
## 5 ALASKA  San.Diego         212      20
## 6 AM WEST San.Diego         383      65

#7 adding total col in the data.frame

flight4 <- flight3 %>%
  mutate(total=flight3$`on time`+flight3$delayed)

#8 adding delayed_rate col in the data.frame #Noted that AM WEST airline’s delayed rate in 5 different destination all higer than ALASKA airline.

flight5 <- flight4 %>%
  mutate(delayed_rate = flight4$delayed/flight4$total)
flight5
## # A tibble: 10 × 6
##    Airline Destination   `on time` delayed total delayed_rate
##    <chr>   <chr>             <int>   <int> <int>        <dbl>
##  1 ALASKA  Los.Angeles         497      62   559       0.111 
##  2 AM WEST Los.Angeles         694     117   811       0.144 
##  3 ALASKA  Phoenix             221      12   233       0.0515
##  4 AM WEST Phoenix            4840     415  5255       0.0790
##  5 ALASKA  San.Diego           212      20   232       0.0862
##  6 AM WEST San.Diego           383      65   448       0.145 
##  7 ALASKA  San.Francisco       503     102   605       0.169 
##  8 AM WEST San.Francisco       320     129   449       0.287 
##  9 ALASKA  Seattle            1841     305  2146       0.142 
## 10 AM WEST Seattle             201      61   262       0.233

#9 creating ggplot to compare delayed rate between two airlines and destination

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
flight5 %>%
  ggplot(aes(Airline,delayed_rate,color=Destination))+
  geom_point()