#1.load csv file into r
flight <- read.csv (file ='/Users/joycealdrich/Documents/SPS Data Science/Data 607/Assignment_4/Flight.csv')
head(flight)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#2 remove empty row
flight2 <- flight [-c(3),]
flight2
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
#3 Put the airline name in the x col
flight2$X <- c("ALASKA","ALASKA","AM WEST", "AM WEST")
#4 organize the table
flight2 <- gather(flight2, "Destination", "Count", 3:7)
#5 rename the 1st and 2nd col
flight2 <- rename(flight2, Airline=X, Status=X.1)
#6 reorder the col
flight2 <- select(flight2, Airline, Destination, Status,Count)
#6 cast status and count
flight3 <- flight2 %>%
pivot_wider(names_from = "Status", values_from = "Count")
head(flight3)
## # A tibble: 6 × 4
## Airline Destination `on time` delayed
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 497 62
## 2 AM WEST Los.Angeles 694 117
## 3 ALASKA Phoenix 221 12
## 4 AM WEST Phoenix 4840 415
## 5 ALASKA San.Diego 212 20
## 6 AM WEST San.Diego 383 65
#7 adding total col in the data.frame
flight4 <- flight3 %>%
mutate(total=flight3$`on time`+flight3$delayed)
#8 adding delayed_rate col in the data.frame #Noted that AM WEST airline’s delayed rate in 5 different destination all higer than ALASKA airline.
flight5 <- flight4 %>%
mutate(delayed_rate = flight4$delayed/flight4$total)
flight5
## # A tibble: 10 × 6
## Airline Destination `on time` delayed total delayed_rate
## <chr> <chr> <int> <int> <int> <dbl>
## 1 ALASKA Los.Angeles 497 62 559 0.111
## 2 AM WEST Los.Angeles 694 117 811 0.144
## 3 ALASKA Phoenix 221 12 233 0.0515
## 4 AM WEST Phoenix 4840 415 5255 0.0790
## 5 ALASKA San.Diego 212 20 232 0.0862
## 6 AM WEST San.Diego 383 65 448 0.145
## 7 ALASKA San.Francisco 503 102 605 0.169
## 8 AM WEST San.Francisco 320 129 449 0.287
## 9 ALASKA Seattle 1841 305 2146 0.142
## 10 AM WEST Seattle 201 61 262 0.233
#9 creating ggplot to compare delayed rate between two airlines and destination
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
flight5 %>%
ggplot(aes(Airline,delayed_rate,color=Destination))+
geom_point()