Using “tidyr” and “dplyr” functions
library(stringr)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(plotly)
Read and input data:
airline <-read.csv("~/airlinedelays.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on '~/
## airlinedelays.csv'
airlinename <- airline %>% rename(airline=X, arrival=X.1, "Los Angeles"=Los.Angeles, "San Diego"=San.Diego, "San Francisco"=San.Francisco)
airlinename
## airline arrival Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
Filling the blank record:
for(i in 1:nrow(airlinename)){
if(i %% 2 ==0){
airlinename$airline[i] <- airlinename$airline[i-1]
}
}
airlinename
## airline arrival Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
airline1 <-airlinename %>% gather(airport, time, 3:7 )
airline1
## airline arrival airport time
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
airline2 <-airline1
airlineontime<-airline2 %>% filter(airline2$arrival == "on time")
airlineontime
## airline arrival airport time
## 1 ALASKA on time Los Angeles 497
## 2 AM WEST on time Los Angeles 694
## 3 ALASKA on time Phoenix 221
## 4 AM WEST on time Phoenix 4840
## 5 ALASKA on time San Diego 212
## 6 AM WEST on time San Diego 383
## 7 ALASKA on time San Francisco 503
## 8 AM WEST on time San Francisco 320
## 9 ALASKA on time Seattle 1841
## 10 AM WEST on time Seattle 201
mean(airlineontime$time)
## [1] 971.2
airlineontime %>% summarise(Min = min(airlineontime$time, na.rm=TRUE),
Median = median(airlineontime$time, na.rm=TRUE),
Mean = mean(airlineontime$time, na.rm=TRUE),
Var = var(airlineontime$time, na.rm=TRUE),
SD = sd(airlineontime$time, na.rm=TRUE),
Max = max(airlineontime$time, na.rm=TRUE),
N = n())
## Min Median Mean Var SD Max N
## 1 201 440 971.2 2083057 1443.28 4840 10
airline3 <-airline1
airlinedelayed<-airline3 %>% filter(airline3$arrival == "delayed")
airlinedelayed
## airline arrival airport time
## 1 ALASKA delayed Los Angeles 62
## 2 AM WEST delayed Los Angeles 117
## 3 ALASKA delayed Phoenix 12
## 4 AM WEST delayed Phoenix 415
## 5 ALASKA delayed San Diego 20
## 6 AM WEST delayed San Diego 65
## 7 ALASKA delayed San Francisco 102
## 8 AM WEST delayed San Francisco 129
## 9 ALASKA delayed Seattle 305
## 10 AM WEST delayed Seattle 61
mean(airlinedelayed$time)
## [1] 128.8
airlinedelayed %>% summarise(Min = min(airlinedelayed$time, na.rm=TRUE),
Median = median(airlinedelayed$time, na.rm=TRUE),
Mean = mean(airlinedelayed$time, na.rm=TRUE),
Var = var(airlinedelayed$time, na.rm=TRUE),
SD = sd(airlinedelayed$time, na.rm=TRUE),
Max = max(airlinedelayed$time, na.rm=TRUE),
N = n())
## Min Median Mean Var SD Max N
## 1 12 83.5 128.8 16935.96 130.1382 415 10
ggplotly(ggplot(airlinedelayed,aes(x=airport,y=time))+geom_bar(aes(fill=airline),stat="identity",position="dodge")+ylab("Time of Delayed Flights")+ggtitle("Delayed Flights by Region"))
```