Installing necessary packages
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("ggplot2")
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
data<- read.csv("https://raw.githubusercontent.com/olgashiligin/Tidying_and_Transforming/master/delays.csv",sep=",")
data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Removing one empty row
data<-data[-c(3), ]
data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Renaming first two columns
names(data) <- c('Airlines', 'Status', 'Los Angeles', 'Phoenix', 'San Diego','San Francisco', names(data)[7])
data
## Airlines Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Replacing empty spaces with Airlines names
for (i in 1:nrow(data)) {
if (data[i, 1] == " ") {
data[i, 1] = data[i - 1, 1]
}
}
data
## Airlines Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
Transforming wide table format
data<-gather(data,Destinations,Number_of_Flights,3:7)
data
## Airlines Status Destinations Number_of_Flights
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Splitting “Status” column into two columns: delayed and on_time. Renaming on time column.
data <- spread(data, Status, Number_of_Flights)
names(data)[names(data) == "on time"] <- "on_time"
data
## Airlines Destinations delayed on_time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
Calculating percentage of flight delays by airlines.
overall_analysis<-data %>%
mutate(total_flights = delayed + on_time) %>%
group_by(Airlines) %>%
summarise(avg_delayed_rate = sum(delayed)/sum(total_flights))
overall_analysis
## # A tibble: 2 x 2
## Airlines avg_delayed_rate
## <fct> <dbl>
## 1 ALASKA 0.133
## 2 AM WEST 0.109
Overall analysis shows that Alaska airlines has higer rate of delays (0.133) compare to AM WEST (0.109)
analysis<-data %>%
mutate(total_flights = delayed + on_time, delayed_rate = delayed/(delayed+on_time)) %>%
arrange(desc(delayed_rate))
analysis
## Airlines Destinations delayed on_time total_flights delayed_rate
## 1 AM WEST San Francisco 129 320 449 0.28730512
## 2 AM WEST Seattle 61 201 262 0.23282443
## 3 ALASKA San Francisco 102 503 605 0.16859504
## 4 AM WEST San Diego 65 383 448 0.14508929
## 5 AM WEST Los Angeles 117 694 811 0.14426634
## 6 ALASKA Seattle 305 1841 2146 0.14212488
## 7 ALASKA Los Angeles 62 497 559 0.11091234
## 8 ALASKA San Diego 20 212 232 0.08620690
## 9 AM WEST Phoenix 415 4840 5255 0.07897241
## 10 ALASKA Phoenix 12 221 233 0.05150215
viz3<-ggplot(analysis, aes(x = Destinations, y = delayed_rate, fill = Airlines)) + geom_col( position = "dodge") + ylab("Delayed Flights, %")+ggtitle(" Flight Delays By Airlines and By Destinations")
viz3
AM WEST: the highest rate of delays - San Francisco (0.28), the lowest rate of delays - Phoenix (0.79)
Alaska: the highest rate of delays - San Francisco (0.17), the lowest rate of delays - Phoenix (0.05)