Installing necessary packages

# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("ggplot2")
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)

Reading data from csv file:

data<- read.csv("https://raw.githubusercontent.com/olgashiligin/Tidying_and_Transforming/master/delays.csv",sep=",")
data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Tidying Data:

  1. Removing one empty row
  2. Renaming first two columns
  3. Replacing empty spaces with Airlines names

Removing one empty row

data<-data[-c(3), ] 
data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Renaming first two columns

names(data) <- c('Airlines', 'Status', 'Los Angeles', 'Phoenix', 'San Diego','San Francisco', names(data)[7])
data
##   Airlines  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2          delayed          62      12        20           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5          delayed         117     415        65           129      61

Replacing empty spaces with Airlines names

for (i in 1:nrow(data)) {
  if (data[i, 1] == " ") {
    data[i, 1] = data[i - 1, 1]
  }
}

data
##   Airlines  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        20           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5  AM WEST delayed         117     415        65           129      61

Prepairing data set for the analysis:

  1. Transforming wide table format
  2. Splitting “Status” column into two columns: delayed and on_time.

Transforming wide table format

data<-gather(data,Destinations,Number_of_Flights,3:7)
data
##    Airlines  Status  Destinations Number_of_Flights
## 1    ALASKA on time   Los Angeles               497
## 2    ALASKA delayed   Los Angeles                62
## 3   AM WEST on time   Los Angeles               694
## 4   AM WEST delayed   Los Angeles               117
## 5    ALASKA on time       Phoenix               221
## 6    ALASKA delayed       Phoenix                12
## 7   AM WEST on time       Phoenix              4840
## 8   AM WEST delayed       Phoenix               415
## 9    ALASKA on time     San Diego               212
## 10   ALASKA delayed     San Diego                20
## 11  AM WEST on time     San Diego               383
## 12  AM WEST delayed     San Diego                65
## 13   ALASKA on time San Francisco               503
## 14   ALASKA delayed San Francisco               102
## 15  AM WEST on time San Francisco               320
## 16  AM WEST delayed San Francisco               129
## 17   ALASKA on time       Seattle              1841
## 18   ALASKA delayed       Seattle               305
## 19  AM WEST on time       Seattle               201
## 20  AM WEST delayed       Seattle                61

Splitting “Status” column into two columns: delayed and on_time. Renaming on time column.

data <- spread(data, Status, Number_of_Flights)
names(data)[names(data) == "on time"] <- "on_time"
data
##    Airlines  Destinations delayed on_time
## 1    ALASKA   Los Angeles      62     497
## 2    ALASKA       Phoenix      12     221
## 3    ALASKA     San Diego      20     212
## 4    ALASKA San Francisco     102     503
## 5    ALASKA       Seattle     305    1841
## 6   AM WEST   Los Angeles     117     694
## 7   AM WEST       Phoenix     415    4840
## 8   AM WEST     San Diego      65     383
## 9   AM WEST San Francisco     129     320
## 10  AM WEST       Seattle      61     201

Performing analysis on flight delays by ALASKA and AM WEST airlines.

Calculating percentage of flight delays by airlines.

overall_analysis<-data %>% 
   mutate(total_flights = delayed + on_time) %>%
    group_by(Airlines) %>% 
     summarise(avg_delayed_rate = sum(delayed)/sum(total_flights))

overall_analysis
## # A tibble: 2 x 2
##   Airlines avg_delayed_rate
##   <fct>               <dbl>
## 1 ALASKA              0.133
## 2 AM WEST             0.109

Overall analysis shows that Alaska airlines has higer rate of delays (0.133) compare to AM WEST (0.109)

analysis<-data %>%
  mutate(total_flights = delayed + on_time, delayed_rate = delayed/(delayed+on_time)) %>%
  arrange(desc(delayed_rate))

analysis
##    Airlines  Destinations delayed on_time total_flights delayed_rate
## 1   AM WEST San Francisco     129     320           449   0.28730512
## 2   AM WEST       Seattle      61     201           262   0.23282443
## 3    ALASKA San Francisco     102     503           605   0.16859504
## 4   AM WEST     San Diego      65     383           448   0.14508929
## 5   AM WEST   Los Angeles     117     694           811   0.14426634
## 6    ALASKA       Seattle     305    1841          2146   0.14212488
## 7    ALASKA   Los Angeles      62     497           559   0.11091234
## 8    ALASKA     San Diego      20     212           232   0.08620690
## 9   AM WEST       Phoenix     415    4840          5255   0.07897241
## 10   ALASKA       Phoenix      12     221           233   0.05150215
viz3<-ggplot(analysis, aes(x = Destinations, y = delayed_rate, fill = Airlines)) + geom_col( position = "dodge") + ylab("Delayed Flights, %")+ggtitle("           Flight Delays By Airlines and By Destinations")
viz3

AM WEST: the highest rate of delays - San Francisco (0.28), the lowest rate of delays - Phoenix (0.79)

Alaska: the highest rate of delays - San Francisco (0.17), the lowest rate of delays - Phoenix (0.05)