1)Create a .CSV file that includes all of the information on the flights table. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations.

library(tidyr)
library(dplyr)
library("ggplot2")
csv <- rbind(c("Airlines", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
             c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
             c("ALASKA", "Delayed", 62, 12, 20, 102, 305),
             c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
             c("AM WEST", "Delayed", 117, 415, 65, 129, 61))
write.table(csv, file = "flightss.csv", sep = ",", col.names=F, row.names=F)

2)Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

raw_data <- read.csv(paste0("https://raw.githubusercontent.com/ErindaB/Data607_Assignment/master/flights.csv"),stringsAsFactors = F); 
raw_data
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA On Time         497     221       212           503    1841
## 2   ALASKA Delayed          62      12        20           102     305
## 3  AM WEST On Time         694    4840       383           320     201
## 4  AM WEST Delayed         117     415        65           129      61

There are 5 columns for each city.Using gather from tidyr we can have them in only 1 column.

tidyr_data <- gather(raw_data, City, Total, Los.Angeles:Seattle) %>% 
              mutate(City = gsub( "\\.", " ", City)) %>%  ## using mutate to remove . from                                                                city names 
                   arrange(Airlines)
tidyr_data
##    Airlines  Status          City Total
## 1    ALASKA On Time   Los Angeles   497
## 2    ALASKA Delayed   Los Angeles    62
## 3    ALASKA On Time       Phoenix   221
## 4    ALASKA Delayed       Phoenix    12
## 5    ALASKA On Time     San Diego   212
## 6    ALASKA Delayed     San Diego    20
## 7    ALASKA On Time San Francisco   503
## 8    ALASKA Delayed San Francisco   102
## 9    ALASKA On Time       Seattle  1841
## 10   ALASKA Delayed       Seattle   305
## 11  AM WEST On Time   Los Angeles   694
## 12  AM WEST Delayed   Los Angeles   117
## 13  AM WEST On Time       Phoenix  4840
## 14  AM WEST Delayed       Phoenix   415
## 15  AM WEST On Time     San Diego   383
## 16  AM WEST Delayed     San Diego    65
## 17  AM WEST On Time San Francisco   320
## 18  AM WEST Delayed San Francisco   129
## 19  AM WEST On Time       Seattle   201
## 20  AM WEST Delayed       Seattle    61

Use of dplyr to tidy and transform the data

spread_data <- spread(tidyr_data, "Status", "Total")
spread_data
##    Airlines          City Delayed On Time
## 1    ALASKA   Los Angeles      62     497
## 2    ALASKA       Phoenix      12     221
## 3    ALASKA     San Diego      20     212
## 4    ALASKA San Francisco     102     503
## 5    ALASKA       Seattle     305    1841
## 6   AM WEST   Los Angeles     117     694
## 7   AM WEST       Phoenix     415    4840
## 8   AM WEST     San Diego      65     383
## 9   AM WEST San Francisco     129     320
## 10  AM WEST       Seattle      61     201
  1. Perform analysis to compare the arrival delays for the two airline
#arrange rows by City 
spread_data2 <- spread_data %>%
  arrange(City, Airlines)

spread_data2 
##    Airlines          City Delayed On Time
## 1    ALASKA   Los Angeles      62     497
## 2   AM WEST   Los Angeles     117     694
## 3    ALASKA       Phoenix      12     221
## 4   AM WEST       Phoenix     415    4840
## 5    ALASKA     San Diego      20     212
## 6   AM WEST     San Diego      65     383
## 7    ALASKA San Francisco     102     503
## 8   AM WEST San Francisco     129     320
## 9    ALASKA       Seattle     305    1841
## 10  AM WEST       Seattle      61     201
#calculate the percentages of delayed and on time flights for each airline and destination
spread_data3 <- spread_data2 %>% 
        mutate(total = Delayed + `On Time`, percentDelayed = round(Delayed / total * 100, 2), 
               percentOnTime = round(100 * `On Time` / total, 2)) 
        
spread_data3  
##    Airlines          City Delayed On Time total percentDelayed
## 1    ALASKA   Los Angeles      62     497   559          11.09
## 2   AM WEST   Los Angeles     117     694   811          14.43
## 3    ALASKA       Phoenix      12     221   233           5.15
## 4   AM WEST       Phoenix     415    4840  5255           7.90
## 5    ALASKA     San Diego      20     212   232           8.62
## 6   AM WEST     San Diego      65     383   448          14.51
## 7    ALASKA San Francisco     102     503   605          16.86
## 8   AM WEST San Francisco     129     320   449          28.73
## 9    ALASKA       Seattle     305    1841  2146          14.21
## 10  AM WEST       Seattle      61     201   262          23.28
##    percentOnTime
## 1          88.91
## 2          85.57
## 3          94.85
## 4          92.10
## 5          91.38
## 6          85.49
## 7          83.14
## 8          71.27
## 9          85.79
## 10         76.72
# Compare percentage of total delayed flights for each airline by creating a new summary table
sumDf <- spread_data3 %>% 
  group_by(Airlines) %>% 
  summarise(totalDelayed = sum(Delayed), totalOnTime = sum(`On Time`), totalFlights = sum(total)) %>% 
  mutate(percentDelayed = round(totalDelayed/totalFlights * 100, 2))

sumDf 
## # A tibble: 2 x 5
##   Airlines totalDelayed totalOnTime totalFlights percentDelayed
##   <chr>           <int>       <int>        <int>          <dbl>
## 1 ALASKA            501        3274         3775           13.3
## 2 AM WEST           787        6438         7225           10.9
#Total Percentage of Delayed Flights by Airline
TotPercDelayedFlight <- ggplot(sumDf, aes(Airlines, percentDelayed)) + 
  geom_bar(aes(fill = Airlines), position = "dodge", stat = "identity") + 
  geom_text(aes(label = percentDelayed), vjust = -.5) +
  ylab("Percentage of Delayed Flights") +
  ggtitle("Total Percentage of Delayed Flights by Airline")


TotPercDelayedFlight

#Compare the percentages of delayed flights for each airline by cities in a bar graph
PercDelayedAirCities <- ggplot(spread_data3, aes(y=percentDelayed, x = Airlines, color = Airlines, fill = Airlines)) + 
    geom_bar( stat = "identity") +
    geom_text(aes(label = percentDelayed), vjust = -.5) +
    facet_wrap(~City) +
    ylim(0, 55) +
    ylab("Percentage of Delayed Flights") +
    ggtitle("Percentage of Delayed Flights by Airline and Cities")
    
PercDelayedAirCities

 #Total number of Flights by Airline and Destination
PercDelayedAirCities1 <- ggplot(spread_data3, aes(y = total, x = Airlines, color = Airlines, fill = Airlines)) +     geom_bar( stat = "identity") +
    geom_text(aes(label = total), vjust = -.3) +
    facet_wrap(~City) +
    ylim(0, 5500) +
    ylab("Number of Flights") +
    ggtitle("Number of Fights by Airline and Cities")

PercDelayedAirCities1

#Compare the percentages of delayed flights for each airline by cities in a line graph
 PercDelayedAirCities1<- ggplot(spread_data3, aes(x = City, y = percentDelayed, group = Airlines, color = Airlines, shape = Airlines)) + 
  geom_point(aes(shape = Airlines)) + 
  geom_line() + 
  geom_text(aes(label = percentDelayed), vjust = -.4) +
  labs(x = "Destination", y = "Percentage of Delayed flights") + 
  theme(legend.title=element_blank()) +
  ggtitle("Percentage of Delayed Flights by Airline and Cities")

PercDelayedAirCities1

To sum up, AM West Airlines has a lower percentage of total delayed flights than Alaska Airlines. However, Alaska has lower percentages of delayed flights than AM West by destination. AM West has 5255 flights to Phoenix with 7.9% of those flights being delayed, while Alaska has only 233 Phoenix flights with 5.15% of flights delayed.Also AM West has 262 flights to Seattle with 23.28% of flights delayed, while Alaska has 2146 Seattle flights with 14.21% delayed.