Retrieve the data from a CSV file

fullData <- read.csv2(file = "https://raw.githubusercontent.com/san123i/CUNY/master/Semester1/607/Week5/data.csv", header = T, sep = ",",  na.strings = c("","NA"), blank.lines.skip = TRUE, stringsAsFactors = F)
names(fullData)[1] <- 'Airline'

Convert the data from ‘Wide’ into a ‘Long’ format for ease of analysis

fullData_Long <- gather(fullData, key="City", value="FlightCount", 3:7)
head(fullData_Long)
##   Airline  Status        City FlightCount
## 1  ALASKA on time Los.Angeles         497
## 2  ALASKA delayed Los.Angeles          62
## 3 AM WEST on time Los.Angeles         694
## 4 AM WEST delayed Los.Angeles         117
## 5  ALASKA on time     Phoenix         221
## 6  ALASKA delayed     Phoenix          12

Transform the data using tidyr and dplyr functions

1. gather -> Convert data from Wide format to Long format 
2. select -> Select only a limited set of columns
3. group_by -> Group the data in a tbl/df mentioning selected columns
4. summarize -> Generate sumarized stats
5. filter -> Filter the tbl/df by applying conditions on columns
6. mutate -> Modify the column data or create new columns
7. left_join  -> Apply left join on different datasets
airline_city_total <- fullData_Long %>% select(Airline, City, FlightCount) %>% group_by(Airline, City) %>% summarize(totalcount=sum(FlightCount))

fullData_long_totalcount <- left_join(fullData_Long, airline_city_total, by=c("Airline","City"))
head(fullData_long_totalcount)
##   Airline  Status        City FlightCount totalcount
## 1  ALASKA on time Los.Angeles         497        559
## 2  ALASKA delayed Los.Angeles          62        559
## 3 AM WEST on time Los.Angeles         694        811
## 4 AM WEST delayed Los.Angeles         117        811
## 5  ALASKA on time     Phoenix         221        233
## 6  ALASKA delayed     Phoenix          12        233
city_airline_ontimeRatio <- fullData_long_totalcount %>% select(Airline, Status, City, FlightCount, totalcount) %>% mutate(ratio=FlightCount/totalcount) %>% filter(Status=='on time') %>% group_by(City, Airline) %>% select(City, Airline, ratio)
city_airline_ontimeRatio
## # A tibble: 10 x 3
## # Groups:   City, Airline [10]
##    City          Airline ratio
##    <chr>         <chr>   <dbl>
##  1 Los.Angeles   ALASKA  0.889
##  2 Los.Angeles   AM WEST 0.856
##  3 Phoenix       ALASKA  0.948
##  4 Phoenix       AM WEST 0.921
##  5 San.Diego     ALASKA  0.914
##  6 San.Diego     AM WEST 0.855
##  7 San.Fransisco ALASKA  0.831
##  8 San.Fransisco AM WEST 0.713
##  9 Seattle       ALASKA  0.858
## 10 Seattle       AM WEST 0.767

Below is the transformed data with few summarized data elements

final_transformed_data <- mutate(city_airline_ontimeRatio, ontime_ratio=ratio, delay_ratio=1-ratio) %>% select(City, Airline, ontime_ratio, delay_ratio)
final_transformed_data
## # A tibble: 10 x 4
## # Groups:   City, Airline [10]
##    City          Airline ontime_ratio delay_ratio
##    <chr>         <chr>          <dbl>       <dbl>
##  1 Los.Angeles   ALASKA         0.889      0.111 
##  2 Los.Angeles   AM WEST        0.856      0.144 
##  3 Phoenix       ALASKA         0.948      0.0515
##  4 Phoenix       AM WEST        0.921      0.0790
##  5 San.Diego     ALASKA         0.914      0.0862
##  6 San.Diego     AM WEST        0.855      0.145 
##  7 San.Fransisco ALASKA         0.831      0.169 
##  8 San.Fransisco AM WEST        0.713      0.287 
##  9 Seattle       ALASKA         0.858      0.142 
## 10 Seattle       AM WEST        0.767      0.233

Number of operating flights by City and Airline

ggplot(fullData_long_totalcount, aes(x=City, y = FlightCount, fill=Airline)) + geom_bar(stat="identity", position = position_dodge())

Ratio of flights coming on time by City and Airline

ggplot(final_transformed_data, aes(x=City, y = ontime_ratio, fill=Airline)) + geom_bar(stat="identity", position = position_dodge())

Ratio of flights delayed by City and Airline

ggplot(final_transformed_data, aes(x=City, y = delay_ratio, fill=Airline)) + geom_bar(stat="identity", position = position_dodge())

Observations: