Week5 Assignment

(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears below, so that you can practice tidying and transformations as described below.

Week5_Assignment

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(ggplot2)
(2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
(2.a) Read flights.csv file and eliminate rows that have blank rows
flights = read.csv(file="flights.csv", header=TRUE, sep=",")
flights = filter(flights, X.1 != "")
print(flights)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3 AM West on time         694    4840       383           320     201
## 4         delayed         117     415        65           129      61
(2.b) If any row doesn’t have the airline name, copy it over from previous row
for (i in 1:nrow(flights)) {
  if (flights$X[i] == "") {
    flights$X[i] = flights$X[i-1]
  }
}
print(flights)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2  Alaska delayed          62      12        20           102     305
## 3 AM West on time         694    4840       383           320     201
## 4 AM West delayed         117     415        65           129      61
(2.c) Apply tidyr gather function and convert the data from wide to long structure so it can be used for analysis
flights_df = gather(flights, "City", "OnTime_Delayed", 3, 4, 5, 6, 7 , na.rm = FALSE, convert = FALSE)
(2.d) Label column as Airlines, OnTime_Delayed, City and Count
colnames(flights_df) = c("Airlines", "OnTime_Delayed", "City", "Count")
print (flights_df)
##    Airlines OnTime_Delayed          City Count
## 1    Alaska        on time   Los.Angeles   497
## 2    Alaska        delayed   Los.Angeles    62
## 3   AM West        on time   Los.Angeles   694
## 4   AM West        delayed   Los.Angeles   117
## 5    Alaska        on time       Phoenix   221
## 6    Alaska        delayed       Phoenix    12
## 7   AM West        on time       Phoenix  4840
## 8   AM West        delayed       Phoenix   415
## 9    Alaska        on time     San.Diego   212
## 10   Alaska        delayed     San.Diego    20
## 11  AM West        on time     San.Diego   383
## 12  AM West        delayed     San.Diego    65
## 13   Alaska        on time San.Francisco   503
## 14   Alaska        delayed San.Francisco   102
## 15  AM West        on time San.Francisco   320
## 16  AM West        delayed San.Francisco   129
## 17   Alaska        on time       Seattle  1841
## 18   Alaska        delayed       Seattle   305
## 19  AM West        on time       Seattle   201
## 20  AM West        delayed       Seattle    61
(2.e) Remove punctuation character (.) from City names using regular expression
flights_df$City = str_replace_all(flights_df$City, "[[.]]", " ")
print (flights_df)
##    Airlines OnTime_Delayed          City Count
## 1    Alaska        on time   Los Angeles   497
## 2    Alaska        delayed   Los Angeles    62
## 3   AM West        on time   Los Angeles   694
## 4   AM West        delayed   Los Angeles   117
## 5    Alaska        on time       Phoenix   221
## 6    Alaska        delayed       Phoenix    12
## 7   AM West        on time       Phoenix  4840
## 8   AM West        delayed       Phoenix   415
## 9    Alaska        on time     San Diego   212
## 10   Alaska        delayed     San Diego    20
## 11  AM West        on time     San Diego   383
## 12  AM West        delayed     San Diego    65
## 13   Alaska        on time San Francisco   503
## 14   Alaska        delayed San Francisco   102
## 15  AM West        on time San Francisco   320
## 16  AM West        delayed San Francisco   129
## 17   Alaska        on time       Seattle  1841
## 18   Alaska        delayed       Seattle   305
## 19  AM West        on time       Seattle   201
## 20  AM West        delayed       Seattle    61
(2.f) Apply tidyr spread function to spread the data in column OnTime_Delayed into 2 different columns: Delayed and OnTime
flights_df = spread(flights_df, OnTime_Delayed, Count)
colnames(flights_df) = c("Airlines", "City", "Delayed", "OnTime")
print (flights_df)
##    Airlines          City Delayed OnTime
## 1    Alaska   Los Angeles      62    497
## 2    Alaska       Phoenix      12    221
## 3    Alaska     San Diego      20    212
## 4    Alaska San Francisco     102    503
## 5    Alaska       Seattle     305   1841
## 6   AM West   Los Angeles     117    694
## 7   AM West       Phoenix     415   4840
## 8   AM West     San Diego      65    383
## 9   AM West San Francisco     129    320
## 10  AM West       Seattle      61    201
(2.g) Create a function to calculate the percentage and round it to 2 decimal digits
calcPercentage = function(value, totalValue) {
  round((value/totalValue)*100, 2)
}
(2.h) Apply dplyr mutate function to add 3 extra columns for TotalFlights, percentOnTime and percentDelayed
flights_df = mutate(flights_df, TotalFlights=(Delayed+OnTime), PercentDelayed=calcPercentage(Delayed, TotalFlights), PercentOnTime=calcPercentage(OnTime, TotalFlights))
print (flights_df)
##    Airlines          City Delayed OnTime TotalFlights PercentDelayed PercentOnTime
## 1    Alaska   Los Angeles      62    497          559          11.09         88.91
## 2    Alaska       Phoenix      12    221          233           5.15         94.85
## 3    Alaska     San Diego      20    212          232           8.62         91.38
## 4    Alaska San Francisco     102    503          605          16.86         83.14
## 5    Alaska       Seattle     305   1841         2146          14.21         85.79
## 6   AM West   Los Angeles     117    694          811          14.43         85.57
## 7   AM West       Phoenix     415   4840         5255           7.90         92.10
## 8   AM West     San Diego      65    383          448          14.51         85.49
## 9   AM West San Francisco     129    320          449          28.73         71.27
## 10  AM West       Seattle      61    201          262          23.28         76.72
(3) Perform analysis to compare the arrival delays for the two airlines.
(3.a) Draw a bar graph that represents the total number of flights for each city
flights_city_group_df=sqldf("select City, sum(TotalFlights) As TotalFlights from flights_df group by City")
## Loading required package: tcltk
print(flights_city_group_df)
##            City TotalFlights
## 1   Los Angeles         1370
## 2       Phoenix         5488
## 3     San Diego          680
## 4 San Francisco         1054
## 5       Seattle         2408
ggplot(flights_city_group_df, aes(x=City, y=TotalFlights)) + geom_bar(fill="#8877aa", color="black", stat="identity")

(3.b) Draw a bar graph that represents the total number of flights for each airline
flights_airline_group_df=sqldf("select Airlines, sum(TotalFlights) As TotalFlights from flights_df group by Airlines")
print(flights_airline_group_df)
##   Airlines TotalFlights
## 1  AM West         7225
## 2   Alaska         3775
ggplot(flights_airline_group_df, aes(x=Airlines, y=TotalFlights)) + geom_bar(fill="#87ffaa", color="black", stat="identity")

(3.c) Draw a bar graph that represents flights delayed for each city separated by each airline
ggplot(data=flights_df, aes(x=City, y=PercentDelayed, fill=Airlines)) + geom_bar(stat="identity", position="dodge") + 
  ylab("% of Flights Delayed") + ggtitle("% of Flights Delayed")

(3.d) Draw a bar graph that represents flights arrived on time for each city separated by each airline
ggplot(data=flights_df, aes(x=City, y=PercentOnTime, fill=Airlines)) + geom_bar(stat="identity", position="dodge") + 
  ylab("% of Flights OnTime") + ggtitle("% of Flights OnTime") 

(3.e) Draw a line graph that represents flights delayed for each city separated by each airline
ggplot(data=flights_df, aes(x=City, y=PercentDelayed, group=Airlines, color=Airlines)) +  geom_line() + 
  geom_point(color = "Red") + ylab("% of Flights Delayed") + ggtitle("% of Flights Delayed") 

(3.f) Draw a line graph that represents flights arrived on time for each city separated by each airline
ggplot(data=flights_df, aes(x=City, y=PercentOnTime, group=Airlines, color=Airlines)) +  geom_line() + 
  geom_point(color = "Blue") + ylab("% of Flights OnTime") +  ggtitle("% of Flights OnTime")

(4) Conclusion.

(4.a) From the graphs we can conclude that Alaska airlines has better OnTime performance compared to AMWest airlines though the total number of flights by AM West airlines is more than 2 times as many as Alaska airlines.

AM West airlines has a worse performance compared to Alaska Airlines as percentage of flights delayed for each airport is more than Alaska Airlines. Also, we notice that the highest delays by AM West are for San Franciso and the lowest are for Phoenix.