Step 1: Install packages

##install.packages("sqldf")
##install.packages("dplyr")
##install.packages("tidyr")
##install.packages("ggplot2")

Step 2: Load packages

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

Step 3: Create databse

Four Tables were created for the databse, “airlines” table containing airline information, “destination” table contining destination information, “on_time” table contaning on time flights information and finally “delayed” table contaning delyed flight information

airlines <- sqldf(c("DROP TABLE IF EXISTS airlines",
                   "CREATE TABLE airlines(
                   a_id INT NOT NULL,
                   a_name VARCHAR(100) NOT NULL,
                   PRIMARY KEY (a_id))",
                   "INSERT INTO airlines (a_id, a_name) VALUES (001, 'ALASKA'),                       (002, 'AMWEST');",
                   "SELECT * FROM airlines"))
## Loading required package: tcltk
airlines
##   a_id a_name
## 1    1 ALASKA
## 2    2 AMWEST
destination <- sqldf(c("DROP TABLE IF EXISTS destination",
                   "CREATE TABLE destination(
                   d_id INT NOT NULL,
                   d_name VARCHAR(100) NOT NULL,
                   PRIMARY KEY (d_id))",
                   "INSERT INTO destination (d_id, d_name) VALUES 
                    (001, 'Los Angeles'), (002, 'Phoenix'), (003, 'San Diego'),                       (004, 'San Francisco'), (005, 'Seattle');",
                   "SELECT * FROM destination"))
destination
##   d_id        d_name
## 1    1   Los Angeles
## 2    2       Phoenix
## 3    3     San Diego
## 4    4 San Francisco
## 5    5       Seattle
on_time <- sqldf(c("DROP TABLE IF EXISTS on_time",
                   "CREATE TABLE on_time(
                   ot_id INT NOT NULL,
                   a_id INT NOT NULL,
                   d_id INT NOT NULL,
                   ot_num INT NOT NULL,
                   PRIMARY KEY (ot_id))",
                   "INSERT INTO on_time (ot_id, a_id, d_id, ot_num) VALUES
                   (001, 001, 001, 497), (002, 001, 002, 221), 
                   (003, 001, 003, 212), (004, 001, 004, 503),
                   (005, 001, 005, 1841), (006, 002, 001, 694),
                   (007, 002, 002, 4840), (008, 002, 003, 383),
                   (009, 002, 004, 320), (010, 002, 005, 201);",
                   "Select * FROM on_time"))
on_time
##    ot_id a_id d_id ot_num
## 1      1    1    1    497
## 2      2    1    2    221
## 3      3    1    3    212
## 4      4    1    4    503
## 5      5    1    5   1841
## 6      6    2    1    694
## 7      7    2    2   4840
## 8      8    2    3    383
## 9      9    2    4    320
## 10    10    2    5    201
delayed <- sqldf(c("DROP TABLE IF EXISTS delayed",
                   "CREATE TABLE delayed(
                   del_id INT NOT NULL,
                   a_id INT NOT NULL,
                   d_id INT NOT NULL,
                   del_num INT NOT NULL,
                   PRIMARY KEY (del_id))",
                   "INSERT INTO delayed (del_id, a_id, d_id, del_num) VALUES
                   (001, 001, 001, 62), (002, 001, 002, 12), 
                   (003, 001, 003, 20), (004, 001, 004, 102),
                   (005, 001, 005, 305), (006, 002, 001, 117),
                   (007, 002, 002, 415), (008, 002, 003, 65),
                   (009, 002, 004, 129), (010, 002, 005, 61);",
                   "Select * FROM delayed"))
delayed
##    del_id a_id d_id del_num
## 1       1    1    1      62
## 2       2    1    2      12
## 3       3    1    3      20
## 4       4    1    4     102
## 5       5    1    5     305
## 6       6    2    1     117
## 7       7    2    2     415
## 8       8    2    3      65
## 9       9    2    4     129
## 10     10    2    5      61

Step 4: Create Wide format Data

Wide forma table was created by using dplyr and tidyr

##Long format table containing "on time" flight information only
tab1 <- left_join(destination, on_time)
## Joining, by = "d_id"
tab3 <- left_join(airlines,tab1 )
## Joining, by = "a_id"
tab_ot <- select(tab3, Airlines = a_name, Destination = d_name, on_time = ot_num)
tab_ot
##    Airlines   Destination on_time
## 1    ALASKA   Los Angeles     497
## 2    ALASKA       Phoenix     221
## 3    ALASKA     San Diego     212
## 4    ALASKA San Francisco     503
## 5    ALASKA       Seattle    1841
## 6    AMWEST   Los Angeles     694
## 7    AMWEST       Phoenix    4840
## 8    AMWEST     San Diego     383
## 9    AMWEST San Francisco     320
## 10   AMWEST       Seattle     201
##Long format table containing "delayed" flight information only
tab2 <- left_join(destination, delayed)
## Joining, by = "d_id"
tab4 <- left_join(airlines, tab2)
## Joining, by = "a_id"
tab_del <- select(tab4, Airlines = a_name, Destination = d_name, Delayed = del_num)
tab_del
##    Airlines   Destination Delayed
## 1    ALASKA   Los Angeles      62
## 2    ALASKA       Phoenix      12
## 3    ALASKA     San Diego      20
## 4    ALASKA San Francisco     102
## 5    ALASKA       Seattle     305
## 6    AMWEST   Los Angeles     117
## 7    AMWEST       Phoenix     415
## 8    AMWEST     San Diego      65
## 9    AMWEST San Francisco     129
## 10   AMWEST       Seattle      61
##Long format table contining "on time" flight information converted to wide format
wide_data1 <- spread(tab_ot, Destination, on_time)
wide_data1
##   Airlines Los Angeles Phoenix San Diego San Francisco Seattle
## 1   ALASKA         497     221       212           503    1841
## 2   AMWEST         694    4840       383           320     201
##Long format table contining "delayed" flight information converted to wide format
wide_data2 <- spread(tab_del, Destination, Delayed)
wide_data2
##   Airlines Los Angeles Phoenix San Diego San Francisco Seattle
## 1   ALASKA          62      12        20           102     305
## 2   AMWEST         117     415        65           129      61
##Wide format table were combined, added "status" column and data rearranged
wide_data <- bind_rows(wide_data1, wide_data2)

Status <- c("on_time", "on_time", "delayed", "delayed")

wide_datas <- mutate(wide_data, Status )

wide_dataar <- arrange(wide_datas, Airlines)

wide_dataf <- wide_dataar[c(1,7,2,3,4,5,6)]
wide_dataf
##   Airlines  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1   ALASKA on_time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        20           102     305
## 3   AMWEST on_time         694    4840       383           320     201
## 4   AMWEST delayed         117     415        65           129      61

Step 5: Write .CSV file

write.csv(wide_dataf, "Airlines.csv", row.names=FALSE)

##.CSV file was uploaded to Github

Step 6: Load .CSV from Github

airline_data <- read.csv("https://raw.githubusercontent.com/choudhury1023/Data-607/gh-pages/Airlines.csv")
airline_data
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on_time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        20           102     305
## 3   AMWEST on_time         694    4840       383           320     201
## 4   AMWEST delayed         117     415        65           129      61

Step 7: Tidy data

##Converted data to long fromat to transfor and analyze
long_data <- gather(airline_data, "Destination", "Num", 3:7)
long_data1 <- spread(long_data, Status, Num)
long_data1
##    Airlines   Destination delayed on_time
## 1    ALASKA   Los.Angeles      62     497
## 2    ALASKA       Phoenix      12     221
## 3    ALASKA     San.Diego      20     212
## 4    ALASKA San.Francisco     102     503
## 5    ALASKA       Seattle     305    1841
## 6    AMWEST   Los.Angeles     117     694
## 7    AMWEST       Phoenix     415    4840
## 8    AMWEST     San.Diego      65     383
## 9    AMWEST San.Francisco     129     320
## 10   AMWEST       Seattle      61     201

Step 8: Transform data

##Added two new columns using mutate function, the two new columns gives us information about total number of flight by destination and percent of delayed flight to that destinaton
long_data2 <- mutate(long_data1, total_flights = delayed + on_time)
long_data3 <- mutate(long_data2 ,percent_delayed = round(delayed * 100 / total_flights, 2))
long_data3
##    Airlines   Destination delayed on_time total_flights percent_delayed
## 1    ALASKA   Los.Angeles      62     497           559           11.09
## 2    ALASKA       Phoenix      12     221           233            5.15
## 3    ALASKA     San.Diego      20     212           232            8.62
## 4    ALASKA San.Francisco     102     503           605           16.86
## 5    ALASKA       Seattle     305    1841          2146           14.21
## 6    AMWEST   Los.Angeles     117     694           811           14.43
## 7    AMWEST       Phoenix     415    4840          5255            7.90
## 8    AMWEST     San.Diego      65     383           448           14.51
## 9    AMWEST San.Francisco     129     320           449           28.73
## 10   AMWEST       Seattle      61     201           262           23.28

Step 9: Plot transformed data

ggplot(data = long_data3, aes(x = Destination, y = percent_delayed, fill = Airlines)) + geom_bar(stat="identity", position="dodge") + ggtitle("Percentage of Arrival Delays By Destination") + ylab("Percentage of Planes Delayed By Destination")

Step 10: More transformation

long_data4 <- group_by(long_data3, Airlines)
long_data4
## Source: local data frame [10 x 6]
## Groups: Airlines [2]
## 
##    Airlines   Destination delayed on_time total_flights percent_delayed
##      <fctr>         <chr>   <int>   <int>         <int>           <dbl>
## 1    ALASKA   Los.Angeles      62     497           559           11.09
## 2    ALASKA       Phoenix      12     221           233            5.15
## 3    ALASKA     San.Diego      20     212           232            8.62
## 4    ALASKA San.Francisco     102     503           605           16.86
## 5    ALASKA       Seattle     305    1841          2146           14.21
## 6    AMWEST   Los.Angeles     117     694           811           14.43
## 7    AMWEST       Phoenix     415    4840          5255            7.90
## 8    AMWEST     San.Diego      65     383           448           14.51
## 9    AMWEST San.Francisco     129     320           449           28.73
## 10   AMWEST       Seattle      61     201           262           23.28
long_data5 <-summarise(long_data4, total_flights_alldest = sum(total_flights), delayed_alldest = sum(delayed))
long_data5
## # A tibble: 2 × 3
##   Airlines total_flights_alldest delayed_alldest
##     <fctr>                 <int>           <int>
## 1   ALASKA                  3775             501
## 2   AMWEST                  7225             787
long_data6 <- mutate(long_data5, pct_delayed_alldest = round((delayed_alldest / total_flights_alldest) * 100, 2))
long_data6
## # A tibble: 2 × 4
##   Airlines total_flights_alldest delayed_alldest pct_delayed_alldest
##     <fctr>                 <int>           <int>               <dbl>
## 1   ALASKA                  3775             501               13.27
## 2   AMWEST                  7225             787               10.89

Step 11: Another plot

ggplot(data = long_data6, aes(x = Airlines, y= pct_delayed_alldest, fill = Airlines)) + geom_bar(stat = "identity", position = "dodge") + ggtitle("Percentage of Arrival Delays All Destination") + ylab("All Destination Delay Rate")

The two plots above points us towards two diffrent conclusions, if we look at first plot where it shows the arrival delay percentage by individual destination it seems Alaska performs better than AMWEST, but if we look at the second plot which shows combined percentage of arrival delays to all destination AMWEST perfoms better than Alaska, which is rather confusing. But if we look into the data we can see AMWEST operates majority of its flight to Phoenix which the lowest percentage of arrival delays for both airlines where as Alaska operates majority of its flight to San Francisco which has the second highest arrival delays after Seattle for both airlines. This indicates there other variable at work beside the Airlines like air traffic and weather.