##install.packages("sqldf")
##install.packages("dplyr")
##install.packages("tidyr")
##install.packages("ggplot2")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
airlines <- sqldf(c("DROP TABLE IF EXISTS airlines",
"CREATE TABLE airlines(
a_id INT NOT NULL,
a_name VARCHAR(100) NOT NULL,
PRIMARY KEY (a_id))",
"INSERT INTO airlines (a_id, a_name) VALUES (001, 'ALASKA'), (002, 'AMWEST');",
"SELECT * FROM airlines"))
## Loading required package: tcltk
airlines
## a_id a_name
## 1 1 ALASKA
## 2 2 AMWEST
destination <- sqldf(c("DROP TABLE IF EXISTS destination",
"CREATE TABLE destination(
d_id INT NOT NULL,
d_name VARCHAR(100) NOT NULL,
PRIMARY KEY (d_id))",
"INSERT INTO destination (d_id, d_name) VALUES
(001, 'Los Angeles'), (002, 'Phoenix'), (003, 'San Diego'), (004, 'San Francisco'), (005, 'Seattle');",
"SELECT * FROM destination"))
destination
## d_id d_name
## 1 1 Los Angeles
## 2 2 Phoenix
## 3 3 San Diego
## 4 4 San Francisco
## 5 5 Seattle
on_time <- sqldf(c("DROP TABLE IF EXISTS on_time",
"CREATE TABLE on_time(
ot_id INT NOT NULL,
a_id INT NOT NULL,
d_id INT NOT NULL,
ot_num INT NOT NULL,
PRIMARY KEY (ot_id))",
"INSERT INTO on_time (ot_id, a_id, d_id, ot_num) VALUES
(001, 001, 001, 497), (002, 001, 002, 221),
(003, 001, 003, 212), (004, 001, 004, 503),
(005, 001, 005, 1841), (006, 002, 001, 694),
(007, 002, 002, 4840), (008, 002, 003, 383),
(009, 002, 004, 320), (010, 002, 005, 201);",
"Select * FROM on_time"))
on_time
## ot_id a_id d_id ot_num
## 1 1 1 1 497
## 2 2 1 2 221
## 3 3 1 3 212
## 4 4 1 4 503
## 5 5 1 5 1841
## 6 6 2 1 694
## 7 7 2 2 4840
## 8 8 2 3 383
## 9 9 2 4 320
## 10 10 2 5 201
delayed <- sqldf(c("DROP TABLE IF EXISTS delayed",
"CREATE TABLE delayed(
del_id INT NOT NULL,
a_id INT NOT NULL,
d_id INT NOT NULL,
del_num INT NOT NULL,
PRIMARY KEY (del_id))",
"INSERT INTO delayed (del_id, a_id, d_id, del_num) VALUES
(001, 001, 001, 62), (002, 001, 002, 12),
(003, 001, 003, 20), (004, 001, 004, 102),
(005, 001, 005, 305), (006, 002, 001, 117),
(007, 002, 002, 415), (008, 002, 003, 65),
(009, 002, 004, 129), (010, 002, 005, 61);",
"Select * FROM delayed"))
delayed
## del_id a_id d_id del_num
## 1 1 1 1 62
## 2 2 1 2 12
## 3 3 1 3 20
## 4 4 1 4 102
## 5 5 1 5 305
## 6 6 2 1 117
## 7 7 2 2 415
## 8 8 2 3 65
## 9 9 2 4 129
## 10 10 2 5 61
##Long format table containing "on time" flight information only
tab1 <- left_join(destination, on_time)
## Joining, by = "d_id"
tab3 <- left_join(airlines,tab1 )
## Joining, by = "a_id"
tab_ot <- select(tab3, Airlines = a_name, Destination = d_name, on_time = ot_num)
tab_ot
## Airlines Destination on_time
## 1 ALASKA Los Angeles 497
## 2 ALASKA Phoenix 221
## 3 ALASKA San Diego 212
## 4 ALASKA San Francisco 503
## 5 ALASKA Seattle 1841
## 6 AMWEST Los Angeles 694
## 7 AMWEST Phoenix 4840
## 8 AMWEST San Diego 383
## 9 AMWEST San Francisco 320
## 10 AMWEST Seattle 201
##Long format table containing "delayed" flight information only
tab2 <- left_join(destination, delayed)
## Joining, by = "d_id"
tab4 <- left_join(airlines, tab2)
## Joining, by = "a_id"
tab_del <- select(tab4, Airlines = a_name, Destination = d_name, Delayed = del_num)
tab_del
## Airlines Destination Delayed
## 1 ALASKA Los Angeles 62
## 2 ALASKA Phoenix 12
## 3 ALASKA San Diego 20
## 4 ALASKA San Francisco 102
## 5 ALASKA Seattle 305
## 6 AMWEST Los Angeles 117
## 7 AMWEST Phoenix 415
## 8 AMWEST San Diego 65
## 9 AMWEST San Francisco 129
## 10 AMWEST Seattle 61
##Long format table contining "on time" flight information converted to wide format
wide_data1 <- spread(tab_ot, Destination, on_time)
wide_data1
## Airlines Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA 497 221 212 503 1841
## 2 AMWEST 694 4840 383 320 201
##Long format table contining "delayed" flight information converted to wide format
wide_data2 <- spread(tab_del, Destination, Delayed)
wide_data2
## Airlines Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA 62 12 20 102 305
## 2 AMWEST 117 415 65 129 61
##Wide format table were combined, added "status" column and data rearranged
wide_data <- bind_rows(wide_data1, wide_data2)
Status <- c("on_time", "on_time", "delayed", "delayed")
wide_datas <- mutate(wide_data, Status )
wide_dataar <- arrange(wide_datas, Airlines)
wide_dataf <- wide_dataar[c(1,7,2,3,4,5,6)]
wide_dataf
## Airlines Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on_time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
write.csv(wide_dataf, "Airlines.csv", row.names=FALSE)
##.CSV file was uploaded to Github
airline_data <- read.csv("https://raw.githubusercontent.com/choudhury1023/Data-607/gh-pages/Airlines.csv")
airline_data
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on_time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
##Converted data to long fromat to transfor and analyze
long_data <- gather(airline_data, "Destination", "Num", 3:7)
long_data1 <- spread(long_data, Status, Num)
long_data1
## Airlines Destination delayed on_time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San.Diego 65 383
## 9 AMWEST San.Francisco 129 320
## 10 AMWEST Seattle 61 201
##Added two new columns using mutate function, the two new columns gives us information about total number of flight by destination and percent of delayed flight to that destinaton
long_data2 <- mutate(long_data1, total_flights = delayed + on_time)
long_data3 <- mutate(long_data2 ,percent_delayed = round(delayed * 100 / total_flights, 2))
long_data3
## Airlines Destination delayed on_time total_flights percent_delayed
## 1 ALASKA Los.Angeles 62 497 559 11.09
## 2 ALASKA Phoenix 12 221 233 5.15
## 3 ALASKA San.Diego 20 212 232 8.62
## 4 ALASKA San.Francisco 102 503 605 16.86
## 5 ALASKA Seattle 305 1841 2146 14.21
## 6 AMWEST Los.Angeles 117 694 811 14.43
## 7 AMWEST Phoenix 415 4840 5255 7.90
## 8 AMWEST San.Diego 65 383 448 14.51
## 9 AMWEST San.Francisco 129 320 449 28.73
## 10 AMWEST Seattle 61 201 262 23.28
ggplot(data = long_data3, aes(x = Destination, y = percent_delayed, fill = Airlines)) + geom_bar(stat="identity", position="dodge") + ggtitle("Percentage of Arrival Delays By Destination") + ylab("Percentage of Planes Delayed By Destination")
long_data4 <- group_by(long_data3, Airlines)
long_data4
## Source: local data frame [10 x 6]
## Groups: Airlines [2]
##
## Airlines Destination delayed on_time total_flights percent_delayed
## <fctr> <chr> <int> <int> <int> <dbl>
## 1 ALASKA Los.Angeles 62 497 559 11.09
## 2 ALASKA Phoenix 12 221 233 5.15
## 3 ALASKA San.Diego 20 212 232 8.62
## 4 ALASKA San.Francisco 102 503 605 16.86
## 5 ALASKA Seattle 305 1841 2146 14.21
## 6 AMWEST Los.Angeles 117 694 811 14.43
## 7 AMWEST Phoenix 415 4840 5255 7.90
## 8 AMWEST San.Diego 65 383 448 14.51
## 9 AMWEST San.Francisco 129 320 449 28.73
## 10 AMWEST Seattle 61 201 262 23.28
long_data5 <-summarise(long_data4, total_flights_alldest = sum(total_flights), delayed_alldest = sum(delayed))
long_data5
## # A tibble: 2 × 3
## Airlines total_flights_alldest delayed_alldest
## <fctr> <int> <int>
## 1 ALASKA 3775 501
## 2 AMWEST 7225 787
long_data6 <- mutate(long_data5, pct_delayed_alldest = round((delayed_alldest / total_flights_alldest) * 100, 2))
long_data6
## # A tibble: 2 × 4
## Airlines total_flights_alldest delayed_alldest pct_delayed_alldest
## <fctr> <int> <int> <dbl>
## 1 ALASKA 3775 501 13.27
## 2 AMWEST 7225 787 10.89
ggplot(data = long_data6, aes(x = Airlines, y= pct_delayed_alldest, fill = Airlines)) + geom_bar(stat = "identity", position = "dodge") + ggtitle("Percentage of Arrival Delays All Destination") + ylab("All Destination Delay Rate")