This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, Pflightdelay, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("stringr")
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
#Step1: # Reading data as is from Pflightdelay and writing it to CSV file on Physical server.
flightset <-
data.frame( header = c("","","Los Angeles","Phoenix","San Diego","San Francisco","Seattle"),
al_ontime = c("ALASKA","on time","497","221","212","503","1841"),
al_delayed = c("","delayed","62","12","20","102","305"),
amwest_ontime = c("AM WEST","on time","694","4840","383","320","201"),
amwest_delayed = c("","delayed","117","415","65","129","61") )
flight_delays <- data.frame(flightset$header,flightset$al_ontime,flightset$al_delayed,
flightset$amwest_ontime,flightset$amwest_delayed)
flight_delay <- t(flight_delays)
flight_delay
## [,1] [,2] [,3] [,4]
## flightset.header "" "" "Los Angeles" "Phoenix"
## flightset.al_ontime "ALASKA" "on time" "497" "221"
## flightset.al_delayed "" "delayed" "62" "12"
## flightset.amwest_ontime "AM WEST" "on time" "694" "4840"
## flightset.amwest_delayed "" "delayed" "117" "415"
## [,5] [,6] [,7]
## flightset.header "San Diego" "San Francisco" "Seattle"
## flightset.al_ontime "212" "503" "1841"
## flightset.al_delayed "20" "102" "305"
## flightset.amwest_ontime "383" "320" "201"
## flightset.amwest_delayed "65" "129" "61"
write.table(flight_delay, file = "~/Desktop/CUNY/5thWeek-TidyingData/flight_delay.csv", row.names = FALSE, col.names=FALSE, sep = ",")
#Step2: Tidy and Transform Data by Preparing Molten Data #1. Read csv file form Physical server #2. Fill the rows which are missing and name the columns #3. Gather data by merging destination cities into one variable destination and observations per destination -Used gather function of TIDYR package to get the dataset based on Destination and switched columns #4. Spread column Status observations into 2 columns on time and delayed and observations were added accordingly -Used spread function of TIDYR package to split Status columns into separate columns vased values in it.
#1
path <- "~/Desktop/CUNY/5thWeek-TidyingData/flight_delay.csv"
flightdelay <- data.frame(read.csv(path,stringsAsFactors = FALSE))
#2
names(flightdelay)[1] <- "Airline"
names(flightdelay)[2] <- "Status"
flightdelay$Airline[2] = flightdelay$Airline[1]
flightdelay$Airline[4] = flightdelay$Airline[3]
#3. Gather
flightdelay <- gather(flightdelay,"Destination","Total",3:7)
flightdelay <- flightdelay[c(1,3,2,4)]
flightdelay
## Airline Destination Status Total
## 1 ALASKA Los.Angeles on time 497
## 2 ALASKA Los.Angeles delayed 62
## 3 AM WEST Los.Angeles on time 694
## 4 AM WEST Los.Angeles delayed 117
## 5 ALASKA Phoenix on time 221
## 6 ALASKA Phoenix delayed 12
## 7 AM WEST Phoenix on time 4840
## 8 AM WEST Phoenix delayed 415
## 9 ALASKA San.Diego on time 212
## 10 ALASKA San.Diego delayed 20
## 11 AM WEST San.Diego on time 383
## 12 AM WEST San.Diego delayed 65
## 13 ALASKA San.Francisco on time 503
## 14 ALASKA San.Francisco delayed 102
## 15 AM WEST San.Francisco on time 320
## 16 AM WEST San.Francisco delayed 129
## 17 ALASKA Seattle on time 1841
## 18 ALASKA Seattle delayed 305
## 19 AM WEST Seattle on time 201
## 20 AM WEST Seattle delayed 61
#4. Spread
flightdelay <- spread(flightdelay,Status,Total)
flightdelay
## Airline Destination delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
#Step3: Analysis to compare the arrival delays for the two airlines #1. Add acolumn Total adding delayed and on time and arrange flightdelat data frame by airline and Destination #2. Calculate the Arrival Delays of 2 airlines ALASKA and AM WEST without considering destinations. Step 1 helps performance while -summarizing(Arrange helps performance) #3. Calculate the Arrival Delays of 2 airlines ALASKA and AM WEST by considering destinations #4.Best Airlines by destination wise while considering delays: From #3 we get airline and destination based delay and we get a dataset - for min delay based on airline which has airline and delay. Perform inner join 2 data sets to get the Airline. -Airline_Delayed_Dest - Airline and Destination Wise Delay -Destination_mindelay - Destination wise Delay -BestAirlinebyDelay - Join of Airline_Delayed_Dest & Destination_mindelay which derices best performed airlines by Destination wise and -delay time #5. Airport with most of the delays (Ratio)
#1 Below
flightdelay <- mutate(flightdelay,Total = delayed + `on time`)
flightdelay <- arrange(flightdelay,desc(Airline))
#flightdelay
#2
Airline_Summary <- flightdelay %>%
group_by(Airline) %>%
summarize(Airline_Delayed_All = sum(delayed),Airline_Ontime_All = sum(`on time`),Airline_Total_All = sum(Total))
Airline_Delayed_Summary <- select(Airline_Summary,Airline,Airline_Delayed_All)
Airline_Delayed_Summary
## # A tibble: 2 x 2
## Airline Airline_Delayed_All
## <chr> <int>
## 1 ALASKA 501
## 2 AM WEST 787
#3
flightdelay <- arrange(flightdelay,Airline,Destination)
Airline_Dest_Summary <- flightdelay %>%
group_by(Airline,Destination) %>%
summarize(Airline_Delayed_Dest = sum(delayed),Airline_Ontime_Dest = sum(`on time`),Airline_Total_Dest = sum(Total))
Airline_Delayed_Dest <- arrange(select(Airline_Dest_Summary,Airline,Destination,Airline_Delayed_Dest),Destination,Airline_Delayed_Dest)
Airline_Delayed_Dest
## # A tibble: 10 x 3
## # Groups: Airline [2]
## Airline Destination Airline_Delayed_Dest
## <chr> <chr> <int>
## 1 ALASKA Los.Angeles 62
## 2 AM WEST Los.Angeles 117
## 3 ALASKA Phoenix 12
## 4 AM WEST Phoenix 415
## 5 ALASKA San.Diego 20
## 6 AM WEST San.Diego 65
## 7 ALASKA San.Francisco 102
## 8 AM WEST San.Francisco 129
## 9 AM WEST Seattle 61
## 10 ALASKA Seattle 305
#4
Destination_mindelay <- Airline_Delayed_Dest %>%
group_by(Destination) %>%
summarise(Airline_Delayed_Dest = min(Airline_Delayed_Dest)) %>%
select(Destination,Airline_Delayed_Dest)
Destination_mindelay
## # A tibble: 5 x 2
## Destination Airline_Delayed_Dest
## <chr> <int>
## 1 Los.Angeles 62
## 2 Phoenix 12
## 3 San.Diego 20
## 4 San.Francisco 102
## 5 Seattle 61
BestAirlinebyDelay <- inner_join(Airline_Delayed_Dest,Destination_mindelay,by = c("Destination","Airline_Delayed_Dest"))
BestAirlinebyDelay
## # A tibble: 5 x 3
## # Groups: Airline [2]
## Airline Destination Airline_Delayed_Dest
## <chr> <chr> <int>
## 1 ALASKA Los.Angeles 62
## 2 ALASKA Phoenix 12
## 3 ALASKA San.Diego 20
## 4 ALASKA San.Francisco 102
## 5 AM WEST Seattle 61
#5
#Destination wise Total Delayed and Total Ontime
Destination_Arr_Rates <- flightdelay %>% na.omit() %>%
group_by(Destination) %>%
summarise(delayed = sum(delayed), `on time` = sum(`on time`),Total = sum(Total) ) %>%
mutate(delayed_ratio = round((delayed * 100 / Total),2),ontime_ratio = round((`on time` * 100/Total),2)) %>%
arrange(desc(delayed_ratio))
Destination_Arr_Rates
## # A tibble: 5 x 6
## Destination delayed `on time` Total delayed_ratio ontime_ratio
## <chr> <int> <int> <int> <dbl> <dbl>
## 1 San.Francisco 231 823 1054 21.9 78.1
## 2 Seattle 366 2042 2408 15.2 84.8
## 3 Los.Angeles 179 1191 1370 13.1 86.9
## 4 San.Diego 85 595 680 12.5 87.5
## 5 Phoenix 427 5061 5488 7.78 92.2
Destination_Arr_Rates[1,1]
## # A tibble: 1 x 1
## Destination
## <chr>
## 1 San.Francisco