5thWeek_Untidy

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, Pflightdelay, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Install Packages and Include Libraries

#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("stringr")
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

#Step1: # Reading data as is from Pflightdelay and writing it to CSV file on Physical server.

flightset <- 
    data.frame( header = c("","","Los Angeles","Phoenix","San Diego","San Francisco","Seattle"),
              al_ontime = c("ALASKA","on time","497","221","212","503","1841"),
              al_delayed = c("","delayed","62","12","20","102","305"),
              amwest_ontime = c("AM WEST","on time","694","4840","383","320","201"),
              amwest_delayed = c("","delayed","117","415","65","129","61") )
flight_delays <- data.frame(flightset$header,flightset$al_ontime,flightset$al_delayed,
                            flightset$amwest_ontime,flightset$amwest_delayed)
flight_delay <- t(flight_delays)
flight_delay

##                          [,1]      [,2]      [,3]          [,4]     
## flightset.header         ""        ""        "Los Angeles" "Phoenix"
## flightset.al_ontime      "ALASKA"  "on time" "497"         "221"    
## flightset.al_delayed     ""        "delayed" "62"          "12"     
## flightset.amwest_ontime  "AM WEST" "on time" "694"         "4840"   
## flightset.amwest_delayed ""        "delayed" "117"         "415"    
##                          [,5]        [,6]            [,7]     
## flightset.header         "San Diego" "San Francisco" "Seattle"
## flightset.al_ontime      "212"       "503"           "1841"   
## flightset.al_delayed     "20"        "102"           "305"    
## flightset.amwest_ontime  "383"       "320"           "201"    
## flightset.amwest_delayed "65"        "129"           "61"

write.table(flight_delay, file = "~/Desktop/CUNY/5thWeek-TidyingData/flight_delay.csv", row.names = FALSE, col.names=FALSE,  sep = ",")

#Step2: Tidy and Transform Data by Preparing Molten Data #1. Read csv file form Physical server #2. Fill the rows which are missing and name the columns #3. Gather data by merging destination cities into one variable destination and observations per destination -Used gather function of TIDYR package to get the dataset based on Destination and switched columns #4. Spread column Status observations into 2 columns on time and delayed and observations were added accordingly -Used spread function of TIDYR package to split Status columns into separate columns vased values in it.

#1
path <- "~/Desktop/CUNY/5thWeek-TidyingData/flight_delay.csv"
flightdelay <- data.frame(read.csv(path,stringsAsFactors = FALSE))
#2
names(flightdelay)[1] <- "Airline"
names(flightdelay)[2] <- "Status"
flightdelay$Airline[2] = flightdelay$Airline[1]
flightdelay$Airline[4] = flightdelay$Airline[3]
#3. Gather 
flightdelay <- gather(flightdelay,"Destination","Total",3:7)
flightdelay <- flightdelay[c(1,3,2,4)]
flightdelay

##    Airline   Destination  Status Total
## 1   ALASKA   Los.Angeles on time   497
## 2   ALASKA   Los.Angeles delayed    62
## 3  AM WEST   Los.Angeles on time   694
## 4  AM WEST   Los.Angeles delayed   117
## 5   ALASKA       Phoenix on time   221
## 6   ALASKA       Phoenix delayed    12
## 7  AM WEST       Phoenix on time  4840
## 8  AM WEST       Phoenix delayed   415
## 9   ALASKA     San.Diego on time   212
## 10  ALASKA     San.Diego delayed    20
## 11 AM WEST     San.Diego on time   383
## 12 AM WEST     San.Diego delayed    65
## 13  ALASKA San.Francisco on time   503
## 14  ALASKA San.Francisco delayed   102
## 15 AM WEST San.Francisco on time   320
## 16 AM WEST San.Francisco delayed   129
## 17  ALASKA       Seattle on time  1841
## 18  ALASKA       Seattle delayed   305
## 19 AM WEST       Seattle on time   201
## 20 AM WEST       Seattle delayed    61

#4. Spread
flightdelay <- spread(flightdelay,Status,Total)
flightdelay

##    Airline   Destination delayed on time
## 1   ALASKA   Los.Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San.Diego      20     212
## 4   ALASKA San.Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los.Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San.Diego      65     383
## 9  AM WEST San.Francisco     129     320
## 10 AM WEST       Seattle      61     201

#Step3: Analysis to compare the arrival delays for the two airlines #1. Add acolumn Total adding delayed and on time and arrange flightdelat data frame by airline and Destination #2. Calculate the Arrival Delays of 2 airlines ALASKA and AM WEST without considering destinations. Step 1 helps performance while -summarizing(Arrange helps performance) #3. Calculate the Arrival Delays of 2 airlines ALASKA and AM WEST by considering destinations #4.Best Airlines by destination wise while considering delays: From #3 we get airline and destination based delay and we get a dataset - for min delay based on airline which has airline and delay. Perform inner join 2 data sets to get the Airline. -Airline_Delayed_Dest - Airline and Destination Wise Delay -Destination_mindelay - Destination wise Delay -BestAirlinebyDelay - Join of Airline_Delayed_Dest & Destination_mindelay which derices best performed airlines by Destination wise and -delay time #5. Airport with most of the delays (Ratio)

#1 Below
flightdelay <- mutate(flightdelay,Total = delayed + `on time`)
flightdelay <- arrange(flightdelay,desc(Airline))
#flightdelay

#2
Airline_Summary <- flightdelay %>% 
                  group_by(Airline) %>% 
                summarize(Airline_Delayed_All = sum(delayed),Airline_Ontime_All = sum(`on time`),Airline_Total_All = sum(Total))
Airline_Delayed_Summary <- select(Airline_Summary,Airline,Airline_Delayed_All)
Airline_Delayed_Summary

## # A tibble: 2 x 2
##   Airline Airline_Delayed_All
##   <chr>                 <int>
## 1 ALASKA                  501
## 2 AM WEST                 787

#3 
flightdelay <- arrange(flightdelay,Airline,Destination)

Airline_Dest_Summary <- flightdelay %>% 
                        group_by(Airline,Destination) %>% 
              summarize(Airline_Delayed_Dest = sum(delayed),Airline_Ontime_Dest = sum(`on time`),Airline_Total_Dest = sum(Total))
Airline_Delayed_Dest <- arrange(select(Airline_Dest_Summary,Airline,Destination,Airline_Delayed_Dest),Destination,Airline_Delayed_Dest)
Airline_Delayed_Dest

## # A tibble: 10 x 3
## # Groups:   Airline [2]
##    Airline Destination   Airline_Delayed_Dest
##    <chr>   <chr>                        <int>
##  1 ALASKA  Los.Angeles                     62
##  2 AM WEST Los.Angeles                    117
##  3 ALASKA  Phoenix                         12
##  4 AM WEST Phoenix                        415
##  5 ALASKA  San.Diego                       20
##  6 AM WEST San.Diego                       65
##  7 ALASKA  San.Francisco                  102
##  8 AM WEST San.Francisco                  129
##  9 AM WEST Seattle                         61
## 10 ALASKA  Seattle                        305

#4
Destination_mindelay <- Airline_Delayed_Dest %>% 
                        group_by(Destination) %>% 
                        summarise(Airline_Delayed_Dest = min(Airline_Delayed_Dest))  %>% 
                        select(Destination,Airline_Delayed_Dest) 
Destination_mindelay

## # A tibble: 5 x 2
##   Destination   Airline_Delayed_Dest
##   <chr>                        <int>
## 1 Los.Angeles                     62
## 2 Phoenix                         12
## 3 San.Diego                       20
## 4 San.Francisco                  102
## 5 Seattle                         61

BestAirlinebyDelay <- inner_join(Airline_Delayed_Dest,Destination_mindelay,by = c("Destination","Airline_Delayed_Dest"))
BestAirlinebyDelay

## # A tibble: 5 x 3
## # Groups:   Airline [2]
##   Airline Destination   Airline_Delayed_Dest
##   <chr>   <chr>                        <int>
## 1 ALASKA  Los.Angeles                     62
## 2 ALASKA  Phoenix                         12
## 3 ALASKA  San.Diego                       20
## 4 ALASKA  San.Francisco                  102
## 5 AM WEST Seattle                         61

#5
#Destination wise Total Delayed and Total Ontime
Destination_Arr_Rates <- flightdelay %>% na.omit() %>% 
  group_by(Destination) %>% 
  summarise(delayed = sum(delayed), `on time` = sum(`on time`),Total = sum(Total) ) %>% 
  mutate(delayed_ratio = round((delayed * 100 / Total),2),ontime_ratio = round((`on time` * 100/Total),2)) %>%
  arrange(desc(delayed_ratio))
Destination_Arr_Rates

## # A tibble: 5 x 6
##   Destination   delayed `on time` Total delayed_ratio ontime_ratio
##   <chr>           <int>     <int> <int>         <dbl>        <dbl>
## 1 San.Francisco     231       823  1054         21.9          78.1
## 2 Seattle           366      2042  2408         15.2          84.8
## 3 Los.Angeles       179      1191  1370         13.1          86.9
## 4 San.Diego          85       595   680         12.5          87.5
## 5 Phoenix           427      5061  5488          7.78         92.2

Destination_Arr_Rates[1,1]

## # A tibble: 1 x 1
##   Destination  
##   <chr>        
## 1 San.Francisco

5thWeek_Untidy_Sets

Sarat Palle

9/27/2019

R Markdown

Install Packages and Include Libraries