Assignment #5

url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Week%205/AirlineData.csv"

rawData <- read.csv(url, header = TRUE, stringsAsFactors = FALSE, na.strings = c("", "NA"))


##Get a quick overview of the downloaded content

#View summary of data
summary(rawData)
##      ï..                 X              Los.Angeles       Phoenix      
##  Length:5           Length:5           Min.   : 62.0   Min.   :  12.0  
##  Class :character   Class :character   1st Qu.:103.2   1st Qu.: 168.8  
##  Mode  :character   Mode  :character   Median :307.0   Median : 318.0  
##                                        Mean   :342.5   Mean   :1372.0  
##                                        3rd Qu.:546.2   3rd Qu.:1521.2  
##                                        Max.   :694.0   Max.   :4840.0  
##                                        NA's   :1       NA's   :1       
##    San.Diego      San.Francisco      Seattle    
##  Min.   : 20.00   Min.   :102.0   Min.   :  61  
##  1st Qu.: 53.75   1st Qu.:122.2   1st Qu.: 166  
##  Median :138.50   Median :224.5   Median : 253  
##  Mean   :170.00   Mean   :263.5   Mean   : 602  
##  3rd Qu.:254.75   3rd Qu.:365.8   3rd Qu.: 689  
##  Max.   :383.00   Max.   :503.0   Max.   :1841  
##  NA's   :1        NA's   :1       NA's   :1
#View head of data
head(rawData)
##       ï..       X Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM West on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61
#View structure of data
str(rawData)
## 'data.frame':    5 obs. of  7 variables:
##  $ ï..          : chr  "ALASKA" NA NA "AM West" ...
##  $ X            : chr  "on time" "delayed" NA "on time" ...
##  $ Los.Angeles  : int  497 62 NA 694 117
##  $ Phoenix      : int  221 12 NA 4840 415
##  $ San.Diego    : int  212 20 NA 383 65
##  $ San.Francisco: int  503 102 NA 320 129
##  $ Seattle      : int  1841 305 NA 201 61
##Using Dplyr
glimpse(rawData)
## Observations: 5
## Variables: 7
## $ ï..           <chr> "ALASKA", NA, NA, "AM West", NA
## $ X             <chr> "on time", "delayed", NA, "on time", "delayed"
## $ Los.Angeles   <int> 497, 62, NA, 694, 117
## $ Phoenix       <int> 221, 12, NA, 4840, 415
## $ San.Diego     <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle       <int> 1841, 305, NA, 201, 61
#Column Names 
names(rawData)
## [1] "ï.."           "X"             "Los.Angeles"   "Phoenix"      
## [5] "San.Diego"     "San.Francisco" "Seattle"

Taking a brief look at the raw data, tells us that row 3 has NA values and can be deleted. The first two column names need to be changed.

#Get rid of row 3
df <- rawData[-3, ]

#rename first 2 columns to use gather function
names(df)[1] <- "Airlines"
names(df)[2] <- "Status"

df2 <- gather(df, "Cities", "Flights", 3:7)

#glimpse df2
glimpse(df2)
## Observations: 20
## Variables: 4
## $ Airlines <chr> "ALASKA", NA, "AM West", NA, "ALASKA", NA, "AM West", NA, ...
## $ Status   <chr> "on time", "delayed", "on time", "delayed", "on time", "de...
## $ Cities   <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Los.Angeles"...
## $ Flights  <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, 383, 65, 5...
#notice we have NA values every other row in the airlines column
#locate all NA values and since we know the previous value should be the next, we can use the na.locf function
df3 <- na.locf(df2)

df4 <- spread(df3, Status, Flights)

df4$Total_PerCity <- df4$delayed + df4$`on time`

head(df4)
##   Airlines        Cities delayed on time Total_PerCity
## 1   ALASKA   Los.Angeles      62     497           559
## 2   ALASKA       Phoenix      12     221           233
## 3   ALASKA     San.Diego      20     212           232
## 4   ALASKA San.Francisco     102     503           605
## 5   ALASKA       Seattle     305    1841          2146
## 6  AM West   Los.Angeles     117     694           811
str(df4)
## 'data.frame':    10 obs. of  5 variables:
##  $ Airlines     : chr  "ALASKA" "ALASKA" "ALASKA" "ALASKA" ...
##  $ Cities       : chr  "Los.Angeles" "Phoenix" "San.Diego" "San.Francisco" ...
##  $ delayed      : int  62 12 20 102 305 117 415 65 129 61
##  $ on time      : int  497 221 212 503 1841 694 4840 383 320 201
##  $ Total_PerCity: int  559 233 232 605 2146 811 5255 448 449 262
summary(df4)
##    Airlines            Cities             delayed          on time      
##  Length:10          Length:10          Min.   : 12.00   Min.   : 201.0  
##  Class :character   Class :character   1st Qu.: 61.25   1st Qu.: 245.8  
##  Mode  :character   Mode  :character   Median : 83.50   Median : 440.0  
##                                        Mean   :128.80   Mean   : 971.2  
##                                        3rd Qu.:126.00   3rd Qu.: 646.2  
##                                        Max.   :415.00   Max.   :4840.0  
##  Total_PerCity   
##  Min.   : 232.0  
##  1st Qu.: 308.5  
##  Median : 504.0  
##  Mean   :1100.0  
##  3rd Qu.: 759.5  
##  Max.   :5255.0

The values are now organized and the flight_total values are type int, so we can proceed with analysis of the data.

Data Analysis

#Calculate percentage of delayed flights vs on time flights
df4 <- mutate(df4, delay_Percent = (delayed/Total_PerCity) * 100)
df4 <- mutate(df4, on_time_percent = (100 - delay_Percent))
head(df4, 10)
##    Airlines        Cities delayed on time Total_PerCity delay_Percent
## 1    ALASKA   Los.Angeles      62     497           559     11.091234
## 2    ALASKA       Phoenix      12     221           233      5.150215
## 3    ALASKA     San.Diego      20     212           232      8.620690
## 4    ALASKA San.Francisco     102     503           605     16.859504
## 5    ALASKA       Seattle     305    1841          2146     14.212488
## 6   AM West   Los.Angeles     117     694           811     14.426634
## 7   AM West       Phoenix     415    4840          5255      7.897241
## 8   AM West     San.Diego      65     383           448     14.508929
## 9   AM West San.Francisco     129     320           449     28.730512
## 10  AM West       Seattle      61     201           262     23.282443
##    on_time_percent
## 1         88.90877
## 2         94.84979
## 3         91.37931
## 4         83.14050
## 5         85.78751
## 6         85.57337
## 7         92.10276
## 8         85.49107
## 9         71.26949
## 10        76.71756
#take average of delayed flights by airline

df5 <- df4 %>% group_by(Airlines) %>% summarize(avg_delays_pct = mean(delay_Percent))
head(df5)
## # A tibble: 2 x 2
##   Airlines avg_delays_pct
##   <chr>             <dbl>
## 1 ALASKA             11.2
## 2 AM West            17.8
df6 <- df4 %>% group_by(Cities) %>% summarize(avg_delaysPerCity = mean(delayed))
head(df6)
## # A tibble: 5 x 2
##   Cities        avg_delaysPerCity
##   <chr>                     <dbl>
## 1 Los.Angeles                89.5
## 2 Phoenix                   214. 
## 3 San.Diego                  42.5
## 4 San.Francisco             116. 
## 5 Seattle                   183
df7 <- df4 %>% group_by(Cities) %>% summarize(delay = sum(delayed))
head(df7)
## # A tibble: 5 x 2
##   Cities        delay
##   <chr>         <int>
## 1 Los.Angeles     179
## 2 Phoenix         427
## 3 San.Diego        85
## 4 San.Francisco   231
## 5 Seattle         366
df8 <- df4 %>% group_by(Cities) %>% summarize(on_time = sum(`on time`))

df8$Delayed <- df7[ ,2]
head(df8)
## # A tibble: 5 x 3
##   Cities        on_time Delayed$delay
##   <chr>           <int>         <int>
## 1 Los.Angeles      1191           179
## 2 Phoenix          5061           427
## 3 San.Diego         595            85
## 4 San.Francisco     823           231
## 5 Seattle          2042           366
ggplot(df4, aes(x=Cities, y=`on time`, group=Airlines, color=Airlines))+geom_line()+labs(x=" Cities", y="On Time Flights")

ggplot(df4, aes(x=Cities, y=delayed, group=Airlines, color=Airlines))+geom_line()+labs(x=" Cities", y="Delayed Flights")

ggplot(df7, aes(x=Cities, y=delay))+geom_bar(stat = "identity")