url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Week%205/AirlineData.csv"
rawData <- read.csv(url, header = TRUE, stringsAsFactors = FALSE, na.strings = c("", "NA"))
##Get a quick overview of the downloaded content
#View summary of data
summary(rawData)
## ï.. X Los.Angeles Phoenix
## Length:5 Length:5 Min. : 62.0 Min. : 12.0
## Class :character Class :character 1st Qu.:103.2 1st Qu.: 168.8
## Mode :character Mode :character Median :307.0 Median : 318.0
## Mean :342.5 Mean :1372.0
## 3rd Qu.:546.2 3rd Qu.:1521.2
## Max. :694.0 Max. :4840.0
## NA's :1 NA's :1
## San.Diego San.Francisco Seattle
## Min. : 20.00 Min. :102.0 Min. : 61
## 1st Qu.: 53.75 1st Qu.:122.2 1st Qu.: 166
## Median :138.50 Median :224.5 Median : 253
## Mean :170.00 Mean :263.5 Mean : 602
## 3rd Qu.:254.75 3rd Qu.:365.8 3rd Qu.: 689
## Max. :383.00 Max. :503.0 Max. :1841
## NA's :1 NA's :1 NA's :1
#View head of data
head(rawData)
## ï.. X Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM West on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#View structure of data
str(rawData)
## 'data.frame': 5 obs. of 7 variables:
## $ ï.. : chr "ALASKA" NA NA "AM West" ...
## $ X : chr "on time" "delayed" NA "on time" ...
## $ Los.Angeles : int 497 62 NA 694 117
## $ Phoenix : int 221 12 NA 4840 415
## $ San.Diego : int 212 20 NA 383 65
## $ San.Francisco: int 503 102 NA 320 129
## $ Seattle : int 1841 305 NA 201 61
##Using Dplyr
glimpse(rawData)
## Observations: 5
## Variables: 7
## $ ï.. <chr> "ALASKA", NA, NA, "AM West", NA
## $ X <chr> "on time", "delayed", NA, "on time", "delayed"
## $ Los.Angeles <int> 497, 62, NA, 694, 117
## $ Phoenix <int> 221, 12, NA, 4840, 415
## $ San.Diego <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle <int> 1841, 305, NA, 201, 61
#Column Names
names(rawData)
## [1] "ï.." "X" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle"
Taking a brief look at the raw data, tells us that row 3 has NA values and can be deleted. The first two column names need to be changed.
#Get rid of row 3
df <- rawData[-3, ]
#rename first 2 columns to use gather function
names(df)[1] <- "Airlines"
names(df)[2] <- "Status"
df2 <- gather(df, "Cities", "Flights", 3:7)
#glimpse df2
glimpse(df2)
## Observations: 20
## Variables: 4
## $ Airlines <chr> "ALASKA", NA, "AM West", NA, "ALASKA", NA, "AM West", NA, ...
## $ Status <chr> "on time", "delayed", "on time", "delayed", "on time", "de...
## $ Cities <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Los.Angeles"...
## $ Flights <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, 383, 65, 5...
#notice we have NA values every other row in the airlines column
#locate all NA values and since we know the previous value should be the next, we can use the na.locf function
df3 <- na.locf(df2)
df4 <- spread(df3, Status, Flights)
df4$Total_PerCity <- df4$delayed + df4$`on time`
head(df4)
## Airlines Cities delayed on time Total_PerCity
## 1 ALASKA Los.Angeles 62 497 559
## 2 ALASKA Phoenix 12 221 233
## 3 ALASKA San.Diego 20 212 232
## 4 ALASKA San.Francisco 102 503 605
## 5 ALASKA Seattle 305 1841 2146
## 6 AM West Los.Angeles 117 694 811
str(df4)
## 'data.frame': 10 obs. of 5 variables:
## $ Airlines : chr "ALASKA" "ALASKA" "ALASKA" "ALASKA" ...
## $ Cities : chr "Los.Angeles" "Phoenix" "San.Diego" "San.Francisco" ...
## $ delayed : int 62 12 20 102 305 117 415 65 129 61
## $ on time : int 497 221 212 503 1841 694 4840 383 320 201
## $ Total_PerCity: int 559 233 232 605 2146 811 5255 448 449 262
summary(df4)
## Airlines Cities delayed on time
## Length:10 Length:10 Min. : 12.00 Min. : 201.0
## Class :character Class :character 1st Qu.: 61.25 1st Qu.: 245.8
## Mode :character Mode :character Median : 83.50 Median : 440.0
## Mean :128.80 Mean : 971.2
## 3rd Qu.:126.00 3rd Qu.: 646.2
## Max. :415.00 Max. :4840.0
## Total_PerCity
## Min. : 232.0
## 1st Qu.: 308.5
## Median : 504.0
## Mean :1100.0
## 3rd Qu.: 759.5
## Max. :5255.0
The values are now organized and the flight_total values are type int, so we can proceed with analysis of the data.
#Calculate percentage of delayed flights vs on time flights
df4 <- mutate(df4, delay_Percent = (delayed/Total_PerCity) * 100)
df4 <- mutate(df4, on_time_percent = (100 - delay_Percent))
head(df4, 10)
## Airlines Cities delayed on time Total_PerCity delay_Percent
## 1 ALASKA Los.Angeles 62 497 559 11.091234
## 2 ALASKA Phoenix 12 221 233 5.150215
## 3 ALASKA San.Diego 20 212 232 8.620690
## 4 ALASKA San.Francisco 102 503 605 16.859504
## 5 ALASKA Seattle 305 1841 2146 14.212488
## 6 AM West Los.Angeles 117 694 811 14.426634
## 7 AM West Phoenix 415 4840 5255 7.897241
## 8 AM West San.Diego 65 383 448 14.508929
## 9 AM West San.Francisco 129 320 449 28.730512
## 10 AM West Seattle 61 201 262 23.282443
## on_time_percent
## 1 88.90877
## 2 94.84979
## 3 91.37931
## 4 83.14050
## 5 85.78751
## 6 85.57337
## 7 92.10276
## 8 85.49107
## 9 71.26949
## 10 76.71756
#take average of delayed flights by airline
df5 <- df4 %>% group_by(Airlines) %>% summarize(avg_delays_pct = mean(delay_Percent))
head(df5)
## # A tibble: 2 x 2
## Airlines avg_delays_pct
## <chr> <dbl>
## 1 ALASKA 11.2
## 2 AM West 17.8
df6 <- df4 %>% group_by(Cities) %>% summarize(avg_delaysPerCity = mean(delayed))
head(df6)
## # A tibble: 5 x 2
## Cities avg_delaysPerCity
## <chr> <dbl>
## 1 Los.Angeles 89.5
## 2 Phoenix 214.
## 3 San.Diego 42.5
## 4 San.Francisco 116.
## 5 Seattle 183
df7 <- df4 %>% group_by(Cities) %>% summarize(delay = sum(delayed))
head(df7)
## # A tibble: 5 x 2
## Cities delay
## <chr> <int>
## 1 Los.Angeles 179
## 2 Phoenix 427
## 3 San.Diego 85
## 4 San.Francisco 231
## 5 Seattle 366
df8 <- df4 %>% group_by(Cities) %>% summarize(on_time = sum(`on time`))
df8$Delayed <- df7[ ,2]
head(df8)
## # A tibble: 5 x 3
## Cities on_time Delayed$delay
## <chr> <int> <int>
## 1 Los.Angeles 1191 179
## 2 Phoenix 5061 427
## 3 San.Diego 595 85
## 4 San.Francisco 823 231
## 5 Seattle 2042 366
ggplot(df4, aes(x=Cities, y=`on time`, group=Airlines, color=Airlines))+geom_line()+labs(x=" Cities", y="On Time Flights")
ggplot(df4, aes(x=Cities, y=delayed, group=Airlines, color=Airlines))+geom_line()+labs(x=" Cities", y="Delayed Flights")
ggplot(df7, aes(x=Cities, y=delay))+geom_bar(stat = "identity")