My approach for this assignment is: * Gather the data from the CSV file found on GitHub. * Clean and transform data * Perform analysis of the Airlines and Cities
dfFlights <- dfFlights %>%
rename(Airline = ï.., Status = X, 'Los Angeles' = Los.Angeles, 'San Diego' = San.Diego,
'San Francisco' = San.Francisco)
colnames(dfFlights)
## [1] "Airline" "Status" "Los Angeles" "Phoenix"
## [5] "San Diego" "San Francisco" "Seattle"
Handling the data
# removing the empty row
dfFlights <- dfFlights[-3,]
# Entering missing data inputs
dfFlights[2,1] <- 'Alaska'
dfFlights[4,1] <- 'AM West'
# formatting the data from char to int
dfFlights <- dfFlights %>%
mutate(Phoenix = as.integer(str_remove(Phoenix, ',')))
dfFlights <- dfFlights %>%
mutate(Seattle = as.integer(str_remove(Seattle, ',')))
dfFlights
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
In order to perform the analysis, I needed to pivot the original data to calculate ratios.
# Pivoting the Alaska Airline rows
dfAlaska <- dfFlights[1:2,-1] %>%
pivot_longer(!Status, names_to = 'City', values_to = 'count') %>%
pivot_wider(names_from = Status, values_from = count)
dfAlaska['Airline'] <- 'Alaska'
# Pivoting the AM West Airline rows
dfAMWest <- dfFlights[3:4,-1] %>%
pivot_longer(!Status, names_to = 'City', values_to = 'count') %>%
pivot_wider(names_from = Status, values_from = count)
dfAMWest['Airline'] <- 'AM West'
# Combining the two data frames
df1 <- rbind(dfAlaska, dfAMWest)
df1 <- df1 %>%
rename(on_time = 'on time')
df1 <- df1 %>%
select(Airline, City, on_time, delayed)
# Calculating ratios
df1['on_time_ratio'] <- df1['on_time'] / (df1['delayed'] + df1['on_time'])
df1['delayed_ratio'] <- df1['delayed'] / (df1['delayed'] + df1['on_time'])
df1 <- df1 %>%
select(Airline, City, on_time, delayed, on_time_ratio, delayed_ratio)
df1
## # A tibble: 10 x 6
## Airline City on_time delayed on_time_ratio delayed_ratio
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 Alaska Los Angeles 497 62 0.889 0.111
## 2 Alaska Phoenix 221 12 0.948 0.0515
## 3 Alaska San Diego 212 20 0.914 0.0862
## 4 Alaska San Francisco 503 102 0.831 0.169
## 5 Alaska Seattle 1841 305 0.858 0.142
## 6 AM West Los Angeles 694 117 0.856 0.144
## 7 AM West Phoenix 4840 415 0.921 0.0790
## 8 AM West San Diego 383 65 0.855 0.145
## 9 AM West San Francisco 320 129 0.713 0.287
## 10 AM West Seattle 201 61 0.767 0.233
ggplot(data = df1) +
geom_bar(mapping = aes(x = Airline, y = on_time, fill = 'on_time'), stat = 'identity') +
geom_bar(mapping = aes(x = Airline, y = delayed, fill = 'delayed'), stat = 'identity') +
ylab('counts')
ggplot(data = df1) +
geom_bar(mapping = aes(x = City, y = on_time, fill = Airline), stat = 'identity', position = 'dodge') +
coord_flip()
ggplot(data = df1) +
geom_bar(mapping = aes(x = City, y = delayed_ratio, fill = Airline), stat = 'identity', position = 'dodge') +
coord_flip()
ggplot(data = df1) +
geom_bar(mapping = aes(x = City, y = on_time_ratio, fill = Airline), stat = 'identity', position = 'dodge') +
coord_flip()
ggplot(data = df1, mapping = aes(x = delayed, y = delayed_ratio, color = Airline, label = City)) +
geom_point() +
geom_text() +
geom_smooth(method=lm, aes(fill = Airline))
## `geom_smooth()` using formula 'y ~ x'
This last graph illustrates that for areas AM West has higher total delayed flights, they have a lower delayed ratio. The inverse is true for Alaska, where the higher total delayed flights have the higher delayed ratio.