Create the MySQL driver to connect to the database
con <- dbConnect(RMySQL::MySQL(),
dbname = "weekfive",
host = "localhost",
port = 3306,
user = "user",
password = "")
Query table and show results
q <- "SELECT * FROM airlines;"
airline.table <- dbGetQuery(con, q)
airline.table
## airline status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 210
## 5 <NA> delayed 117 415 65 129 61
Remove empty rows
airline.table <- filter(airline.table, status != " ")
airline.table
## airline status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 210
## 4 <NA> delayed 117 415 65 129 61
Fill in the ’NA’s in the airline column with the correct values
i <- 1
j <- 1
#Check the 'airline' column. If the value is 'NA', replace it with the value in the row above it
while(i <= length(airline.table$airline)){
if(is.na(airline.table$airline[i])) {airline.table$airline[j] <- airline.table$airline[j-1]}
j <- j + 1
i <- i + 1
}
airline.table
## airline status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 210
## 4 AM WEST delayed 117 415 65 129 61
Change from ‘wide’ to ‘long’ format
#Gather the columns of cities and put them in one column called 'city' and
#put the values of cities in a column called 'flights'
tidytable <- gather(airline.table, city, flights, Los_Angeles:Seattle)
tidytable
## airline status city flights
## 1 Alaska on time Los_Angeles 497
## 2 Alaska delayed Los_Angeles 62
## 3 AM WEST on time Los_Angeles 694
## 4 AM WEST delayed Los_Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 Alaska on time San_Diego 212
## 10 Alaska delayed San_Diego 20
## 11 AM WEST on time San_Diego 383
## 12 AM WEST delayed San_Diego 65
## 13 Alaska on time San_Francisco 503
## 14 Alaska delayed San_Francisco 102
## 15 AM WEST on time San_Francisco 320
## 16 AM WEST delayed San_Francisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AM WEST on time Seattle 210
## 20 AM WEST delayed Seattle 61
Filter the delayed flights
delays <- tidytable %>%
filter(status == "delayed") %>%
arrange(airline)
delays
## airline status city flights
## 1 Alaska delayed Los_Angeles 62
## 2 Alaska delayed Phoenix 12
## 3 Alaska delayed San_Diego 20
## 4 Alaska delayed San_Francisco 102
## 5 Alaska delayed Seattle 305
## 6 AM WEST delayed Los_Angeles 117
## 7 AM WEST delayed Phoenix 415
## 8 AM WEST delayed San_Diego 65
## 9 AM WEST delayed San_Francisco 129
## 10 AM WEST delayed Seattle 61
Find total flights grouped by city and airline
#Calculates the number of total flights grouped by airline and city
totalflights <- aggregate(flights ~ city+airline, tidytable, sum)
totalflights
## city airline flights
## 1 Los_Angeles Alaska 559
## 2 Phoenix Alaska 233
## 3 San_Diego Alaska 232
## 4 San_Francisco Alaska 605
## 5 Seattle Alaska 2146
## 6 Los_Angeles AM WEST 811
## 7 Phoenix AM WEST 5255
## 8 San_Diego AM WEST 448
## 9 San_Francisco AM WEST 449
## 10 Seattle AM WEST 271
delaypercent <- totalflights %>%
mutate(delayed_flight_percentage = round((delays$flights/flights), 4)*100) %>%
arrange(city)
delaypercent
## city airline flights delayed_flight_percentage
## 1 Los_Angeles Alaska 559 11.09
## 2 Los_Angeles AM WEST 811 14.43
## 3 Phoenix Alaska 233 5.15
## 4 Phoenix AM WEST 5255 7.90
## 5 San_Diego Alaska 232 8.62
## 6 San_Diego AM WEST 448 14.51
## 7 San_Francisco Alaska 605 16.86
## 8 San_Francisco AM WEST 449 28.73
## 9 Seattle Alaska 2146 14.21
## 10 Seattle AM WEST 271 22.51
We can see that if you want to be on time, your best chance is to fly Alaska airlines
plottable <- delaypercent %>%
select(city, airline, delayed_flight_percentage)
plottable
## city airline delayed_flight_percentage
## 1 Los_Angeles Alaska 11.09
## 2 Los_Angeles AM WEST 14.43
## 3 Phoenix Alaska 5.15
## 4 Phoenix AM WEST 7.90
## 5 San_Diego Alaska 8.62
## 6 San_Diego AM WEST 14.51
## 7 San_Francisco Alaska 16.86
## 8 San_Francisco AM WEST 28.73
## 9 Seattle Alaska 14.21
## 10 Seattle AM WEST 22.51
ggplot(plottable, aes(city, delayed_flight_percentage)) +
geom_col(aes(fill = airline), position = "dodge" ) +
labs(title = "Percentage of Delayed Flights", x = "City", y = "Percent")