R Markdown

setwd(‘/Users/ethan/documents/BANA4137’) library(dplyr)

flights = read.csv(“cvg_flights.csv”, header = TRUE, na.strings = ““) airlines = read.csv(”airlines.csv”, header = TRUE, na.strings = ““) airports = read.csv(”airports.csv”, header = TRUE, na.strings = ““)

ncol(flights) nrow(flights)

ncol(airlines) nrow(airlines)

ncol(airports) nrow(airports)

merged.data <- left_join(flights, airlines, by=c(“AIRLINE”=“IATA_CODE”))

md.final <- left_join(merged.data, airports,by=c(“AIRLINE”=“IATA_CODE”)) head(md.final)

length(is.na(md.final)) length(md.final)

length(is.na(md.final))/length(md.final)

sum(md.final\(CANCELLED==0) sum(md.final\)CANCELLED==1 | md.final$CANCELLED==0)

sum(md.final\(CANCELLED==0)/sum(md.final\)CANCELLED==1 | md.final$CANCELLED==0)

sum(is.na(md.final$DEPARTURE_TIME))

TIME_DIFF <- (md.final\(SCHEDULED_TIME - md.final\)ELAPSED_TIME) head(TIME_DIFF)

filtered_data = md.final[(md.final[‘AIRLINE’] == ‘DL’) & (md.final[‘ORIGIN_AIRPORT’] == ‘CVG’) & (md.final[‘DEPARTURE_DELAY’] > 30)]

flight_number <- filtered_data[‘FLIGHT_NUMBER’].head(6)

print(flight_number)

average_delay <- md.final %>% group_by(AIRLINE) %>% summarize(avg_departure_delay = mean(DEPARTURE_DELAY, na.rm = TRUE))

longest_delay_airline <- average_delay[which.max(average_delay\(avg_departure_delay), ] shortest_delay_airline <- average_delay[which.min(average_delay\)avg_departure_delay), ]

print(average_delay) print(longest_delay_airline) print(shortest_delay_airline)

top_airports <- md.final %>% group_by(ORIGIN_AIRPORT) %>% summarize(avg_departure_delay = mean(DEPARTURE_DELAY, na.rm = TRUE)) %>% arrange(desc(avg_departure_delay)) %>% head(6)

print(top_airports)

longest_delay_airport <- top_airports[which.max(top_airports\(avg_departure_delay), ] shortest_delay_airport <- top_airports[which.min(top_airports\)avg_departure_delay), ]

print(longest_delay_airport) print(shortest_delay_airport)

flights_from_CVG <- md.final %>% filter(ORIGIN_AIRPORT == “CVG”) %>% count(AIRLINE)

print(flights_from_CVG)