setwd(‘/Users/ethan/documents/BANA4137’) library(dplyr)
flights = read.csv(“cvg_flights.csv”, header = TRUE, na.strings = ““) airlines = read.csv(”airlines.csv”, header = TRUE, na.strings = ““) airports = read.csv(”airports.csv”, header = TRUE, na.strings = ““)
ncol(flights) nrow(flights)
ncol(airlines) nrow(airlines)
ncol(airports) nrow(airports)
merged.data <- left_join(flights, airlines, by=c(“AIRLINE”=“IATA_CODE”))
md.final <- left_join(merged.data, airports,by=c(“AIRLINE”=“IATA_CODE”)) head(md.final)
length(is.na(md.final)) length(md.final)
length(is.na(md.final))/length(md.final)
sum(md.final\(CANCELLED==0) sum(md.final\)CANCELLED==1 | md.final$CANCELLED==0)
sum(md.final\(CANCELLED==0)/sum(md.final\)CANCELLED==1 | md.final$CANCELLED==0)
sum(is.na(md.final$DEPARTURE_TIME))
TIME_DIFF <- (md.final\(SCHEDULED_TIME - md.final\)ELAPSED_TIME) head(TIME_DIFF)
filtered_data = md.final[(md.final[‘AIRLINE’] == ‘DL’) & (md.final[‘ORIGIN_AIRPORT’] == ‘CVG’) & (md.final[‘DEPARTURE_DELAY’] > 30)]
flight_number <- filtered_data[‘FLIGHT_NUMBER’].head(6)
print(flight_number)
average_delay <- md.final %>% group_by(AIRLINE) %>% summarize(avg_departure_delay = mean(DEPARTURE_DELAY, na.rm = TRUE))
longest_delay_airline <- average_delay[which.max(average_delay\(avg_departure_delay), ] shortest_delay_airline <- average_delay[which.min(average_delay\)avg_departure_delay), ]
print(average_delay) print(longest_delay_airline) print(shortest_delay_airline)
top_airports <- md.final %>% group_by(ORIGIN_AIRPORT) %>% summarize(avg_departure_delay = mean(DEPARTURE_DELAY, na.rm = TRUE)) %>% arrange(desc(avg_departure_delay)) %>% head(6)
print(top_airports)
longest_delay_airport <- top_airports[which.max(top_airports\(avg_departure_delay), ] shortest_delay_airport <- top_airports[which.min(top_airports\)avg_departure_delay), ]
print(longest_delay_airport) print(shortest_delay_airport)
flights_from_CVG <- md.final %>% filter(ORIGIN_AIRPORT == “CVG”) %>% count(AIRLINE)
print(flights_from_CVG)