flights = read.csv("CVG_Flights.csv", header = TRUE, na.strings = "")
airlines = read.csv("airlines.csv", header = TRUE, na.strings = "")
airports = read.csv("airports.csv", header = TRUE, na.strings = "")
glimpse(flights)
## Rows: 7,763
## Columns: 27
## $ FLIGHT_DATE <chr> "1/1/2015", "1/1/2015", "1/1/2015", "1/1/2015", "1…
## $ AIRLINE <chr> "MQ", "MQ", "OO", "DL", "F9", "DL", "MQ", "F9", "D…
## $ FLIGHT_NUMBER <int> 3355, 3597, 4811, 2315, 1308, 1502, 3491, 1290, 23…
## $ ORIGIN_AIRPORT <chr> "CVG", "CVG", "BNA", "CVG", "IAD", "CVG", "DFW", "…
## $ DESTINATION_AIRPORT <chr> "MIA", "DFW", "CVG", "ATL", "CVG", "TPA", "CVG", "…
## $ SCHEDULED_DEPARTURE <chr> "6:00:00 AM", "6:00:00 AM", "6:45:00 AM", "7:10:00…
## $ DEPARTURE_TIME <chr> "5:56:00 AM", NA, "6:37:00 AM", "7:08:00 AM", "7:0…
## $ DEPARTURE_DELAY <int> -4, NA, -8, -2, -10, -1, NA, -2, -9, 11, NA, -5, -…
## $ TAXI_OUT <int> 15, NA, 16, 36, 11, 9, NA, 10, 27, 9, NA, 20, 18, …
## $ WHEELS_OFF <chr> "6:11:00 AM", NA, "6:53:00 AM", "7:44:00 AM", "7:1…
## $ SCHEDULED_TIME <int> 153, 165, 72, 102, 95, 141, 128, 150, 153, 127, 17…
## $ ELAPSED_TIME <int> 150, NA, 66, 101, 91, 116, NA, 143, 150, 119, NA, …
## $ AIR_TIME <int> 123, NA, 45, 62, 75, 105, NA, 126, 117, 103, NA, 2…
## $ DISTANCE <int> 948, 812, 230, 373, 388, 773, 812, 932, 879, 532, …
## $ WHEELS_ON <chr> "8:14:00 AM", NA, "8:38:00 AM", "8:46:00 AM", "8:3…
## $ TAXI_IN <int> 12, NA, 5, 3, 5, 2, NA, 7, 6, 7, NA, 4, 5, 9, 10, …
## $ SCHEDULED_ARRIVAL <chr> "8:33:00 AM", "7:45:00 AM", "8:57:00 AM", "8:52:00…
## $ ARRIVAL_TIME <chr> "8:26:00 AM", NA, "8:43:00 AM", "8:49:00 AM", "8:3…
## $ ARRIVAL_DELAY <int> -7, NA, -14, -3, -14, -26, NA, -9, -12, 3, NA, -11…
## $ DIVERTED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CANCELLED <int> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ CANCELLATION_REASON <chr> NA, "B", NA, NA, NA, NA, "B", NA, NA, NA, "B", NA,…
## $ AIR_SYSTEM_DELAY <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ SECURITY_DELAY <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ AIRLINE_DELAY <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ LATE_AIRCRAFT_DELAY <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ WEATHER_DELAY <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
glimpse(airlines)
## Rows: 14
## Columns: 2
## $ IATA_CODE <chr> "UA", "AA", "US", "F9", "B6", "OO", "AS", "NK", "WN", "DL", …
## $ AIRLINE <chr> "United Air Lines Inc.", "American Airlines Inc.", "US Airwa…
glimpse(airports)
## Rows: 322
## Columns: 7
## $ IATA_CODE <chr> "ABE", "ABI", "ABQ", "ABR", "ABY", "ACK", "ACT", "ACV", "ACY…
## $ AIRPORT <chr> "Lehigh Valley International Airport", "Abilene Regional Air…
## $ CITY <chr> "Allentown", "Abilene", "Albuquerque", "Aberdeen", "Albany",…
## $ STATE <chr> "PA", "TX", "NM", "SD", "GA", "MA", "TX", "CA", "NJ", "AK", …
## $ COUNTRY <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA…
## $ LATITUDE <dbl> 40.65236, 32.41132, 35.04022, 45.44906, 31.53552, 41.25305, …
## $ LONGITUDE <dbl> -75.44040, -99.68190, -106.60919, -98.42183, -84.19447, -70.…
Flight_Airline_data <- left_join(flights, airlines, by=c("AIRLINE"="IATA_CODE"))
merged_data <- left_join(Flight_Airline_data, airports, by=c("ORIGIN_AIRPORT"="IATA_CODE"))
print(head(merged_data, 6))
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/1/2015 MQ 3355 CVG MIA
## 2 1/1/2015 MQ 3597 CVG DFW
## 3 1/1/2015 OO 4811 BNA CVG
## 4 1/1/2015 DL 2315 CVG ATL
## 5 1/1/2015 F9 1308 IAD CVG
## 6 1/1/2015 DL 1502 CVG TPA
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 6:00:00 AM 5:56:00 AM -4 15 6:11:00 AM
## 2 6:00:00 AM <NA> NA NA <NA>
## 3 6:45:00 AM 6:37:00 AM -8 16 6:53:00 AM
## 4 7:10:00 AM 7:08:00 AM -2 36 7:44:00 AM
## 5 7:15:00 AM 7:05:00 AM -10 11 7:16:00 AM
## 6 7:20:00 AM 7:19:00 AM -1 9 7:28:00 AM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 153 150 123 948 8:14:00 AM 12
## 2 165 NA NA 812 <NA> NA
## 3 72 66 45 230 8:38:00 AM 5
## 4 102 101 62 373 8:46:00 AM 3
## 5 95 91 75 388 8:31:00 AM 5
## 6 141 116 105 773 9:13:00 AM 2
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 8:33:00 AM 8:26:00 AM -7 0 0
## 2 7:45:00 AM <NA> NA 0 1
## 3 8:57:00 AM 8:43:00 AM -14 0 0
## 4 8:52:00 AM 8:49:00 AM -3 0 0
## 5 8:50:00 AM 8:36:00 AM -14 0 0
## 6 9:41:00 AM 9:15:00 AM -26 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> NA NA NA
## 2 B NA NA NA
## 3 <NA> NA NA NA
## 4 <NA> NA NA NA
## 5 <NA> NA NA NA
## 6 <NA> NA NA NA
## LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 1 NA NA American Eagle Airlines Inc.
## 2 NA NA American Eagle Airlines Inc.
## 3 NA NA Skywest Airlines Inc.
## 4 NA NA Delta Air Lines Inc.
## 5 NA NA Frontier Airlines Inc.
## 6 NA NA Delta Air Lines Inc.
## AIRPORT CITY STATE COUNTRY
## 1 Cincinnati/Northern Kentucky International Airport Covington KY USA
## 2 Cincinnati/Northern Kentucky International Airport Covington KY USA
## 3 Nashville International Airport Nashville TN USA
## 4 Cincinnati/Northern Kentucky International Airport Covington KY USA
## 5 Washington Dulles International Airport Chantilly VA USA
## 6 Cincinnati/Northern Kentucky International Airport Covington KY USA
## LATITUDE LONGITUDE
## 1 39.04614 -84.66217
## 2 39.04614 -84.66217
## 3 36.12448 -86.67818
## 4 39.04614 -84.66217
## 5 38.94453 -77.45581
## 6 39.04614 -84.66217
sum(is.na.data.frame(merged_data))
## [1] 40270
map(merged_data, ~sum(is.na(.)))
## $FLIGHT_DATE
## [1] 0
##
## $AIRLINE
## [1] 0
##
## $FLIGHT_NUMBER
## [1] 0
##
## $ORIGIN_AIRPORT
## [1] 0
##
## $DESTINATION_AIRPORT
## [1] 0
##
## $SCHEDULED_DEPARTURE
## [1] 0
##
## $DEPARTURE_TIME
## [1] 370
##
## $DEPARTURE_DELAY
## [1] 370
##
## $TAXI_OUT
## [1] 378
##
## $WHEELS_OFF
## [1] 378
##
## $SCHEDULED_TIME
## [1] 0
##
## $ELAPSED_TIME
## [1] 392
##
## $AIR_TIME
## [1] 392
##
## $DISTANCE
## [1] 0
##
## $WHEELS_ON
## [1] 381
##
## $TAXI_IN
## [1] 381
##
## $SCHEDULED_ARRIVAL
## [1] 0
##
## $ARRIVAL_TIME
## [1] 381
##
## $ARRIVAL_DELAY
## [1] 392
##
## $DIVERTED
## [1] 0
##
## $CANCELLED
## [1] 0
##
## $CANCELLATION_REASON
## [1] 7385
##
## $AIR_SYSTEM_DELAY
## [1] 5814
##
## $SECURITY_DELAY
## [1] 5814
##
## $AIRLINE_DELAY
## [1] 5814
##
## $LATE_AIRCRAFT_DELAY
## [1] 5814
##
## $WEATHER_DELAY
## [1] 5814
##
## $AIRLINE.y
## [1] 0
##
## $AIRPORT
## [1] 0
##
## $CITY
## [1] 0
##
## $STATE
## [1] 0
##
## $COUNTRY
## [1] 0
##
## $LATITUDE
## [1] 0
##
## $LONGITUDE
## [1] 0
print((colMeans(is.na(merged_data)))*100)
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT
## 0.000000 0.000000 0.000000 0.000000
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY
## 0.000000 0.000000 4.766199 4.766199
## TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 4.869252 4.869252 0.000000 5.049594
## AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 5.049594 0.000000 4.907896 4.907896
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED
## 0.000000 4.907896 5.049594 0.000000
## CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## 0.000000 95.130748 74.893727 74.893727
## AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 74.893727 74.893727 74.893727 0.000000
## AIRPORT CITY STATE COUNTRY
## 0.000000 0.000000 0.000000 0.000000
## LATITUDE LONGITUDE
## 0.000000 0.000000
#Total number of flights in merged data
total_flights <- nrow(merged_data)
print(total_flights)
## [1] 7763
#Filter for Cancelled flights
all_cancelled_flights <- dplyr::filter( merged_data, CANCELLED %in% "1")
#Number of cancelled flights
num_cancelled_flights <- nrow(all_cancelled_flights)
print(num_cancelled_flights)
## [1] 378
#Number of non-cancelled flights
num_non_cancelled_flights <- total_flights - num_cancelled_flights
print(num_non_cancelled_flights)
## [1] 7385
#Proportion
print(num_cancelled_flights/total_flights)
## [1] 0.04869252
table(merged_data$CANCELLATION_REASON)
##
## A B C
## 71 180 127
sum(is.na(merged_data$DEPARTURE_TIME))
## [1] 370
# create the new variable
SCHEDULED_MINUS_ELAPSED <- merged_data$SCHEDULED_TIME - merged_data$ELAPSED_TIME
# add new variable to merged_data via a join
merged_data2 <- cbind(merged_data, SCHEDULED_MINUS_ELAPSED)
print(head(merged_data$SCHEDULED_MINUS_ELAPSED, 6))
## NULL
# Extract the observations and create a new data frame
DELTA_CVG_DEPARTURE_DELAY_30_MIN_PLUS <- subset(merged_data, AIRLINE == 'DL' & ORIGIN_AIRPORT == 'CVG' & DEPARTURE_DELAY > 30)
# Print the first 6 flight numbers in the new data frame.
print(head(DELTA_CVG_DEPARTURE_DELAY_30_MIN_PLUS$FLIGHT_NUMBER, 6))
## [1] 1741 1833 1854 1503 63 1934
avg_dep_delay_by_airline <- merged_data %>%
group_by(AIRLINE) %>%
summarise(avg_dep_delay = mean(DEPARTURE_DELAY, is.na = TRUE))
print(avg_dep_delay_by_airline)
## # A tibble: 5 × 2
## AIRLINE avg_dep_delay
## <chr> <dbl>
## 1 DL NA
## 2 EV NA
## 3 F9 NA
## 4 MQ NA
## 5 OO NA
origin_airport_avg_dep_delay_time <- merged_data %>%
group_by(ORIGIN_AIRPORT) %>%
summarise(avg_dep_delay = mean(DEPARTURE_DELAY, is.na = TRUE))
##( origin_airport_avg_dep_delay_time <- arrange(origin_airport_avg_dep_delay_time, avg_dep_delay = desc) )## I cant figure out whats causing the error here. I turned this line of code into a comment because it was preventing me from knitting the document.
print(head(origin_airport_avg_dep_delay_time, 6))
## # A tibble: 6 × 2
## ORIGIN_AIRPORT avg_dep_delay
## <chr> <dbl>
## 1 ATL NA
## 2 BDL -4
## 3 BNA NA
## 4 BOS NA
## 5 CLT NA
## 6 CVG NA
#Question 12 ##For flights departing from CVG airport, count how many flights are offered by each airline. Print the entire list.
# Filter for flights departing CVG and create a new data frame
Flights_Departing_CVG <- subset(merged_data, ORIGIN_AIRPORT == 'CVG')
print(count(Flights_Departing_CVG, AIRLINE))
## AIRLINE n
## 1 DL 1149
## 2 EV 776
## 3 F9 258
## 4 MQ 1041
## 5 OO 662