dim(flights)
## [1] 7763 27
sum(is.na(flights))
## [1] 40270
missing_rows <- which(apply(is.na(flights), 1, any))
head(flights[missing_rows, ])
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/1/2015 MQ 3355 CVG MIA
## 2 1/1/2015 MQ 3597 CVG DFW
## 3 1/1/2015 OO 4811 BNA CVG
## 4 1/1/2015 DL 2315 CVG ATL
## 5 1/1/2015 F9 1308 IAD CVG
## 6 1/1/2015 DL 1502 CVG TPA
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 6:00:00 AM 5:56:00 AM -4 15 6:11:00 AM
## 2 6:00:00 AM <NA> NA NA <NA>
## 3 6:45:00 AM 6:37:00 AM -8 16 6:53:00 AM
## 4 7:10:00 AM 7:08:00 AM -2 36 7:44:00 AM
## 5 7:15:00 AM 7:05:00 AM -10 11 7:16:00 AM
## 6 7:20:00 AM 7:19:00 AM -1 9 7:28:00 AM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 153 150 123 948 8:14:00 AM 12
## 2 165 NA NA 812 <NA> NA
## 3 72 66 45 230 8:38:00 AM 5
## 4 102 101 62 373 8:46:00 AM 3
## 5 95 91 75 388 8:31:00 AM 5
## 6 141 116 105 773 9:13:00 AM 2
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 8:33:00 AM 8:26:00 AM -7 0 0
## 2 7:45:00 AM <NA> NA 0 1
## 3 8:57:00 AM 8:43:00 AM -14 0 0
## 4 8:52:00 AM 8:49:00 AM -3 0 0
## 5 8:50:00 AM 8:36:00 AM -14 0 0
## 6 9:41:00 AM 9:15:00 AM -26 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> NA NA NA
## 2 B NA NA NA
## 3 <NA> NA NA NA
## 4 <NA> NA NA NA
## 5 <NA> NA NA NA
## 6 <NA> NA NA NA
## LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
flights %>% summary()
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT
## Length:7763 Length:7763 Min. : 62 Length:7763
## Class :character Class :character 1st Qu.:1908 Class :character
## Mode :character Mode :character Median :3246 Mode :character
## Mean :3194
## 3rd Qu.:4536
## Max. :6297
##
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY
## Length:7763 Length:7763 Length:7763 Min. :-24.00
## Class :character Class :character Class :character 1st Qu.: -5.00
## Mode :character Mode :character Mode :character Median : -1.00
## Mean : 12.35
## 3rd Qu.: 12.00
## Max. :708.00
## NA's :370
## TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## Min. : 1.00 Length:7763 Min. : 60.0 Min. : 49.0
## 1st Qu.: 12.00 Class :character 1st Qu.: 85.0 1st Qu.: 83.0
## Median : 15.00 Mode :character Median :126.0 Median :123.0
## Mean : 18.52 Mean :128.1 Mean :127.3
## 3rd Qu.: 21.00 3rd Qu.:156.0 3rd Qu.:152.0
## Max. :122.00 Max. :313.0 Max. :340.0
## NA's :378 NA's :392
## AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## Min. : 36.0 Min. : 229.0 Length:7763 Min. : 1.000
## 1st Qu.: 56.0 1st Qu.: 308.0 Class :character 1st Qu.: 5.000
## Median : 99.0 Median : 589.0 Mode :character Median : 6.000
## Mean :100.4 Mean : 675.9 Mean : 8.448
## 3rd Qu.:125.0 3rd Qu.: 871.0 3rd Qu.: 9.000
## Max. :319.0 Max. :2036.0 Max. :128.000
## NA's :392 NA's :381
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED
## Length:7763 Length:7763 Min. :-56.00 Min. :0.000000
## Class :character Class :character 1st Qu.:-12.00 1st Qu.:0.000000
## Mode :character Mode :character Median : -2.00 Median :0.000000
## Mean : 10.65 Mean :0.001803
## 3rd Qu.: 16.00 3rd Qu.:0.000000
## Max. :716.00 Max. :1.000000
## NA's :392
## CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## Min. :0.00000 Length:7763 Min. : 0.00 Min. :0.00
## 1st Qu.:0.00000 Class :character 1st Qu.: 0.00 1st Qu.:0.00
## Median :0.00000 Mode :character Median : 10.00 Median :0.00
## Mean :0.04869 Mean : 16.35 Mean :0.01
## 3rd Qu.:0.00000 3rd Qu.: 23.00 3rd Qu.:0.00
## Max. :1.00000 Max. :258.00 Max. :9.00
## NA's :5814 NA's :5814
## AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 2.00 Median : 0.000
## Mean : 17.06 Mean : 23.24 Mean : 3.873
## 3rd Qu.: 18.00 3rd Qu.: 30.00 3rd Qu.: 0.000
## Max. :435.00 Max. :435.00 Max. :638.000
## NA's :5814 NA's :5814 NA's :5814
ggplot(flights, aes(x = DEPARTURE_DELAY, y = ARRIVAL_DELAY)) +
geom_point(alpha=0.3) +
ggtitle("Departure Delay vs Arrival Delay") +
xlab("Departure Delay (minutes)") +
ylab("Arrival Delay (minutes)")
## Warning: Removed 392 rows containing missing values (geom_point).
#### I choose to visualize the association between 2 variables of
Arrival Delay abd Departure Delay. From the plot, we can see that there
is a clear positive relationship between departure delay and arrival
delay - as the departure delay increases, so does the arrival delay.
There also appear to be a few outliers with very high delays in both
directions. This relationship can be explored further by looking at the
airlines or destinations with the highest delays.
# Convert date to a date object
flights$date <- as.Date(flights$FLIGHT_DATE)
# Extract month from date
flights$month <- format(flights$date, "%m")
# Plot the scatterplot
flights %>% ggplot(aes(x = ARRIVAL_DELAY, y = DEPARTURE_DELAY)) +
geom_point(alpha = 0.4) +
facet_wrap(~ month, ncol = 3) +
xlab("Arrival delay (minutes)") +
ylab("Departure delay (minutes)") +
ggtitle("Association between departure delay and arrival, by month")
## Warning: Removed 392 rows containing missing values (geom_point).
merged_data <- left_join(flights, airlines, by=c("AIRLINE" = "IATA_CODE"))
merged_data <- left_join(merged_data, airports, by=c( "ORIGIN_AIRPORT" = "IATA_CODE"))
merged_data <- left_join(merged_data, airports, by=c("DESTINATION_AIRPORT" = "IATA_CODE"))
head(merged_data)
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/1/2015 MQ 3355 CVG MIA
## 2 1/1/2015 MQ 3597 CVG DFW
## 3 1/1/2015 OO 4811 BNA CVG
## 4 1/1/2015 DL 2315 CVG ATL
## 5 1/1/2015 F9 1308 IAD CVG
## 6 1/1/2015 DL 1502 CVG TPA
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 6:00:00 AM 5:56:00 AM -4 15 6:11:00 AM
## 2 6:00:00 AM <NA> NA NA <NA>
## 3 6:45:00 AM 6:37:00 AM -8 16 6:53:00 AM
## 4 7:10:00 AM 7:08:00 AM -2 36 7:44:00 AM
## 5 7:15:00 AM 7:05:00 AM -10 11 7:16:00 AM
## 6 7:20:00 AM 7:19:00 AM -1 9 7:28:00 AM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 153 150 123 948 8:14:00 AM 12
## 2 165 NA NA 812 <NA> NA
## 3 72 66 45 230 8:38:00 AM 5
## 4 102 101 62 373 8:46:00 AM 3
## 5 95 91 75 388 8:31:00 AM 5
## 6 141 116 105 773 9:13:00 AM 2
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 8:33:00 AM 8:26:00 AM -7 0 0
## 2 7:45:00 AM <NA> NA 0 1
## 3 8:57:00 AM 8:43:00 AM -14 0 0
## 4 8:52:00 AM 8:49:00 AM -3 0 0
## 5 8:50:00 AM 8:36:00 AM -14 0 0
## 6 9:41:00 AM 9:15:00 AM -26 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> NA NA NA
## 2 B NA NA NA
## 3 <NA> NA NA NA
## 4 <NA> NA NA NA
## 5 <NA> NA NA NA
## 6 <NA> NA NA NA
## LATE_AIRCRAFT_DELAY WEATHER_DELAY date month
## 1 NA NA 0001-01-20 01
## 2 NA NA 0001-01-20 01
## 3 NA NA 0001-01-20 01
## 4 NA NA 0001-01-20 01
## 5 NA NA 0001-01-20 01
## 6 NA NA 0001-01-20 01
## AIRLINE.y
## 1 American Eagle Airlines Inc.
## 2 American Eagle Airlines Inc.
## 3 Skywest Airlines Inc.
## 4 Delta Air Lines Inc.
## 5 Frontier Airlines Inc.
## 6 Delta Air Lines Inc.
## AIRPORT.x CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington KY
## 2 Cincinnati/Northern Kentucky International Airport Covington KY
## 3 Nashville International Airport Nashville TN
## 4 Cincinnati/Northern Kentucky International Airport Covington KY
## 5 Washington Dulles International Airport Chantilly VA
## 6 Cincinnati/Northern Kentucky International Airport Covington KY
## COUNTRY.x LATITUDE.x LONGITUDE.x
## 1 USA 39.04614 -84.66217
## 2 USA 39.04614 -84.66217
## 3 USA 36.12448 -86.67818
## 4 USA 39.04614 -84.66217
## 5 USA 38.94453 -77.45581
## 6 USA 39.04614 -84.66217
## AIRPORT.y CITY.y STATE.y
## 1 Miami International Airport Miami FL
## 2 Dallas/Fort Worth International Airport Dallas-Fort Worth TX
## 3 Cincinnati/Northern Kentucky International Airport Covington KY
## 4 Hartsfield-Jackson Atlanta International Airport Atlanta GA
## 5 Cincinnati/Northern Kentucky International Airport Covington KY
## 6 Tampa International Airport Tampa FL
## COUNTRY.y LATITUDE.y LONGITUDE.y
## 1 USA 25.79325 -80.29056
## 2 USA 32.89595 -97.03720
## 3 USA 39.04614 -84.66217
## 4 USA 33.64044 -84.42694
## 5 USA 39.04614 -84.66217
## 6 USA 27.97547 -82.53325
view(merged_data)