This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Question 1: -------------------------------------------------------------
flights = read.csv("CVG_Flights.csv", header = TRUE, na.strings = "")
airlines = read.csv("airlines.csv", header = TRUE, na.strings = "")
airports = read.csv("airports.csv", header = TRUE, na.strings = "")
# Question 2: -------------------------------------------------------------
nrow(flights)
## [1] 7763
ncol(flights)
## [1] 27
nrow(airlines)
## [1] 14
ncol(airlines)
## [1] 2
nrow(airports)
## [1] 322
ncol(airports)
## [1] 7
### answer for question 2:
# Flights: 7763 rows, 27 columns
# Each row represents a flight
# Each column represents different information about the flight (airline, flight number, destination, departure time, etc.)
# Airlines: 14 rows, 2 columns
# Each row represents a different airline
# Each column represents information about that airline - IATA code is an airline designator
# Airports: 322 rows, 7 columns
# Each row represents a different airport
# Each column represents the location information of that airport
# Question 3: -------------------------------------------------------------
flights_with_airlines <- left_join(flights,
airlines,
by = c("AIRLINE" = "IATA_CODE"))
flights_with_originalairports <- left_join(flights_with_airlines,
airports,
by = c("ORIGIN_AIRPORT"= "IATA_CODE"))
flights_merged <- left_join(flights_with_originalairports,
airports,
by = c("DESTINATION_AIRPORT" = "IATA_CODE"))
# Question 4: -------------------------------------------------------------
head(flights_merged)
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/1/2015 MQ 3355 CVG MIA
## 2 1/1/2015 MQ 3597 CVG DFW
## 3 1/1/2015 OO 4811 BNA CVG
## 4 1/1/2015 DL 2315 CVG ATL
## 5 1/1/2015 F9 1308 IAD CVG
## 6 1/1/2015 DL 1502 CVG TPA
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 6:00:00 AM 5:56:00 AM -4 15 6:11:00 AM
## 2 6:00:00 AM <NA> NA NA <NA>
## 3 6:45:00 AM 6:37:00 AM -8 16 6:53:00 AM
## 4 7:10:00 AM 7:08:00 AM -2 36 7:44:00 AM
## 5 7:15:00 AM 7:05:00 AM -10 11 7:16:00 AM
## 6 7:20:00 AM 7:19:00 AM -1 9 7:28:00 AM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 153 150 123 948 8:14:00 AM 12
## 2 165 NA NA 812 <NA> NA
## 3 72 66 45 230 8:38:00 AM 5
## 4 102 101 62 373 8:46:00 AM 3
## 5 95 91 75 388 8:31:00 AM 5
## 6 141 116 105 773 9:13:00 AM 2
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 8:33:00 AM 8:26:00 AM -7 0 0
## 2 7:45:00 AM <NA> NA 0 1
## 3 8:57:00 AM 8:43:00 AM -14 0 0
## 4 8:52:00 AM 8:49:00 AM -3 0 0
## 5 8:50:00 AM 8:36:00 AM -14 0 0
## 6 9:41:00 AM 9:15:00 AM -26 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> NA NA NA
## 2 B NA NA NA
## 3 <NA> NA NA NA
## 4 <NA> NA NA NA
## 5 <NA> NA NA NA
## 6 <NA> NA NA NA
## LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 1 NA NA American Eagle Airlines Inc.
## 2 NA NA American Eagle Airlines Inc.
## 3 NA NA Skywest Airlines Inc.
## 4 NA NA Delta Air Lines Inc.
## 5 NA NA Frontier Airlines Inc.
## 6 NA NA Delta Air Lines Inc.
## AIRPORT.x CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington KY
## 2 Cincinnati/Northern Kentucky International Airport Covington KY
## 3 Nashville International Airport Nashville TN
## 4 Cincinnati/Northern Kentucky International Airport Covington KY
## 5 Washington Dulles International Airport Chantilly VA
## 6 Cincinnati/Northern Kentucky International Airport Covington KY
## COUNTRY.x LATITUDE.x LONGITUDE.x
## 1 USA 39.04614 -84.66217
## 2 USA 39.04614 -84.66217
## 3 USA 36.12448 -86.67818
## 4 USA 39.04614 -84.66217
## 5 USA 38.94453 -77.45581
## 6 USA 39.04614 -84.66217
## AIRPORT.y CITY.y STATE.y
## 1 Miami International Airport Miami FL
## 2 Dallas/Fort Worth International Airport Dallas-Fort Worth TX
## 3 Cincinnati/Northern Kentucky International Airport Covington KY
## 4 Hartsfield-Jackson Atlanta International Airport Atlanta GA
## 5 Cincinnati/Northern Kentucky International Airport Covington KY
## 6 Tampa International Airport Tampa FL
## COUNTRY.y LATITUDE.y LONGITUDE.y
## 1 USA 25.79325 -80.29056
## 2 USA 32.89595 -97.03720
## 3 USA 39.04614 -84.66217
## 4 USA 33.64044 -84.42694
## 5 USA 39.04614 -84.66217
## 6 USA 27.97547 -82.53325
# Question 5: -------------------------------------------------------------
colnames(flights_merged)[colSums(is.na(flights_merged)) > 0]
## [1] "DEPARTURE_TIME" "DEPARTURE_DELAY" "TAXI_OUT"
## [4] "WHEELS_OFF" "ELAPSED_TIME" "AIR_TIME"
## [7] "WHEELS_ON" "TAXI_IN" "ARRIVAL_TIME"
## [10] "ARRIVAL_DELAY" "CANCELLATION_REASON" "AIR_SYSTEM_DELAY"
## [13] "SECURITY_DELAY" "AIRLINE_DELAY" "LATE_AIRCRAFT_DELAY"
## [16] "WEATHER_DELAY"
### answer for question 5:
# missing values:
# departure time (4.7%), departure delay (4.7%), taxi out (4.8%), wheels off (4.8%), elapsed time (5%), air time (5%),
# wheels on (4.9%), taxi in (4.9%), arrival time (4.9%), arrival delay (5%), cancellation reasons (95.1%), air system delay (74.89%),
# security delay (74.89%), airline delay (74.89%), late aircraft delay (74.89%), weather delay (74.89%)
# Question 6: -------------------------------------------------------------
nrow(flights_merged)
## [1] 7763
sum(is.na(flights_merged$DEPARTURE_TIME)) / nrow(flights_merged)
## [1] 0.04766199
### answer to question 6:
# Proportion of canceled flights = 0.04766199; there are 3 different cancellation reasons
# Question 7: -------------------------------------------------------------
# Yes there are missing values in departure time. This is because we also have data on
# canceled flights, so therefore if a flight is canceled then it will not provide us
# with a departure time.
# Question 8: -------------------------------------------------------------
flights_merged$TIME_DIFFERENCE <- flights_merged$SCHEDULED_TIME-flights_merged$ELAPSED_TIME
head(flights_merged)
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/1/2015 MQ 3355 CVG MIA
## 2 1/1/2015 MQ 3597 CVG DFW
## 3 1/1/2015 OO 4811 BNA CVG
## 4 1/1/2015 DL 2315 CVG ATL
## 5 1/1/2015 F9 1308 IAD CVG
## 6 1/1/2015 DL 1502 CVG TPA
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 6:00:00 AM 5:56:00 AM -4 15 6:11:00 AM
## 2 6:00:00 AM <NA> NA NA <NA>
## 3 6:45:00 AM 6:37:00 AM -8 16 6:53:00 AM
## 4 7:10:00 AM 7:08:00 AM -2 36 7:44:00 AM
## 5 7:15:00 AM 7:05:00 AM -10 11 7:16:00 AM
## 6 7:20:00 AM 7:19:00 AM -1 9 7:28:00 AM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 153 150 123 948 8:14:00 AM 12
## 2 165 NA NA 812 <NA> NA
## 3 72 66 45 230 8:38:00 AM 5
## 4 102 101 62 373 8:46:00 AM 3
## 5 95 91 75 388 8:31:00 AM 5
## 6 141 116 105 773 9:13:00 AM 2
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 8:33:00 AM 8:26:00 AM -7 0 0
## 2 7:45:00 AM <NA> NA 0 1
## 3 8:57:00 AM 8:43:00 AM -14 0 0
## 4 8:52:00 AM 8:49:00 AM -3 0 0
## 5 8:50:00 AM 8:36:00 AM -14 0 0
## 6 9:41:00 AM 9:15:00 AM -26 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> NA NA NA
## 2 B NA NA NA
## 3 <NA> NA NA NA
## 4 <NA> NA NA NA
## 5 <NA> NA NA NA
## 6 <NA> NA NA NA
## LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 1 NA NA American Eagle Airlines Inc.
## 2 NA NA American Eagle Airlines Inc.
## 3 NA NA Skywest Airlines Inc.
## 4 NA NA Delta Air Lines Inc.
## 5 NA NA Frontier Airlines Inc.
## 6 NA NA Delta Air Lines Inc.
## AIRPORT.x CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington KY
## 2 Cincinnati/Northern Kentucky International Airport Covington KY
## 3 Nashville International Airport Nashville TN
## 4 Cincinnati/Northern Kentucky International Airport Covington KY
## 5 Washington Dulles International Airport Chantilly VA
## 6 Cincinnati/Northern Kentucky International Airport Covington KY
## COUNTRY.x LATITUDE.x LONGITUDE.x
## 1 USA 39.04614 -84.66217
## 2 USA 39.04614 -84.66217
## 3 USA 36.12448 -86.67818
## 4 USA 39.04614 -84.66217
## 5 USA 38.94453 -77.45581
## 6 USA 39.04614 -84.66217
## AIRPORT.y CITY.y STATE.y
## 1 Miami International Airport Miami FL
## 2 Dallas/Fort Worth International Airport Dallas-Fort Worth TX
## 3 Cincinnati/Northern Kentucky International Airport Covington KY
## 4 Hartsfield-Jackson Atlanta International Airport Atlanta GA
## 5 Cincinnati/Northern Kentucky International Airport Covington KY
## 6 Tampa International Airport Tampa FL
## COUNTRY.y LATITUDE.y LONGITUDE.y TIME_DIFFERENCE
## 1 USA 25.79325 -80.29056 3
## 2 USA 32.89595 -97.03720 NA
## 3 USA 39.04614 -84.66217 6
## 4 USA 33.64044 -84.42694 1
## 5 USA 39.04614 -84.66217 4
## 6 USA 27.97547 -82.53325 25
# Question 9: -------------------------------------------------------------
Delta_CVG_Delay <- filter(flights_merged, ORIGIN_AIRPORT=="CVG", AIRLINE=="DL", DEPARTURE_DELAY > 30)
head(Delta_CVG_Delay)
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/3/2015 DL 1741 CVG SLC
## 2 1/3/2015 DL 1833 CVG LAX
## 3 1/4/2015 DL 1854 CVG RSW
## 4 1/9/2015 DL 1503 CVG MCO
## 5 1/9/2015 DL 63 CVG BOS
## 6 1/10/2015 DL 1934 CVG ATL
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 8:25:00 AM 11:41:00 AM 196 11 11:52:00 AM
## 2 4:46:00 PM 5:43:00 PM 57 7 5:50:00 PM
## 3 9:35:00 AM 10:20:00 AM 45 15 10:35:00 AM
## 4 1:55:00 PM 2:39:00 PM 44 15 2:54:00 PM
## 5 4:15:00 PM 5:32:00 PM 77 9 5:41:00 PM
## 6 2:19:00 PM 3:58:00 PM 99 8 4:06:00 PM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 245 222 208 1450 1:20:00 AM 3
## 2 294 269 250 1900 1:00:00 AM 12
## 3 153 156 139 879 1:54:00 AM 2
## 4 134 131 109 757 1:43:00 AM 7
## 5 121 104 90 752 1:11:00 AM 5
## 6 99 74 60 373 1:06:00 AM 6
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 10:30:00 AM 1:23:00 PM 173 0 0
## 2 6:40:00 PM 7:12:00 PM 32 0 0
## 3 12:08:00 PM 12:56:00 PM 48 0 0
## 4 4:09:00 PM 4:50:00 PM 41 0 0
## 5 6:16:00 PM 7:16:00 PM 60 0 0
## 6 3:58:00 PM 5:12:00 PM 74 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> 0 0 173
## 2 <NA> 0 0 9
## 3 <NA> 48 0 0
## 4 <NA> 0 0 41
## 5 <NA> 0 0 0
## 6 <NA> 0 0 74
## LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 1 0 0 Delta Air Lines Inc.
## 2 23 0 Delta Air Lines Inc.
## 3 0 0 Delta Air Lines Inc.
## 4 0 0 Delta Air Lines Inc.
## 5 60 0 Delta Air Lines Inc.
## 6 0 0 Delta Air Lines Inc.
## AIRPORT.x CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington KY
## 2 Cincinnati/Northern Kentucky International Airport Covington KY
## 3 Cincinnati/Northern Kentucky International Airport Covington KY
## 4 Cincinnati/Northern Kentucky International Airport Covington KY
## 5 Cincinnati/Northern Kentucky International Airport Covington KY
## 6 Cincinnati/Northern Kentucky International Airport Covington KY
## COUNTRY.x LATITUDE.x LONGITUDE.x
## 1 USA 39.04614 -84.66217
## 2 USA 39.04614 -84.66217
## 3 USA 39.04614 -84.66217
## 4 USA 39.04614 -84.66217
## 5 USA 39.04614 -84.66217
## 6 USA 39.04614 -84.66217
## AIRPORT.y CITY.y STATE.y
## 1 Salt Lake City International Airport Salt Lake City UT
## 2 Los Angeles International Airport Los Angeles CA
## 3 Southwest Florida International Airport Ft. Myers FL
## 4 Orlando International Airport Orlando FL
## 5 Gen. Edward Lawrence Logan International Airport Boston MA
## 6 Hartsfield-Jackson Atlanta International Airport Atlanta GA
## COUNTRY.y LATITUDE.y LONGITUDE.y TIME_DIFFERENCE
## 1 USA 40.78839 -111.97777 23
## 2 USA 33.94254 -118.40807 25
## 3 USA 26.53617 -81.75517 -3
## 4 USA 28.42889 -81.31603 3
## 5 USA 42.36435 -71.00518 17
## 6 USA 33.64044 -84.42694 25
# Question 10: ------------------------------------------------------------
by_airlines <- group_by(flights_merged, AIRLINE.y)
summarize(by_airlines, mean_delay_time =mean(DEPARTURE_DELAY, na.rm = TRUE))
## # A tibble: 5 × 2
## AIRLINE.y mean_delay_time
## <chr> <dbl>
## 1 American Eagle Airlines Inc. 19.1
## 2 Atlantic Southeast Airlines 10.4
## 3 Delta Air Lines Inc. 6.09
## 4 Frontier Airlines Inc. 23.9
## 5 Skywest Airlines Inc. 11.1
### answer to question 10:
# Frontier has the longest average delay and Delta has the shortest average delay
# Question 11: ------------------------------------------------------------
by_airport <- group_by(flights_merged, ORIGIN_AIRPORT)
summarize(by_airport, mean_delay_time =mean(DEPARTURE_DELAY, na.rm = TRUE))
## # A tibble: 38 × 2
## ORIGIN_AIRPORT mean_delay_time
## <chr> <dbl>
## 1 ATL 7.12
## 2 BDL -4
## 3 BNA 13.3
## 4 BOS 21.1
## 5 CLT 10.0
## 6 CVG 11.6
## 7 DEN 15.1
## 8 DFW 16.9
## 9 DTW 21.3
## 10 EWR 7.29
## # … with 28 more rows
### answer to question 11:
# DTW has the longest average department delay and BDL has the shortest
# average department delay
# Question 12: ------------------------------------------------------------
flights_merged %>%
filter(ORIGIN_AIRPORT == "CVG") %>%
count(AIRLINE)
## AIRLINE n
## 1 DL 1149
## 2 EV 776
## 3 F9 258
## 4 MQ 1041
## 5 OO 662
### answer to question 12:
# DL = 1149
# EV = 776
# F9 = 258
# MQ = 1041
# OO = 662