R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# Question 1: -------------------------------------------------------------
flights = read.csv("CVG_Flights.csv", header = TRUE, na.strings = "")

airlines = read.csv("airlines.csv", header = TRUE, na.strings = "")

airports = read.csv("airports.csv", header = TRUE, na.strings = "")
# Question 2: -------------------------------------------------------------
nrow(flights)
## [1] 7763
ncol(flights)
## [1] 27
nrow(airlines)
## [1] 14
ncol(airlines)
## [1] 2
nrow(airports)
## [1] 322
ncol(airports)
## [1] 7
### answer for question 2:
# Flights: 7763 rows, 27 columns
# Each row represents a flight 
# Each column represents different information about the flight (airline, flight number, destination, departure time, etc.)
# Airlines: 14 rows, 2 columns
# Each row represents a different airline
# Each column represents information about that airline - IATA code is an airline designator
# Airports: 322 rows, 7 columns
# Each row represents a different airport
# Each column represents the location information of that airport
# Question 3: -------------------------------------------------------------
flights_with_airlines <- left_join(flights,
                                   airlines,
                                   by = c("AIRLINE" = "IATA_CODE"))

flights_with_originalairports <- left_join(flights_with_airlines,
                                           airports,
                                           by = c("ORIGIN_AIRPORT"= "IATA_CODE"))

flights_merged <- left_join(flights_with_originalairports,
                            airports,
                            by = c("DESTINATION_AIRPORT" = "IATA_CODE"))
# Question 4: -------------------------------------------------------------
head(flights_merged)
##   FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1    1/1/2015      MQ          3355            CVG                 MIA
## 2    1/1/2015      MQ          3597            CVG                 DFW
## 3    1/1/2015      OO          4811            BNA                 CVG
## 4    1/1/2015      DL          2315            CVG                 ATL
## 5    1/1/2015      F9          1308            IAD                 CVG
## 6    1/1/2015      DL          1502            CVG                 TPA
##   SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1          6:00:00 AM     5:56:00 AM              -4       15 6:11:00 AM
## 2          6:00:00 AM           <NA>              NA       NA       <NA>
## 3          6:45:00 AM     6:37:00 AM              -8       16 6:53:00 AM
## 4          7:10:00 AM     7:08:00 AM              -2       36 7:44:00 AM
## 5          7:15:00 AM     7:05:00 AM             -10       11 7:16:00 AM
## 6          7:20:00 AM     7:19:00 AM              -1        9 7:28:00 AM
##   SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE  WHEELS_ON TAXI_IN
## 1            153          150      123      948 8:14:00 AM      12
## 2            165           NA       NA      812       <NA>      NA
## 3             72           66       45      230 8:38:00 AM       5
## 4            102          101       62      373 8:46:00 AM       3
## 5             95           91       75      388 8:31:00 AM       5
## 6            141          116      105      773 9:13:00 AM       2
##   SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1        8:33:00 AM   8:26:00 AM            -7        0         0
## 2        7:45:00 AM         <NA>            NA        0         1
## 3        8:57:00 AM   8:43:00 AM           -14        0         0
## 4        8:52:00 AM   8:49:00 AM            -3        0         0
## 5        8:50:00 AM   8:36:00 AM           -14        0         0
## 6        9:41:00 AM   9:15:00 AM           -26        0         0
##   CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1                <NA>               NA             NA            NA
## 2                   B               NA             NA            NA
## 3                <NA>               NA             NA            NA
## 4                <NA>               NA             NA            NA
## 5                <NA>               NA             NA            NA
## 6                <NA>               NA             NA            NA
##   LATE_AIRCRAFT_DELAY WEATHER_DELAY                    AIRLINE.y
## 1                  NA            NA American Eagle Airlines Inc.
## 2                  NA            NA American Eagle Airlines Inc.
## 3                  NA            NA        Skywest Airlines Inc.
## 4                  NA            NA         Delta Air Lines Inc.
## 5                  NA            NA       Frontier Airlines Inc.
## 6                  NA            NA         Delta Air Lines Inc.
##                                            AIRPORT.x    CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington      KY
## 2 Cincinnati/Northern Kentucky International Airport Covington      KY
## 3                    Nashville International Airport Nashville      TN
## 4 Cincinnati/Northern Kentucky International Airport Covington      KY
## 5            Washington Dulles International Airport Chantilly      VA
## 6 Cincinnati/Northern Kentucky International Airport Covington      KY
##   COUNTRY.x LATITUDE.x LONGITUDE.x
## 1       USA   39.04614   -84.66217
## 2       USA   39.04614   -84.66217
## 3       USA   36.12448   -86.67818
## 4       USA   39.04614   -84.66217
## 5       USA   38.94453   -77.45581
## 6       USA   39.04614   -84.66217
##                                            AIRPORT.y            CITY.y STATE.y
## 1                        Miami International Airport             Miami      FL
## 2            Dallas/Fort Worth International Airport Dallas-Fort Worth      TX
## 3 Cincinnati/Northern Kentucky International Airport         Covington      KY
## 4   Hartsfield-Jackson Atlanta International Airport           Atlanta      GA
## 5 Cincinnati/Northern Kentucky International Airport         Covington      KY
## 6                        Tampa International Airport             Tampa      FL
##   COUNTRY.y LATITUDE.y LONGITUDE.y
## 1       USA   25.79325   -80.29056
## 2       USA   32.89595   -97.03720
## 3       USA   39.04614   -84.66217
## 4       USA   33.64044   -84.42694
## 5       USA   39.04614   -84.66217
## 6       USA   27.97547   -82.53325
# Question 5: -------------------------------------------------------------
colnames(flights_merged)[colSums(is.na(flights_merged)) > 0]
##  [1] "DEPARTURE_TIME"      "DEPARTURE_DELAY"     "TAXI_OUT"           
##  [4] "WHEELS_OFF"          "ELAPSED_TIME"        "AIR_TIME"           
##  [7] "WHEELS_ON"           "TAXI_IN"             "ARRIVAL_TIME"       
## [10] "ARRIVAL_DELAY"       "CANCELLATION_REASON" "AIR_SYSTEM_DELAY"   
## [13] "SECURITY_DELAY"      "AIRLINE_DELAY"       "LATE_AIRCRAFT_DELAY"
## [16] "WEATHER_DELAY"
### answer for question 5:
# missing values:
  # departure time (4.7%), departure delay (4.7%), taxi out (4.8%), wheels off (4.8%), elapsed time (5%), air time (5%), 
  # wheels on (4.9%), taxi in  (4.9%), arrival time  (4.9%), arrival delay (5%), cancellation reasons (95.1%), air system delay (74.89%),
  # security delay (74.89%), airline delay (74.89%), late aircraft delay (74.89%), weather delay (74.89%)
# Question 6: -------------------------------------------------------------
nrow(flights_merged)
## [1] 7763
sum(is.na(flights_merged$DEPARTURE_TIME)) / nrow(flights_merged)
## [1] 0.04766199
### answer to question 6:
# Proportion of canceled flights = 0.04766199; there are 3 different cancellation reasons
# Question 7: -------------------------------------------------------------
# Yes there are missing values in departure time. This is because we also have data on 
# canceled flights, so therefore if a flight is canceled then it will not provide us 
# with a departure time. 
# Question 8: -------------------------------------------------------------
flights_merged$TIME_DIFFERENCE <- flights_merged$SCHEDULED_TIME-flights_merged$ELAPSED_TIME
head(flights_merged)
##   FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1    1/1/2015      MQ          3355            CVG                 MIA
## 2    1/1/2015      MQ          3597            CVG                 DFW
## 3    1/1/2015      OO          4811            BNA                 CVG
## 4    1/1/2015      DL          2315            CVG                 ATL
## 5    1/1/2015      F9          1308            IAD                 CVG
## 6    1/1/2015      DL          1502            CVG                 TPA
##   SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1          6:00:00 AM     5:56:00 AM              -4       15 6:11:00 AM
## 2          6:00:00 AM           <NA>              NA       NA       <NA>
## 3          6:45:00 AM     6:37:00 AM              -8       16 6:53:00 AM
## 4          7:10:00 AM     7:08:00 AM              -2       36 7:44:00 AM
## 5          7:15:00 AM     7:05:00 AM             -10       11 7:16:00 AM
## 6          7:20:00 AM     7:19:00 AM              -1        9 7:28:00 AM
##   SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE  WHEELS_ON TAXI_IN
## 1            153          150      123      948 8:14:00 AM      12
## 2            165           NA       NA      812       <NA>      NA
## 3             72           66       45      230 8:38:00 AM       5
## 4            102          101       62      373 8:46:00 AM       3
## 5             95           91       75      388 8:31:00 AM       5
## 6            141          116      105      773 9:13:00 AM       2
##   SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1        8:33:00 AM   8:26:00 AM            -7        0         0
## 2        7:45:00 AM         <NA>            NA        0         1
## 3        8:57:00 AM   8:43:00 AM           -14        0         0
## 4        8:52:00 AM   8:49:00 AM            -3        0         0
## 5        8:50:00 AM   8:36:00 AM           -14        0         0
## 6        9:41:00 AM   9:15:00 AM           -26        0         0
##   CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1                <NA>               NA             NA            NA
## 2                   B               NA             NA            NA
## 3                <NA>               NA             NA            NA
## 4                <NA>               NA             NA            NA
## 5                <NA>               NA             NA            NA
## 6                <NA>               NA             NA            NA
##   LATE_AIRCRAFT_DELAY WEATHER_DELAY                    AIRLINE.y
## 1                  NA            NA American Eagle Airlines Inc.
## 2                  NA            NA American Eagle Airlines Inc.
## 3                  NA            NA        Skywest Airlines Inc.
## 4                  NA            NA         Delta Air Lines Inc.
## 5                  NA            NA       Frontier Airlines Inc.
## 6                  NA            NA         Delta Air Lines Inc.
##                                            AIRPORT.x    CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington      KY
## 2 Cincinnati/Northern Kentucky International Airport Covington      KY
## 3                    Nashville International Airport Nashville      TN
## 4 Cincinnati/Northern Kentucky International Airport Covington      KY
## 5            Washington Dulles International Airport Chantilly      VA
## 6 Cincinnati/Northern Kentucky International Airport Covington      KY
##   COUNTRY.x LATITUDE.x LONGITUDE.x
## 1       USA   39.04614   -84.66217
## 2       USA   39.04614   -84.66217
## 3       USA   36.12448   -86.67818
## 4       USA   39.04614   -84.66217
## 5       USA   38.94453   -77.45581
## 6       USA   39.04614   -84.66217
##                                            AIRPORT.y            CITY.y STATE.y
## 1                        Miami International Airport             Miami      FL
## 2            Dallas/Fort Worth International Airport Dallas-Fort Worth      TX
## 3 Cincinnati/Northern Kentucky International Airport         Covington      KY
## 4   Hartsfield-Jackson Atlanta International Airport           Atlanta      GA
## 5 Cincinnati/Northern Kentucky International Airport         Covington      KY
## 6                        Tampa International Airport             Tampa      FL
##   COUNTRY.y LATITUDE.y LONGITUDE.y TIME_DIFFERENCE
## 1       USA   25.79325   -80.29056               3
## 2       USA   32.89595   -97.03720              NA
## 3       USA   39.04614   -84.66217               6
## 4       USA   33.64044   -84.42694               1
## 5       USA   39.04614   -84.66217               4
## 6       USA   27.97547   -82.53325              25
# Question 9: -------------------------------------------------------------
Delta_CVG_Delay <- filter(flights_merged, ORIGIN_AIRPORT=="CVG", AIRLINE=="DL", DEPARTURE_DELAY > 30)
head(Delta_CVG_Delay)
##   FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1    1/3/2015      DL          1741            CVG                 SLC
## 2    1/3/2015      DL          1833            CVG                 LAX
## 3    1/4/2015      DL          1854            CVG                 RSW
## 4    1/9/2015      DL          1503            CVG                 MCO
## 5    1/9/2015      DL            63            CVG                 BOS
## 6   1/10/2015      DL          1934            CVG                 ATL
##   SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT  WHEELS_OFF
## 1          8:25:00 AM    11:41:00 AM             196       11 11:52:00 AM
## 2          4:46:00 PM     5:43:00 PM              57        7  5:50:00 PM
## 3          9:35:00 AM    10:20:00 AM              45       15 10:35:00 AM
## 4          1:55:00 PM     2:39:00 PM              44       15  2:54:00 PM
## 5          4:15:00 PM     5:32:00 PM              77        9  5:41:00 PM
## 6          2:19:00 PM     3:58:00 PM              99        8  4:06:00 PM
##   SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE  WHEELS_ON TAXI_IN
## 1            245          222      208     1450 1:20:00 AM       3
## 2            294          269      250     1900 1:00:00 AM      12
## 3            153          156      139      879 1:54:00 AM       2
## 4            134          131      109      757 1:43:00 AM       7
## 5            121          104       90      752 1:11:00 AM       5
## 6             99           74       60      373 1:06:00 AM       6
##   SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1       10:30:00 AM   1:23:00 PM           173        0         0
## 2        6:40:00 PM   7:12:00 PM            32        0         0
## 3       12:08:00 PM  12:56:00 PM            48        0         0
## 4        4:09:00 PM   4:50:00 PM            41        0         0
## 5        6:16:00 PM   7:16:00 PM            60        0         0
## 6        3:58:00 PM   5:12:00 PM            74        0         0
##   CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1                <NA>                0              0           173
## 2                <NA>                0              0             9
## 3                <NA>               48              0             0
## 4                <NA>                0              0            41
## 5                <NA>                0              0             0
## 6                <NA>                0              0            74
##   LATE_AIRCRAFT_DELAY WEATHER_DELAY            AIRLINE.y
## 1                   0             0 Delta Air Lines Inc.
## 2                  23             0 Delta Air Lines Inc.
## 3                   0             0 Delta Air Lines Inc.
## 4                   0             0 Delta Air Lines Inc.
## 5                  60             0 Delta Air Lines Inc.
## 6                   0             0 Delta Air Lines Inc.
##                                            AIRPORT.x    CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington      KY
## 2 Cincinnati/Northern Kentucky International Airport Covington      KY
## 3 Cincinnati/Northern Kentucky International Airport Covington      KY
## 4 Cincinnati/Northern Kentucky International Airport Covington      KY
## 5 Cincinnati/Northern Kentucky International Airport Covington      KY
## 6 Cincinnati/Northern Kentucky International Airport Covington      KY
##   COUNTRY.x LATITUDE.x LONGITUDE.x
## 1       USA   39.04614   -84.66217
## 2       USA   39.04614   -84.66217
## 3       USA   39.04614   -84.66217
## 4       USA   39.04614   -84.66217
## 5       USA   39.04614   -84.66217
## 6       USA   39.04614   -84.66217
##                                          AIRPORT.y         CITY.y STATE.y
## 1             Salt Lake City International Airport Salt Lake City      UT
## 2                Los Angeles International Airport    Los Angeles      CA
## 3          Southwest Florida International Airport      Ft. Myers      FL
## 4                    Orlando International Airport        Orlando      FL
## 5 Gen. Edward Lawrence Logan International Airport         Boston      MA
## 6 Hartsfield-Jackson Atlanta International Airport        Atlanta      GA
##   COUNTRY.y LATITUDE.y LONGITUDE.y TIME_DIFFERENCE
## 1       USA   40.78839  -111.97777              23
## 2       USA   33.94254  -118.40807              25
## 3       USA   26.53617   -81.75517              -3
## 4       USA   28.42889   -81.31603               3
## 5       USA   42.36435   -71.00518              17
## 6       USA   33.64044   -84.42694              25
# Question 10: ------------------------------------------------------------
by_airlines <- group_by(flights_merged, AIRLINE.y)
summarize(by_airlines, mean_delay_time =mean(DEPARTURE_DELAY, na.rm = TRUE))
## # A tibble: 5 × 2
##   AIRLINE.y                    mean_delay_time
##   <chr>                                  <dbl>
## 1 American Eagle Airlines Inc.           19.1 
## 2 Atlantic Southeast Airlines            10.4 
## 3 Delta Air Lines Inc.                    6.09
## 4 Frontier Airlines Inc.                 23.9 
## 5 Skywest Airlines Inc.                  11.1
### answer to question 10:
# Frontier has the longest average delay and Delta has the shortest average delay
# Question 11: ------------------------------------------------------------
by_airport <- group_by(flights_merged, ORIGIN_AIRPORT)
summarize(by_airport, mean_delay_time =mean(DEPARTURE_DELAY, na.rm = TRUE))
## # A tibble: 38 × 2
##    ORIGIN_AIRPORT mean_delay_time
##    <chr>                    <dbl>
##  1 ATL                       7.12
##  2 BDL                      -4   
##  3 BNA                      13.3 
##  4 BOS                      21.1 
##  5 CLT                      10.0 
##  6 CVG                      11.6 
##  7 DEN                      15.1 
##  8 DFW                      16.9 
##  9 DTW                      21.3 
## 10 EWR                       7.29
## # … with 28 more rows
### answer to question 11:
# DTW has the longest average department delay and BDL has the shortest 
# average department delay
# Question 12: ------------------------------------------------------------
flights_merged %>%
  filter(ORIGIN_AIRPORT == "CVG") %>%
  count(AIRLINE)
##   AIRLINE    n
## 1      DL 1149
## 2      EV  776
## 3      F9  258
## 4      MQ 1041
## 5      OO  662
### answer to question 12:
# DL = 1149
# EV = 776
# F9 = 258
# MQ = 1041
# OO = 662