Assignment 5

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

QUESTION 1:

setwd("~/Desktop/BANA 4137")
Flights = read.csv("~/Desktop/BANA 4137/Assignment_5/CVG_Flights.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = "")
airlines = read.csv("~/Desktop/BANA 4137/Assignment_5/airlines.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = "")
airports = read.csv("~/Desktop/BANA 4137/Assignment_5/airports.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = "")

QUESTION 2:

dim(airlines)
## [1] 14  2
dim(airports)
## [1] 322   7
dim(Flights)
## [1] 7763   27

Rows

Columns

1. CVG_Flights:

2. Airports:

3. Airlines:

Meanning of vairbles

Each row in a dataset corresponds to a single observation, or data point, and each column to a variable. A flight might be represented by a row in a dataset on flights, for instance, as variables like the airline, destination, flight number, departure and arrival times, and so on could be represented by columns.

Accurate data analysis and interpretation depend on an understanding of the variables present in a dataset. It entails being aware of the kind of data that each variable represents as well as comprehending its significance and applicability in relation to the study as a whole.

QUESTION 3:

Flights_airlines <- left_join(Flights,
                                   airlines,
                                   by = c("AIRLINE" = "IATA_CODE"))

Flights_originalairports <- left_join(Flights_airlines,
                                           airports,
                                           by = c("ORIGIN_AIRPORT"= "IATA_CODE"))

Flights_merged <- left_join(Flights_originalairports,
                            airports,
                            by = c("DESTINATION_AIRPORT" = "IATA_CODE"))

nrow(Flights_merged)
## [1] 7763
ncol(Flights_merged)
## [1] 40

QUESTION 4:

head(Flights_merged)
##   FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1    1/1/2015      MQ          3355            CVG                 MIA
## 2    1/1/2015      MQ          3597            CVG                 DFW
## 3    1/1/2015      OO          4811            BNA                 CVG
## 4    1/1/2015      DL          2315            CVG                 ATL
## 5    1/1/2015      F9          1308            IAD                 CVG
## 6    1/1/2015      DL          1502            CVG                 TPA
##   SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1          6:00:00 AM     5:56:00 AM              -4       15 6:11:00 AM
## 2          6:00:00 AM           <NA>              NA       NA       <NA>
## 3          6:45:00 AM     6:37:00 AM              -8       16 6:53:00 AM
## 4          7:10:00 AM     7:08:00 AM              -2       36 7:44:00 AM
## 5          7:15:00 AM     7:05:00 AM             -10       11 7:16:00 AM
## 6          7:20:00 AM     7:19:00 AM              -1        9 7:28:00 AM
##   SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE  WHEELS_ON TAXI_IN
## 1            153          150      123      948 8:14:00 AM      12
## 2            165           NA       NA      812       <NA>      NA
## 3             72           66       45      230 8:38:00 AM       5
## 4            102          101       62      373 8:46:00 AM       3
## 5             95           91       75      388 8:31:00 AM       5
## 6            141          116      105      773 9:13:00 AM       2
##   SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1        8:33:00 AM   8:26:00 AM            -7        0         0
## 2        7:45:00 AM         <NA>            NA        0         1
## 3        8:57:00 AM   8:43:00 AM           -14        0         0
## 4        8:52:00 AM   8:49:00 AM            -3        0         0
## 5        8:50:00 AM   8:36:00 AM           -14        0         0
## 6        9:41:00 AM   9:15:00 AM           -26        0         0
##   CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1                <NA>               NA             NA            NA
## 2                   B               NA             NA            NA
## 3                <NA>               NA             NA            NA
## 4                <NA>               NA             NA            NA
## 5                <NA>               NA             NA            NA
## 6                <NA>               NA             NA            NA
##   LATE_AIRCRAFT_DELAY WEATHER_DELAY                    AIRLINE.y
## 1                  NA            NA American Eagle Airlines Inc.
## 2                  NA            NA American Eagle Airlines Inc.
## 3                  NA            NA        Skywest Airlines Inc.
## 4                  NA            NA         Delta Air Lines Inc.
## 5                  NA            NA       Frontier Airlines Inc.
## 6                  NA            NA         Delta Air Lines Inc.
##                                            AIRPORT.x    CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington      KY
## 2 Cincinnati/Northern Kentucky International Airport Covington      KY
## 3                    Nashville International Airport Nashville      TN
## 4 Cincinnati/Northern Kentucky International Airport Covington      KY
## 5            Washington Dulles International Airport Chantilly      VA
## 6 Cincinnati/Northern Kentucky International Airport Covington      KY
##   COUNTRY.x LATITUDE.x LONGITUDE.x
## 1       USA   39.04614   -84.66217
## 2       USA   39.04614   -84.66217
## 3       USA   36.12448   -86.67818
## 4       USA   39.04614   -84.66217
## 5       USA   38.94453   -77.45581
## 6       USA   39.04614   -84.66217
##                                            AIRPORT.y            CITY.y STATE.y
## 1                        Miami International Airport             Miami      FL
## 2            Dallas/Fort Worth International Airport Dallas-Fort Worth      TX
## 3 Cincinnati/Northern Kentucky International Airport         Covington      KY
## 4   Hartsfield-Jackson Atlanta International Airport           Atlanta      GA
## 5 Cincinnati/Northern Kentucky International Airport         Covington      KY
## 6                        Tampa International Airport             Tampa      FL
##   COUNTRY.y LATITUDE.y LONGITUDE.y
## 1       USA   25.79325   -80.29056
## 2       USA   32.89595   -97.03720
## 3       USA   39.04614   -84.66217
## 4       USA   33.64044   -84.42694
## 5       USA   39.04614   -84.66217
## 6       USA   27.97547   -82.53325

QUESTION 5:

missing_values <- sapply(Flights_merged, function(x) sum(is.na(x)))
missing_values
##         FLIGHT_DATE             AIRLINE       FLIGHT_NUMBER      ORIGIN_AIRPORT 
##                   0                   0                   0                   0 
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE      DEPARTURE_TIME     DEPARTURE_DELAY 
##                   0                   0                 370                 370 
##            TAXI_OUT          WHEELS_OFF      SCHEDULED_TIME        ELAPSED_TIME 
##                 378                 378                   0                 392 
##            AIR_TIME            DISTANCE           WHEELS_ON             TAXI_IN 
##                 392                   0                 381                 381 
##   SCHEDULED_ARRIVAL        ARRIVAL_TIME       ARRIVAL_DELAY            DIVERTED 
##                   0                 381                 392                   0 
##           CANCELLED CANCELLATION_REASON    AIR_SYSTEM_DELAY      SECURITY_DELAY 
##                   0                7385                5814                5814 
##       AIRLINE_DELAY LATE_AIRCRAFT_DELAY       WEATHER_DELAY           AIRLINE.y 
##                5814                5814                5814                   0 
##           AIRPORT.x              CITY.x             STATE.x           COUNTRY.x 
##                   0                   0                   0                   0 
##          LATITUDE.x         LONGITUDE.x           AIRPORT.y              CITY.y 
##                   0                   0                   0                   0 
##             STATE.y           COUNTRY.y          LATITUDE.y         LONGITUDE.y 
##                   0                   0                   0                   0
percentage_missing <- missing_values / nrow(Flights_merged) * 100
percentage_missing
##         FLIGHT_DATE             AIRLINE       FLIGHT_NUMBER      ORIGIN_AIRPORT 
##            0.000000            0.000000            0.000000            0.000000 
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE      DEPARTURE_TIME     DEPARTURE_DELAY 
##            0.000000            0.000000            4.766199            4.766199 
##            TAXI_OUT          WHEELS_OFF      SCHEDULED_TIME        ELAPSED_TIME 
##            4.869252            4.869252            0.000000            5.049594 
##            AIR_TIME            DISTANCE           WHEELS_ON             TAXI_IN 
##            5.049594            0.000000            4.907896            4.907896 
##   SCHEDULED_ARRIVAL        ARRIVAL_TIME       ARRIVAL_DELAY            DIVERTED 
##            0.000000            4.907896            5.049594            0.000000 
##           CANCELLED CANCELLATION_REASON    AIR_SYSTEM_DELAY      SECURITY_DELAY 
##            0.000000           95.130748           74.893727           74.893727 
##       AIRLINE_DELAY LATE_AIRCRAFT_DELAY       WEATHER_DELAY           AIRLINE.y 
##           74.893727           74.893727           74.893727            0.000000 
##           AIRPORT.x              CITY.x             STATE.x           COUNTRY.x 
##            0.000000            0.000000            0.000000            0.000000 
##          LATITUDE.x         LONGITUDE.x           AIRPORT.y              CITY.y 
##            0.000000            0.000000            0.000000            0.000000 
##             STATE.y           COUNTRY.y          LATITUDE.y         LONGITUDE.y 
##            0.000000            0.000000            0.000000            0.000000

QUESTION 6:

proportion_cancelled <- sum(is.na(Flights_merged$DEPARTURE_TIME)) / nrow(Flights_merged)
proportion_cancelled
## [1] 0.04766199
num_cancel_reasons <- length(unique(Flights_merged$CANCELLATION_REASON))
num_cancel_reasons
## [1] 4

QUESTION 7:

There are missing values in DEPARTURE_TIME. That means, when the flight it cancelled, we will not have a DEPARTURE_TIME.

cancel_departure_table <- table(is.na(Flights_merged$DEPARTURE_TIME), Flights_merged$CANCELLED)
cancel_departure_table
##        
##            0    1
##   FALSE 7385    8
##   TRUE     0  370

QUESTION 8:

Flights_merged$TIME_DIFFERENCE <- Flights_merged$SCHEDULED_TIME-Flights_merged$ELAPSED_TIME
head(Flights_merged$TIME_DIFFERENCE)
## [1]  3 NA  6  1  4 25

QUESTION 9:

delta_cvg_delayed <- Flights_merged[
  Flights_merged$Airline == "Delta" &
  Flights_merged$Origin == "CVG" &
  Flights_merged$DEPARTURE_DELAY > 30,
]
delta_cvg_delayed
##  [1] FLIGHT_DATE         AIRLINE             FLIGHT_NUMBER      
##  [4] ORIGIN_AIRPORT      DESTINATION_AIRPORT SCHEDULED_DEPARTURE
##  [7] DEPARTURE_TIME      DEPARTURE_DELAY     TAXI_OUT           
## [10] WHEELS_OFF          SCHEDULED_TIME      ELAPSED_TIME       
## [13] AIR_TIME            DISTANCE            WHEELS_ON          
## [16] TAXI_IN             SCHEDULED_ARRIVAL   ARRIVAL_TIME       
## [19] ARRIVAL_DELAY       DIVERTED            CANCELLED          
## [22] CANCELLATION_REASON AIR_SYSTEM_DELAY    SECURITY_DELAY     
## [25] AIRLINE_DELAY       LATE_AIRCRAFT_DELAY WEATHER_DELAY      
## [28] AIRLINE.y           AIRPORT.x           CITY.x             
## [31] STATE.x             COUNTRY.x           LATITUDE.x         
## [34] LONGITUDE.x         AIRPORT.y           CITY.y             
## [37] STATE.y             COUNTRY.y           LATITUDE.y         
## [40] LONGITUDE.y         TIME_DIFFERENCE    
## <0 rows> (or 0-length row.names)
head(delta_cvg_delayed)
##  [1] FLIGHT_DATE         AIRLINE             FLIGHT_NUMBER      
##  [4] ORIGIN_AIRPORT      DESTINATION_AIRPORT SCHEDULED_DEPARTURE
##  [7] DEPARTURE_TIME      DEPARTURE_DELAY     TAXI_OUT           
## [10] WHEELS_OFF          SCHEDULED_TIME      ELAPSED_TIME       
## [13] AIR_TIME            DISTANCE            WHEELS_ON          
## [16] TAXI_IN             SCHEDULED_ARRIVAL   ARRIVAL_TIME       
## [19] ARRIVAL_DELAY       DIVERTED            CANCELLED          
## [22] CANCELLATION_REASON AIR_SYSTEM_DELAY    SECURITY_DELAY     
## [25] AIRLINE_DELAY       LATE_AIRCRAFT_DELAY WEATHER_DELAY      
## [28] AIRLINE.y           AIRPORT.x           CITY.x             
## [31] STATE.x             COUNTRY.x           LATITUDE.x         
## [34] LONGITUDE.x         AIRPORT.y           CITY.y             
## [37] STATE.y             COUNTRY.y           LATITUDE.y         
## [40] LONGITUDE.y         TIME_DIFFERENCE    
## <0 rows> (or 0-length row.names)

QUESTION 10:

Flights_merged %>%
  group_by(AIRLINE) %>%
  summarize(avg_delay = mean(DEPARTURE_DELAY, na.rm = TRUE)) %>%
  arrange(avg_delay)
## # A tibble: 5 × 2
##   AIRLINE avg_delay
##   <chr>       <dbl>
## 1 DL           6.09
## 2 EV          10.4 
## 3 OO          11.1 
## 4 MQ          19.1 
## 5 F9          23.9

QUESTION 11:

avg_deprature <- Flights_merged %>%
  group_by(ORIGIN_AIRPORT) %>%
  summarize(avg_delay = mean(DEPARTURE_DELAY, na.rm = TRUE)) %>%
  arrange(desc(avg_delay)) %>%
  head(6)
avg_deprature
## # A tibble: 6 × 2
##   ORIGIN_AIRPORT avg_delay
##   <chr>              <dbl>
## 1 TTN                 74.3
## 2 RDU                 70.8
## 3 PIT                 33.5
## 4 JFK                 27.4
## 5 LAS                 26.0
## 6 LGA                 25.6

QUESTION 12:

Total_Flights <- Flights_merged %>%
  filter(ORIGIN_AIRPORT == "CVG") %>%
  group_by(AIRLINE) %>%
  summarise(total_flights = n())
Total_Flights
## # A tibble: 5 × 2
##   AIRLINE total_flights
##   <chr>           <int>
## 1 DL               1149
## 2 EV                776
## 3 F9                258
## 4 MQ               1041
## 5 OO                662