library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
QUESTION 1:
setwd("~/Desktop/BANA 4137")
Flights = read.csv("~/Desktop/BANA 4137/Assignment_5/CVG_Flights.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = "")
airlines = read.csv("~/Desktop/BANA 4137/Assignment_5/airlines.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = "")
airports = read.csv("~/Desktop/BANA 4137/Assignment_5/airports.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = "")
QUESTION 2:
dim(airlines)
## [1] 14 2
dim(airports)
## [1] 322 7
dim(Flights)
## [1] 7763 27
Rows
Columns
1. CVG_Flights:
2. Airports:
3. Airlines:
Meanning of vairbles
Each row in a dataset corresponds to a single observation, or data point, and each column to a variable. A flight might be represented by a row in a dataset on flights, for instance, as variables like the airline, destination, flight number, departure and arrival times, and so on could be represented by columns.
Accurate data analysis and interpretation depend on an understanding of the variables present in a dataset. It entails being aware of the kind of data that each variable represents as well as comprehending its significance and applicability in relation to the study as a whole.
QUESTION 3:
Flights_airlines <- left_join(Flights,
airlines,
by = c("AIRLINE" = "IATA_CODE"))
Flights_originalairports <- left_join(Flights_airlines,
airports,
by = c("ORIGIN_AIRPORT"= "IATA_CODE"))
Flights_merged <- left_join(Flights_originalairports,
airports,
by = c("DESTINATION_AIRPORT" = "IATA_CODE"))
nrow(Flights_merged)
## [1] 7763
ncol(Flights_merged)
## [1] 40
QUESTION 4:
head(Flights_merged)
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 1/1/2015 MQ 3355 CVG MIA
## 2 1/1/2015 MQ 3597 CVG DFW
## 3 1/1/2015 OO 4811 BNA CVG
## 4 1/1/2015 DL 2315 CVG ATL
## 5 1/1/2015 F9 1308 IAD CVG
## 6 1/1/2015 DL 1502 CVG TPA
## SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF
## 1 6:00:00 AM 5:56:00 AM -4 15 6:11:00 AM
## 2 6:00:00 AM <NA> NA NA <NA>
## 3 6:45:00 AM 6:37:00 AM -8 16 6:53:00 AM
## 4 7:10:00 AM 7:08:00 AM -2 36 7:44:00 AM
## 5 7:15:00 AM 7:05:00 AM -10 11 7:16:00 AM
## 6 7:20:00 AM 7:19:00 AM -1 9 7:28:00 AM
## SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 1 153 150 123 948 8:14:00 AM 12
## 2 165 NA NA 812 <NA> NA
## 3 72 66 45 230 8:38:00 AM 5
## 4 102 101 62 373 8:46:00 AM 3
## 5 95 91 75 388 8:31:00 AM 5
## 6 141 116 105 773 9:13:00 AM 2
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED
## 1 8:33:00 AM 8:26:00 AM -7 0 0
## 2 7:45:00 AM <NA> NA 0 1
## 3 8:57:00 AM 8:43:00 AM -14 0 0
## 4 8:52:00 AM 8:49:00 AM -3 0 0
## 5 8:50:00 AM 8:36:00 AM -14 0 0
## 6 9:41:00 AM 9:15:00 AM -26 0 0
## CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
## 1 <NA> NA NA NA
## 2 B NA NA NA
## 3 <NA> NA NA NA
## 4 <NA> NA NA NA
## 5 <NA> NA NA NA
## 6 <NA> NA NA NA
## LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 1 NA NA American Eagle Airlines Inc.
## 2 NA NA American Eagle Airlines Inc.
## 3 NA NA Skywest Airlines Inc.
## 4 NA NA Delta Air Lines Inc.
## 5 NA NA Frontier Airlines Inc.
## 6 NA NA Delta Air Lines Inc.
## AIRPORT.x CITY.x STATE.x
## 1 Cincinnati/Northern Kentucky International Airport Covington KY
## 2 Cincinnati/Northern Kentucky International Airport Covington KY
## 3 Nashville International Airport Nashville TN
## 4 Cincinnati/Northern Kentucky International Airport Covington KY
## 5 Washington Dulles International Airport Chantilly VA
## 6 Cincinnati/Northern Kentucky International Airport Covington KY
## COUNTRY.x LATITUDE.x LONGITUDE.x
## 1 USA 39.04614 -84.66217
## 2 USA 39.04614 -84.66217
## 3 USA 36.12448 -86.67818
## 4 USA 39.04614 -84.66217
## 5 USA 38.94453 -77.45581
## 6 USA 39.04614 -84.66217
## AIRPORT.y CITY.y STATE.y
## 1 Miami International Airport Miami FL
## 2 Dallas/Fort Worth International Airport Dallas-Fort Worth TX
## 3 Cincinnati/Northern Kentucky International Airport Covington KY
## 4 Hartsfield-Jackson Atlanta International Airport Atlanta GA
## 5 Cincinnati/Northern Kentucky International Airport Covington KY
## 6 Tampa International Airport Tampa FL
## COUNTRY.y LATITUDE.y LONGITUDE.y
## 1 USA 25.79325 -80.29056
## 2 USA 32.89595 -97.03720
## 3 USA 39.04614 -84.66217
## 4 USA 33.64044 -84.42694
## 5 USA 39.04614 -84.66217
## 6 USA 27.97547 -82.53325
QUESTION 5:
missing_values <- sapply(Flights_merged, function(x) sum(is.na(x)))
missing_values
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT
## 0 0 0 0
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY
## 0 0 370 370
## TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 378 378 0 392
## AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 392 0 381 381
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED
## 0 381 392 0
## CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## 0 7385 5814 5814
## AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 5814 5814 5814 0
## AIRPORT.x CITY.x STATE.x COUNTRY.x
## 0 0 0 0
## LATITUDE.x LONGITUDE.x AIRPORT.y CITY.y
## 0 0 0 0
## STATE.y COUNTRY.y LATITUDE.y LONGITUDE.y
## 0 0 0 0
percentage_missing <- missing_values / nrow(Flights_merged) * 100
percentage_missing
## FLIGHT_DATE AIRLINE FLIGHT_NUMBER ORIGIN_AIRPORT
## 0.000000 0.000000 0.000000 0.000000
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY
## 0.000000 0.000000 4.766199 4.766199
## TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 4.869252 4.869252 0.000000 5.049594
## AIR_TIME DISTANCE WHEELS_ON TAXI_IN
## 5.049594 0.000000 4.907896 4.907896
## SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED
## 0.000000 4.907896 5.049594 0.000000
## CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## 0.000000 95.130748 74.893727 74.893727
## AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY AIRLINE.y
## 74.893727 74.893727 74.893727 0.000000
## AIRPORT.x CITY.x STATE.x COUNTRY.x
## 0.000000 0.000000 0.000000 0.000000
## LATITUDE.x LONGITUDE.x AIRPORT.y CITY.y
## 0.000000 0.000000 0.000000 0.000000
## STATE.y COUNTRY.y LATITUDE.y LONGITUDE.y
## 0.000000 0.000000 0.000000 0.000000
QUESTION 6:
proportion_cancelled <- sum(is.na(Flights_merged$DEPARTURE_TIME)) / nrow(Flights_merged)
proportion_cancelled
## [1] 0.04766199
num_cancel_reasons <- length(unique(Flights_merged$CANCELLATION_REASON))
num_cancel_reasons
## [1] 4
QUESTION 7:
There are missing values in DEPARTURE_TIME. That means, when the flight it cancelled, we will not have a DEPARTURE_TIME.
cancel_departure_table <- table(is.na(Flights_merged$DEPARTURE_TIME), Flights_merged$CANCELLED)
cancel_departure_table
##
## 0 1
## FALSE 7385 8
## TRUE 0 370
QUESTION 8:
Flights_merged$TIME_DIFFERENCE <- Flights_merged$SCHEDULED_TIME-Flights_merged$ELAPSED_TIME
head(Flights_merged$TIME_DIFFERENCE)
## [1] 3 NA 6 1 4 25
QUESTION 9:
delta_cvg_delayed <- Flights_merged[
Flights_merged$Airline == "Delta" &
Flights_merged$Origin == "CVG" &
Flights_merged$DEPARTURE_DELAY > 30,
]
delta_cvg_delayed
## [1] FLIGHT_DATE AIRLINE FLIGHT_NUMBER
## [4] ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE
## [7] DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT
## [10] WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## [13] AIR_TIME DISTANCE WHEELS_ON
## [16] TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME
## [19] ARRIVAL_DELAY DIVERTED CANCELLED
## [22] CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## [25] AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## [28] AIRLINE.y AIRPORT.x CITY.x
## [31] STATE.x COUNTRY.x LATITUDE.x
## [34] LONGITUDE.x AIRPORT.y CITY.y
## [37] STATE.y COUNTRY.y LATITUDE.y
## [40] LONGITUDE.y TIME_DIFFERENCE
## <0 rows> (or 0-length row.names)
head(delta_cvg_delayed)
## [1] FLIGHT_DATE AIRLINE FLIGHT_NUMBER
## [4] ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE
## [7] DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT
## [10] WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## [13] AIR_TIME DISTANCE WHEELS_ON
## [16] TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME
## [19] ARRIVAL_DELAY DIVERTED CANCELLED
## [22] CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## [25] AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## [28] AIRLINE.y AIRPORT.x CITY.x
## [31] STATE.x COUNTRY.x LATITUDE.x
## [34] LONGITUDE.x AIRPORT.y CITY.y
## [37] STATE.y COUNTRY.y LATITUDE.y
## [40] LONGITUDE.y TIME_DIFFERENCE
## <0 rows> (or 0-length row.names)
QUESTION 10:
Flights_merged %>%
group_by(AIRLINE) %>%
summarize(avg_delay = mean(DEPARTURE_DELAY, na.rm = TRUE)) %>%
arrange(avg_delay)
## # A tibble: 5 × 2
## AIRLINE avg_delay
## <chr> <dbl>
## 1 DL 6.09
## 2 EV 10.4
## 3 OO 11.1
## 4 MQ 19.1
## 5 F9 23.9
QUESTION 11:
avg_deprature <- Flights_merged %>%
group_by(ORIGIN_AIRPORT) %>%
summarize(avg_delay = mean(DEPARTURE_DELAY, na.rm = TRUE)) %>%
arrange(desc(avg_delay)) %>%
head(6)
avg_deprature
## # A tibble: 6 × 2
## ORIGIN_AIRPORT avg_delay
## <chr> <dbl>
## 1 TTN 74.3
## 2 RDU 70.8
## 3 PIT 33.5
## 4 JFK 27.4
## 5 LAS 26.0
## 6 LGA 25.6
QUESTION 12:
Total_Flights <- Flights_merged %>%
filter(ORIGIN_AIRPORT == "CVG") %>%
group_by(AIRLINE) %>%
summarise(total_flights = n())
Total_Flights
## # A tibble: 5 × 2
## AIRLINE total_flights
## <chr> <int>
## 1 DL 1149
## 2 EV 776
## 3 F9 258
## 4 MQ 1041
## 5 OO 662