library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/Thu Nguyen/Downloads/Project2dataset")
df<-read.csv("airline_delay.csv")
flight <- read.csv("airline_delay.csv")
Question which airline is worth traveling to ?
Airline delays are a major concern for travelers, airlines, and airport operations, as they affect customer satisfaction, scheduling efficiency, and overall transportation reliability. The airline_delay.csv dataset provides a detailed look into flight performance by capturing key factors such as departure times, arrival times, delay durations, and airline information. By analyzing these patterns, we can better understand the causes of delays, identify trends across airlines or routes, and develop insights that support more efficient decision-making within the aviation industry. This dataset serves as a valuable foundation for exploring how flight delays occur and what can be done to minimize their impact.
This data included 3351 rows and 21 varibles which included different cities that have late flight delays.
I will use single proportion test to help me understand more about this project because this will help me evaluated which flight have the less amount of delay. We will compare multiple data in the dataset to understand about this situation. This dataset would be right tailed.
The dataset contains 3,351 records and 21 variables, representing monthly airline delay statistics across various U.S. airports and carriers. All core identifying fields such as year, month, carrier, carrier name, airport, airport name are fully populated. Most numerical delay related fields have 3343 valid entries, indicating only minimal missing data.I examined the dataset for missing values using colSums(is.na()) and confirmed that the flight variable contained no NA entries, ensuring its suitability for accurate calculations. Additionally, I utilized functions such as head(), summary(), str(), and table() to gain a comprehensive understanding of the dataset’s structure and key characteristics.
colSums(is.na(flight))
## year month carrier carrier_name
## 0 0 0 0
## airport airport_name arr_flights arr_del15
## 0 0 8 8
## carrier_ct weather_ct nas_ct security_ct
## 8 8 8 8
## late_aircraft_ct arr_cancelled arr_diverted arr_delay
## 8 8 8 8
## carrier_delay weather_delay nas_delay security_delay
## 8 8 8 8
## late_aircraft_delay
## 8
flight <- filter(flight, !is.na(airport))
head(flight)
## year month carrier carrier_name airport
## 1 2020 12 9E Endeavor Air Inc. ABE
## 2 2020 12 9E Endeavor Air Inc. ABY
## 3 2020 12 9E Endeavor Air Inc. AEX
## 4 2020 12 9E Endeavor Air Inc. AGS
## 5 2020 12 9E Endeavor Air Inc. ALB
## 6 2020 12 9E Endeavor Air Inc. ATL
## airport_name arr_flights
## 1 Allentown/Bethlehem/Easton, PA: Lehigh Valley International 44
## 2 Albany, GA: Southwest Georgia Regional 90
## 3 Alexandria, LA: Alexandria International 88
## 4 Augusta, GA: Augusta Regional at Bush Field 184
## 5 Albany, NY: Albany International 76
## 6 Atlanta, GA: Hartsfield-Jackson Atlanta International 5985
## arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct
## 1 3 1.63 0.00 0.12 0 1.25
## 2 1 0.96 0.00 0.04 0 0.00
## 3 8 5.75 0.00 1.60 0 0.65
## 4 9 4.17 0.00 1.83 0 3.00
## 5 11 4.78 0.00 5.22 0 1.00
## 6 445 142.89 11.96 161.37 1 127.79
## arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay
## 1 0 1 89 56 0 3
## 2 0 0 23 22 0 1
## 3 0 1 338 265 0 45
## 4 0 0 508 192 0 92
## 5 1 0 692 398 0 178
## 6 5 0 30756 16390 1509 5060
## security_delay late_aircraft_delay
## 1 0 30
## 2 0 0
## 3 0 28
## 4 0 224
## 5 0 116
## 6 16 7781
summary(flight)
## year month carrier carrier_name
## Min. :2019 Min. :12 Length:3351 Length:3351
## 1st Qu.:2019 1st Qu.:12 Class :character Class :character
## Median :2019 Median :12 Mode :character Mode :character
## Mean :2019 Mean :12
## 3rd Qu.:2020 3rd Qu.:12
## Max. :2020 Max. :12
##
## airport airport_name arr_flights arr_del15
## Length:3351 Length:3351 Min. : 1.0 Min. : 0
## Class :character Class :character 1st Qu.: 35.0 1st Qu.: 5
## Mode :character Mode :character Median : 83.0 Median : 12
## Mean : 298.3 Mean : 51
## 3rd Qu.: 194.5 3rd Qu.: 33
## Max. :19713.0 Max. :2289
## NA's :8 NA's :8
## carrier_ct weather_ct nas_ct security_ct
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 1.49 1st Qu.: 0.000 1st Qu.: 0.82 1st Qu.: 0.0000
## Median : 4.75 Median : 0.060 Median : 2.98 Median : 0.0000
## Mean : 16.07 Mean : 1.443 Mean : 16.18 Mean : 0.1373
## 3rd Qu.: 12.26 3rd Qu.: 1.010 3rd Qu.: 8.87 3rd Qu.: 0.0000
## Max. :697.00 Max. :89.420 Max. :1039.54 Max. :17.3100
## NA's :8 NA's :8 NA's :8 NA's :8
## late_aircraft_ct arr_cancelled arr_diverted arr_delay
## Min. : 0.00 Min. : 0.000 Min. : 0.0000 Min. : 0
## 1st Qu.: 0.90 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 230
## Median : 3.28 Median : 0.000 Median : 0.0000 Median : 746
## Mean : 17.17 Mean : 2.885 Mean : 0.5758 Mean : 3334
## 3rd Qu.: 10.24 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.: 2096
## Max. :819.66 Max. :224.000 Max. :42.0000 Max. :160383
## NA's :8 NA's :8 NA's :8 NA's :8
## carrier_delay weather_delay nas_delay security_delay
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.000
## 1st Qu.: 68.5 1st Qu.: 0.0 1st Qu.: 21.5 1st Qu.: 0.000
## Median : 272.0 Median : 3.0 Median : 106.0 Median : 0.000
## Mean : 1144.8 Mean : 177.6 Mean : 749.6 Mean : 5.401
## 3rd Qu.: 830.5 3rd Qu.: 82.0 3rd Qu.: 362.0 3rd Qu.: 0.000
## Max. :55215.0 Max. :14219.0 Max. :82064.0 Max. :553.000
## NA's :8 NA's :8 NA's :8 NA's :8
## late_aircraft_delay
## Min. : 0
## 1st Qu.: 31
## Median : 205
## Mean : 1257
## 3rd Qu.: 724
## Max. :75179
## NA's :8
str(flight)
## 'data.frame': 3351 obs. of 21 variables:
## $ year : int 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ month : int 12 12 12 12 12 12 12 12 12 12 ...
## $ carrier : chr "9E" "9E" "9E" "9E" ...
## $ carrier_name : chr "Endeavor Air Inc." "Endeavor Air Inc." "Endeavor Air Inc." "Endeavor Air Inc." ...
## $ airport : chr "ABE" "ABY" "AEX" "AGS" ...
## $ airport_name : chr "Allentown/Bethlehem/Easton, PA: Lehigh Valley International" "Albany, GA: Southwest Georgia Regional" "Alexandria, LA: Alexandria International" "Augusta, GA: Augusta Regional at Bush Field" ...
## $ arr_flights : int 44 90 88 184 76 5985 142 147 84 150 ...
## $ arr_del15 : int 3 1 8 9 11 445 14 10 14 19 ...
## $ carrier_ct : num 1.63 0.96 5.75 4.17 4.78 ...
## $ weather_ct : num 0 0 0 0 0 ...
## $ nas_ct : num 0.12 0.04 1.6 1.83 5.22 ...
## $ security_ct : num 0 0 0 0 0 1 0 0 0 0 ...
## $ late_aircraft_ct : num 1.25 0 0.65 3 1 ...
## $ arr_cancelled : int 0 0 0 0 1 5 1 0 1 3 ...
## $ arr_diverted : int 1 0 1 0 0 0 0 1 1 0 ...
## $ arr_delay : int 89 23 338 508 692 30756 436 1070 2006 846 ...
## $ carrier_delay : int 56 22 265 192 398 16390 162 838 1164 423 ...
## $ weather_delay : int 0 0 0 0 0 1509 0 141 619 0 ...
## $ nas_delay : int 3 1 45 92 178 5060 182 24 223 389 ...
## $ security_delay : int 0 0 0 0 0 16 0 0 0 0 ...
## $ late_aircraft_delay: int 30 0 28 224 116 7781 92 67 0 34 ...
dim(flight)
## [1] 3351 21
table(flight$airport)
##
## ABE ABI ABQ ABR ABY ACK ACT ACV ACY ADK ADQ AEX AGS ALB ALO ALS AMA ANC APN ASE
## 11 3 23 2 2 1 3 2 2 2 2 6 7 23 2 1 10 6 2 2
## ATL ATW ATY AUS AVL AVP AZA AZO BDL BET BFF BFL BFM BGM BGR BHM BIL BIS BJI BLI
## 28 10 2 28 14 6 2 5 26 2 2 4 1 2 10 18 12 8 2 2
## BLV BMI BNA BOI BOS BPT BQK BQN BRD BRO BRW BTM BTR BTV BUF BUR BWI BZN CAE CAK
## 2 11 31 17 27 2 2 3 2 7 2 2 11 16 24 15 27 20 12 9
## CDB CDC CDV CGI CHA CHO CHS CID CIU CKB CLE CLL CLT CMH CMI CMX CNY COD COS COU
## 1 2 2 2 14 7 28 18 2 4 29 5 26 27 2 2 2 2 14 4
## CPR CRP CRW CSG CVG CWA CYS DAB DAL DAY DBQ DCA DDC DEC DEN DFW DHN DIK DLH DRO
## 2 9 7 2 25 5 2 5 8 14 1 27 1 1 21 27 2 1 4 4
## DRT DSM DTW DVL EAR EAU ECP EGE EKO ELM ELP ERI ESC EUG EVV EWN EWR EYW FAI FAR
## 1 23 29 2 2 2 11 8 2 5 20 3 2 6 8 3 25 11 5 16
## FAT FAY FCA FLG FLL FNT FSD FSM FWA GCC GCK GEG GFK GGG GJT GNV GPT GRB GRI GRK
## 11 6 8 4 21 10 18 5 9 2 2 16 5 2 9 9 12 12 4 5
## GRR GSO GSP GTF GTR GUC GUM HDN HGR HHH HIB HLN HNL HOB HOU HPN HRL HSV HTS HVN
## 23 19 21 4 2 5 2 14 2 2 2 2 12 3 12 7 12 15 2 1
## HYS IAD IAG IAH ICT IDA ILM IMT IND INL ISP ITH ITO JAC JAN JAX JFK JLN JMS JNU
## 2 22 3 26 19 4 11 2 29 2 4 2 4 8 11 28 16 4 2 2
## JST KOA KTN LAN LAR LAS LAW LAX LBB LBE LBF LBL LCH LCK LEX LFT LGA LGB LIH LIT
## 1 12 2 10 2 22 3 22 10 2 2 2 3 2 14 9 24 10 12 22
## LNK LRD LSE LWB LWS LYH MAF MBS MCI MCO MDT MDW MEI MEM MFE MFR MGM MHK MHT MIA
## 6 7 4 2 2 2 11 4 28 20 16 8 2 25 13 6 5 3 10 15
## MKE MKG MLB MLI MLU MMH MOB MOT MQT MRY MSN MSO MSP MSY MTJ MYR OAJ OAK OGD OGG
## 23 2 6 11 7 1 11 6 4 8 18 10 27 30 15 15 4 17 2 12
## OGS OKC OMA OME ONT ORD ORF ORH OTH OTZ OWB PAE PAH PBG PBI PDX PGD PGV PHF PHL
## 3 24 27 2 17 23 25 2 2 2 2 2 2 5 22 20 2 1 4 27
## PHX PIA PIB PIE PIH PIR PIT PLN PNS PPG PRC PSC PSE PSG PSM PSP PUB PVD PVU PWM
## 23 11 2 2 2 2 31 2 19 1 2 4 1 2 2 20 2 21 2 20
## RAP RDD RDM RDU RFD RHI RIC RIW RKS RNO ROA ROC ROW RST RSW SAF SAN SAT SAV SBA
## 12 2 5 30 2 2 26 1 2 19 11 22 3 6 23 3 21 22 23 9
## SBN SBP SCC SCE SCK SDF SEA SFB SFO SGF SGU SHD SHR SHV SIT SJC SJT SJU SLC SLN
## 8 4 2 5 3 22 21 2 18 10 2 2 1 12 2 18 2 15 21 2
## SMF SMX SNA SPI SPN SPS SRQ STC STL STS STT STX SUN SUX SWF SWO SYR TLH TOL TPA
## 22 2 15 4 2 2 23 2 25 6 10 6 2 3 4 2 21 12 5 22
## TRI TTN TUL TUS TVC TWF TXK TYR TYS UIN USA VCT VEL VLD VPS WRG XNA XWA YAK YUM
## 8 2 23 20 10 2 2 3 18 1 2 1 2 2 16 2 18 2 2 4
library(ggplot2)
library(dplyr)
df_bar <- df %>%
group_by(carrier_name) %>%
summarise(avg_delay = mean(arr_del15, na.rm = TRUE))
ggplot(df_bar, aes(x = reorder(carrier_name, avg_delay), y = avg_delay)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Arrival Delays by Airline",
x = "Airline",
y = "Average Delays (15+ min)") +
coord_flip()
sample_proportion <- mean(flight$airport == "Yes")
sample_proportion
## [1] 0
Test
prop.test(500, 3034 , 0.4, alternative = "less")
##
## 1-sample proportions test with continuity correction
##
## data: 500 out of 3034, null probability 0.4
## X-squared = 698.35, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is less than 0.4
## 95 percent confidence interval:
## 0.0000000 0.1763446
## sample estimates:
## p
## 0.1647989
After analyzing the dataset which it provided a detailed overview of flight delay patterns across various airlines, airports, and time periods. The data show that the leading causes of delays are late aircraft and NAS-related issues, reflecting broader scheduling and airspace congestion challenges rather than isolated weather or security events. By examining delay counts, durations, and operational trends, the dataset highlights that some carriers and airports face disproportionately higher disruptions. Notably, Allegiant Airlines stands out as the top performing carrier, consistently experiencing the least amount of delays. Overall, this dataset offers valuable insights into the root causes of airline delays and serves as a strong foundation for further understading of the airline networking