Global Civil Aviation Disruption — Data Cleaning

Madison 2026-03-09


0. Load Libraries

library(tidyverse)
library(lubridate)

1. Load Raw Data

Data can be found at https://www.kaggle.com/datasets/zkskhurram/global-civil-aviation-disruption2026-iranus-war

flight_cancellations_raw  <- read_csv("flight_cancellations.csv")
conflict_events_raw       <- read_csv("conflict_events.csv")
airline_losses_raw        <- read_csv("airline_losses_estimate.csv")
airport_disruptions_raw   <- read_csv("airport_disruptions.csv")
airspace_closures_raw     <- read_csv("airspace_closures.csv")
flight_reroutes_raw       <- read_csv("flight_reroutes.csv")

2. Flight Cancellations

2.1 Initial Inspection

glimpse(flight_cancellations_raw)
## Rows: 50
## Columns: 9
## $ date                <date> 2026-02-28, 2026-02-28, 2026-02-28, 2026-02-28, 2…
## $ airport             <chr> "Dubai International (DXB)", "Tehran Imam Khomeini…
## $ country             <chr> "UAE", "Iran", "Israel", "Israel", "Qatar", "Qatar…
## $ airline             <chr> "Emirates", "Iran Air", "El Al", "Wizz Air", "Qata…
## $ flight_number       <chr> "EK0912", "IR0702", "LY0082", "W60001", "QR0502", …
## $ origin              <chr> "DXB", "IKA", "TLV", "TLV", "DOH", "DOH", "DXB", "…
## $ destination         <chr> "IKA", "DXB", "JFK", "LTN", "IKA", "THR", "IKA", "…
## $ cancellation_reason <chr> "Airspace closed - Iranian FIR shutdown", "Airspac…
## $ aircraft_type       <chr> "Boeing 777-300ER", "Airbus A320-200", "Boeing 787…
summary(flight_cancellations_raw)
##       date              airport            country            airline         
##  Min.   :2026-02-28   Length:50          Length:50          Length:50         
##  1st Qu.:2026-03-01   Class :character   Class :character   Class :character  
##  Median :2026-03-02   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2026-03-02                                                           
##  3rd Qu.:2026-03-04                                                           
##  Max.   :2026-03-07                                                           
##  flight_number         origin          destination        cancellation_reason
##  Length:50          Length:50          Length:50          Length:50          
##  Class :character   Class :character   Class :character   Class :character   
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character   
##                                                                              
##                                                                              
##                                                                              
##  aircraft_type     
##  Length:50         
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
colSums(is.na(flight_cancellations_raw))
##                date             airport             country             airline 
##                   0                   0                   0                   0 
##       flight_number              origin         destination cancellation_reason 
##                   0                   0                   0                   0 
##       aircraft_type 
##                   0

2.2 Clean

flight_cancellations <- flight_cancellations_raw %>%
  
  # Parse date
  mutate(date = ymd(date)) %>%
  
  # Extract IATA code from airport name e.g. "Dubai International (DXB)" -> "DXB"
  mutate(iata = str_extract(airport, "(?<=\\()\\w{3}(?=\\))"),
         airport_name = str_trim(str_remove(airport, "\\s*\\(\\w{3}\\)"))) %>%
  
  # Standardize airport name inconsistency: "Ataturk / Istanbul Airport" -> "Istanbul Airport"
  mutate(airport_name = str_replace(airport_name, "Ataturk / Istanbul Airport", "Istanbul Airport")) %>%
  
  # Standardize text columns to title case
  mutate(across(c(country, airline, cancellation_reason, aircraft_type), str_to_title)) %>%
  
  # Reorder columns cleanly
  select(date, iata, airport_name, country, airline, flight_number,
         origin, destination, cancellation_reason, aircraft_type) %>%
  
  arrange(date, iata)

2.3 Validation

# Check IATA extraction worked
flight_cancellations %>% count(iata) %>% arrange(desc(n))
## # A tibble: 20 × 2
##    iata      n
##    <chr> <int>
##  1 DXB      10
##  2 DOH       6
##  3 IST       5
##  4 TLV       4
##  5 JED       3
##  6 LHR       3
##  7 MCT       3
##  8 FRA       2
##  9 RUH       2
## 10 SVO       2
## 11 AMM       1
## 12 AMS       1
## 13 BAH       1
## 14 CDG       1
## 15 DEL       1
## 16 GYD       1
## 17 IKA       1
## 18 KHI       1
## 19 KWI       1
## 20 LHE       1
# Verify no remaining name inconsistencies
flight_cancellations %>% distinct(airport_name) %>% arrange(airport_name)
## # A tibble: 20 × 1
##    airport_name                
##    <chr>                       
##  1 Amman Queen Alia            
##  2 Amsterdam Schiphol          
##  3 Bahrain International       
##  4 Baku International          
##  5 Ben Gurion International    
##  6 Charles de Gaulle           
##  7 Dubai International         
##  8 Frankfurt                   
##  9 Hamad International         
## 10 Heathrow                    
## 11 Indira Gandhi International 
## 12 Istanbul Airport            
## 13 Karachi Jinnah International
## 14 King Abdulaziz International
## 15 King Khalid International   
## 16 Kuwait International        
## 17 Lahore Allama Iqbal         
## 18 Muscat International        
## 19 Sheremetyevo International  
## 20 Tehran Imam Khomeini
# Check date range
range(flight_cancellations$date)
## [1] "2026-02-28" "2026-03-07"
# Remaining NAs
colSums(is.na(flight_cancellations))
##                date                iata        airport_name             country 
##                   0                   0                   0                   0 
##             airline       flight_number              origin         destination 
##                   0                   0                   0                   0 
## cancellation_reason       aircraft_type 
##                   0                   0

3. Conflict Events

3.1 Initial Inspection

glimpse(conflict_events_raw)
## Rows: 28
## Columns: 9
## $ date            <date> 2026-02-28, 2026-02-28, 2026-02-28, 2026-02-28, 2026-…
## $ time_utc        <time> 21:02:00, 21:15:00, 21:45:00, 22:10:00, 22:30:00, 23:…
## $ location        <chr> "Natanz Nuclear Facility (Isfahan)", "Fordow Nuclear F…
## $ latitude        <dbl> 33.7225, 34.8846, 35.5233, 29.4752, 27.2220, 26.6000, …
## $ longitude       <dbl> 51.7261, 50.5925, 51.7760, 52.5903, 56.3777, 56.4000, …
## $ event_type      <chr> "US Airstrike - Precision Strike", "US Airstrike - Bun…
## $ aviation_impact <chr> "Immediate Iranian FIR closure", "Qom regional no-fly …
## $ severity        <chr> "CRITICAL", "CRITICAL", "CRITICAL", "HIGH", "HIGH", "C…
## $ source          <chr> "Reuters / AP / Flightradar24", "BBC World / FlightAwa…
colSums(is.na(conflict_events_raw))
##            date        time_utc        location        latitude       longitude 
##               0               0               0               0               0 
##      event_type aviation_impact        severity          source 
##               0               0               0               0

3.2 Clean

conflict_events <- conflict_events_raw %>%
  
  # Combine date + time_utc into single datetime
  mutate(datetime_utc = ymd_hms(paste(date, time_utc)),
         date = ymd(date)) %>%
  
  # Standardize severity to ordered factor
  mutate(severity = factor(severity,
                           levels = c("LOW", "MEDIUM", "HIGH", "CRITICAL"),
                           ordered = TRUE)) %>%
  
  # Standardize event_type to title case
  mutate(event_type = str_to_title(event_type)) %>%
  
  # Flag the geographic outlier (UN Security Council = New York, not a conflict site)
  mutate(is_diplomatic = if_else(str_detect(location, "UN Security Council"), TRUE, FALSE)) %>%
  
  # Reorder columns
  select(date, datetime_utc, location, latitude, longitude,
         event_type, severity, aviation_impact, is_diplomatic, source) %>%
  
  arrange(datetime_utc)

3.3 Validation

# Severity distribution
conflict_events %>% count(severity)
## # A tibble: 4 × 2
##   severity     n
##   <ord>    <int>
## 1 LOW          1
## 2 MEDIUM       4
## 3 HIGH        15
## 4 CRITICAL     8
# Confirm diplomatic outlier flagged
conflict_events %>% filter(is_diplomatic) %>% select(date, location, latitude, longitude)
## # A tibble: 1 × 4
##   date       location                                latitude longitude
##   <date>     <chr>                                      <dbl>     <dbl>
## 1 2026-03-05 UN Security Council - Emergency Session     40.7     -74.0
# Date range
range(conflict_events$date)
## [1] "2026-02-28" "2026-03-07"
# NAs
colSums(is.na(conflict_events))
##            date    datetime_utc        location        latitude       longitude 
##               0               0               0               0               0 
##      event_type        severity aviation_impact   is_diplomatic          source 
##               0               0               0               0               0

4. Airline Losses Estimate

4.1 Initial Inspection

glimpse(airline_losses_raw)
## Rows: 35
## Columns: 7
## $ airline                  <chr> "Emirates", "Qatar Airways", "Etihad Airways"…
## $ country                  <chr> "UAE", "Qatar", "UAE", "Turkey", "UK", "Germa…
## $ estimated_daily_loss_usd <dbl> 4200000, 3100000, 1800000, 2400000, 1950000, …
## $ cancelled_flights        <dbl> 18, 12, 8, 14, 6, 5, 4, 3, 7, 6, 4, 5, 6, 9, …
## $ rerouted_flights         <dbl> 62, 41, 29, 38, 32, 28, 25, 21, 35, 31, 23, 2…
## $ additional_fuel_cost_usd <dbl> 2835200, 1765800, 1023400, 1432600, 1185600, …
## $ passengers_impacted      <dbl> 9180, 6540, 4320, 7700, 4680, 3950, 3400, 280…
colSums(is.na(airline_losses_raw))
##                  airline                  country estimated_daily_loss_usd 
##                        0                        0                        0 
##        cancelled_flights         rerouted_flights additional_fuel_cost_usd 
##                        0                        0                        0 
##      passengers_impacted 
##                        0

4.2 Clean

airline_losses <- airline_losses_raw %>%
  
  # Standardize text columns
  mutate(airline  = str_to_title(airline),
         country  = str_to_title(country)) %>%
  
  # Flag SAS "Scandinavia" country as non-standard
  mutate(country_flag = if_else(country == "Scandinavia",
                                "Non-standard country value", NA_character_)) %>%
  
  # Flag El Al zero reroutes (intentional — full airspace closure, not missing data)
  mutate(zero_reroutes_note = if_else(airline == "El Al" & rerouted_flights == 0,
                                      "Full airspace closure — reroutes not possible", NA_character_)) %>%
  
  # Derive total loss estimate (daily loss is already provided; add cost-per-passenger metric)
  mutate(loss_per_passenger_usd = round(estimated_daily_loss_usd / passengers_impacted, 2)) %>%
  
  arrange(desc(estimated_daily_loss_usd))

4.3 Validation

# Check flagged rows
airline_losses %>% filter(!is.na(country_flag) | !is.na(zero_reroutes_note)) %>%
  select(airline, country, country_flag, zero_reroutes_note)
## # A tibble: 2 × 4
##   airline country     country_flag               zero_reroutes_note             
##   <chr>   <chr>       <chr>                      <chr>                          
## 1 El Al   Israel      <NA>                       Full airspace closure — rerout…
## 2 Sas     Scandinavia Non-standard country value <NA>
# Top 5 airlines by daily loss
airline_losses %>% select(airline, country, estimated_daily_loss_usd) %>% head(5)
## # A tibble: 5 × 3
##   airline            country   estimated_daily_loss_usd
##   <chr>              <chr>                        <dbl>
## 1 Emirates           Uae                        4200000
## 2 Qatar Airways      Qatar                      3100000
## 3 El Al              Israel                     2800000
## 4 Turkish Airlines   Turkey                     2400000
## 5 Singapore Airlines Singapore                  2100000
# NAs
colSums(is.na(airline_losses %>% select(-country_flag, -zero_reroutes_note)))
##                  airline                  country estimated_daily_loss_usd 
##                        0                        0                        0 
##        cancelled_flights         rerouted_flights additional_fuel_cost_usd 
##                        0                        0                        0 
##      passengers_impacted   loss_per_passenger_usd 
##                        0                        0

5. Airport Disruptions

5.1 Initial Inspection

glimpse(airport_disruptions_raw)
## Rows: 35
## Columns: 10
## $ airport           <chr> "Dubai International", "Hamad International", "Ben G…
## $ iata              <chr> "DXB", "DOH", "TLV", "IST", "MCT", "LHR", "KWI", "BA…
## $ icao              <chr> "OMDB", "OTHH", "LLBG", "LTFM", "OOMS", "EGLL", "OKB…
## $ country           <chr> "UAE", "Qatar", "Israel", "Turkey", "Oman", "UK", "K…
## $ latitude          <dbl> 25.2532, 25.2736, 32.0114, 41.2753, 23.5933, 51.4700…
## $ longitude         <dbl> 55.3657, 51.6080, 34.8866, 28.7519, 58.2844, -0.4543…
## $ flights_cancelled <dbl> 87, 62, 128, 44, 38, 22, 46, 34, 52, 41, 28, 35, 22,…
## $ flights_delayed   <dbl> 112, 78, 41, 67, 54, 44, 58, 48, 71, 58, 36, 44, 31,…
## $ flights_diverted  <dbl> 24, 18, 0, 12, 10, 6, 14, 10, 15, 13, 8, 10, 7, 8, 5…
## $ runway_status     <chr> "FULLY OPERATIONAL", "FULLY OPERATIONAL", "RESTRICTE…
colSums(is.na(airport_disruptions_raw))
##           airport              iata              icao           country 
##                 0                 0                 0                 0 
##          latitude         longitude flights_cancelled   flights_delayed 
##                 0                 0                 0                 0 
##  flights_diverted     runway_status 
##                 0                 0

5.2 Clean

airport_disruptions <- airport_disruptions_raw %>%
  
  # Standardize text
  mutate(country      = str_to_title(country),
         runway_status = str_to_title(runway_status)) %>%
  
  # Create simplified operational status category
  mutate(status_category = case_when(
    str_detect(runway_status, "Closed")               ~ "Closed",
    str_detect(runway_status, "Restricted|Partially") ~ "Restricted",
    str_detect(runway_status, "Advisory")             ~ "Advisory",
    str_detect(runway_status, "Fully Operational")    ~ "Operational",
    TRUE ~ "Unknown"
  )) %>%
  
  # Flag the Princess Juliana / Curacao data entry error
  mutate(data_quality_flag = if_else(
    iata == "AUA" & str_detect(airport, "Curacao|Juliana"),
    "Possible IATA/airport mismatch — verify AUA vs SXM",
    NA_character_
  )) %>%
  
  # Derive total disruption metric
  mutate(total_disruptions = flights_cancelled + flights_delayed + flights_diverted) %>%
  
  arrange(desc(flights_cancelled))

5.3 Validation

# Status category breakdown
airport_disruptions %>% count(status_category)
## # A tibble: 4 × 2
##   status_category     n
##   <chr>           <int>
## 1 Advisory            1
## 2 Closed              4
## 3 Operational        20
## 4 Restricted         10
# Flagged data quality issues
airport_disruptions %>% filter(!is.na(data_quality_flag)) %>%
  select(airport, iata, icao, country, data_quality_flag)
## # A tibble: 1 × 5
##   airport                    iata  icao  country data_quality_flag              
##   <chr>                      <chr> <chr> <chr>   <chr>                          
## 1 Princess Juliana / Curacao AUA   TNCA  Aruba   Possible IATA/airport mismatch…
# Iranian airports: confirm 0 delays and 0 diversions
airport_disruptions %>%
  filter(status_category == "Closed") %>%
  select(airport, iata, flights_cancelled, flights_delayed, flights_diverted)
## # A tibble: 4 × 5
##   airport               iata  flights_cancelled flights_delayed flights_diverted
##   <chr>                 <chr>             <dbl>           <dbl>            <dbl>
## 1 Tehran Imam Khomeini  IKA                 210               0                0
## 2 Mashhad International MHD                  62               0                0
## 3 Isfahan International IFN                  48               0                0
## 4 Shiraz International  SYZ                  36               0                0
# NAs
colSums(is.na(airport_disruptions %>% select(-data_quality_flag)))
##           airport              iata              icao           country 
##                 0                 0                 0                 0 
##          latitude         longitude flights_cancelled   flights_delayed 
##                 0                 0                 0                 0 
##  flights_diverted     runway_status   status_category total_disruptions 
##                 0                 0                 0                 0

6. Airspace Closures

6.1 Initial Inspection

glimpse(airspace_closures_raw)
## Rows: 25
## Columns: 7
## $ country            <chr> "Iran", "Iran", "Iraq", "Saudi Arabia", "UAE", "Kuw…
## $ region             <chr> "Tehran FIR", "Esfahan FIR", "Baghdad FIR", "Jeddah…
## $ closure_start_time <dttm> 2026-02-28 21:00:00, 2026-02-28 21:00:00, 2026-03-…
## $ closure_end_time   <dttm> 2026-03-07 23:59:00, 2026-03-07 23:59:00, 2026-03-…
## $ closure_reason     <chr> "Active conflict - military operations", "Active co…
## $ authority          <chr> "ICAO / CAOI", "CAOI", "GCAA Iraq", "GACA", "GCAA U…
## $ NOTAM_reference    <chr> "A1420/26", "A1421/26", "A0330/26", "A0712/26", "A0…
colSums(is.na(airspace_closures_raw))
##            country             region closure_start_time   closure_end_time 
##                  0                  0                  0                  0 
##     closure_reason          authority    NOTAM_reference 
##                  0                  0                  0

6.2 Clean

airspace_closures <- airspace_closures_raw %>%
  
  # Parse ISO 8601 datetimes (already have timezone Z)
  mutate(closure_start_time = ymd_hms(closure_start_time, tz = "UTC"),
         closure_end_time   = ymd_hms(closure_end_time,   tz = "UTC")) %>%
  
  # Engineer closure duration in hours
  mutate(closure_duration_hours = as.numeric(difftime(closure_end_time,
                                                       closure_start_time,
                                                       units = "hours"))) %>%
  
  # Standardize text
  mutate(country = str_to_title(country),
         region  = str_to_title(region)) %>%
  
  # Classify closure severity tier based on duration
  mutate(duration_tier = case_when(
    closure_duration_hours >= 100 ~ "Extended (100h+)",
    closure_duration_hours >= 48  ~ "Prolonged (48-100h)",
    closure_duration_hours >= 12  ~ "Significant (12-48h)",
    TRUE                          ~ "Short (<12h)"
  )) %>%
  
  arrange(desc(closure_duration_hours))

6.3 Validation

# Duration tier breakdown
airspace_closures %>% count(duration_tier)
## # A tibble: 4 × 2
##   duration_tier            n
##   <chr>                <int>
## 1 Extended (100h+)         5
## 2 Prolonged (48-100h)      7
## 3 Short (<12h)             4
## 4 Significant (12-48h)     9
# Longest closures
airspace_closures %>%
  select(country, region, closure_duration_hours, duration_tier) %>%
  head(5)
## # A tibble: 5 × 4
##   country     region       closure_duration_hours duration_tier   
##   <chr>       <chr>                         <dbl> <chr>           
## 1 Iran        Tehran Fir                     171. Extended (100h+)
## 2 Iran        Esfahan Fir                    171. Extended (100h+)
## 3 Israel      Tel Aviv Fir                   170. Extended (100h+)
## 4 Afghanistan Kabul Fir                      136. Extended (100h+)
## 5 Iraq        Baghdad Fir                    108. Extended (100h+)
# Date range
range(airspace_closures$closure_start_time)
## [1] NA NA
# NAs
colSums(is.na(airspace_closures))
##                country                 region     closure_start_time 
##                      0                      0                      3 
##       closure_end_time         closure_reason              authority 
##                      1                      0                      0 
##        NOTAM_reference closure_duration_hours          duration_tier 
##                      0                      4                      0

7. Flight Reroutes

7.1 Initial Inspection

glimpse(flight_reroutes_raw)
## Rows: 45
## Columns: 7
## $ flight_id                <chr> "EK0201-20260301", "EK0203-20260301", "BA0105…
## $ airline                  <chr> "Emirates", "Emirates", "British Airways", "B…
## $ original_route           <chr> "LHR-DXB via standard Gulf", "LHR-DXB via sta…
## $ diverted_route           <chr> "LHR-DXB via Southern Red Sea", "LHR-DXB via …
## $ additional_distance_km   <dbl> 620, 620, 980, 980, 870, 870, 850, 850, 410, …
## $ additional_fuel_cost_usd <dbl> 48400, 48400, 76440, 76440, 67860, 67860, 663…
## $ delay_minutes            <dbl> 55, 52, 88, 91, 77, 80, 75, 72, 38, 35, 42, 4…
colSums(is.na(flight_reroutes_raw))
##                flight_id                  airline           original_route 
##                        0                        0                        0 
##           diverted_route   additional_distance_km additional_fuel_cost_usd 
##                        0                        0                        0 
##            delay_minutes 
##                        0

7.2 Clean

flight_reroutes <- flight_reroutes_raw %>%
  
  # Parse flight_id into components: airline code, flight number, date
  mutate(
    flight_code  = str_extract(flight_id, "^[A-Z0-9]+(?=\\d{4}-)"),
    flight_num   = str_extract(flight_id, "\\d{4}(?=-)"),
    flight_date  = ymd(str_extract(flight_id, "\\d{8}$"))
  ) %>%
  
  # Fix known typo: "DXC-BEY" -> "DXB-BEY" in diverted_route
  mutate(diverted_route = str_replace(diverted_route, "DXC-BEY", "DXB-BEY"),
         typo_corrected = if_else(str_detect(flight_id, "FZ0103"), TRUE, FALSE)) %>%
  
  # Standardize airline name
  mutate(airline = str_to_title(airline)) %>%
  
  # Derive fuel cost per km efficiency metric
  mutate(fuel_cost_per_km = round(additional_fuel_cost_usd / additional_distance_km, 2)) %>%
  
  # Note: original_route and diverted_route are free-text labels — keep as-is, do not parse
  
  select(flight_id, flight_date, airline, original_route, diverted_route,
         additional_distance_km, additional_fuel_cost_usd, fuel_cost_per_km,
         delay_minutes, typo_corrected) %>%
  
  arrange(desc(additional_distance_km))

7.3 Validation

# Confirm typo fix
flight_reroutes %>% filter(typo_corrected) %>%
  select(flight_id, diverted_route, typo_corrected)
## # A tibble: 1 × 3
##   flight_id       diverted_route             typo_corrected
##   <chr>           <chr>                      <lgl>         
## 1 FZ0103-20260302 DXB-BEY via Egyptian route TRUE
# Longest reroutes
flight_reroutes %>%
  select(flight_id, airline, additional_distance_km, additional_fuel_cost_usd, delay_minutes) %>%
  head(5)
## # A tibble: 5 × 5
##   flight_id  airline additional_distance_km additional_fuel_cost…¹ delay_minutes
##   <chr>      <chr>                    <dbl>                  <dbl>         <dbl>
## 1 CX0253-20… Cathay…                   1350                 105300           122
## 2 CX0255-20… Cathay…                   1350                 105300           118
## 3 SQ0317-20… Singap…                   1200                  93600           108
## 4 SQ0319-20… Singap…                   1200                  93600           105
## 5 AI0910-20… Air In…                   1140                  88920           102
## # ℹ abbreviated name: ¹​additional_fuel_cost_usd
# Date range
range(flight_reroutes$flight_date)
## [1] "2026-03-01" "2026-03-03"
# NAs
colSums(is.na(flight_reroutes %>% select(-typo_corrected)))
##                flight_id              flight_date                  airline 
##                        0                        0                        0 
##           original_route           diverted_route   additional_distance_km 
##                        0                        0                        0 
## additional_fuel_cost_usd         fuel_cost_per_km            delay_minutes 
##                        0                        0                        0

8. Cross-Table Join Key Audit

# --- Airline name consistency: flight_cancellations <-> airline_losses ---
cancellation_airlines <- flight_cancellations %>% distinct(airline) %>% pull(airline)
losses_airlines       <- airline_losses %>% distinct(airline) %>% pull(airline)

cat("Airlines in cancellations NOT in losses:\n")
## Airlines in cancellations NOT in losses:
setdiff(cancellation_airlines, losses_airlines)
## [1] "Iran Air"       "Wizz Air"       "Kuwait Airways" "Azal"
cat("\nAirlines in losses NOT in cancellations:\n")
## 
## Airlines in losses NOT in cancellations:
setdiff(losses_airlines, cancellation_airlines)
##  [1] "Singapore Airlines"  "Cathay Pacific"      "Air China"          
##  [4] "Korean Air"          "China Eastern"       "China Southern"     
##  [7] "Japan Airlines"      "Ana"                 "Malaysia Airlines"  
## [10] "Asiana Airlines"     "Thai Airways"        "Iberia"             
## [13] "Swiss International" "Sas"                 "Austrian Airlines"  
## [16] "Finnair"             "Lot Polish Airlines" "Egyptair"
# --- IATA consistency: flight_cancellations <-> airport_disruptions ---
cancellation_iatas <- flight_cancellations %>% distinct(iata) %>% pull(iata)
disruption_iatas   <- airport_disruptions %>% distinct(iata) %>% pull(iata)

cat("\nIATAs in cancellations NOT in airport_disruptions:\n")
## 
## IATAs in cancellations NOT in airport_disruptions:
setdiff(cancellation_iatas, disruption_iatas)
## character(0)
cat("\nIATAs in airport_disruptions NOT in cancellations:\n")
## 
## IATAs in airport_disruptions NOT in cancellations:
setdiff(disruption_iatas, cancellation_iatas)
##  [1] "MHD" "IFN" "AUH" "SYZ" "SHJ" "ICN" "DME" "NRT" "EVN" "HKG" "SIN" "CAI"
## [13] "LCA" "TBS" "AUA"
# --- Country consistency: airspace_closures <-> airport_disruptions ---
closure_countries    <- airspace_closures %>% distinct(country) %>% pull(country)
disruption_countries <- airport_disruptions %>% distinct(country) %>% pull(country)

cat("\nCountries in closures NOT in airport_disruptions:\n")
## 
## Countries in closures NOT in airport_disruptions:
setdiff(closure_countries, disruption_countries)
## [1] "Afghanistan"  "Iraq"         "Turkmenistan" "Kazakhstan"   "Greece"      
## [6] "Yemen"

9. Build Joined Analysis Table

# Primary analysis table: cancellations enriched with airport + airline loss data
analysis_main <- flight_cancellations %>%
  
  # Join airport metadata (lat/lon, status, total disruptions)
  left_join(
    airport_disruptions %>%
      select(iata, latitude, longitude, status_category, total_disruptions),
    by = "iata"
  ) %>%
  
  # Join airline financial impact
  left_join(
    airline_losses %>%
      select(airline, country, estimated_daily_loss_usd, passengers_impacted,
             loss_per_passenger_usd),
    by = "airline"
  )

glimpse(analysis_main)
## Rows: 50
## Columns: 18
## $ date                     <date> 2026-02-28, 2026-02-28, 2026-02-28, 2026-02-…
## $ iata                     <chr> "DXB", "IKA", "TLV", "TLV", "BAH", "CDG", "DE…
## $ airport_name             <chr> "Dubai International", "Tehran Imam Khomeini"…
## $ country.x                <chr> "Uae", "Iran", "Israel", "Israel", "Bahrain",…
## $ airline                  <chr> "Emirates", "Iran Air", "El Al", "Wizz Air", …
## $ flight_number            <chr> "EK0912", "IR0702", "LY0082", "W60001", "GF05…
## $ origin                   <chr> "DXB", "IKA", "TLV", "TLV", "BAH", "CDG", "DE…
## $ destination              <chr> "IKA", "DXB", "JFK", "LTN", "IKA", "DXB", "DX…
## $ cancellation_reason      <chr> "Airspace Closed - Iranian Fir Shutdown", "Ai…
## $ aircraft_type            <chr> "Boeing 777-300er", "Airbus A320-200", "Boein…
## $ latitude                 <dbl> 25.2532, 35.4161, 32.0114, 32.0114, 26.2708, …
## $ longitude                <dbl> 55.3657, 51.1522, 34.8866, 34.8866, 50.6336, …
## $ status_category          <chr> "Operational", "Closed", "Restricted", "Restr…
## $ total_disruptions        <dbl> 223, 210, 169, 169, 92, 48, 73, 158, 158, 223…
## $ country.y                <chr> "Uae", NA, "Israel", NA, "Bahrain", "France",…
## $ estimated_daily_loss_usd <dbl> 4200000, NA, 2800000, NA, 680000, 1420000, 15…
## $ passengers_impacted      <dbl> 9180, NA, 3980, NA, 2240, 3400, 3760, 6540, 6…
## $ loss_per_passenger_usd   <dbl> 457.52, NA, 703.52, NA, 303.57, 417.65, 414.8…
# Check join quality — how many rows lost?
cat("Rows before join:", nrow(flight_cancellations), "\n")
## Rows before join: 50
cat("Rows after join: ", nrow(analysis_main), "\n")
## Rows after join:  50
cat("Unmatched airline rows (NA loss data):",
    sum(is.na(analysis_main$estimated_daily_loss_usd)), "\n")
## Unmatched airline rows (NA loss data): 4

10. Export Cleaned Tables

write_csv(flight_cancellations,  "cleaned_flight_cancellations.csv")
write_csv(conflict_events,        "cleaned_conflict_events.csv")
write_csv(airline_losses,         "cleaned_airline_losses.csv")
write_csv(airport_disruptions,    "cleaned_airport_disruptions.csv")
write_csv(airspace_closures,      "cleaned_airspace_closures.csv")
write_csv(flight_reroutes,        "cleaned_flight_reroutes.csv")
write_csv(analysis_main,          "cleaned_analysis_main.csv")

cat("All cleaned files exported.\n")
## All cleaned files exported.

11. Session Info

sessionInfo()
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.4 forcats_1.0.1   stringr_1.5.2   dplyr_1.1.4    
##  [5] purrr_1.0.4     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
##  [9] ggplot2_4.0.0   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] bit_4.6.0          gtable_0.3.6       crayon_1.5.3       compiler_4.3.1    
##  [5] tidyselect_1.2.1   parallel_4.3.1     scales_1.4.0       yaml_2.3.10       
##  [9] fastmap_1.2.0      R6_2.6.1           generics_0.1.4     knitr_1.50        
## [13] pillar_1.11.1      RColorBrewer_1.1-3 tzdb_0.5.0         rlang_1.1.5       
## [17] utf8_1.2.4         stringi_1.8.7      xfun_0.52          S7_0.2.0          
## [21] bit64_4.6.0-1      timechange_0.3.0   cli_3.6.4          withr_3.0.2       
## [25] magrittr_2.0.3     digest_0.6.37      grid_4.3.1         vroom_1.6.5       
## [29] rstudioapi_0.17.1  hms_1.1.3          lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.30    
## [37] tools_4.3.1        pkgconfig_2.0.3    htmltools_0.5.8.1