# Packages
library(readxl)
library(ggplot2)
library(dplyr)
library(tools)
library(stringr)
library(tidyr)
library(lubridate)
full_path <- paste0(path, "20200101-20240413_combined.xlsx")
data <- read_excel(full_path, sheet = "Sheet1")
full_path2 <- paste0(path2, "1980_combined.csv")
data2 <- read.csv(full_path2)
colnames(data)
## [1] "NtsbNo" "EventType"
## [3] "Mkey" "EventDate"
## [5] "City" "State"
## [7] "Country" "ReportNo"
## [9] "N#" "SerialNumber"
## [11] "HasSafetyRec" "Mode"
## [13] "ReportType" "OriginalPublishedDate"
## [15] "DocketOriginalPublishedDate" "HighestInjuryLevel"
## [17] "FatalInjuryCount" "SeriousInjuryCount"
## [19] "MinorInjuryCount" "ProbableCause"
## [21] "Findings" "EventID"
## [23] "Latitude" "Longitude"
## [25] "Make" "Model"
## [27] "AirCraftCategory" "AirportID"
## [29] "AirportName" "AmateurBuilt"
## [31] "NumberOfEngines" "EngineType"
## [33] "Scheduled" "PurposeOfFlight"
## [35] "FAR" "AirCraftDamage"
## [37] "WeatherCondition" "Operator"
## [39] "BroadPhaseofFlight" "ReportStatus"
## [41] "RepGenFlag" "MostRecentReportType"
## [43] "DocketUrl" "ReportUrl"
This data set has data from the U.S. NTSB and I will use two data sets: the first is only for incidents in the US between 01/01/2020 - 04/13/2024. The second data set will include all international aircraft incidents involving Boeing and Airbus aircraft from 01/01/1980 until 04/13/2024.
# Standardize the 'Make' column to title case:
data <- data %>%
mutate(Make = tolower(Make)) %>% # Convert all to lower case first
mutate(Make = sapply(Make, toTitleCase)) # Then convert to title case
# Standardizing the 'Operator' column as well:
# Remove 'Inc', 'Co', and other corporate designators, and trim whitespace in the 'Operator" column
data <- data %>%
mutate(Operator = tolower(as.character(Operator))) %>%
mutate(Operator = gsub("\\s+inc\\.?$|\\s+co\\.?$|\\s+corp\\.?$|\\s+ltd\\.?$|\\s+llc\\.?$|\\s+corporation\\.?$", "", Operator)) %>%
mutate(Operator = gsub("[.,]", "", Operator)) %>%
mutate(Operator = str_trim(Operator))
# Convert all names to lower case for initial standardization and remove 'Inc', 'Co', etc.
data <- data %>%
mutate(Operator = tolower(as.character(Operator))) %>%
mutate(Operator = gsub("\\s+inc\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+co\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+ltd\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+llc\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+corporation\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+corp\\.?$", "", Operator)) %>%
mutate(Operator = str_trim(Operator)) # Trim whitespace
# Create a mapping of known variations to the standardized names
name_mapping <- c(
"american airlines inc" = "american airlines",
"delta air lines inc" = "delta air lines",
"united airlines inc" = "united airlines")
# Apply the mapping to standardize the names
data <- data %>%
mutate(Operator = ifelse(Operator %in% names(name_mapping), name_mapping[Operator], Operator))
# Convert back to Title Case for presentation
data <- data %>%
mutate(Operator = tools::toTitleCase(Operator))
## Adding Time variable for each event (accidentally lost when formatting the EventDate column as date):
data_temp <- read.csv(full_path2)
# Creating the 'EventTime' column by extracting the time part and removing 'Z'
data2 <- data_temp %>%
mutate(EventTime = sub(".*T(.*)Z$", "\\1", EventDate))
#--- --- --- ---
## Doing the same cleaning for data2
# Standardize the 'Make' column to title case:
data2 <- data2 %>%
mutate(Make = tolower(Make)) %>% # Convert all to lower case first
mutate(Make = sapply(Make, toTitleCase)) # Then convert to title case
# Standardizing the 'Operator' column as well:
# Remove 'Inc', 'Co', and other corporate designators, and trim whitespace in the 'Operator" column
data2 <- data2 %>%
mutate(Operator = tolower(as.character(Operator))) %>%
mutate(Operator = gsub("\\s+inc\\.?$|\\s+co\\.?$|\\s+corp\\.?$|\\s+ltd\\.?$|\\s+llc\\.?$|\\s+corporation\\.?$", "", Operator)) %>%
mutate(Operator = gsub("[.,]", "", Operator)) %>%
mutate(Operator = str_trim(Operator))
# Convert all names to lower case for initial standardization and remove 'Inc', 'Co', etc.
data2 <- data2 %>%
mutate(Operator = tolower(as.character(Operator))) %>%
mutate(Operator = gsub("\\s+inc\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+co\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+ltd\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+llc\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+corporation\\.?$", "", Operator)) %>%
mutate(Operator = gsub("\\s+corp\\.?$", "", Operator)) %>%
mutate(Operator = str_trim(Operator)) # Trim whitespace
# Create a mapping of known variations to the standardized names
name_mapping <- c(
"american airlines inc" = "american airlines",
"delta air lines inc" = "delta air lines",
"united airlines inc" = "united airlines")
# Apply the mapping to standardize the names
data2 <- data2 %>%
mutate(Operator = ifelse(Operator %in% names(name_mapping), name_mapping[Operator], Operator))
# Convert back to Title Case for presentation
data2 <- data2 %>%
mutate(Operator = tools::toTitleCase(Operator))
# Create a new variable 'Year' from 'EventDate'
# data2 <- data2 %>%
# mutate(Year = year(EventDate))
# Create a new variable 'Year' from 'EventDate' using base R
data2 <- data2 %>%
mutate(EventDate = as.Date(EventDate))
data2 <- data2 %>%
mutate(Year = as.numeric(format(EventDate, "%Y")))
## Converting the EventTime variable to a continuous variable
data2 <- data2 %>%
mutate(
TimeFraction = as.numeric(substr(EventTime, 1, 2)) / 24 +
as.numeric(substr(EventTime, 4, 5)) / 1440 +
as.numeric(substr(EventTime, 7, 8)) / 86400)
# Convert to Date format
data$EventDate <- as.Date(data$EventDate)
# a scatter plot with ggplot2
ggplot(data) +
geom_point(aes(x = EventDate, y = FatalInjuryCount), color = 'red', alpha = 0.5) +
geom_point(aes(x = EventDate, y = SeriousInjuryCount), color = 'blue', alpha = 0.5) +
labs(title = "Scatter Plot of Fatalities and Serious Injuries Over Time",
x = "Event Date",
y = "Count") +
theme_minimal() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y")
# Top 10 Airplane Makes in Accidents
top_makes <- data %>%
count(Make) %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
print(top_makes)
## # A tibble: 11 × 2
## Make n
## <chr> <int>
## 1 Cessna 1203
## 2 Piper 786
## 3 Beech 274
## 4 Mooney 71
## 5 Vans 71
## 6 Boeing 68
## 7 Cirrus Design Corp 59
## 8 Air Tractor Inc 55
## 9 Maule 42
## 10 Aeronca 41
## 11 Bellanca 41
ggplot(top_makes, aes(x = reorder(Make, n), y = n, fill = Make)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Airplane Makes in Accidents", x = "Airplane Make", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
# Top 10 States Where the Accidents Happened
top_states <- data %>%
count(State) %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
print(top_states)
## # A tibble: 10 × 2
## State n
## <chr> <int>
## 1 Texas 385
## 2 California 355
## 3 Florida 316
## 4 Alaska 313
## 5 Colorado 150
## 6 Arizona 149
## 7 Georgia 136
## 8 Washington 129
## 9 North Carolina 119
## 10 Idaho 108
ggplot(top_states, aes(x = reorder(State, n), y = n, fill = State)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 States with Most Accidents", x = "State", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
# Number of Accidents by Make (Boeing vs. Airbus)
boeing_accidents <- sum(grepl("Boeing", data$Make, ignore.case = TRUE))
airbus_accidents <- sum(grepl("Airbus", data$Make, ignore.case = TRUE))
print(c(Boeing = boeing_accidents, Airbus = airbus_accidents))
## Boeing Airbus
## 74 26
accidents_by_make <- data.frame(
Make = c("Boeing", "Airbus"),
Accidents = c(boeing_accidents, airbus_accidents)
)
ggplot(accidents_by_make, aes(x = Make, y = Accidents, fill = Make)) +
geom_bar(stat = "identity") +
labs(title = "Number of Accidents: Boeing vs. Airbus", x = "Airplane Make", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none")
# Top 10 Operators in Accidents
top_operators <- data %>%
count(Operator) %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
print(top_operators)
## # A tibble: 10 × 2
## Operator n
## <chr> <int>
## 1 <NA> 2661
## 2 Pilot 31
## 3 United Airlines 20
## 4 American Airlines 12
## 5 Southwest Airlines 9
## 6 Civil Air Patrol 8
## 7 Delta Air Lines 8
## 8 Private Individual 7
## 9 Hyannis Air Service 5
## 10 Embry-Riddle Aeronautical University 4
ggplot(top_operators, aes(x = reorder(Operator, n), y = n, fill = Operator)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Operators in Accidents", x = "Operator", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
# Count the occurrences of each operator, excluding NAs, and find the top 10
top_operators2 <- data %>%
filter(!is.na(Operator)) %>% # Exclude rows where Operator is NA
count(Operator) %>%
arrange(desc(n)) %>%
top_n(10, n)
print(top_operators2)
## # A tibble: 33 × 2
## Operator n
## <chr> <int>
## 1 Pilot 31
## 2 United Airlines 20
## 3 American Airlines 12
## 4 Southwest Airlines 9
## 5 Civil Air Patrol 8
## 6 Delta Air Lines 8
## 7 Private Individual 7
## 8 Hyannis Air Service 5
## 9 Embry-Riddle Aeronautical University 4
## 10 Airborne Tactical Advantage Company 3
## # ℹ 23 more rows
ggplot(top_operators2, aes(x = reorder(Operator, n), y = n, fill = Operator)) +
geom_bar(stat = "identity") +
labs(title = "Top Flight Operators in Accidents", x = "Operator", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
## Doing the same explorations in data2
# Convert to Date format
data2$EventDate <- as.Date(data2$EventDate)
# a scatter plot with ggplot2
ggplot(data2) +
geom_point(aes(x = EventDate, y = FatalInjuryCount, color = "Fatalities"), alpha = 0.5) +
geom_point(aes(x = EventDate, y = SeriousInjuryCount, color = "Serious Injuries"), alpha = 0.5) +
labs(title = "Scatter Plot of Fatalities and Serious Injuries Over Time",
x = "Event Date",
y = "Count",
color = "Injury Type") +
theme_minimal() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_color_manual(values = c("Fatalities" = "red", "Serious Injuries" = "blue"))
# Scatter plot for the time of the day an incident happened
ggplot(data2) +
geom_point(aes(x = TimeFraction, y = FatalInjuryCount, color = "Fatalities"), alpha = 0.5) +
geom_point(aes(x = TimeFraction, y = SeriousInjuryCount, color = "Serious Injuries"), alpha = 0.5) +
labs(title = "Scatter Plot of Fatalities and Serious Injuries Over The Time of a Day",
x = "Time of Day (fraction)",
y = "Count",
color = "Injury Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_color_manual(values = c("Fatalities" = "red", "Serious Injuries" = "blue"))
# Top 10 Airplane Makes in Accidents
top_makes2 <- data2 %>%
filter(!is.na(manufacturer)) %>%
count(manufacturer) %>%
arrange(desc(n)) %>%
top_n(2)
## Selecting by n
print(top_makes2)
## manufacturer n
## 1 Boeing 3096
## 2 Airbus 549
ggplot(top_makes2, aes(x = reorder(manufacturer, n), y = n, fill = manufacturer)) +
geom_bar(stat = "identity") +
labs(title = "Airplane Makes Involved in Accidents", x = "Airplane Make", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
# Top 10 Countries Where the Accidents Happened
top_Countries <- data2 %>%
count(Country) %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
print(top_Countries)
## Country n
## 1 United States 1945
## 2 United Kingdom 121
## 3 Australia 89
## 4 Japan 74
## 5 Mexico 72
## 6 Indonesia 68
## 7 Germany 62
## 8 India 58
## 9 France 52
## 10 Spain 50
ggplot(top_Countries, aes(x = reorder(Country, n), y = n, fill = Country)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Country with Most Accidents", x = "Country", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
# Top 10 Operators in Accidents
top_operators2 <- data2 %>%
count(Operator) %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
print(top_operators2)
## Operator n
## 1 1166
## 2 United Airlines 136
## 3 American Airlines 128
## 4 Delta Air Lines 109
## 5 Southwest Airlines 78
## 6 Continental Airlines 49
## 7 Northwest Airlines 36
## 8 Ryanair 31
## 9 United Air Lines 30
## 10 Delta Airlines 24
ggplot(top_operators2, aes(x = reorder(Operator, n), y = n, fill = Operator)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Operators in Accidents", x = "Operator", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
# Count the occurrences of each operator, excluding NAs, and find the top 10
top_operators3 <- data2 %>%
filter(!is.na(Operator)) %>% # Exclude rows where Operator is NA
count(Operator) %>%
arrange(desc(n)) %>%
top_n(10, n)
print(top_operators3)
## Operator n
## 1 1166
## 2 United Airlines 136
## 3 American Airlines 128
## 4 Delta Air Lines 109
## 5 Southwest Airlines 78
## 6 Continental Airlines 49
## 7 Northwest Airlines 36
## 8 Ryanair 31
## 9 United Air Lines 30
## 10 Delta Airlines 24
ggplot(top_operators3, aes(x = reorder(Operator, n), y = n, fill = Operator)) +
geom_bar(stat = "identity") +
labs(title = "Top Flight Operators in Accidents", x = "Operator", y = "Number of Accidents") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
fatal_stats <- data %>%
summarise(
Mean = mean(FatalInjuryCount, na.rm = TRUE),
Median = median(FatalInjuryCount, na.rm = TRUE),
SD = sd(FatalInjuryCount, na.rm = TRUE),
IQR = IQR(FatalInjuryCount, na.rm = TRUE),
Min = min(FatalInjuryCount, na.rm = TRUE),
Max = max(FatalInjuryCount, na.rm = TRUE),
Total = sum(FatalInjuryCount, na.rm = TRUE)
)
print(fatal_stats)
## # A tibble: 1 × 7
## Mean Median SD IQR Min Max Total
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.277 0 0.753 0 0 10 1182
serious_stats <- data %>%
summarise(
Mean = mean(SeriousInjuryCount, na.rm = TRUE),
Median = median(SeriousInjuryCount, na.rm = TRUE),
SD = sd(SeriousInjuryCount, na.rm = TRUE),
IQR = IQR(SeriousInjuryCount, na.rm = TRUE),
Min = min(SeriousInjuryCount, na.rm = TRUE),
Max = max(SeriousInjuryCount, na.rm = TRUE),
Total = sum(SeriousInjuryCount, na.rm = TRUE)
)
print(serious_stats)
## # A tibble: 1 × 7
## Mean Median SD IQR Min Max Total
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.182 0 0.532 0 0 7 775
# Histogram for FatalInjuryCount
ggplot(data, aes(x = FatalInjuryCount)) +
geom_histogram(binwidth = 1, fill = "red", color = "black") +
labs(title = "Histogram of Fatal Injury Count", x = "Fatal Injury Count", y = "Frequency") +
theme_minimal()
# Histogram for SeriousInjuryCount
ggplot(data, aes(x = SeriousInjuryCount)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = "Histogram of Serious Injury Count", x = "Serious Injury Count", y = "Frequency") +
theme_minimal()
# Combined Histogram for FatalInjuryCount and SeriousInjuryCount
ggplot() +
geom_histogram(data = data, aes(x = FatalInjuryCount), binwidth = 1, fill = "red", alpha = 0.5) +
geom_histogram(data = data, aes(x = SeriousInjuryCount), binwidth = 1, fill = "blue", alpha = 0.5) +
labs(title = "Combined Histogram of Fatal and Serious Injury Counts",
x = "Injury Count",
y = "Frequency") +
theme_minimal()
# Scatter plots for the fatalities and serious injuries
# Use pivot_longer to reshape the data for plotting
long_data <- data %>%
select(EventDate, FatalInjuryCount, SeriousInjuryCount) %>%
pivot_longer(cols = c("FatalInjuryCount", "SeriousInjuryCount"), names_to = "InjuryType", values_to = "InjuryCount")
# Create the scatter plot
ggplot(long_data, aes(x = EventDate, y = InjuryCount, color = InjuryType)) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("FatalInjuryCount" = "red", "SeriousInjuryCount" = "blue")) +
labs(title = "Scatter Plot of Fatal and Serious Injuries Over Time",
x = "Event Date",
y = "Injury Count") +
theme_minimal()
# descriptive stats for data2
fatal_stats2 <- data2 %>%
summarise(
Mean = mean(FatalInjuryCount, na.rm = TRUE),
Median = median(FatalInjuryCount, na.rm = TRUE),
SD = sd(FatalInjuryCount, na.rm = TRUE),
IQR = IQR(FatalInjuryCount, na.rm = TRUE),
Min = min(FatalInjuryCount, na.rm = TRUE),
Max = max(FatalInjuryCount, na.rm = TRUE),
Total = sum(FatalInjuryCount, na.rm = TRUE) )
print(fatal_stats2)
## Mean Median SD IQR Min Max Total
## 1 3.292148 0 22.11358 0 0 349 11404
serious_stats2 <- data2 %>%
summarise(
Mean = mean(SeriousInjuryCount, na.rm = TRUE),
Median = median(SeriousInjuryCount, na.rm = TRUE),
SD = sd(SeriousInjuryCount, na.rm = TRUE),
IQR = IQR(SeriousInjuryCount, na.rm = TRUE),
Min = min(SeriousInjuryCount, na.rm = TRUE),
Max = max(SeriousInjuryCount, na.rm = TRUE),
Total = sum(SeriousInjuryCount, na.rm = TRUE) )
print(serious_stats2)
## Mean Median SD IQR Min Max Total
## 1 0.4659353 0 3.010904 0 0 81 1614
# Histogram for FatalInjuryCount
ggplot(data2, aes(x = FatalInjuryCount)) +
geom_histogram(binwidth = 1, fill = "red", color = "black") +
labs(title = "Histogram of Fatal Injury Count", x = "Fatal Injury Count", y = "Frequency") +
theme_minimal()
# Filter the data to include only rows where FatalInjuryCount is 1 or more
data2_filtered <- data2 %>%
filter(FatalInjuryCount >= 4)
data2_filtered2 <- data2 %>%
filter(SeriousInjuryCount >= 4)
data2_filtered3 <- data2_filtered %>%
filter(SeriousInjuryCount >= 4)
# Now, create the histogram with the filtered data
ggplot(data2_filtered, aes(x = FatalInjuryCount)) +
geom_histogram(binwidth = 1, fill = "red", color = "black") +
labs(title = "Histogram of Fatal Injury Count (4+)", x = "Fatal Injury Count", y = "Frequency") +
theme_minimal()
# Histogram for SeriousInjuryCount
ggplot(data2, aes(x = SeriousInjuryCount)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = "Histogram of Serious Injury Count", x = "Serious Injury Count", y = "Frequency") +
theme_minimal()
ggplot(data2_filtered2, aes(x = SeriousInjuryCount)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = "Histogram of Serious Injury Count (4+)", x = "Serious Injury Count", y = "Frequency") +
theme_minimal()
# Combined Histogram for FatalInjuryCount and SeriousInjuryCount
ggplot() +
geom_histogram(data = data2, aes(x = FatalInjuryCount), binwidth = 1, fill = "red", alpha = 0.5) +
geom_histogram(data = data2, aes(x = SeriousInjuryCount), binwidth = 1, fill = "blue", alpha = 0.5) +
labs(title = "Combined Histogram of Fatal and Serious Injury Counts",
x = "Injury Count",
y = "Frequency") +
theme_minimal()
ggplot() +
geom_histogram(data = data2_filtered, aes(x = FatalInjuryCount), binwidth = 1, fill = "red", alpha = 0.5) +
geom_histogram(data = data2_filtered2, aes(x = SeriousInjuryCount), binwidth = 1, fill = "blue", alpha = 0.5) +
labs(title = "Combined Histogram of Fatal and Serious Injury Counts (4+)",
x = "Injury Count",
y = "Frequency") +
theme_minimal()
# Scatter plots for the fatalities and serious injuries
# Use pivot_longer to reshape the data for plotting
long_data2 <- data2 %>%
select(EventDate, FatalInjuryCount, SeriousInjuryCount) %>%
pivot_longer(cols = c("FatalInjuryCount", "SeriousInjuryCount"), names_to = "InjuryType", values_to = "InjuryCount")
ggplot(long_data2, aes(x = EventDate, y = InjuryCount, color = InjuryType)) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("FatalInjuryCount" = "red", "SeriousInjuryCount" = "blue")) +
labs(title = "Scatter Plot of Fatal and Serious Injuries Over Time",
x = "Event Date",
y = "Injury Count") +
theme_minimal()
# Use pivot_longer to reshape the data for plotting
long_data3 <- data2_filtered3 %>%
select(EventDate, FatalInjuryCount, SeriousInjuryCount) %>%
pivot_longer(cols = c("FatalInjuryCount", "SeriousInjuryCount"), names_to = "InjuryType", values_to = "InjuryCount")
ggplot(long_data3, aes(x = EventDate, y = InjuryCount, color = InjuryType)) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("FatalInjuryCount" = "red", "SeriousInjuryCount" = "blue")) +
labs(title = "Scatter Plot of Fatal and Serious Injuries Over Time (4+)",
x = "Event Date",
y = "Injury Count") +
theme_minimal()
# first, combine fatal and serious injury counts
data2 <- data2 %>%
mutate(TotalInjuries = FatalInjuryCount + SeriousInjuryCount)
# 'Make' = 'manufacturer', 'Operator', and 'Country' are categorical variable - needing processing
data2$manufacturer <- as.factor(data2$manufacturer)
data2$Operator <- as.factor(data2$Operator)
data2$Country <- as.factor(data2$Country)
# convert other categorical variables to factors if they are not already
data2$WeatherCondition <- as.factor(data2$WeatherCondition)
data2$BroadPhaseofFlight <- as.factor(data2$BroadPhaseofFlight)
# Checking the number of levels for each factor
sapply(data2[c("WeatherCondition", "BroadPhaseofFlight", "NumberOfEngines", "Make", "Operator", "Country")], nlevels)
## WeatherCondition BroadPhaseofFlight NumberOfEngines Make
## 7 13 0 0
## Operator Country
## 1110 147
# The linear regression model
model3 <- lm(TotalInjuries ~ Year + TimeFraction + manufacturer, data = data2)
summary(model3)
##
## Call:
## lm(formula = TotalInjuries ~ Year + TimeFraction + manufacturer,
## data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.31 -4.76 -3.34 -1.86 343.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.23619 64.79203 2.118 0.0342 *
## Year -0.06641 0.03231 -2.055 0.0399 *
## TimeFraction -5.92703 1.44424 -4.104 4.16e-05 ***
## manufacturerAirbus 4.85627 3.67183 1.323 0.1861
## manufacturerBoeing 2.31603 3.53023 0.656 0.5118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.9 on 3459 degrees of freedom
## (224 observations deleted due to missingness)
## Multiple R-squared: 0.006737, Adjusted R-squared: 0.005588
## F-statistic: 5.865 on 4 and 3459 DF, p-value: 0.0001059
# Plot diagnostic plots
par(mfrow=c(2,2))
plot(model3)
hist(model3$residuals)
INTERPRETATION & RESIDUAL ANALYSIS.
Coefficients
Continued interpretation of the model output.
DIAGNOSTIC PLOTS.
THE BOTTOMLINE:
All in all, I don’t think a linear regression model is appropriate for this data set and question for the above reasons.