##According to the FAA, 2,781,971 passengers fly every day in the US, as in June 2019. Passengers reckon that flying is very safe, considering strict inspections are conducted and security measures are taken to avoid and/or mitigate any mishappenings. However, there remain a few chances of unfortunate incidents.
library(ISLR)
library(tree)
library(ggplot2)
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.tree tree
## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(nnet)
##The dataset consists of certain parameters recorded during the incident` such as cabin temperature, turbulence experienced, number of safety complaints prior to the accident, number of days since the last inspection was conducted before the incident, an estimation of the pilot’s control given the various factors at play
df<-read.csv("train.csv")
head(df)
df2<-read.csv("test.csv")
head(df2)
dim(df)
## [1] 10000 12
colSums(is.na(df))
## Severity Safety_Score Days_Since_Inspection
## 0 0 0
## Total_Safety_Complaints Control_Metric Turbulence_In_gforces
## 0 0 0
## Cabin_Temperature Accident_Type_Code Max_Elevation
## 0 0 0
## Violations Adverse_Weather_Metric Accident_ID
## 0 0 0
df[!duplicated(df[1:12]),]
ggplot(df,aes(x=df$Safety_Score))+geom_histogram(aes(y=..density..),color="black",fill="white",binwidth = 2)+stat_function(fun=dnorm,args = list(mean=mean(df$Safety_Score), sd=sd(df$Safety_Score)))
ggplot(df,aes(x=df$Days_Since_Inspection))+geom_histogram(aes(y =..density..), color="black",fill="white",binwidth = 2)+stat_function(fun = dnorm, args = list(mean = mean(df$Days_Since_Inspection), sd = sd(df$Days_Since_Inspection)))
ggplot(df,aes(x=df$Total_Safety_Complaints))+geom_histogram(color="black",fill="white",binwidth = 2)
ggplot(df,aes(x=df$Control_Metric))+geom_histogram(color="black", fill="white",binwidth=1)
ggplot(df,aes(x=df$Turbulence_In_gforces))+geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df,aes(x=df$Cabin_Temperature))+geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df,aes(x=df$Max_Elevation))+geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df,aes(x=df$Violations))+geom_histogram(color="black", fill="white",binwidth=0.5)
ggplot(df,aes(x=df$Adverse_Weather_Metric))+geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### PLotting and Mapping the correlation of the predictors
pairs.panels(df[c("Safety_Score","Days_Since_Inspection","Total_Safety_Complaints","Control_Metric",
"Turbulence_In_gforces","Cabin_Temperature","Max_Elevation","Violations","Adverse_Weather_Metric")],hist.col="green",gap=0)
###Mapping the correlation
mydf<-df[,c("Safety_Score","Days_Since_Inspection","Total_Safety_Complaints","Control_Metric",
"Turbulence_In_gforces","Cabin_Temperature","Max_Elevation","Violations","Adverse_Weather_Metric")]
res<-cor(mydf)
round(res,2)
## Safety_Score Days_Since_Inspection
## Safety_Score 1.00 -0.69
## Days_Since_Inspection -0.69 1.00
## Total_Safety_Complaints 0.06 -0.03
## Control_Metric 0.00 -0.01
## Turbulence_In_gforces 0.02 0.00
## Cabin_Temperature 0.03 -0.04
## Max_Elevation 0.00 0.00
## Violations 0.04 -0.02
## Adverse_Weather_Metric -0.11 0.04
## Total_Safety_Complaints Control_Metric
## Safety_Score 0.06 0.00
## Days_Since_Inspection -0.03 -0.01
## Total_Safety_Complaints 1.00 -0.02
## Control_Metric -0.02 1.00
## Turbulence_In_gforces 0.07 -0.64
## Cabin_Temperature 0.01 -0.01
## Max_Elevation 0.04 -0.03
## Violations -0.02 0.00
## Adverse_Weather_Metric 0.00 -0.03
## Turbulence_In_gforces Cabin_Temperature Max_Elevation
## Safety_Score 0.02 0.03 0.00
## Days_Since_Inspection 0.00 -0.04 0.00
## Total_Safety_Complaints 0.07 0.01 0.04
## Control_Metric -0.64 -0.01 -0.03
## Turbulence_In_gforces 1.00 0.01 0.05
## Cabin_Temperature 0.01 1.00 -0.01
## Max_Elevation 0.05 -0.01 1.00
## Violations 0.01 0.02 -0.03
## Adverse_Weather_Metric 0.04 -0.03 0.17
## Violations Adverse_Weather_Metric
## Safety_Score 0.04 -0.11
## Days_Since_Inspection -0.02 0.04
## Total_Safety_Complaints -0.02 0.00
## Control_Metric 0.00 -0.03
## Turbulence_In_gforces 0.01 0.04
## Cabin_Temperature 0.02 -0.03
## Max_Elevation -0.03 0.17
## Violations 1.00 -0.02
## Adverse_Weather_Metric -0.02 1.00
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
df$Severity <- as.factor(df$Severity)
#df$Control_Metric <- as.factor(df$Control_Metric)
#df$Max_Elevation <- as.factor(df$Max_Elevation)
#df$Safety_Score <- as.factor(df$Safety_Score)
#df$Turbulence_In_gforces <- as.factor(df$Turbulence_In_gforces)
#df$Violations <- as.factor(df$Violations)
#df$Days_Since_Inspection <- as.factor(df$Days_Since_Inspection)
#df$Cabin_Temperature <- as.factor(df$Cabin_Temperature)
#df$Adverse_Weather_Metric <- as.factor(df$Adverse_Weather_Metric)
#df$Total_Safety_Complaints <- as.factor(df$Total_Safety_Complaints)
#df$Accident_Type_Code <- as.factor(df$Accident_Type_Code)
#df$Accident_ID <- as.factor(df$Accident_ID)
df<-df[,-12]
sample_size=floor(0.8*nrow(df))
set.seed(123)
df_train= sample(seq_len(nrow(df)),size = sample_size)
train=df[df_train,]
test=df[-df_train,]
model1<-nnet::multinom(Severity~.,data = train)
## # weights: 48 (33 variable)
## initial value 11090.354889
## iter 10 value 9653.471808
## iter 20 value 9363.992850
## iter 30 value 8976.835019
## iter 40 value 8731.176139
## iter 40 value 8731.176102
## iter 40 value 8731.176100
## final value 8731.176100
## converged
summary(model1)
## Call:
## nnet::multinom(formula = Severity ~ ., data = train)
##
## Coefficients:
## (Intercept) Safety_Score
## Minor_Damage_And_Injuries -11.668941 0.1826360
## Significant_Damage_And_Fatalities -9.137193 0.1521960
## Significant_Damage_And_Serious_Injuries -8.873214 0.1122385
## Days_Since_Inspection
## Minor_Damage_And_Injuries 0.6217623
## Significant_Damage_And_Fatalities 0.5115922
## Significant_Damage_And_Serious_Injuries 0.3664565
## Total_Safety_Complaints Control_Metric
## Minor_Damage_And_Injuries 3.415895e-03 -0.02342091
## Significant_Damage_And_Fatalities -8.583185e-03 -0.05866111
## Significant_Damage_And_Serious_Injuries 6.495886e-06 0.01198562
## Turbulence_In_gforces Cabin_Temperature
## Minor_Damage_And_Injuries 0.1146925 -0.02205745
## Significant_Damage_And_Fatalities -0.1372987 -0.02367616
## Significant_Damage_And_Serious_Injuries 1.4926753 -0.02647164
## Accident_Type_Code Max_Elevation
## Minor_Damage_And_Injuries -0.2460281 8.696850e-06
## Significant_Damage_And_Fatalities 0.4128667 -2.470323e-05
## Significant_Damage_And_Serious_Injuries 0.1166352 7.893311e-06
## Violations Adverse_Weather_Metric
## Minor_Damage_And_Injuries 0.03319810 -0.4674305
## Significant_Damage_And_Fatalities 0.20136329 1.7071741
## Significant_Damage_And_Serious_Injuries 0.03647066 -0.6688087
##
## Std. Errors:
## (Intercept) Safety_Score
## Minor_Damage_And_Injuries 2.108019e-05 0.002414889
## Significant_Damage_And_Fatalities 2.177762e-05 0.002544796
## Significant_Damage_And_Serious_Injuries 2.118830e-05 0.002170567
## Days_Since_Inspection
## Minor_Damage_And_Injuries 0.0007418008
## Significant_Damage_And_Fatalities 0.0007380472
## Significant_Damage_And_Serious_Injuries 0.0006968389
## Total_Safety_Complaints Control_Metric
## Minor_Damage_And_Injuries 0.002896688 0.001942259
## Significant_Damage_And_Fatalities 0.002913341 0.002045456
## Significant_Damage_And_Serious_Injuries 0.002939284 0.001933970
## Turbulence_In_gforces Cabin_Temperature
## Minor_Damage_And_Injuries 2.542427e-05 0.001828848
## Significant_Damage_And_Fatalities 2.761131e-05 0.001879044
## Significant_Damage_And_Serious_Injuries 2.674169e-05 0.001855816
## Accident_Type_Code Max_Elevation
## Minor_Damage_And_Injuries 6.101029e-05 3.795995e-06
## Significant_Damage_And_Fatalities 8.109606e-05 4.091934e-06
## Significant_Damage_And_Serious_Injuries 7.821873e-05 3.408058e-06
## Violations Adverse_Weather_Metric
## Minor_Damage_And_Injuries 3.941765e-05 7.408916e-06
## Significant_Damage_And_Fatalities 4.459623e-05 1.287239e-05
## Significant_Damage_And_Serious_Injuries 4.392495e-05 3.397635e-06
##
## Residual Deviance: 17462.35
## AIC: 17528.35
predicted.classes <- model1 %>% predict(test)
head(predicted.classes)
## [1] Minor_Damage_And_Injuries
## [2] Minor_Damage_And_Injuries
## [3] Highly_Fatal_And_Damaging
## [4] Significant_Damage_And_Fatalities
## [5] Significant_Damage_And_Serious_Injuries
## [6] Significant_Damage_And_Serious_Injuries
## 4 Levels: Highly_Fatal_And_Damaging ... Significant_Damage_And_Serious_Injuries
mean(predicted.classes == test$Severity)
## [1] 0.6205
head(test[,1])
## [1] Minor_Damage_And_Injuries
## [2] Significant_Damage_And_Fatalities
## [3] Highly_Fatal_And_Damaging
## [4] Highly_Fatal_And_Damaging
## [5] Significant_Damage_And_Serious_Injuries
## [6] Significant_Damage_And_Serious_Injuries
## 4 Levels: Highly_Fatal_And_Damaging ... Significant_Damage_And_Serious_Injuries
table(test[,1],knn_df,dnn=c("True","Predicted"))
## Predicted
## True Highly_Fatal_And_Damaging
## Highly_Fatal_And_Damaging 351
## Minor_Damage_And_Injuries 71
## Significant_Damage_And_Fatalities 61
## Significant_Damage_And_Serious_Injuries 106
## Predicted
## True Minor_Damage_And_Injuries
## Highly_Fatal_And_Damaging 78
## Minor_Damage_And_Injuries 251
## Significant_Damage_And_Fatalities 104
## Significant_Damage_And_Serious_Injuries 125
## Predicted
## True Significant_Damage_And_Fatalities
## Highly_Fatal_And_Damaging 47
## Minor_Damage_And_Injuries 64
## Significant_Damage_And_Fatalities 153
## Significant_Damage_And_Serious_Injuries 36
## Predicted
## True Significant_Damage_And_Serious_Injuries
## Highly_Fatal_And_Damaging 109
## Minor_Damage_And_Injuries 117
## Significant_Damage_And_Fatalities 44
## Significant_Damage_And_Serious_Injuries 283
mean(knn_df==test[,1])
## [1] 0.519
miserror <- sum(test[,1]!=knn_df)/nrow(test)
miserror
## [1] 0.481
#install.packages("rpart.plot")
library(rpart.plot)
rpart.plot(model3)
### Testing the model
test$pred <- predict(model3,test[,-1],type="class")
table(test$pred, test$Severity)
##
## Highly_Fatal_And_Damaging
## Highly_Fatal_And_Damaging 431
## Minor_Damage_And_Injuries 109
## Significant_Damage_And_Fatalities 25
## Significant_Damage_And_Serious_Injuries 20
##
## Minor_Damage_And_Injuries
## Highly_Fatal_And_Damaging 24
## Minor_Damage_And_Injuries 441
## Significant_Damage_And_Fatalities 25
## Significant_Damage_And_Serious_Injuries 13
##
## Significant_Damage_And_Fatalities
## Highly_Fatal_And_Damaging 20
## Minor_Damage_And_Injuries 123
## Significant_Damage_And_Fatalities 212
## Significant_Damage_And_Serious_Injuries 7
##
## Significant_Damage_And_Serious_Injuries
## Highly_Fatal_And_Damaging 21
## Minor_Damage_And_Injuries 110
## Significant_Damage_And_Fatalities 19
## Significant_Damage_And_Serious_Injuries 400
mean(test$pred == test$Severity)
## [1] 0.742
library(randomForest)
model4<-randomForest(Severity~.,data = train, xtree=400,mtry=6,nodesize=2)
test$pred_rf2 <- predict(model4,test[,-1])
table(test$pred_rf2, test[,1])
##
## Highly_Fatal_And_Damaging
## Highly_Fatal_And_Damaging 546
## Minor_Damage_And_Injuries 9
## Significant_Damage_And_Fatalities 16
## Significant_Damage_And_Serious_Injuries 14
##
## Minor_Damage_And_Injuries
## Highly_Fatal_And_Damaging 10
## Minor_Damage_And_Injuries 485
## Significant_Damage_And_Fatalities 3
## Significant_Damage_And_Serious_Injuries 5
##
## Significant_Damage_And_Fatalities
## Highly_Fatal_And_Damaging 5
## Minor_Damage_And_Injuries 7
## Significant_Damage_And_Fatalities 345
## Significant_Damage_And_Serious_Injuries 5
##
## Significant_Damage_And_Serious_Injuries
## Highly_Fatal_And_Damaging 10
## Minor_Damage_And_Injuries 21
## Significant_Damage_And_Fatalities 3
## Significant_Damage_And_Serious_Injuries 516
mean(test$pred_rf2==test[,1])
## [1] 0.946
print(model4)
##
## Call:
## randomForest(formula = Severity ~ ., data = train, xtree = 400, mtry = 6, nodesize = 2)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 4.9%
## Confusion matrix:
## Highly_Fatal_And_Damaging
## Highly_Fatal_And_Damaging 2324
## Minor_Damage_And_Injuries 52
## Significant_Damage_And_Fatalities 12
## Significant_Damage_And_Serious_Injuries 39
## Minor_Damage_And_Injuries
## Highly_Fatal_And_Damaging 53
## Minor_Damage_And_Injuries 1932
## Significant_Damage_And_Fatalities 21
## Significant_Damage_And_Serious_Injuries 56
## Significant_Damage_And_Fatalities
## Highly_Fatal_And_Damaging 54
## Minor_Damage_And_Injuries 22
## Significant_Damage_And_Fatalities 1275
## Significant_Damage_And_Serious_Injuries 7
## Significant_Damage_And_Serious_Injuries
## Highly_Fatal_And_Damaging 33
## Minor_Damage_And_Injuries 18
## Significant_Damage_And_Fatalities 25
## Significant_Damage_And_Serious_Injuries 2077
## class.error
## Highly_Fatal_And_Damaging 0.05681818
## Minor_Damage_And_Injuries 0.04545455
## Significant_Damage_And_Fatalities 0.04351088
## Significant_Damage_And_Serious_Injuries 0.04681046
#install.packages("bagRboostR")
library(bagRboostR)
df2$Severity <- predict(model4,df2)
oob = trainControl(method = "oob")
cv_5 = trainControl(method = "cv", number = 5)
rf_grid = expand.grid(mtry = 1:10)
set.seed(825)
flight_rf_tune = train(Severity ~ ., data = train,
method = "rf",
trControl = oob,
verbose = FALSE,
tuneGrid = rf_grid)
flight_rf_tune
## Random Forest
##
## 8000 samples
## 10 predictor
## 4 classes: 'Highly_Fatal_And_Damaging', 'Minor_Damage_And_Injuries', 'Significant_Damage_And_Fatalities', 'Significant_Damage_And_Serious_Injuries'
##
## No pre-processing
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.817125 0.7522339
## 2 0.900750 0.8657478
## 3 0.931250 0.9070590
## 4 0.942250 0.9219407
## 5 0.948125 0.9298797
## 6 0.951750 0.9347763
## 7 0.951250 0.9341052
## 8 0.952750 0.9361257
## 9 0.952250 0.9354564
## 10 0.951750 0.9347667
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 8.
calc_acc = function(actual, predicted) {
mean(actual == predicted)
}
flight_rf_tune$bestTune
df2$Severity <- predict(flight_rf_tune,df2)
set.seed(120)
model5 <- train(Severity~.,data=train,method="svmLinear", trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"))
test$pred_svm <- model5 %>% predict(test[,-1])
head(test$pred_svm)
## [1] Minor_Damage_And_Injuries
## [2] Minor_Damage_And_Injuries
## [3] Highly_Fatal_And_Damaging
## [4] Significant_Damage_And_Fatalities
## [5] Significant_Damage_And_Serious_Injuries
## [6] Significant_Damage_And_Serious_Injuries
## 4 Levels: Highly_Fatal_And_Damaging ... Significant_Damage_And_Serious_Injuries
mean(test$pred_svm == test[,1])
## [1] 0.7045
####Building this Machine Learning models can help us to anticipate and classify the severity of any airplane accident based on past incidents. With this, all airlines, even the entire aviation industry, can predict the severity of airplane accidents caused due to various factors and, correspondingly, have a plan of action to minimize the risk associated with them.
#gbm_grid = expand.grid(interaction.depth = 1:5, n.trees = (1:6) * 500, shrinkage = c(0.001, 0.01, 0.1),n.minobsinnode = 10)
#install.packages("gbm")
#library(gbm)
#flight_gbm_tune = train(Severity ~ ., data = train,method = "gbm", trControl = cv_5, verbose = FALSE, tuneGrid = gbm_grid)