This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(nasa_nb)
## Neo.Reference.ID Name Absolute.Magnitude Est.Dia.in.KM.min.
## Min. :2000433 Min. :2000433 Min. :11.16 Min. : 0.001011
## 1st Qu.:3097594 1st Qu.:3097594 1st Qu.:20.10 1st Qu.: 0.033462
## Median :3514799 Median :3514799 Median :21.90 Median : 0.110804
## Mean :3272298 Mean :3272298 Mean :22.27 Mean : 0.204604
## 3rd Qu.:3690060 3rd Qu.:3690060 3rd Qu.:24.50 3rd Qu.: 0.253837
## Max. :3781897 Max. :3781897 Max. :32.10 Max. :15.579552
## Est.Dia.in.KM.max. Est.Dia.in.M.min. Est.Dia.in.M.max.
## Min. : 0.00226 Min. : 1.011 Min. : 2.26
## 1st Qu.: 0.07482 1st Qu.: 33.462 1st Qu.: 74.82
## Median : 0.24777 Median : 110.804 Median : 247.77
## Mean : 0.45751 Mean : 204.604 Mean : 457.51
## 3rd Qu.: 0.56760 3rd Qu.: 253.837 3rd Qu.: 567.60
## Max. :34.83694 Max. :15579.552 Max. :34836.94
## Est.Dia.in.Miles.min. Est.Dia.in.Miles.max. Est.Dia.in.Feet.min.
## Min. :0.000628 Min. : 0.001404 Min. : 3.32
## 1st Qu.:0.020792 1st Qu.: 0.046493 1st Qu.: 109.78
## Median :0.068850 Median : 0.153954 Median : 363.53
## Mean :0.127135 Mean : 0.284283 Mean : 671.27
## 3rd Qu.:0.157727 3rd Qu.: 0.352688 3rd Qu.: 832.80
## Max. :9.680682 Max. :21.646663 Max. :51114.02
## Est.Dia.in.Feet.max. Close.Approach.Date Epoch.Date.Close.Approach
## Min. : 7.41 Length:4687 Min. :7.889e+11
## 1st Qu.: 245.49 Class :character 1st Qu.:1.016e+12
## Median : 812.88 Mode :character Median :1.203e+12
## Mean : 1501.01 Mean :1.180e+12
## 3rd Qu.: 1862.19 3rd Qu.:1.356e+12
## Max. :114294.42 Max. :1.473e+12
## Relative.Velocity.km.per.sec Relative.Velocity.km.per.hr Miles.per.hour
## Min. : 0.3355 Min. : 1208 Min. : 750.5
## 1st Qu.: 8.4329 1st Qu.: 30358 1st Qu.:18863.5
## Median :12.9179 Median : 46504 Median :28896.0
## Mean :13.9708 Mean : 50295 Mean :31251.3
## 3rd Qu.:18.0776 3rd Qu.: 65080 3rd Qu.:40437.9
## Max. :44.6337 Max. :160681 Max. :99841.2
## Miss.Dist..Astronomical. Miss.Dist..lunar. Miss.Dist..kilometers.
## Min. :0.0001779 Min. : 0.06919 Min. : 26610
## 1st Qu.:0.1334196 1st Qu.: 51.90021 1st Qu.:19959283
## Median :0.2650286 Median :103.09612 Median :39647712
## Mean :0.2567782 Mean : 99.88671 Mean :38413467
## 3rd Qu.:0.3841541 3rd Qu.:149.43592 3rd Qu.:57468628
## Max. :0.4998841 Max. :194.45491 Max. :74781600
## Miss.Dist..miles. Orbiting.Body Orbit.ID Orbit.Determination.Date
## Min. : 16535 Length:4687 Min. : 1.0 Length:4687
## 1st Qu.:12402124 Class :character 1st Qu.: 9.0 Class :character
## Median :24635948 Mode :character Median : 16.0 Mode :character
## Mean :23869022 Mean : 28.3
## 3rd Qu.:35709350 3rd Qu.: 31.0
## Max. :46467132 Max. :611.0
## Orbit.Uncertainity Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant
## Min. :0.000 Min. :0.0000021 Min. :2.196
## 1st Qu.:0.000 1st Qu.:0.0145851 1st Qu.:4.050
## Median :3.000 Median :0.0473655 Median :5.071
## Mean :3.517 Mean :0.0823201 Mean :5.056
## 3rd Qu.:6.000 3rd Qu.:0.1235935 3rd Qu.:6.019
## Max. :9.000 Max. :0.4778910 Max. :9.025
## Epoch.Osculation Eccentricity Semi.Major.Axis Inclination
## Min. :2450165 Min. :0.007522 Min. :0.6159 Min. : 0.01451
## 1st Qu.:2458001 1st Qu.:0.240858 1st Qu.:1.0006 1st Qu.: 4.96234
## Median :2458001 Median :0.372450 Median :1.2410 Median :10.31184
## Mean :2457724 Mean :0.382569 Mean :1.4003 Mean :13.37384
## 3rd Qu.:2458001 3rd Qu.:0.512411 3rd Qu.:1.6784 3rd Qu.:19.51168
## Max. :2458020 Max. :0.960261 Max. :5.0720 Max. :75.40667
## Asc.Node.Longitude Orbital.Period Perihelion.Distance Perihelion.Arg
## Min. : 0.0019 Min. : 176.6 Min. :0.08074 Min. : 0.0069
## 1st Qu.: 83.0812 1st Qu.: 365.6 1st Qu.:0.63083 1st Qu.: 95.6259
## Median :172.6254 Median : 504.9 Median :0.83315 Median :189.7616
## Mean :172.1573 Mean : 635.6 Mean :0.81338 Mean :183.9322
## 3rd Qu.:255.0269 3rd Qu.: 794.2 3rd Qu.:0.99723 3rd Qu.:271.7776
## Max. :359.9059 Max. :4172.2 Max. :1.29983 Max. :359.9931
## Aphelion.Dist Perihelion.Time Mean.Anomaly Mean.Motion
## Min. :0.8038 Min. :2450100 Min. : 0.0032 Min. :0.08628
## 1st Qu.:1.2661 1st Qu.:2457815 1st Qu.: 87.0069 1st Qu.:0.45329
## Median :1.6182 Median :2457973 Median :185.7189 Median :0.71295
## Mean :1.9871 Mean :2457728 Mean :181.1679 Mean :0.73824
## 3rd Qu.:2.4512 3rd Qu.:2458108 3rd Qu.:276.5319 3rd Qu.:0.98467
## Max. :8.9839 Max. :2458839 Max. :359.9180 Max. :2.03900
## Equinox Hazardous
## Length:4687 Length:4687
## Class :character Class :character
## Mode :character Mode :character
##
##
##
sum(is.na(nasa_nb))
## [1] 0
nasa_clean <- nasa_nb %>% select(-Neo.Reference.ID, -Name , -Close.Approach.Date, -Orbiting.Body,
-Orbit.ID, -Orbit.Determination.Date, -Equinox,
-Asc.Node.Longitude, -Est.Dia.in.Feet.max., -Est.Dia.in.Feet.min.,
-Est.Dia.in.M.max., -Est.Dia.in.M.min., -Est.Dia.in.Miles.max.,
-Est.Dia.in.Miles.min., -Epoch.Date.Close.Approach, -Relative.Velocity.km.per.sec,
-Miss.Dist..Astronomical., -Miss.Dist..lunar., -Miss.Dist..miles.,
-Miles.per.hour, -Epoch.Osculation, -Perihelion.Time, -Orbital.Period) %>%
mutate(Hazardous=ifelse(Hazardous=="True", 1,0)) %>% mutate(Hazardous=as.factor(Hazardous))
set.seed(123)
nasa_idx = sample(nrow(nasa_clean), nrow(nasa_clean)*0.8)
# Training Set
nasa_trn = nasa_clean[nasa_idx, ]
# Test Set
nasa_tst = nasa_clean[-nasa_idx,]
subtrain_index <- sample(nrow(nasa_trn), nrow(nasa_trn)*0.65)
# Subtraining
sub_training <- nasa_trn[subtrain_index, ]
# Validation
validation <- nasa_trn[-subtrain_index, ]
summary(sub_training)
## Absolute.Magnitude Est.Dia.in.KM.min. Est.Dia.in.KM.max.
## Min. :13.92 Min. :0.001011 Min. :0.00226
## 1st Qu.:20.10 1st Qu.:0.035039 1st Qu.:0.07835
## Median :21.90 Median :0.110804 Median :0.24777
## Mean :22.28 Mean :0.195312 Mean :0.43673
## 3rd Qu.:24.40 3rd Qu.:0.253837 3rd Qu.:0.56760
## Max. :32.10 Max. :4.370740 Max. :9.77327
## Relative.Velocity.km.per.hr Miss.Dist..kilometers. Orbit.Uncertainity
## Min. : 1208 Min. : 72172 Min. :0.000
## 1st Qu.: 30419 1st Qu.:20793231 1st Qu.:0.000
## Median : 45118 Median :39742096 Median :3.000
## Mean : 49690 Mean :38709324 Mean :3.526
## 3rd Qu.: 63670 3rd Qu.:57311427 3rd Qu.:6.000
## Max. :153789 Max. :74781600 Max. :9.000
## Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant Eccentricity
## Min. :0.0000021 Min. :2.240 Min. :0.007522
## 1st Qu.:0.0147658 1st Qu.:4.013 1st Qu.:0.244002
## Median :0.0485300 Median :5.063 Median :0.376130
## Mean :0.0831235 Mean :5.052 Mean :0.381790
## 3rd Qu.:0.1240818 3rd Qu.:6.030 3rd Qu.:0.506546
## Max. :0.4778910 Max. :9.025 Max. :0.960261
## Semi.Major.Axis Inclination Perihelion.Distance Perihelion.Arg
## Min. :0.6159 Min. : 0.02099 Min. :0.08074 Min. : 0.4588
## 1st Qu.:0.9997 1st Qu.: 4.91180 1st Qu.:0.62882 1st Qu.: 97.7800
## Median :1.2436 Median :10.24206 Median :0.83738 Median :193.2657
## Mean :1.4049 Mean :13.29716 Mean :0.81655 Mean :186.8172
## 3rd Qu.:1.7033 3rd Qu.:19.23963 3rd Qu.:1.00094 3rd Qu.:276.5412
## Max. :5.0720 Max. :75.40667 Max. :1.29983 Max. :359.9298
## Aphelion.Dist Mean.Anomaly Mean.Motion Hazardous
## Min. :0.8038 Min. : 0.0032 Min. :0.08628 0:2046
## 1st Qu.:1.2667 1st Qu.: 86.8170 1st Qu.:0.44336 1: 390
## Median :1.6127 Median :184.0743 Median :0.71068
## Mean :1.9932 Mean :180.7745 Mean :0.73691
## 3rd Qu.:2.4738 3rd Qu.:276.6560 3rd Qu.:0.98603
## Max. :8.9839 Max. :359.9180 Max. :2.03900
a <- round(prop.table(table(sub_training$Hazardous)),4)
b <- round(prop.table(table(validation$Hazardous)),4)
c <- round(prop.table(table(nasa_tst$Hazardous)),4)
dataset <- c(rep("Subtraining Set" , 2) , rep("Validation Set" , 2) , rep("Test Set" , 2))
esito <- rep(c("Non-Hazardous" , "Hazardous") , 3)
proporzione <- c(a,b,c)
data <- data.frame(dataset,esito,proporzione)
bar_plot <- ggplot(data, aes(fill=esito, y=proporzione, x=dataset)) +
geom_bar(position="fill", stat="identity") +
scale_fill_manual(values = c("purple","green2")) +
ggtitle("Proporzioni degli esiti")
ggplotly(bar_plot)
set.seed(123)
sub_training <- upSample(sub_training[,-17], sub_training[,17], yname = "Hazardous")
table(sub_training$Hazardous)
##
## 0 1
## 2046 2046
melted_data <- reshape2::melt(sub_training, id.vars = "Hazardous") %>% rename(Variabile=variable)
# summary(melted_data)
ggplot(melted_data, aes( x="", y = value, fill = Variabile)) +
geom_boxplot( outlier.shape="x", outlier.size=3, show.legend = F) +
facet_wrap(~ Variabile, nrow = 5, scales = "free") +
labs(title = "Boxplots per 12 Variabili", x = "") +
theme(axis.title.y=element_blank(),
axis.title.x=element_blank())
apply(sub_training[,-17], 2, skewness)
## Absolute.Magnitude Est.Dia.in.KM.min.
## 0.67329856 4.84574954
## Est.Dia.in.KM.max. Relative.Velocity.km.per.hr
## 4.84574954 0.90667778
## Miss.Dist..kilometers. Orbit.Uncertainity
## -0.12678447 0.68956984
## Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant
## 2.23892229 0.11079086
## Eccentricity Semi.Major.Axis
## 0.22659508 1.02578150
## Inclination Perihelion.Distance
## 1.45343450 -0.10901073
## Perihelion.Arg Aphelion.Dist
## -0.11126068 1.20952428
## Mean.Anomaly Mean.Motion
## -0.07464279 0.43142266
sub_trn_log <- sub_training %>% mutate(Est.Dia.in.KM.min.=log(Est.Dia.in.KM.min.),
Est.Dia.in.KM.max.=log(Est.Dia.in.KM.max.),
Minimum.Orbit.Intersection=log(Minimum.Orbit.Intersection),
Inclination=log(Inclination))
#GRAFICO BOXPLOT POST TRASFORMAZIONI LOG
melted_data <- reshape2::melt(sub_trn_log, id.vars = "Hazardous") %>% rename(Variabile=variable)
# summary(melted_data)
ggplot(melted_data, aes( x="", y = value, fill = Variabile)) +
geom_boxplot( outlier.shape="x", outlier.size=3, show.legend = F) +
facet_wrap(~ Variabile, nrow = 5, scales = "free") +
labs(title = "Boxplots per 12 Variabili", x = "") +
theme(axis.title.y=element_blank(),
axis.title.x=element_blank())
apply(sub_trn_log[,-17], 2, skewness)
## Absolute.Magnitude Est.Dia.in.KM.min.
## 0.67329856 -0.67329856
## Est.Dia.in.KM.max. Relative.Velocity.km.per.hr
## -0.67329856 0.90667778
## Miss.Dist..kilometers. Orbit.Uncertainity
## -0.12678447 0.68956984
## Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant
## -0.88291601 0.11079086
## Eccentricity Semi.Major.Axis
## 0.22659508 1.02578150
## Inclination Perihelion.Distance
## -0.73044257 -0.10901073
## Perihelion.Arg Aphelion.Dist
## -0.11126068 1.20952428
## Mean.Anomaly Mean.Motion
## -0.07464279 0.43142266
corr1 <- round(cor(sub_trn_log[,-c(17)]), 3)
corr_plot <- ggcorrplot(corr1, hc.order = TRUE, lab=T, lab_size = 4, type="lower", show.diag = T)
corr_plot
sub_trn_clean <- sub_trn_log %>% select(-Mean.Motion,-Semi.Major.Axis,-Aphelion.Dist,
-Est.Dia.in.KM.min., -Est.Dia.in.KM.max.)
melted_data2 <- reshape2::melt(sub_trn_clean, id.vars = "Hazardous")
ggplot(melted_data2, aes(x = value)) +
geom_density(aes(fill=factor(Hazardous)), alpha=0.5) +
facet_wrap(~ variable, nrow = 4, scales = "free") +
labs(title = "Distribuzioni di densità condizionatamente a Hazardous e Non-Hazardous", x = "Valore", y = "Densità ") +
scale_fill_manual(values = c("green2","purple"), name = "Esito", labels = c("Non-Hazardous", "Hazardous")) +
theme_minimal()
final_sub_trn <- sub_trn_clean %>% select(Absolute.Magnitude, Orbit.Uncertainity,
Minimum.Orbit.Intersection,Hazardous)
#### Normalizzazione Subtraining (con trasformazioni effettuate) ####
matrix_indicators <- matrix(0, nrow=(dim(final_sub_trn)[2]-1), ncol=2)
colnames(matrix_indicators) <- c("min", "max")
# calcolo min e max.
for (i in 1:(dim(final_sub_trn)[2]-1)){
matrix_indicators[i, "min"] <- min(final_sub_trn[,i])
matrix_indicators[i, "max"] <- max(final_sub_trn[,i])
}
# normalizzazione
for (i in 1:(dim(final_sub_trn)[2]-1)){
final_sub_trn[, i] <- (final_sub_trn[, i] - matrix_indicators[i, "min"])/(matrix_indicators[i, "max"]- matrix_indicators[i, "min"])
}
summary(final_sub_trn)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :0.0000 Min. :0.0000 Min. :0.0000 0:2046
## 1st Qu.:0.3179 1st Qu.:0.0000 1st Qu.:0.7080 1:2046
## Median :0.3949 Median :0.1111 Median :0.7778
## Mean :0.4134 Mean :0.2878 Mean :0.7638
## 3rd Qu.:0.4829 3rd Qu.:0.5556 3rd Qu.:0.8389
## Max. :1.0000 Max. :1.0000 Max. :1.0000
#### Preparazione (norm e log) Validation ####
validation_log <- validation %>% select(Absolute.Magnitude, Orbit.Uncertainity,
Minimum.Orbit.Intersection,Hazardous) %>%
mutate(Minimum.Orbit.Intersection=log(Minimum.Orbit.Intersection))
table(validation_log$Hazardous)
##
## 0 1
## 1107 206
summary(validation_log)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :11.16 Min. :0.000 Min. :-11.1140 0:1107
## 1st Qu.:20.00 1st Qu.:0.000 1st Qu.: -4.2013 1: 206
## Median :22.00 Median :4.000 Median : -3.1249
## Mean :22.30 Mean :3.599 Mean : -3.3772
## 3rd Qu.:24.70 3rd Qu.:7.000 3rd Qu.: -2.1242
## Max. :30.80 Max. :9.000 Max. : -0.7864
# normalizzazione
for (i in 1:(dim(validation_log)[2]-1)){
validation_log[, i] <- (validation_log[, i] - matrix_indicators[i, "min"])/(matrix_indicators[i, "max"]- matrix_indicators[i, "min"])
}
summary(validation_log)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :-0.1518 Min. :0.0000 Min. :0.1601 0:1107
## 1st Qu.: 0.3344 1st Qu.:0.0000 1st Qu.:0.7197 1: 206
## Median : 0.4444 Median :0.4444 Median :0.8068
## Mean : 0.4607 Mean :0.3999 Mean :0.7864
## 3rd Qu.: 0.5930 3rd Qu.:0.7778 3rd Qu.:0.8878
## Max. : 0.9285 Max. :1.0000 Max. :0.9961
LDA QDA
# Funzione per Q-Q plot per singola variabile
qq_plot <- function(data, var) {
ggplot(data, aes(sample = !!sym(var), color = Hazardous)) +
geom_qq_line(color="black", linewidth=0.8) +
geom_qq() +
facet_wrap(~Hazardous) +
labs(title = paste("Q-Q Plot di", var), size = 2) +
labs(x = "", y = "") +
scale_color_manual(values = c("green2","purple"), name = "Esito", labels = c("Non-Hazardous", "Hazardous"))
}
# Q-Q plot per le diverse variabili
plots_list <- lapply(names(final_sub_trn)[1:ncol(final_sub_trn)-1], function(var) qq_plot(final_sub_trn, var))
# Unione dei Q-Q plot
grid.arrange(grobs = plots_list, ncol = 2)
# Variabili di supporto
pvalue_shapiro <- matrix(0, nrow = (dim(final_sub_trn)[2]-1), ncol = 2)
rownames(pvalue_shapiro) = colnames(final_sub_trn)[-4]
colnames(pvalue_shapiro) = c("1", "0")
# Test Shapiro e costruzione di una matrice riassuntiva con i p-value condizionati alla classe
for (i in colnames(final_sub_trn)[-4]){
pvalue_shapiro[i, "1"] <- shapiro.test(final_sub_trn[final_sub_trn$Hazardous == "1", i])$p.value
pvalue_shapiro[i, "0"] <- shapiro.test(final_sub_trn[final_sub_trn$Hazardous == "0", i])$p.value
}
round(pvalue_shapiro, 10)
## 1 0
## Absolute.Magnitude 0 3.78e-08
## Orbit.Uncertainity 0 0.00e+00
## Minimum.Orbit.Intersection 0 0.00e+00
REGRESSIONE LOGISTICA
model_logit<-glm(Hazardous~., data=final_sub_trn, family=binomial)
summary(model_logit)#tutte significative tranne relative velocity per km e eccentricity
##
## Call:
## glm(formula = Hazardous ~ ., family = binomial, data = final_sub_trn)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 21.3387 0.7245 29.454 < 2e-16 ***
## Absolute.Magnitude -16.9947 0.6719 -25.295 < 2e-16 ***
## Orbit.Uncertainity -1.3040 0.1906 -6.842 7.81e-12 ***
## Minimum.Orbit.Intersection -18.1837 0.6601 -27.546 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5672.7 on 4091 degrees of freedom
## Residual deviance: 2958.0 on 4088 degrees of freedom
## AIC: 2966
##
## Number of Fisher Scoring iterations: 6
influencePlot(model_logit)
## StudRes Hat CookD
## 315 -1.137165 1.002951e-02 0.002303363
## 746 -1.406131 1.998944e-02 0.008548113
## 1586 -4.401998 1.169127e-05 0.043251744
## 1928 -3.954934 4.575701e-05 0.026996968
final_sub_trn2<-final_sub_trn[-c(746,315,1586,1928), ]
model_logit2<-glm(Hazardous~., data=final_sub_trn2, family='binomial')
summary(model_logit2) #AIC MIGLIORATO
##
## Call:
## glm(formula = Hazardous ~ ., family = "binomial", data = final_sub_trn2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 21.8868 0.7423 29.486 < 2e-16 ***
## Absolute.Magnitude -17.2263 0.6812 -25.289 < 2e-16 ***
## Orbit.Uncertainity -1.3087 0.1921 -6.811 9.68e-12 ***
## Minimum.Orbit.Intersection -18.7563 0.6786 -27.639 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5667.2 on 4087 degrees of freedom
## Residual deviance: 2919.1 on 4084 degrees of freedom
## AIC: 2927.1
##
## Number of Fisher Scoring iterations: 6
probabilities <- predict(model_logit2, type = "response")
predictors <- c("Absolute.Magnitude", "Orbit.Uncertainity", "Minimum.Orbit.Intersection")
# Costruzione delle log(p1/(1-p1))
supp <- final_sub_trn2[, -c(4)]
supp <- supp %>%
mutate(logit = log(probabilities/(1-probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
# Costruzione dei grafici
ggplot(supp, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula = 'y ~ x'
predAIC<-ifelse(predict(model_logit2, validation_log, type='response')>0.5, 1, 0)
confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 890 25
## 1 217 181
##
## Accuracy : 0.8157
## 95% CI : (0.7936, 0.8363)
## No Information Rate : 0.8431
## P-Value [Acc > NIR] : 0.9967
##
## Kappa : 0.4949
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8786
## Specificity : 0.8040
## Pos Pred Value : 0.4548
## Neg Pred Value : 0.9727
## Prevalence : 0.1569
## Detection Rate : 0.1379
## Detection Prevalence : 0.3031
## Balanced Accuracy : 0.8413
##
## 'Positive' Class : 1
##
#F1 score
confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')$byClass[7]
## F1
## 0.5993377
pred_roclogit <- prediction(predAIC, validation$Hazardous)
perf_logit <- performance(pred_roclogit,"tpr","fpr")
auc_logit <- performance(pred_roclogit, measure = "auc")@y.values
auc_logit
## [[1]]
## [1] 0.8413077
KNN
final_sub_trn_KNN <- sub_training %>% select(Absolute.Magnitude, Orbit.Uncertainity,
Minimum.Orbit.Intersection,Hazardous)
#### Normalizzazione Subtraining (per KNN senza trasformazioni effettuate) ####
matrix_indicators_KNN <- matrix(0, nrow=(dim(final_sub_trn_KNN)[2]-1), ncol=2)
colnames(matrix_indicators_KNN) <- c("min", "max")
# calcolo min e max.
for (i in 1:(dim(final_sub_trn_KNN)[2]-1)){
matrix_indicators_KNN[i, "min"] <- min(final_sub_trn_KNN[,i])
matrix_indicators_KNN[i, "max"] <- max(final_sub_trn_KNN[,i])
}
# normalizzazione
for (i in 1:(dim(final_sub_trn_KNN)[2]-1)){
final_sub_trn_KNN[, i] <- (final_sub_trn_KNN[, i] - matrix_indicators_KNN[i, "min"])/(matrix_indicators_KNN[i, "max"]- matrix_indicators_KNN[i, "min"])
}
summary(final_sub_trn_KNN)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :0.0000 Min. :0.0000 Min. :0.00000 0:2046
## 1st Qu.:0.3179 1st Qu.:0.0000 1st Qu.:0.02711 1:2046
## Median :0.3949 Median :0.1111 Median :0.06422
## Mean :0.4134 Mean :0.2878 Mean :0.12322
## 3rd Qu.:0.4829 3rd Qu.:0.5556 3rd Qu.:0.13670
## Max. :1.0000 Max. :1.0000 Max. :1.00000
#### Preparazione (e stand) Validation KNN ####
validation_KNN <- validation %>% select(Absolute.Magnitude, Orbit.Uncertainity,
Minimum.Orbit.Intersection,Hazardous)
table(validation_KNN$Hazardous)
##
## 0 1
## 1107 206
summary(validation_KNN)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :11.16 Min. :0.000 Min. :0.0000149 0:1107
## 1st Qu.:20.00 1st Qu.:0.000 1st Qu.:0.0149764 1: 206
## Median :22.00 Median :4.000 Median :0.0439429
## Mean :22.30 Mean :3.599 Mean :0.0799857
## 3rd Qu.:24.70 3rd Qu.:7.000 3rd Qu.:0.1195290
## Max. :30.80 Max. :9.000 Max. :0.4554590
# normalizzazione
for (i in 1:(dim(validation_KNN)[2]-1)){
validation_KNN[, i] <- (validation_KNN[, i] - matrix_indicators_KNN[i, "min"])/(matrix_indicators_KNN[i, "max"]- matrix_indicators_KNN[i, "min"])
}
summary(validation_KNN)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :-0.1518 Min. :0.0000 Min. :0.0000269 0:1107
## 1st Qu.: 0.3344 1st Qu.:0.0000 1st Qu.:0.0313343 1: 206
## Median : 0.4444 Median :0.4444 Median :0.0919478
## Mean : 0.4607 Mean :0.3999 Mean :0.1673688
## 3rd Qu.: 0.5930 3rd Qu.:0.7778 3rd Qu.:0.2501145
## Max. : 0.9285 Max. :1.0000 Max. :0.9530602
calc_class_err = function(actual, predicted) { mean(actual != predicted) }
## Valutazione del validation error al variare di k
## Con questo approccio posso definire anche il k ottimale da utilizzare poi nel test set
set.seed(123)
k_to_try = 1:100
err_k = rep(x = 0, times = length(k_to_try))
for (i in seq_along(k_to_try)) {
pred = knn(train = final_sub_trn_KNN[,-4],
test = validation_KNN[,-4], cl = final_sub_trn_KNN[,4],
k = k_to_try[i])
err_k[i] = calc_class_err(validation_KNN[,4], pred) }
## test error Vs k
plot(err_k, type = "b", col = "dodgerblue", cex = 1, pch = 20,
xlab = "k, number of neighbors", ylab = "classification error",
main = "(Test) Error Rate vs Neighbors")
## Errore minimo
min(err_k)
## [1] 0.01827875
## Valori di k che danno l'errore minimo
which(err_k == min(err_k))
## [1] 1
err_filtr <- err_k[err_k > 0.02]
posizione <- which.min(err_filtr)
best_k <- which(err_k == err_filtr[posizione])
best_k
## [1] 3
err_k[best_k]
## [1] 0.02665651
PREVISIONE REGR LOG E KNN SU VALIDATION
predAIC<-ifelse(predict(model_logit2, validation_log, type='response')>0.5, 1, 0)
confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 890 25
## 1 217 181
##
## Accuracy : 0.8157
## 95% CI : (0.7936, 0.8363)
## No Information Rate : 0.8431
## P-Value [Acc > NIR] : 0.9967
##
## Kappa : 0.4949
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8786
## Specificity : 0.8040
## Pos Pred Value : 0.4548
## Neg Pred Value : 0.9727
## Prevalence : 0.1569
## Detection Rate : 0.1379
## Detection Prevalence : 0.3031
## Balanced Accuracy : 0.8413
##
## 'Positive' Class : 1
##
pred <- knn(train = final_sub_trn_KNN[,-4], test = validation_KNN[,-4], cl = final_sub_trn_KNN[,4],k = 3, prob = TRUE)
confusionMatrix(as.factor(pred),as.factor(validation_KNN$Hazardous), positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1074 2
## 1 33 204
##
## Accuracy : 0.9733
## 95% CI : (0.9631, 0.9814)
## No Information Rate : 0.8431
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9051
##
## Mcnemar's Test P-Value : 3.959e-07
##
## Sensitivity : 0.9903
## Specificity : 0.9702
## Pos Pred Value : 0.8608
## Neg Pred Value : 0.9981
## Prevalence : 0.1569
## Detection Rate : 0.1554
## Detection Prevalence : 0.1805
## Balanced Accuracy : 0.9802
##
## 'Positive' Class : 1
##
# F1 score regr logistica
F1_regr_log <-confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')$byClass[7]
F1_regr_log
## F1
## 0.5993377
# F1 score KNN
F1_KNN <- confusionMatrix(as.factor(pred),as.factor(validation_KNN$Hazardous), positive="1")$byClass[7]
F1_KNN
## F1
## 0.9209932
pred_roclogit <- prediction(predAIC, validation$Hazardous)
perf_logit <- performance(pred_roclogit,"tpr","fpr")
auc_logit <- performance(pred_roclogit, measure = "auc")@y.values
auc_logit
## [[1]]
## [1] 0.8413077
prob_knn_val <- attr(pred, "prob")
prob_knn_val <- 2*ifelse(pred == "0", 1-prob_knn_val, prob_knn_val) -1
pred_rocknn_val <- prediction(prob_knn_val, validation_KNN[, 4])
pred_knn_val <- performance(pred_rocknn_val, "tpr", "fpr")
auc_knn_val <- performance(pred_rocknn_val, measure = "auc")@y.values
auc_knn_val
## [[1]]
## [1] 0.9891314
par(mfrow=c(1,2))
plot(perf_logit, colorize=T, lwd=3, main="Regressione Logistica su Validation", cex.main = 1) ; abline(a=0, b=1, lty=2, col="gray")
plot(pred_knn_val, colorize=T, lwd=3, main="KNN su Validation", cex.main = 1); abline(a=0, b=1, lty=2, col="gray")
PARTE TRAINING SU TEST
KNN
training_KNN <- nasa_trn %>% select(Absolute.Magnitude, Orbit.Uncertainity,
Minimum.Orbit.Intersection,Hazardous)
# summary(training_KNN)
set.seed(123)
training_KNN <- upSample(training_KNN[,-4], training_KNN[,4], yname="Hazardous")
table(training_KNN$Hazardous)
##
## 0 1
## 3153 3153
# Normalizzazione
matrix_indicators_TRN_KNN <- matrix(0, nrow=(dim(training_KNN)[2]-1), ncol=2)
colnames(matrix_indicators_TRN_KNN) <- c("min", "max")
# calcolo min e max.
for (i in 1:(dim(training_KNN)[2]-1)){
matrix_indicators_TRN_KNN[i, "min"] <- min(training_KNN[,i])
matrix_indicators_TRN_KNN[i, "max"] <- max(training_KNN[,i])
}
# normalizzazione
for (i in 1:(dim(training_KNN)[2]-1)){
training_KNN[, i] <- (training_KNN[, i] - matrix_indicators_TRN_KNN[i, "min"])/(matrix_indicators_TRN_KNN[i, "max"]- matrix_indicators_TRN_KNN[i, "min"])
}
summary(training_KNN)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :0.0000 Min. :0.0000 Min. :0.00000 0:3153
## 1st Qu.:0.4031 1st Qu.:0.0000 1st Qu.:0.02717 1:3153
## Median :0.4699 Median :0.1111 Median :0.06108
## Mean :0.4884 Mean :0.2939 Mean :0.12154
## 3rd Qu.:0.5511 3rd Qu.:0.6667 3rd Qu.:0.13355
## Max. :1.0000 Max. :1.0000 Max. :1.00000
test_KNN <- nasa_tst %>% select(Absolute.Magnitude, Orbit.Uncertainity,
Minimum.Orbit.Intersection,Hazardous)
# summary(test_KNN)
# normalizzazione
for (i in 1:(dim(test_KNN)[2]-1)){
test_KNN[, i] <- (test_KNN[, i] - matrix_indicators_TRN_KNN[i, "min"])/(matrix_indicators_TRN_KNN[i, "max"]- matrix_indicators_TRN_KNN[i, "min"])
}
summary(test_KNN)
## Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
## Min. :0.1595 Min. :0.0000 Min. :0.00000 0:779
## 1st Qu.:0.4317 1st Qu.:0.0000 1st Qu.:0.02904 1:159
## Median :0.5081 Median :0.3333 Median :0.09748
## Mean :0.5272 Mean :0.3753 Mean :0.17472
## 3rd Qu.:0.6323 3rd Qu.:0.6667 3rd Qu.:0.26308
## Max. :0.8997 Max. :1.0000 Max. :0.95181
set.seed(123)
model <- knn(training_KNN[, -4], test_KNN[, -4], cl=training_KNN[, 4], k=3, prob=TRUE)
ALBERI DECISIONALI
set.seed (123)
tree.nasa.trn <- tree(Hazardous ~. , nasa_trn)
plot(tree.nasa.trn) ; text(tree.nasa.trn, pretty = 0) ; title(main = "Albero di classificazione non potato")
tree.pred1 <- predict(tree.nasa.trn , nasa_tst , type = "class")
confusionMatrix(tree.pred1, nasa_tst$Hazardous, positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 775 2
## 1 4 157
##
## Accuracy : 0.9936
## 95% CI : (0.9861, 0.9976)
## No Information Rate : 0.8305
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9774
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9874
## Specificity : 0.9949
## Pos Pred Value : 0.9752
## Neg Pred Value : 0.9974
## Prevalence : 0.1695
## Detection Rate : 0.1674
## Detection Prevalence : 0.1716
## Balanced Accuracy : 0.9911
##
## 'Positive' Class : 1
##
#PRUNING
set.seed (123)
cv.nasa <- cv.tree(tree.nasa.trn , FUN = prune.misclass)
#SCELGO LA SIZE MIGLIORE
par(mfrow = c(1, 2))
plot(cv.nasa$size , cv.nasa$dev, type = "b")
plot(cv.nasa$k, cv.nasa$dev, type = "b")
par(mfrow= c(1,1))
prune.nasa <- prune.misclass(tree.nasa.trn , best = 3)
#GRAFICO POTATO
plot(prune.nasa) ; text(prune.nasa , pretty = 0) ; title(main = "Albero di classificazione potato")
RANDOM FOREST
set.seed(123)
rf_trn_1 <- randomForest(Hazardous ~ ., nasa_trn, importance = T, proximity = T, ntree = 500)
rf_trn_1
##
## Call:
## randomForest(formula = Hazardous ~ ., data = nasa_trn, importance = T, proximity = T, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 0.37%
## Confusion matrix:
## 0 1 class.error
## 0 3146 7 0.002220108
## 1 7 589 0.011744966
plot(rf_trn_1)
prova <- rf_trn_1$err.rate
which.min(rf_trn_1$err.rate[,3])
## [1] 74
set.seed(123)
rf_trn_2 <- randomForest(Hazardous ~ ., nasa_trn, importance = T, proximity = T, ntree = 74)
rf_trn_2
##
## Call:
## randomForest(formula = Hazardous ~ ., data = nasa_trn, importance = T, proximity = T, ntree = 74)
## Type of random forest: classification
## Number of trees: 74
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 0.35%
## Confusion matrix:
## 0 1 class.error
## 0 3145 8 0.002537266
## 1 5 591 0.008389262
varImpPlot(rf_trn_2,
main = "Importanza delle variabili per la distinzione tra Hazardous e Non-Hazardous")
PREVISIONE SU TEST
KNN
confusionMatrix(data=model,reference=factor(test_KNN[, 4]), positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 763 0
## 1 16 159
##
## Accuracy : 0.9829
## 95% CI : (0.9724, 0.9902)
## No Information Rate : 0.8305
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9417
##
## Mcnemar's Test P-Value : 0.0001768
##
## Sensitivity : 1.0000
## Specificity : 0.9795
## Pos Pred Value : 0.9086
## Neg Pred Value : 1.0000
## Prevalence : 0.1695
## Detection Rate : 0.1695
## Detection Prevalence : 0.1866
## Balanced Accuracy : 0.9897
##
## 'Positive' Class : 1
##
ALBERI DECISIONALI
tree.pred2 <- predict(prune.nasa , nasa_tst , type = "class")
confusionMatrix(tree.pred2, nasa_tst$Hazardous, positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 775 2
## 1 4 157
##
## Accuracy : 0.9936
## 95% CI : (0.9861, 0.9976)
## No Information Rate : 0.8305
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9774
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9874
## Specificity : 0.9949
## Pos Pred Value : 0.9752
## Neg Pred Value : 0.9974
## Prevalence : 0.1695
## Detection Rate : 0.1674
## Detection Prevalence : 0.1716
## Balanced Accuracy : 0.9911
##
## 'Positive' Class : 1
##
RANDOM FOREST
set.seed(123)
rf_pred <- predict(rf_trn_2, nasa_tst) %>%
as.data.frame() %>%
mutate(Hazardous = as.factor(`.`)) %>%
select(Hazardous)
confusionMatrix(rf_pred$Hazardous, nasa_tst$Hazardous, positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 777 0
## 1 2 159
##
## Accuracy : 0.9979
## 95% CI : (0.9923, 0.9997)
## No Information Rate : 0.8305
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9925
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9974
## Pos Pred Value : 0.9876
## Neg Pred Value : 1.0000
## Prevalence : 0.1695
## Detection Rate : 0.1695
## Detection Prevalence : 0.1716
## Balanced Accuracy : 0.9987
##
## 'Positive' Class : 1
##
# VISUALIZATION POINTS
rf_class <- data.frame(actual = nasa_tst$Hazardous,
predicted = rf_pred$Hazardous) %>%
mutate(Status = ifelse(actual == predicted, "CORRETTI" , "SBAGLIATI"))
ggplot(data = rf_class, mapping = aes(x = predicted, y = actual,
color = Status, shape = Status)) +
scale_color_manual(values = c("green2","red2")) +
geom_jitter(size = 2, alpha = 0.8) +
labs(x = "Esito previsto",
y = "Esito reale", title= "Previsioni con modello RandomForest (tree = 40)") +
theme_bw()
# F1 score KNN
F1_KNN <- confusionMatrix(data=model,reference=factor(test_KNN[, 4]), positive="1")$byClass[7]
F1_KNN
## F1
## 0.9520958
F1_DT <- confusionMatrix(tree.pred2, nasa_tst$Hazardous, positive="1")$byClass[7]
F1_DT
## F1
## 0.98125
F1_RF <- confusionMatrix(rf_pred$Hazardous, nasa_tst$Hazardous, positive="1")$byClass[7]
F1_RF
## F1
## 0.99375
prob_knn <- attr(model, "prob")
prob_knn <- 2*ifelse(model == "0", 1-prob_knn, prob_knn) -1
pred_rocknn <- prediction(prob_knn, test_KNN[, 4])
pred_knn <- performance(pred_rocknn, "tpr", "fpr")
auc_knn <- performance(pred_rocknn, measure = "auc")@y.values
auc_knn
## [[1]]
## [1] 0.9965324
tree.preds <- predict(prune.nasa, nasa_tst, type="vector")[,"1"]
pred_roctree <- prediction(tree.preds, nasa_tst$Hazardous)
pred_tree <- performance(pred_roctree, "tpr", "fpr")
auc_tree <- performance(pred_roctree, measure = "auc")@y.values
auc_tree
## [[1]]
## [1] 0.9948168
pred_rocRF <- predict(rf_trn_2, newdata=nasa_tst, type="prob")
pred_rocRF <- pred_rocRF[,"1"]
pred_rocRF <- prediction(pred_rocRF, nasa_tst$Hazardous)
pred_RF <- performance(pred_rocRF, "tpr", "fpr")
auc_RF <- performance(pred_rocRF, measure = "auc")@y.values
auc_RF
## [[1]]
## [1] 0.9999677
par(mfrow=c(1,3))
plot(pred_knn, colorize=T, lwd=3) ; abline(a=0, b=1, lty=2, col="gray") ; title(main = "Curva di ROC per KNN", sub = paste("AUC equal to:", round(auc_knn[[1]],3)), cex.sub = 1.4)
plot(pred_tree, colorize=T, lwd=3, main="Curva di ROC per Alberi Decisionali"); abline(a=0, b=1, lty=2, col="gray"); title(sub = paste("AUC pari a:", round(auc_tree[[1]],3)), cex.sub = 1.4)
plot(pred_RF, colorize=T, lwd=3, main="Curva di ROC per RF"); abline(a=0, b=1, lty=2, col="gray"); title(sub = paste("AUC pari a:", round(auc_RF[[1]],5)), cex.sub = 1.4)