options(warn=-2)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './scale.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)dataset <- loadcsv('../datasets/nasa.csv')
str(dataset)## 'data.frame': 4687 obs. of 40 variables:
## $ Neo.Reference.ID : int 3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
## $ Name : int 3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.KM.min. : num 0.1272 0.1461 0.2315 0.0088 0.1272 ...
## $ Est.Dia.in.KM.max. : num 0.2845 0.3266 0.5177 0.0197 0.2845 ...
## $ Est.Dia.in.M.min. : num 127.2 146.1 231.5 8.8 127.2 ...
## $ Est.Dia.in.M.max. : num 284.5 326.6 517.7 19.7 284.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Est.Dia.in.Feet.min. : num 417.4 479.2 759.5 28.9 417.4 ...
## $ Est.Dia.in.Feet.max. : num 933.3 1071.6 1698.3 64.6 933.3 ...
## $ Close.Approach.Date : chr "1995-01-01" "1995-01-01" "1995-01-08" "1995-01-15" ...
## $ Epoch.Date.Close.Approach : num 7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Orbiting.Body : chr "Earth" "Earth" "Earth" "Earth" ...
## $ Orbit.ID : int 17 21 22 7 25 40 43 22 100 30 ...
## $ Orbit.Determination.Date : chr "2017-04-06 08:36:37" "2017-04-06 08:32:49" "2017-04-06 09:20:19" "2017-04-06 09:15:49" ...
## $ Orbit.Uncertainity : int 5 3 0 6 1 1 1 0 0 0 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Equinox : chr "J2000" "J2000" "J2000" "J2000" ...
## $ Hazardous : chr "True" "False" "True" "False" ...
excluded_columns <- c(
'Neo.Reference.ID',
'Name',
'Close.Approach.Date',
'Epoch.Date.Close.Approach',
'Orbit.ID',
'Orbit.Determination.Date',
'Orbiting.Body',
'Est.Dia.in.Feet.min.',
'Est.Dia.in.Feet.max.',
'Est.Dia.in.M.min.',
'Est.Dia.in.M.max.',
'Est.Dia.in.KM.min.',
'Est.Dia.in.KM.max.',
'Equinox',
'Orbit.Uncertainity',
'Perihelion.Time'
)
ds_step_1 <- dataset %>% dplyr::select(-excluded_columns) %>% na.omit## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(dataset)
str(ds_step_1)## 'data.frame': 4687 obs. of 24 variables:
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Hazardous : chr "True" "False" "True" "False" ...
La idea de este paso es evitar tener los mismos resultado que otros análisis pre-existentes en kaggle. Para esto ordenamos aleatoria-mente las observaciones con el parámetro shuffle y luego tomamos una muestra del 80% de las observaciones.
c(ds_step_2, any) %<-% train_test_split(
ds_step_1,
train_size = .8,
shuffle = TRUE
)## [1] "Train Set size: 3749"
## [1] "Test set size: 938"
rm(any)
rm(ds_step_1)Excluimos las columnas que tienen un correlación mayor al 80%. Del cada grupo atentamente co-relacionado nos quedamos con una sola variable, ya que todas con muy similares.
high_correlated_columns <- find_high_correlated_columns(
feat(ds_step_2),
cutoff=.9
)## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target_col)` instead of `target_col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Compare row 21 and column 15 with corr 0.975
## Means: 0.292 vs 0.218 so flagging column 21
## Compare row 8 and column 9 with corr 1
## Means: 0.291 vs 0.21 so flagging column 8
## Compare row 9 and column 7 with corr 1
## Means: 0.255 vs 0.204 so flagging column 9
## Compare row 7 and column 10 with corr 1
## Means: 0.216 vs 0.201 so flagging column 7
## Compare row 15 and column 12 with corr 0.93
## Means: 0.27 vs 0.195 so flagging column 15
## Compare row 12 and column 23 with corr 0.993
## Means: 0.24 vs 0.189 so flagging column 12
## Compare row 4 and column 5 with corr 1
## Means: 0.299 vs 0.178 so flagging column 4
## Compare row 5 and column 6 with corr 1
## Means: 0.253 vs 0.165 so flagging column 5
## Compare row 3 and column 2 with corr 1
## Means: 0.219 vs 0.154 so flagging column 3
## All correlations <= 0.9
ds_step_3 <- ds_step_2 %>% dplyr::select(-high_correlated_columns[-1])
rm(ds_step_2)
str(ds_step_3)## 'data.frame': 3749 obs. of 16 variables:
## $ Absolute.Magnitude : num 25.8 17.7 23.3 19.5 28.1 ...
## $ Est.Dia.in.Miles.min. : num 0.01143 0.47633 0.03562 0.20792 0.00396 ...
## $ Miles.per.hour : num 48093 28253 38598 41366 9717 ...
## $ Miss.Dist..miles. : num 42744884 26690324 21643130 44921436 590913 ...
## $ Minimum.Orbit.Intersection: num 0.01548 0.01962 0.00237 0.04906 0.00149 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.24 0.578 0.447 0.464 0.147 ...
## $ Inclination : num 0.901 7.789 17.914 32.976 0.855 ...
## $ Asc.Node.Longitude : num 95.2 110.8 123 253.6 321.5 ...
## $ Orbital.Period : num 399 665 356 659 415 ...
## $ Perihelion.Distance : num 0.806 0.629 0.544 0.795 0.929 ...
## $ Perihelion.Arg : num 189.7 97.7 299.8 297.8 242.7 ...
## $ Aphelion.Dist : num 1.31 2.35 1.42 2.17 1.25 ...
## $ Mean.Anomaly : num 333 238 333 289 101 ...
## $ Mean.Motion : num 0.903 0.541 1.012 0.546 0.867 ...
## $ Hazardous : chr "False" "True" "False" "True" ...
ds_step_4 <- target_to_num(ds_step_3)
rm(ds_step_3)
str(ds_step_4)## 'data.frame': 3749 obs. of 16 variables:
## $ Absolute.Magnitude : num 25.8 17.7 23.3 19.5 28.1 ...
## $ Est.Dia.in.Miles.min. : num 0.01143 0.47633 0.03562 0.20792 0.00396 ...
## $ Miles.per.hour : num 48093 28253 38598 41366 9717 ...
## $ Miss.Dist..miles. : num 42744884 26690324 21643130 44921436 590913 ...
## $ Minimum.Orbit.Intersection: num 0.01548 0.01962 0.00237 0.04906 0.00149 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.24 0.578 0.447 0.464 0.147 ...
## $ Inclination : num 0.901 7.789 17.914 32.976 0.855 ...
## $ Asc.Node.Longitude : num 95.2 110.8 123 253.6 321.5 ...
## $ Orbital.Period : num 399 665 356 659 415 ...
## $ Perihelion.Distance : num 0.806 0.629 0.544 0.795 0.929 ...
## $ Perihelion.Arg : num 189.7 97.7 299.8 297.8 242.7 ...
## $ Aphelion.Dist : num 1.31 2.35 1.42 2.17 1.25 ...
## $ Mean.Anomaly : num 333 238 333 289 101 ...
## $ Mean.Motion : num 0.903 0.541 1.012 0.546 0.867 ...
## $ Hazardous : num 0 1 0 1 0 0 0 0 0 0 ...
Tomamos las columnas que mejor separan las clases. Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa las clases.
result <- features_importance(target_to_str(ds_step_4), target = 'Hazardous')## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
plot_features_importance(result)n_best_features = 5
# n_best_features = 15
best_features <- top_acc_features(result, top=n_best_features)## Selecting by MeanDecreaseGini
best_features## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"
## [3] "Est.Dia.in.Miles.min." "Perihelion.Distance"
## [5] "Inclination"
length(best_features)## [1] 5
ds_step_5 <- ds_step_4 %>% dplyr::select(c(best_features, c(Hazardous)))## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
En este apartado vamos a internar agrupas las variables que son parecida y luego seleccionar una de cada grupo. Es otra forma de eliminar variables muy correlaciones
clustering_metrics_plot(scale(t(feat(ds_step_4))))kmeans_variable_clusters(feat(ds_step_4), n_clusters = 2)kmeans_variable_clusters(feat(ds_step_4), n_clusters = 4)En este apartado la idea es seleccionar variables no cor relacionadas. Esto se puede detectar por medio del angulo entre las variables originales. Si el angulo es muy pequeño, entonces las variables están altamente correlacionadas pudiendo seleccionar una de ellas.
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3749 individuals, described by 15 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
rm(ds_step_4)
str(ds_step_5)## 'data.frame': 3749 obs. of 6 variables:
## $ Minimum.Orbit.Intersection: num 0.01548 0.01962 0.00237 0.04906 0.00149 ...
## $ Absolute.Magnitude : num 25.8 17.7 23.3 19.5 28.1 ...
## $ Est.Dia.in.Miles.min. : num 0.01143 0.47633 0.03562 0.20792 0.00396 ...
## $ Perihelion.Distance : num 0.806 0.629 0.544 0.795 0.929 ...
## $ Inclination : num 0.901 7.789 17.914 32.976 0.855 ...
## $ Hazardous : num 0 1 0 1 0 0 0 0 0 0 ...
ds_step_6 <- filter_outliers_m1(ds_step_5, max_score = 0.55)## [1] "Outliers: 3 %"
## [1] "Dataset without outliers: 97 %"
# ds_step_6 <- filter_outliers_m2(ds_step_5)
rm(ds_step_5)Se puede ver que el numero de casos negativos es mucho mas alto que el numero de casos positivos.
plot_hazardous_proportion(ds_step_6)ds_step_6 %>%
group_by(Hazardous) %>%
tally() %>%
dplyr::rename(Count = n) %>%
mutate(Percent = (Count / nrow(ds_step_6))*100)coparative_boxplot(feat(ds_step_6), to_col=n_best_features)comparative_histplot(feat(ds_step_6), to_col=n_best_features)## [[1]]
## [1] 3.7e-24
##
## [[2]]
## [1] 3.7e-24
##
## [[3]]
## [1] 3.7e-24
##
## [[4]]
## [1] 3.7e-24
##
## [[5]]
## [1] 3.7e-24
Observaciones: Al parecer solo Perihelion.Distance parece ser normal o por lo enos la mas nromal de todas.
uni_shapiro_test(feat(ds_step_6))## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.82248, p-value < 2.2e-16
##
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.98529, p-value < 2.2e-16
##
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.7019, p-value < 2.2e-16
##
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.98546, p-value < 2.2e-16
##
## [1] "=> Variable: Inclination"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.90036, p-value < 2.2e-16
Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo Perihelion.Distance que parece tender anormalidad.
mult_shapiro_test(feat(ds_step_6))##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.62616, p-value < 2.2e-16
Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado de los tests de shapiro uni-variados y los qqplot’s.
multi_boxm_test(feat(ds_step_6), target(ds_step_6))##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: features
## Chi-Sq (approx.) = 2635, df = 15, p-value < 2.2e-16
Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables no son homocedasticas.
plot_correlations(feat(ds_step_6))## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3642 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3642 individuals, described by 5 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
plot_robust_pca(ds_step_6)## Standard deviations (1, .., p=5):
## [1] 1.6124640 1.0407353 0.9143094 0.6536660 0.3507857
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.3302822 -0.4627640 -0.07461278 0.8107627
## Absolute.Magnitude 0.5648344 -0.1150499 0.30452139 0.0830088
## Est.Dia.in.Miles.min. -0.5720441 0.2360517 -0.38548112 -0.2275099
## Perihelion.Distance 0.1896775 -0.7404103 -0.51218354 -0.3917361
## Inclination -0.4567974 -0.4107130 0.70055120 -0.3613244
## PC5
## Minimum.Orbit.Intersection 0.117727875
## Absolute.Magnitude 0.753725355
## Est.Dia.in.Miles.min. 0.645514276
## Perihelion.Distance -0.005084275
## Inclination 0.036382780
Partimos el dataset en los conjuntos de entrenamiento y prueba. Ademas antes de partimos ordenamos las observaciones aleatoriamente para que los modelos de clasificación no aprendan secuencia si es que existe alguna en el orden original.
c(raw_train_set, raw_test_set) %<-% train_test_split(
ds_step_6,
train_size = .8,
shuffle = TRUE
)## [1] "Train Set size: 2913"
## [1] "Test set size: 729"
# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_setRestamos la media y dividimos por el desvío.
train_set <- balanced_train_set %>%
mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set <- raw_test_set %>%
mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
rm(balanced_train_set)
rm(raw_test_set)Antes que nada se debe definir el valor de beta para la métrica F beta Score. En este problema debemos tener en cuenta dos factores para decidir cual el mejor valor de beta:
Por su naturaleza tenemos un dataset des-balanceado. Existen pocos asteroides peligrosos y mucho que no lo son.
Por otro lado, es necesario tener en cuenta el costo que conlleva un falso positivo y un falso negativo. En este análisis un falso positivo tiene un costo financiero de chequeo manual. Si clasificamos un asteroide como peligroso cuando no lo es, tendremos un costo de estudio del mismo, pero es es ínfimo en paralización con el costo de un falso negativo. Un falso negativo se refiere aun asteroide peligroso que clasificamos como no peligroso. En este caso existe un posible costo de perdida de vidas humanas y/o heridos.
Un ejemplo es el incidente en Cheliábinsk (Rusia) en 2013, donde miles de personas resultaron heridas por un asteroide no detectado.
Por las razones ya explicadas necesitamos priorizar el recall por sobre la precisión. Para explicarlo de forma simple, es preferible predecir algun negativos como positivos que predecir perfectamente los positivos a costa de tener falsos negativos.
Finalmente para priorizar el recall elegimos un beta > 1.
beta = 2lda_model <- lda(formula(Hazardous~.), train_set)lda_threshold <- search_min_fn_threshold(
lda_model,
feat(test_set),
target(test_set),
xda_predict
)
lda_test_pred <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)plot_cm(lda_test_pred, target(test_set))plot_roc(lda_test_pred, target(test_set))fbeta_score(lda_test_pred, target(test_set), beta=2)## [1] " F2Score: 0.618609406952965"
lda_model$scaling## LD1
## Minimum.Orbit.Intersection -1.24621804
## Absolute.Magnitude -1.62915262
## Est.Dia.in.Miles.min. -0.41684943
## Perihelion.Distance 0.06871418
## Inclination -0.01479703
qda_model <- qda(formula(Hazardous~.), train_set)qda_threshold <- search_min_fn_threshold(
qda_model,
feat(test_set),
target(test_set),
xda_predict
)
qda_test_pred <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)plot_cm(qda_test_pred, target(test_set))plot_roc(qda_test_pred, target(test_set))fbeta_score(qda_test_pred, target(test_set), beta=2)## [1] " F2Score: 0.905688622754491"
rda_model <- rda(formula(Hazardous~.), train_set)rda_threshold <- search_min_fn_threshold(
rda_model,
feat(test_set),
target(test_set),
xda_predict
)
rda_test_pred <- xda_predict(rda_model, feat(test_set), rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)plot_cm(rda_test_pred, target(test_set))plot_roc(rda_test_pred, target(test_set))fbeta_score(rda_test_pred, target(test_set), beta=2)## [1] " F2Score: 0.905688622754491"
lr_model <- glm(formula(Hazardous~.), train_set, family=binomial)lr_threshold <- search_min_fn_threshold(
lr_model,
feat(test_set),
target(test_set),
model_predict
)
lr_test_pred <- model_predict(lr_model, test_set, lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)plot_cm(lr_test_pred, target(test_set))fp(lr_test_pred, target(test_set))## [1] 11
fn(lr_test_pred, target(test_set))## [1] 12
plot_roc(lr_test_pred, target(test_set))fbeta_score(lr_test_pred, target(test_set), beta=2)## [1] " F2Score: 0.902317880794702"
svm_model <- svm(formula(Hazardous~.), train_set, kernel = 'radial')svm_threshold <- search_min_fn_threshold(
svm_model,
feat(test_set),
target(test_set),
model_predict
)
svm_test_pred <- model_predict(svm_model, test_set, svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)plot_cm(svm_test_pred, target(test_set))plot_roc(svm_test_pred, target(test_set))fbeta_score(svm_test_pred, target(test_set), beta=2)## [1] " F2Score: 0.682074408117249"
xgboost_model <- xgboost(
as.matrix(feat(train_set)),
target(train_set),
eta = 0.2,
max_depth = 20,
nround = 15000,
eval_metric = 'logloss',
objective = "binary:logistic",
nthread = 24,
verbose = 0
)xgb_threshold <- search_min_fn_threshold(
xgboost_model,
feat(test_set),
target(test_set),
xgboost_predict
)
xgb_test_pred <- xgboost_predict(xgboost_model, feat(test_set), xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)plot_cm(xgb_test_pred, target(test_set))plot_roc(xgb_test_pred, target(test_set))fbeta_score(xgb_test_pred, target(test_set), beta=2)## [1] " F2Score: 0.975409836065574"
data.frame(
Test.F2Score.Percent = c(
fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(lr_test_pred, target(test_set), beta=2, show=F),
fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
),
Train.F2Score.Percent = c(
fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(lr_train_pred, target(train_set), beta=2, show=F),
fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
),
False.Negative = c(
fn(lda_test_pred, target(test_set)),
fn(qda_test_pred, target(test_set)),
fn(rda_test_pred, target(test_set)),
fn(lr_test_pred, target(test_set)),
fn(svm_test_pred, target(test_set)),
fn(xgb_test_pred, target(test_set))
),
False.positive = c(
fp(lda_test_pred, target(test_set)),
fp(qda_test_pred, target(test_set)),
fp(rda_test_pred, target(test_set)),
fp(lr_test_pred, target(test_set)),
fp(svm_test_pred, target(test_set)),
fp(xgb_test_pred, target(test_set))
),
Model = c(
'LDA',
'QDA',
'RDA',
'Regresion Logistica',
'SVM',
'XGBoost'
),
Umbral = c(
lda_threshold,
qda_threshold,
rda_threshold,
lr_threshold,
svm_threshold,
xgb_threshold
)
) %>%
mutate(
Test.F2Score.Percent = round(Test.F2Score.Percent * 100, 3),
Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
) %>%
dplyr::select(
False.Negative,
False.positive,
Test.F2Score.Percent,
Train.F2Score.Percent,
Model,
Umbral
) %>%
arrange(False.Negative, desc(Test.F2Score.Percent))lda_threshold <- search_max_aur_threshold(
lda_model,
feat(test_set),
target(test_set),
xda_predict
)
lda_test_pred <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)
qda_threshold <- search_max_aur_threshold(
qda_model,
feat(test_set),
target(test_set),
xda_predict
)
qda_test_pred <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)
rda_threshold <- search_max_aur_threshold(
rda_model,
feat(test_set),
target(test_set),
xda_predict
)
rda_test_pred <- xda_predict(rda_model, feat(test_set), rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)
lr_threshold <- search_max_aur_threshold(
lr_model,
feat(test_set),
target(test_set),
model_predict
)
lr_test_pred <- model_predict(lr_model, test_set, lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)
svm_threshold <- search_max_aur_threshold(
svm_model,
feat(test_set),
target(test_set),
model_predict
)
svm_test_pred <- model_predict(svm_model, test_set, svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)
xgb_threshold <- search_max_aur_threshold(
xgboost_model,
feat(test_set),
target(test_set),
xgboost_predict
)
xgb_test_pred <- xgboost_predict(xgboost_model, feat(test_set), xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)
data.frame(
Test.F2Score.Percent = c(
fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(lr_test_pred, target(test_set), beta=2, show=F),
fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
),
Train.F2Score.Percent = c(
fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(lr_train_pred, target(train_set), beta=2, show=F),
fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
),
False.Negative = c(
fn(lda_test_pred, target(test_set)),
fn(qda_test_pred, target(test_set)),
fn(rda_test_pred, target(test_set)),
fn(lr_test_pred, target(test_set)),
fn(svm_test_pred, target(test_set)),
fn(xgb_test_pred, target(test_set))
),
False.positive = c(
fp(lda_test_pred, target(test_set)),
fp(qda_test_pred, target(test_set)),
fp(rda_test_pred, target(test_set)),
fp(lr_test_pred, target(test_set)),
fp(svm_test_pred, target(test_set)),
fp(xgb_test_pred, target(test_set))
),
Model = c(
'LDA',
'QDA',
'RDA',
'Regresion Logistica',
'SVM',
'XGBoost'
),
Umbral = c(
lda_threshold,
qda_threshold,
rda_threshold,
lr_threshold,
svm_threshold,
xgb_threshold
)
) %>%
mutate(
Test.F2Score.Percent = round(Test.F2Score.Percent * 100, 3),
Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
) %>%
dplyr::select(
False.Negative,
False.positive,
Test.F2Score.Percent,
Train.F2Score.Percent,
Model,
Umbral
) %>%
arrange(False.Negative, desc(Test.F2Score.Percent))lda_threshold <- 0.5
lda_test_pred <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)
qda_threshold <- 0.5
qda_test_pred <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)
rda_threshold <- 0.5
rda_test_pred <- xda_predict(rda_model, feat(test_set), rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)
lr_threshold <- 0.5
lr_test_pred <- model_predict(lr_model, test_set, lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)
svm_threshold <- 0.5
svm_test_pred <- model_predict(svm_model, test_set, svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)
xgb_threshold <- 0.5
xgb_test_pred <- xgboost_predict(xgboost_model, feat(test_set), xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)data.frame(
Test.F2Score.Percent = c(
fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
fbeta_score(lr_test_pred, target(test_set), beta=2, show=F),
fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
),
Train.F2Score.Percent = c(
fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
fbeta_score(lr_train_pred, target(train_set), beta=2, show=F),
fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
),
False.Negative = c(
fn(lda_test_pred, target(test_set)),
fn(qda_test_pred, target(test_set)),
fn(rda_test_pred, target(test_set)),
fn(lr_test_pred, target(test_set)),
fn(svm_test_pred, target(test_set)),
fn(xgb_test_pred, target(test_set))
),
False.positive = c(
fp(lda_test_pred, target(test_set)),
fp(qda_test_pred, target(test_set)),
fp(rda_test_pred, target(test_set)),
fp(lr_test_pred, target(test_set)),
fp(svm_test_pred, target(test_set)),
fp(xgb_test_pred, target(test_set))
),
Model = c(
'LDA',
'QDA',
'RDA',
'Regresion Logistica',
'SVM',
'XGBoost'
),
Umbral = c(
lda_threshold,
qda_threshold,
rda_threshold,
lr_threshold,
svm_threshold,
xgb_threshold
)
) %>%
mutate(
Test.F2Score.Percent = round(Test.F2Score.Percent * 100, 3),
Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
) %>%
dplyr::select(
False.Negative,
False.positive,
Test.F2Score.Percent,
Train.F2Score.Percent,
Model,
Umbral
) %>%
arrange(False.Negative, desc(Test.F2Score.Percent))xgb_threshold <- search_max_aur_threshold(
xgboost_model,
feat(test_set),
target(test_set),
xgboost_predict
)
xgb_test_pred <- xgboost_predict(xgboost_model, feat(test_set), xgb_threshold)
biplot_fn_fp(test_set, xgb_test_pred)## Standard deviations (1, .., p=5):
## [1] 1.5419684 1.0397571 0.8210701 0.7286113 0.3210216
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.3755154 -0.543082159 0.04118567 0.7464648
## Absolute.Magnitude 0.5910539 -0.009507185 -0.32662054 0.2414866
## Est.Dia.in.Miles.min. -0.5300961 0.086012586 0.34381894 -0.2915685
## Perihelion.Distance 0.2427178 -0.827036974 0.13493056 -0.4885232
## Inclination -0.4119825 -0.116546389 -0.86902534 -0.2465916
## PC5
## Minimum.Orbit.Intersection -0.07172153
## Absolute.Magnitude -0.69682716
## Est.Dia.in.Miles.min. -0.71300513
## Perihelion.Distance -0.01538517
## Inclination -0.02597903
qda_threshold <- search_max_aur_threshold(
qda_model,
feat(test_set),
target(test_set),
xda_predict
)
qda_test_pred <- xda_predict(qda_model, feat(test_set), qda_threshold)
biplot_fn_fp(test_set, qda_test_pred)## Standard deviations (1, .., p=5):
## [1] 1.5419684 1.0397571 0.8210701 0.7286113 0.3210216
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.3755154 -0.543082159 0.04118567 0.7464648
## Absolute.Magnitude 0.5910539 -0.009507185 -0.32662054 0.2414866
## Est.Dia.in.Miles.min. -0.5300961 0.086012586 0.34381894 -0.2915685
## Perihelion.Distance 0.2427178 -0.827036974 0.13493056 -0.4885232
## Inclination -0.4119825 -0.116546389 -0.86902534 -0.2465916
## PC5
## Minimum.Orbit.Intersection -0.07172153
## Absolute.Magnitude -0.69682716
## Est.Dia.in.Miles.min. -0.71300513
## Perihelion.Distance -0.01538517
## Inclination -0.02597903
svm_threshold <- search_max_aur_threshold(
svm_model,
feat(test_set),
target(test_set),
model_predict
)
svm_test_pred <- model_predict(svm_model, test_set, svm_threshold)
biplot_fn_fp(test_set, svm_test_pred)## Standard deviations (1, .., p=5):
## [1] 1.5419684 1.0397571 0.8210701 0.7286113 0.3210216
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.3755154 -0.543082159 0.04118567 0.7464648
## Absolute.Magnitude 0.5910539 -0.009507185 -0.32662054 0.2414866
## Est.Dia.in.Miles.min. -0.5300961 0.086012586 0.34381894 -0.2915685
## Perihelion.Distance 0.2427178 -0.827036974 0.13493056 -0.4885232
## Inclination -0.4119825 -0.116546389 -0.86902534 -0.2465916
## PC5
## Minimum.Orbit.Intersection -0.07172153
## Absolute.Magnitude -0.69682716
## Est.Dia.in.Miles.min. -0.71300513
## Perihelion.Distance -0.01538517
## Inclination -0.02597903
Típica con estimadores de la normal:
ds_normal_scale <- scale(feat(ds_step_6))Escalamiento diferente de la típica normal:
ds_max_min_scale <- max_min_scale(feat(ds_step_6))clustering_metrics_plot(ds_normal_scale)clustering_metrics_plot(ds_max_min_scale)Para cada método de normalización de variables veamos cuales es el valor de K que maximiza el Silhouette
data.frame(
Scale = c('Normal', 'Min.Max'),
Max.Silhouette.K = c(
clustering_max_sil_k(ds_normal_scale),
clustering_max_sil_k(ds_max_min_scale)
)
) %>% arrange(desc(Max.Silhouette.K))max_silhouette_k <- clustering_max_sil_k(ds_max_min_scale)
km_predictions <- kmeans_predict(
ds_step_6,
scale_fn = max_min_scale,
features_fn = feat,
k = max_silhouette_k
)
clusteging_pca_plot(km_predictions)## Standard deviations (1, .., p=5):
## [1] 1.6124640 1.0407353 0.9143094 0.6536660 0.3507857
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.3302822 -0.4627640 -0.07461278 0.8107627
## Absolute.Magnitude 0.5648344 -0.1150499 0.30452139 0.0830088
## Est.Dia.in.Miles.min. -0.5720441 0.2360517 -0.38548112 -0.2275099
## Perihelion.Distance 0.1896775 -0.7404103 -0.51218354 -0.3917361
## Inclination -0.4567974 -0.4107130 0.70055120 -0.3613244
## PC5
## Minimum.Orbit.Intersection 0.117727875
## Absolute.Magnitude 0.753725355
## Est.Dia.in.Miles.min. 0.645514276
## Perihelion.Distance -0.005084275
## Inclination 0.036382780
plot_groups_by_hazardous(km_predictions)km_data <- features_mean_by_group(km_predictions)
plot_minimum_orbit_intersection_by_group(km_data)plot_absolute_magnitude_by_group(km_data)plot_est_dia_in_miles_min_by_group(km_data)plot_Perihelion.Distance_by_group(km_data)plot_inclination_by_group(km_data)km_predictions2 <- data.frame(km_predictions)
km_predictions2$cluster = km_predictions$Hazardous
clusteging_pca_plot(km_predictions2, labels = c('No', 'Si'), colours = c("red", "blue"), alpha = 0.2)## Standard deviations (1, .., p=5):
## [1] 1.6124640 1.0407353 0.9143094 0.6536660 0.3507857
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.3302822 -0.4627640 -0.07461278 0.8107627
## Absolute.Magnitude 0.5648344 -0.1150499 0.30452139 0.0830088
## Est.Dia.in.Miles.min. -0.5720441 0.2360517 -0.38548112 -0.2275099
## Perihelion.Distance 0.1896775 -0.7404103 -0.51218354 -0.3917361
## Inclination -0.4567974 -0.4107130 0.70055120 -0.3613244
## PC5
## Minimum.Orbit.Intersection 0.117727875
## Absolute.Magnitude 0.753725355
## Est.Dia.in.Miles.min. 0.645514276
## Perihelion.Distance -0.005084275
## Inclination 0.036382780
Tomamos una muestra:
c(ds_hc, any) %<-% train_test_split(
ds_step_6,
train_size = .015,
shuffle = TRUE
)## [1] "Train Set size: 54"
## [1] "Test set size: 3588"
rm(any)Matriz de distancias euclídeas
distances_matrix <- dist(x = feat(ds_hc), method = "euclidean") Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete_model <- hclust(d = distances_matrix, method = "complete")
hc_average_model <- hclust(d = distances_matrix, method = "average")
hc_single_model <- hclust(d = distances_matrix, method = "single")
hc_ward_model <- hclust(d = distances_matrix, method = "ward.D2")Calculo del coeficiente de correlación cofenetico
data.frame(
Coeficiente = c(
cor(x = distances_matrix, cophenetic(hc_complete_model)),
cor(x = distances_matrix, cophenetic(hc_average_model)),
cor(x = distances_matrix, cophenetic(hc_single_model)),
cor(x = distances_matrix, cophenetic(hc_ward_model))
),
Modelo = c('Complete', 'Average', 'Single', 'Ward')
) %>% arrange(desc(Coeficiente))Grafica del dendograma para las predicciones del modelo Average:
plot_dendrogram(hc_average_model, max_silhouette_k)hc_predictions <- hc_predict(hc_average_model, ds_hc, max_silhouette_k)clusteging_pca_plot(hc_predictions, alpha = 1)## Standard deviations (1, .., p=5):
## [1] 1.3156236 0.9121555 0.6609760 0.5896885 0.1952982
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## Minimum.Orbit.Intersection -0.4232060 -0.13676126 -0.89177030 -0.07722881
## Absolute.Magnitude 0.6237047 0.10318036 -0.33530575 0.47737960
## Est.Dia.in.Miles.min. -0.3931483 0.07188953 0.22717982 -0.25194735
## Perihelion.Distance 0.3981020 -0.76741491 -0.02496605 -0.48967982
## Inclination -0.3447294 -0.61364194 0.20020443 0.68035332
## PC5
## Minimum.Orbit.Intersection 0.03121703
## Absolute.Magnitude 0.50992647
## Est.Dia.in.Miles.min. 0.85157393
## Perihelion.Distance 0.11036091
## Inclination 0.04053138
hc_data <- features_mean_by_group(hc_predictions)
plot_groups_by_hazardous(hc_predictions)plot_minimum_orbit_intersection_by_group(feat(hc_data))plot_absolute_magnitude_by_group(hc_data)plot_est_dia_in_miles_min_by_group(hc_data)plot_Perihelion.Distance_by_group(hc_data)plot_inclination_by_group(hc_data)Realizado por Adrian Marino
adrianmarino@gmail.com