notebook.knit

options(warn=-2)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')

## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './scale.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"

#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')

## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"

set.seed(1)

1. Cargamos el dataset

1.1. Cargamos el dataset nasa-asteroids y excluirnos observaciones con valores faltaste.

dataset <- loadcsv('../datasets/nasa.csv')
str(dataset)

## 'data.frame':    4687 obs. of  40 variables:
##  $ Neo.Reference.ID            : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Name                        : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.KM.min.          : num  0.1272 0.1461 0.2315 0.0088 0.1272 ...
##  $ Est.Dia.in.KM.max.          : num  0.2845 0.3266 0.5177 0.0197 0.2845 ...
##  $ Est.Dia.in.M.min.           : num  127.2 146.1 231.5 8.8 127.2 ...
##  $ Est.Dia.in.M.max.           : num  284.5 326.6 517.7 19.7 284.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Est.Dia.in.Feet.min.        : num  417.4 479.2 759.5 28.9 417.4 ...
##  $ Est.Dia.in.Feet.max.        : num  933.3 1071.6 1698.3 64.6 933.3 ...
##  $ Close.Approach.Date         : chr  "1995-01-01" "1995-01-01" "1995-01-08" "1995-01-15" ...
##  $ Epoch.Date.Close.Approach   : num  7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Orbiting.Body               : chr  "Earth" "Earth" "Earth" "Earth" ...
##  $ Orbit.ID                    : int  17 21 22 7 25 40 43 22 100 30 ...
##  $ Orbit.Determination.Date    : chr  "2017-04-06 08:36:37" "2017-04-06 08:32:49" "2017-04-06 09:20:19" "2017-04-06 09:15:49" ...
##  $ Orbit.Uncertainity          : int  5 3 0 6 1 1 1 0 0 0 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Equinox                     : chr  "J2000" "J2000" "J2000" "J2000" ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.2. Las siguientes columnas las excluimos ya sea por que son no numerica, o colineales con las columna que si son parte de nuestro analisis.

excluded_columns <- c(
  'Neo.Reference.ID',
  'Name',
  'Close.Approach.Date',
  'Epoch.Date.Close.Approach',
  'Orbit.ID',
  'Orbit.Determination.Date',
  'Orbiting.Body',
  'Est.Dia.in.Feet.min.',
  'Est.Dia.in.Feet.max.',
  'Est.Dia.in.M.min.',
  'Est.Dia.in.M.max.',
  'Est.Dia.in.KM.min.',
  'Est.Dia.in.KM.max.',
  'Equinox',
  'Orbit.Uncertainity',
  'Perihelion.Time'
)

ds_step_1 <- dataset %>% dplyr::select(-excluded_columns) %>% na.omit

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

rm(dataset)

str(ds_step_1)

## 'data.frame':    4687 obs. of  24 variables:
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.3. Tomamos una muestra del 80% del dataset.

La idea de este paso es evitar tener los mismos resultado que otros análisis pre-existentes en kaggle. Para esto ordenamos aleatoria-mente las observaciones con el parámetro shuffle y luego tomamos una muestra del 80% de las observaciones.

c(ds_step_2, any) %<-% train_test_split(
  ds_step_1, 
  train_size = .8, 
  shuffle    = TRUE
)

## [1] "Train Set size: 3749"
## [1] "Test set size: 938"

rm(any)
rm(ds_step_1)

2. Eliminamos las columnas que están altamente co-relacionadas

Excluimos las columnas que tienen un correlación mayor al 80%. Del cada grupo atentamente co-relacionado nos quedamos con una sola variable, ya que todas con muy similares.

high_correlated_columns <- find_high_correlated_columns(
  feat(ds_step_2), 
  cutoff=.9
)

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target_col)` instead of `target_col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

## Compare row 21  and column  15 with corr  0.975 
##   Means:  0.292 vs 0.218 so flagging column 21 
## Compare row 8  and column  9 with corr  1 
##   Means:  0.291 vs 0.21 so flagging column 8 
## Compare row 9  and column  7 with corr  1 
##   Means:  0.255 vs 0.204 so flagging column 9 
## Compare row 7  and column  10 with corr  1 
##   Means:  0.216 vs 0.201 so flagging column 7 
## Compare row 15  and column  12 with corr  0.93 
##   Means:  0.27 vs 0.195 so flagging column 15 
## Compare row 12  and column  23 with corr  0.993 
##   Means:  0.24 vs 0.189 so flagging column 12 
## Compare row 4  and column  5 with corr  1 
##   Means:  0.299 vs 0.178 so flagging column 4 
## Compare row 5  and column  6 with corr  1 
##   Means:  0.253 vs 0.165 so flagging column 5 
## Compare row 3  and column  2 with corr  1 
##   Means:  0.219 vs 0.154 so flagging column 3 
## All correlations <= 0.9

ds_step_3 <- ds_step_2 %>% dplyr::select(-high_correlated_columns[-1])

rm(ds_step_2)
str(ds_step_3)

## 'data.frame':    3749 obs. of  16 variables:
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Miles.per.hour            : num  48093 28253 38598 41366 9717 ...
##  $ Miss.Dist..miles.         : num  42744884 26690324 21643130 44921436 590913 ...
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Epoch.Osculation          : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity              : num  0.24 0.578 0.447 0.464 0.147 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Asc.Node.Longitude        : num  95.2 110.8 123 253.6 321.5 ...
##  $ Orbital.Period            : num  399 665 356 659 415 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Perihelion.Arg            : num  189.7 97.7 299.8 297.8 242.7 ...
##  $ Aphelion.Dist             : num  1.31 2.35 1.42 2.17 1.25 ...
##  $ Mean.Anomaly              : num  333 238 333 289 101 ...
##  $ Mean.Motion               : num  0.903 0.541 1.012 0.546 0.867 ...
##  $ Hazardous                 : chr  "False" "True" "False" "True" ...

3. Transformamos la variable a predecir a tipo numericas

ds_step_4 <- target_to_num(ds_step_3)

rm(ds_step_3)
str(ds_step_4)

## 'data.frame':    3749 obs. of  16 variables:
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Miles.per.hour            : num  48093 28253 38598 41366 9717 ...
##  $ Miss.Dist..miles.         : num  42744884 26690324 21643130 44921436 590913 ...
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Epoch.Osculation          : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity              : num  0.24 0.578 0.447 0.464 0.147 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Asc.Node.Longitude        : num  95.2 110.8 123 253.6 321.5 ...
##  $ Orbital.Period            : num  399 665 356 659 415 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Perihelion.Arg            : num  189.7 97.7 299.8 297.8 242.7 ...
##  $ Aphelion.Dist             : num  1.31 2.35 1.42 2.17 1.25 ...
##  $ Mean.Anomaly              : num  333 238 333 289 101 ...
##  $ Mean.Motion               : num  0.903 0.541 1.012 0.546 0.867 ...
##  $ Hazardous                 : num  0 1 0 1 0 0 0 0 0 0 ...

4. Seleccionamos las variables

4.1 Feature importance

Tomamos las columnas que mejor separan las clases. Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa las clases.

result <- features_importance(target_to_str(ds_step_4), target = 'Hazardous')

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

plot_features_importance(result)

n_best_features = 5
# n_best_features = 15

best_features <- top_acc_features(result, top=n_best_features)

## Selecting by MeanDecreaseGini

best_features

## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"        
## [3] "Est.Dia.in.Miles.min."      "Perihelion.Distance"       
## [5] "Inclination"

length(best_features)

## [1] 5

ds_step_5 <- ds_step_4 %>% dplyr::select(c(best_features, c(Hazardous)))

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

4.2 Cluster de variables

En este apartado vamos a internar agrupas las variables que son parecida y luego seleccionar una de cada grupo. Es otra forma de eliminar variables muy correlaciones

clustering_metrics_plot(scale(t(feat(ds_step_4))))

kmeans_variable_clusters(feat(ds_step_4), n_clusters = 2)

kmeans_variable_clusters(feat(ds_step_4), n_clusters = 4)

4.3 PCA

En este apartado la idea es seleccionar variables no cor relacionadas. Esto se puede detectar por medio del angulo entre las variables originales. Si el angulo es muy pequeño, entonces las variables están altamente correlacionadas pudiendo seleccionar una de ellas.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3749 individuals, described by 15 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

rm(ds_step_4)
str(ds_step_5)

## 'data.frame':    3749 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Hazardous                 : num  0 1 0 1 0 0 0 0 0 0 ...

5. Filtramos outliers

ds_step_6 <- filter_outliers_m1(ds_step_5, max_score = 0.55)

## [1] "Outliers: 3 %"
## [1] "Dataset without outliers: 97 %"

# ds_step_6 <- filter_outliers_m2(ds_step_5)

rm(ds_step_5)

6. Análisis Exploratorio

6.1. Proporción de classes.

Se puede ver que el numero de casos negativos es mucho mas alto que el numero de casos positivos.

plot_hazardous_proportion(ds_step_6)

ds_step_6 %>% 
  group_by(Hazardous) %>% 
  tally() %>% 
  dplyr::rename(Count = n) %>%
  mutate(Percent = (Count / nrow(ds_step_6))*100)

6.1. Boxplot comparativos

coparative_boxplot(feat(ds_step_6), to_col=n_best_features)

6.2. Histogramas y densidad

comparative_histplot(feat(ds_step_6), to_col=n_best_features)

6.3. Analizamos gráfico de normalidad univariado

## [[1]]
## [1] 3.7e-24
## 
## [[2]]
## [1] 3.7e-24
## 
## [[3]]
## [1] 3.7e-24
## 
## [[4]]
## [1] 3.7e-24
## 
## [[5]]
## [1] 3.7e-24

Observaciones: Al parecer solo Perihelion.Distance parece ser normal o por lo enos la mas nromal de todas.

6.3. Test de normalidad uni-variado

uni_shapiro_test(feat(ds_step_6))

## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.82248, p-value < 2.2e-16
## 
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98529, p-value < 2.2e-16
## 
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.7019, p-value < 2.2e-16
## 
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98546, p-value < 2.2e-16
## 
## [1] "=> Variable: Inclination"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.90036, p-value < 2.2e-16

Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo Perihelion.Distance que parece tender anormalidad.

6.4. Test de normalidad muti-variado

mult_shapiro_test(feat(ds_step_6))

## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.62616, p-value < 2.2e-16

Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado de los tests de shapiro uni-variados y los qqplot’s.

6.5. Test de homocedasticidad multi-variado

multi_boxm_test(feat(ds_step_6), target(ds_step_6))

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  features
## Chi-Sq (approx.) = 2635, df = 15, p-value < 2.2e-16

Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables no son homocedasticas.

6.6. Correlaciones entre variables

plot_correlations(feat(ds_step_6))

6.7. Análisis completo

6.8. PCA: Comparación de variables originales con/sin la variable a predecir.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3642 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3642 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

6.9. PCA: Incluyendo la variable a predecir.

plot_robust_pca(ds_step_6)

## Standard deviations (1, .., p=5):
## [1] 1.6124640 1.0407353 0.9143094 0.6536660 0.3507857
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1        PC2         PC3        PC4
## Minimum.Orbit.Intersection -0.3302822 -0.4627640 -0.07461278  0.8107627
## Absolute.Magnitude          0.5648344 -0.1150499  0.30452139  0.0830088
## Est.Dia.in.Miles.min.      -0.5720441  0.2360517 -0.38548112 -0.2275099
## Perihelion.Distance         0.1896775 -0.7404103 -0.51218354 -0.3917361
## Inclination                -0.4567974 -0.4107130  0.70055120 -0.3613244
##                                     PC5
## Minimum.Orbit.Intersection  0.117727875
## Absolute.Magnitude          0.753725355
## Est.Dia.in.Miles.min.       0.645514276
## Perihelion.Distance        -0.005084275
## Inclination                 0.036382780

7. Train test split

Partimos el dataset en los conjuntos de entrenamiento y prueba. Ademas antes de partimos ordenamos las observaciones aleatoriamente para que los modelos de clasificación no aprendan secuencia si es que existe alguna en el orden original.

c(raw_train_set, raw_test_set) %<-% train_test_split(
  ds_step_6, 
  train_size = .8, 
  shuffle    = TRUE
)

## [1] "Train Set size: 2913"
## [1] "Test set size: 729"

8. Método SMOTE para balancear el dataset.

# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_set

9. Escalamos las variables numéricas

Restamos la media y dividimos por el desvío.

train_set <- balanced_train_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set  <- raw_test_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))

rm(balanced_train_set)
rm(raw_test_set)

10. Clasificacion

Antes que nada se debe definir el valor de beta para la métrica F beta Score. En este problema debemos tener en cuenta dos factores para decidir cual el mejor valor de beta:

Por su naturaleza tenemos un dataset des-balanceado. Existen pocos asteroides peligrosos y mucho que no lo son.
Por otro lado, es necesario tener en cuenta el costo que conlleva un falso positivo y un falso negativo. En este análisis un falso positivo tiene un costo financiero de chequeo manual. Si clasificamos un asteroide como peligroso cuando no lo es, tendremos un costo de estudio del mismo, pero es es ínfimo en paralización con el costo de un falso negativo. Un falso negativo se refiere aun asteroide peligroso que clasificamos como no peligroso. En este caso existe un posible costo de perdida de vidas humanas y/o heridos.
Un ejemplo es el incidente en Cheliábinsk (Rusia) en 2013, donde miles de personas resultaron heridas por un asteroide no detectado.

Por las razones ya explicadas necesitamos priorizar el recall por sobre la precisión. Para explicarlo de forma simple, es preferible predecir algun negativos como positivos que predecir perfectamente los positivos a costa de tener falsos negativos.

Finalmente para priorizar el recall elegimos un beta > 1.

beta = 2

10.1. Entrenamos un modelo LDA

lda_model <- lda(formula(Hazardous~.), train_set)

lda_threshold <- search_min_fn_threshold(
  lda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

plot_cm(lda_test_pred, target(test_set))

plot_roc(lda_test_pred, target(test_set))

fbeta_score(lda_test_pred, target(test_set), beta=2)

## [1] " F2Score: 0.618609406952965"

lda_model$scaling

##                                    LD1
## Minimum.Orbit.Intersection -1.24621804
## Absolute.Magnitude         -1.62915262
## Est.Dia.in.Miles.min.      -0.41684943
## Perihelion.Distance         0.06871418
## Inclination                -0.01479703

10.2. Entrenamos un modelo QDA

qda_model <- qda(formula(Hazardous~.), train_set)

qda_threshold <- search_min_fn_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)

plot_cm(qda_test_pred, target(test_set))

plot_roc(qda_test_pred, target(test_set))

fbeta_score(qda_test_pred, target(test_set), beta=2)

## [1] " F2Score: 0.905688622754491"

10.3. Entrenamos un modelo RDA

rda_model <- rda(formula(Hazardous~.), train_set)

rda_threshold <- search_min_fn_threshold(
  rda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

plot_cm(rda_test_pred, target(test_set))

plot_roc(rda_test_pred, target(test_set))

fbeta_score(rda_test_pred, target(test_set), beta=2)

## [1] " F2Score: 0.905688622754491"

10.4. Entrenamos un modelo de regresión logística

lr_model <- glm(formula(Hazardous~.), train_set, family=binomial)

lr_threshold <- search_min_fn_threshold(
  lr_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

plot_cm(lr_test_pred, target(test_set))

fp(lr_test_pred, target(test_set))

## [1] 11

fn(lr_test_pred, target(test_set))

## [1] 12

plot_roc(lr_test_pred, target(test_set))

fbeta_score(lr_test_pred, target(test_set), beta=2)

## [1] " F2Score: 0.902317880794702"

10.5. Entrenamos un modelo SVM

svm_model <- svm(formula(Hazardous~.), train_set, kernel = 'radial')

svm_threshold <- search_min_fn_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)

plot_cm(svm_test_pred, target(test_set))

plot_roc(svm_test_pred, target(test_set))

fbeta_score(svm_test_pred, target(test_set), beta=2)

## [1] " F2Score: 0.682074408117249"

10.6. Entrenamos un modelo XGBoost

xgboost_model <- xgboost(
 as.matrix(feat(train_set)), 
 target(train_set),
 eta         = 0.2,
 max_depth   = 20,
 nround      = 15000,
 eval_metric = 'logloss',
 objective   = "binary:logistic",
 nthread     = 24,
 verbose     = 0
)

xgb_threshold <- search_min_fn_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)

plot_cm(xgb_test_pred, target(test_set))

plot_roc(xgb_test_pred, target(test_set))

fbeta_score(xgb_test_pred, target(test_set), beta=2)

## [1] " F2Score: 0.975409836065574"

10.7. Comparativa de metricas

10.7.1 Metricas seleccionando el threshould con minimo False Negative

data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.2 Metricas seleccionando el threshould con maximo AUR

lda_threshold <- search_max_aur_threshold(
  lda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

qda_threshold <- search_max_aur_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)


rda_threshold <- search_max_aur_threshold(
  rda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

lr_threshold <- search_max_aur_threshold(
  lr_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

svm_threshold <- search_max_aur_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)


xgb_threshold <- search_max_aur_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)



data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>% 
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.3 Metricas seleccionando threshould=0.5

lda_threshold  <- 0.5
lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

qda_threshold  <- 0.5
qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)


rda_threshold  <- 0.5
rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

lr_threshold  <- 0.5
lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

svm_threshold  <- 0.5
svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)

xgb_threshold  <- 0.5
xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)

data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.4 Veamos los Falsos Positivos y Falsos Negativos para los mejores modelos

10.7.4.1 XGBoost con maximizo AUR

xgb_threshold <- search_max_aur_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)


biplot_fn_fp(test_set, xgb_test_pred)

## Standard deviations (1, .., p=5):
## [1] 1.5419684 1.0397571 0.8210701 0.7286113 0.3210216
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1          PC2         PC3        PC4
## Minimum.Orbit.Intersection -0.3755154 -0.543082159  0.04118567  0.7464648
## Absolute.Magnitude          0.5910539 -0.009507185 -0.32662054  0.2414866
## Est.Dia.in.Miles.min.      -0.5300961  0.086012586  0.34381894 -0.2915685
## Perihelion.Distance         0.2427178 -0.827036974  0.13493056 -0.4885232
## Inclination                -0.4119825 -0.116546389 -0.86902534 -0.2465916
##                                    PC5
## Minimum.Orbit.Intersection -0.07172153
## Absolute.Magnitude         -0.69682716
## Est.Dia.in.Miles.min.      -0.71300513
## Perihelion.Distance        -0.01538517
## Inclination                -0.02597903

10.7.4.2 QDA con maximizo AUR

qda_threshold <- search_max_aur_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)

biplot_fn_fp(test_set, qda_test_pred)

## Standard deviations (1, .., p=5):
## [1] 1.5419684 1.0397571 0.8210701 0.7286113 0.3210216
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1          PC2         PC3        PC4
## Minimum.Orbit.Intersection -0.3755154 -0.543082159  0.04118567  0.7464648
## Absolute.Magnitude          0.5910539 -0.009507185 -0.32662054  0.2414866
## Est.Dia.in.Miles.min.      -0.5300961  0.086012586  0.34381894 -0.2915685
## Perihelion.Distance         0.2427178 -0.827036974  0.13493056 -0.4885232
## Inclination                -0.4119825 -0.116546389 -0.86902534 -0.2465916
##                                    PC5
## Minimum.Orbit.Intersection -0.07172153
## Absolute.Magnitude         -0.69682716
## Est.Dia.in.Miles.min.      -0.71300513
## Perihelion.Distance        -0.01538517
## Inclination                -0.02597903

10.7.4.3 SVM con maximizo AUR

svm_threshold <- search_max_aur_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)

biplot_fn_fp(test_set, svm_test_pred)

## Standard deviations (1, .., p=5):
## [1] 1.5419684 1.0397571 0.8210701 0.7286113 0.3210216
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1          PC2         PC3        PC4
## Minimum.Orbit.Intersection -0.3755154 -0.543082159  0.04118567  0.7464648
## Absolute.Magnitude          0.5910539 -0.009507185 -0.32662054  0.2414866
## Est.Dia.in.Miles.min.      -0.5300961  0.086012586  0.34381894 -0.2915685
## Perihelion.Distance         0.2427178 -0.827036974  0.13493056 -0.4885232
## Inclination                -0.4119825 -0.116546389 -0.86902534 -0.2465916
##                                    PC5
## Minimum.Orbit.Intersection -0.07172153
## Absolute.Magnitude         -0.69682716
## Est.Dia.in.Miles.min.      -0.71300513
## Perihelion.Distance        -0.01538517
## Inclination                -0.02597903

11. KMeans Clustering

11.1. Escalamos los datos

Típica con estimadores de la normal:

ds_normal_scale <- scale(feat(ds_step_6))

Escalamiento diferente de la típica normal:

ds_max_min_scale <- max_min_scale(feat(ds_step_6))

11.2. Definimos el numero de clusters a utilizar

clustering_metrics_plot(ds_normal_scale)

clustering_metrics_plot(ds_max_min_scale)

Para cada método de normalización de variables veamos cuales es el valor de K que maximiza el Silhouette

data.frame(
  Scale = c('Normal', 'Min.Max'),
  Max.Silhouette.K = c(
    clustering_max_sil_k(ds_normal_scale),
    clustering_max_sil_k(ds_max_min_scale)
  )
) %>% arrange(desc(Max.Silhouette.K))

11.2. Kmeans

11.2.1 Usando K optimo

max_silhouette_k <- clustering_max_sil_k(ds_max_min_scale)

km_predictions <- kmeans_predict(
  ds_step_6,
  scale_fn    = max_min_scale, 
  features_fn = feat, 
  k           = max_silhouette_k
)
  
clusteging_pca_plot(km_predictions)

## Standard deviations (1, .., p=5):
## [1] 1.6124640 1.0407353 0.9143094 0.6536660 0.3507857
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1        PC2         PC3        PC4
## Minimum.Orbit.Intersection -0.3302822 -0.4627640 -0.07461278  0.8107627
## Absolute.Magnitude          0.5648344 -0.1150499  0.30452139  0.0830088
## Est.Dia.in.Miles.min.      -0.5720441  0.2360517 -0.38548112 -0.2275099
## Perihelion.Distance         0.1896775 -0.7404103 -0.51218354 -0.3917361
## Inclination                -0.4567974 -0.4107130  0.70055120 -0.3613244
##                                     PC5
## Minimum.Orbit.Intersection  0.117727875
## Absolute.Magnitude          0.753725355
## Est.Dia.in.Miles.min.       0.645514276
## Perihelion.Distance        -0.005084275
## Inclination                 0.036382780

plot_groups_by_hazardous(km_predictions)

km_data <- features_mean_by_group(km_predictions)

plot_minimum_orbit_intersection_by_group(km_data)

plot_absolute_magnitude_by_group(km_data)

plot_est_dia_in_miles_min_by_group(km_data)

plot_Perihelion.Distance_by_group(km_data)

plot_inclination_by_group(km_data)

km_predictions2 <- data.frame(km_predictions)
km_predictions2$cluster = km_predictions$Hazardous
clusteging_pca_plot(km_predictions2, labels = c('No', 'Si'),  colours = c("red", "blue"), alpha = 0.2)

## Standard deviations (1, .., p=5):
## [1] 1.6124640 1.0407353 0.9143094 0.6536660 0.3507857
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1        PC2         PC3        PC4
## Minimum.Orbit.Intersection -0.3302822 -0.4627640 -0.07461278  0.8107627
## Absolute.Magnitude          0.5648344 -0.1150499  0.30452139  0.0830088
## Est.Dia.in.Miles.min.      -0.5720441  0.2360517 -0.38548112 -0.2275099
## Perihelion.Distance         0.1896775 -0.7404103 -0.51218354 -0.3917361
## Inclination                -0.4567974 -0.4107130  0.70055120 -0.3613244
##                                     PC5
## Minimum.Orbit.Intersection  0.117727875
## Absolute.Magnitude          0.753725355
## Est.Dia.in.Miles.min.       0.645514276
## Perihelion.Distance        -0.005084275
## Inclination                 0.036382780

11.3 Hierarchical Clustering

Tomamos una muestra:

c(ds_hc, any) %<-% train_test_split(
  ds_step_6, 
  train_size = .015, 
  shuffle    = TRUE
)

## [1] "Train Set size: 54"
## [1] "Test set size: 3588"

rm(any)

Matriz de distancias euclídeas

distances_matrix <- dist(x = feat(ds_hc), method = "euclidean")

Dendrogramas (según el tipo de segmentación jerárquica aplicada)

hc_complete_model <- hclust(d = distances_matrix, method = "complete") 
hc_average_model  <- hclust(d = distances_matrix, method = "average")
hc_single_model   <- hclust(d = distances_matrix, method = "single")
hc_ward_model     <- hclust(d = distances_matrix, method = "ward.D2")

Calculo del coeficiente de correlación cofenetico

data.frame(
  Coeficiente = c(
    cor(x = distances_matrix, cophenetic(hc_complete_model)),
    cor(x = distances_matrix, cophenetic(hc_average_model)),
    cor(x = distances_matrix, cophenetic(hc_single_model)),
    cor(x = distances_matrix, cophenetic(hc_ward_model))
  ),
  Modelo = c('Complete', 'Average', 'Single', 'Ward')
) %>% arrange(desc(Coeficiente))

Grafica del dendograma para las predicciones del modelo Average:

plot_dendrogram(hc_average_model,  max_silhouette_k)

hc_predictions <- hc_predict(hc_average_model, ds_hc, max_silhouette_k)

clusteging_pca_plot(hc_predictions, alpha = 1)

## Standard deviations (1, .., p=5):
## [1] 1.3156236 0.9121555 0.6609760 0.5896885 0.1952982
## 
## Rotation (n x k) = (5 x 5):
##                                   PC1         PC2         PC3         PC4
## Minimum.Orbit.Intersection -0.4232060 -0.13676126 -0.89177030 -0.07722881
## Absolute.Magnitude          0.6237047  0.10318036 -0.33530575  0.47737960
## Est.Dia.in.Miles.min.      -0.3931483  0.07188953  0.22717982 -0.25194735
## Perihelion.Distance         0.3981020 -0.76741491 -0.02496605 -0.48967982
## Inclination                -0.3447294 -0.61364194  0.20020443  0.68035332
##                                   PC5
## Minimum.Orbit.Intersection 0.03121703
## Absolute.Magnitude         0.50992647
## Est.Dia.in.Miles.min.      0.85157393
## Perihelion.Distance        0.11036091
## Inclination                0.04053138

hc_data <- features_mean_by_group(hc_predictions)

plot_groups_by_hazardous(hc_predictions)

plot_minimum_orbit_intersection_by_group(feat(hc_data))

plot_absolute_magnitude_by_group(hc_data)

plot_est_dia_in_miles_min_by_group(hc_data)

plot_Perihelion.Distance_by_group(hc_data)

plot_inclination_by_group(hc_data)

Realizado por Adrian Marino

adrianmarino@gmail.com

Maestría en Explotación de datos y Descubrimiento de conocimiento Materia: Análisis inteligente de datos Trabajo práctico: Asteroides Peligrosos

Adrian Norberto Marino

2021/08/01