Aplicación de CRISP-DM para los datos de imports-85.data

Para mayores referencias, consulta en:

  1. Fuente de datos: abrir.
  2. Blog de referencia: abrir.
  3. GitHub: abrir.

1. DATA UNDERSTANDING

Lectura de datos:
data = read.csv('imports-85.data', sep = ',',header = F)
head(data,10)
##    V1  V2          V3  V4    V5   V6          V7  V8    V9   V10   V11  V12
## 1   3   ? alfa-romero gas   std  two convertible rwd front  88.6 168.8 64.1
## 2   3   ? alfa-romero gas   std  two convertible rwd front  88.6 168.8 64.1
## 3   1   ? alfa-romero gas   std  two   hatchback rwd front  94.5 171.2 65.5
## 4   2 164        audi gas   std four       sedan fwd front  99.8 176.6 66.2
## 5   2 164        audi gas   std four       sedan 4wd front  99.4 176.6 66.4
## 6   2   ?        audi gas   std  two       sedan fwd front  99.8 177.3 66.3
## 7   1 158        audi gas   std four       sedan fwd front 105.8 192.7 71.4
## 8   1   ?        audi gas   std four       wagon fwd front 105.8 192.7 71.4
## 9   1 158        audi gas turbo four       sedan fwd front 105.8 192.7 71.4
## 10  0   ?        audi gas turbo  two   hatchback 4wd front  99.5 178.2 67.9
##     V13  V14  V15  V16 V17  V18  V19  V20  V21 V22  V23 V24 V25   V26
## 1  48.8 2548 dohc four 130 mpfi 3.47 2.68  9.0 111 5000  21  27 13495
## 2  48.8 2548 dohc four 130 mpfi 3.47 2.68  9.0 111 5000  21  27 16500
## 3  52.4 2823 ohcv  six 152 mpfi 2.68 3.47  9.0 154 5000  19  26 16500
## 4  54.3 2337  ohc four 109 mpfi 3.19 3.40 10.0 102 5500  24  30 13950
## 5  54.3 2824  ohc five 136 mpfi 3.19 3.40  8.0 115 5500  18  22 17450
## 6  53.1 2507  ohc five 136 mpfi 3.19 3.40  8.5 110 5500  19  25 15250
## 7  55.7 2844  ohc five 136 mpfi 3.19 3.40  8.5 110 5500  19  25 17710
## 8  55.7 2954  ohc five 136 mpfi 3.19 3.40  8.5 110 5500  19  25 18920
## 9  55.9 3086  ohc five 131 mpfi 3.13 3.40  8.3 140 5500  17  20 23875
## 10 52.0 3053  ohc five 131 mpfi 3.13 3.40  7.0 160 5500  16  22     ?
Asignación de nombres de columnas de acuerdo al diccionario:
colnames(data) = c('symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
              'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base',
              'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
              'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate',
              'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price')
Declaración de variables numéricas:
continuous_values_cols = c('normalized-losses', 'wheel-base', 'length', 'width', 'height',
                          'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower',
                          'peak-rpm', 'city-mpg', 'highway-mpg', 'price')
Filtrado por variables numéricas:
data = data[,continuous_values_cols]
head(data,10)
##    normalized-losses wheel-base length width height curb-weight bore stroke
## 1                  ?       88.6  168.8  64.1   48.8        2548 3.47   2.68
## 2                  ?       88.6  168.8  64.1   48.8        2548 3.47   2.68
## 3                  ?       94.5  171.2  65.5   52.4        2823 2.68   3.47
## 4                164       99.8  176.6  66.2   54.3        2337 3.19   3.40
## 5                164       99.4  176.6  66.4   54.3        2824 3.19   3.40
## 6                  ?       99.8  177.3  66.3   53.1        2507 3.19   3.40
## 7                158      105.8  192.7  71.4   55.7        2844 3.19   3.40
## 8                  ?      105.8  192.7  71.4   55.7        2954 3.19   3.40
## 9                158      105.8  192.7  71.4   55.9        3086 3.13   3.40
## 10                 ?       99.5  178.2  67.9   52.0        3053 3.13   3.40
##    compression-rate horsepower peak-rpm city-mpg highway-mpg price
## 1               9.0        111     5000       21          27 13495
## 2               9.0        111     5000       21          27 16500
## 3               9.0        154     5000       19          26 16500
## 4              10.0        102     5500       24          30 13950
## 5               8.0        115     5500       18          22 17450
## 6               8.5        110     5500       19          25 15250
## 7               8.5        110     5500       19          25 17710
## 8               8.5        110     5500       19          25 18920
## 9               8.3        140     5500       17          20 23875
## 10              7.0        160     5500       16          22     ?

2. DATA PREPARATION

Cambio de ‘?’ a ‘NA’:
data[data == '?'] = NA 
Determinación del número de ‘NA’ por columna:
colSums(is.na(data))
## normalized-losses        wheel-base            length             width 
##                41                 0                 0                 0 
##            height       curb-weight              bore            stroke 
##                 0                 0                 4                 4 
##  compression-rate        horsepower          peak-rpm          city-mpg 
##                 0                 2                 2                 0 
##       highway-mpg             price 
##                 0                 4
Ubicación de ‘NA’:
which(is.na(data))
##  [1]    1    2    3    6    8   10   15   16   17   18   44   45   46   47   49
## [16]   50   64   67   72   74   75   76   83   84   85  110  111  114  115  125
## [31]  127  128  129  130  131  132  182  190  192  193  194 1286 1287 1288 1289
## [46] 1491 1492 1493 1494 1976 1977 2181 2182 2675 2710 2711 2795
Conteo de ‘NA’:
sum(is.na(data))
## [1] 57
Remoción de ‘NA’ presentes en la columna ‘price’:
library(tidyr)
data = drop_na(data, price) # o usando una 'pipe':  data = data %>% drop_na(price)
Determinación del número de ‘NA’ por columnas:
colSums(is.na(data))
## normalized-losses        wheel-base            length             width 
##                37                 0                 0                 0 
##            height       curb-weight              bore            stroke 
##                 0                 0                 4                 4 
##  compression-rate        horsepower          peak-rpm          city-mpg 
##                 0                 2                 2                 0 
##       highway-mpg             price 
##                 0                 0
Conversión a variables numéricas:
summary(data)
##  normalized-losses    wheel-base        length          width      
##  Length:201         Min.   : 86.6   Min.   :141.1   Min.   :60.30  
##  Class :character   1st Qu.: 94.5   1st Qu.:166.8   1st Qu.:64.10  
##  Mode  :character   Median : 97.0   Median :173.2   Median :65.50  
##                     Mean   : 98.8   Mean   :174.2   Mean   :65.89  
##                     3rd Qu.:102.4   3rd Qu.:183.5   3rd Qu.:66.60  
##                     Max.   :120.9   Max.   :208.1   Max.   :72.00  
##      height       curb-weight       bore              stroke         
##  Min.   :47.80   Min.   :1488   Length:201         Length:201        
##  1st Qu.:52.00   1st Qu.:2169   Class :character   Class :character  
##  Median :54.10   Median :2414   Mode  :character   Mode  :character  
##  Mean   :53.77   Mean   :2556                                        
##  3rd Qu.:55.50   3rd Qu.:2926                                        
##  Max.   :59.80   Max.   :4066                                        
##  compression-rate  horsepower          peak-rpm            city-mpg    
##  Min.   : 7.00    Length:201         Length:201         Min.   :13.00  
##  1st Qu.: 8.60    Class :character   Class :character   1st Qu.:19.00  
##  Median : 9.00    Mode  :character   Mode  :character   Median :24.00  
##  Mean   :10.16                                          Mean   :25.18  
##  3rd Qu.: 9.40                                          3rd Qu.:30.00  
##  Max.   :23.00                                          Max.   :49.00  
##   highway-mpg       price          
##  Min.   :16.00   Length:201        
##  1st Qu.:25.00   Class :character  
##  Median :30.00   Mode  :character  
##  Mean   :30.69                     
##  3rd Qu.:34.00                     
##  Max.   :54.00
data$`normalized-losses` = as.numeric(data$`normalized-losses`)
data$bore = as.numeric(data$bore)
data$stroke = as.numeric(data$stroke)
data$horsepower =  as.numeric(data$horsepower)
data$`peak-rpm` = as.numeric(data$`peak-rpm`)
data$price = as.numeric(data$price)
summary(data)
##  normalized-losses   wheel-base        length          width      
##  Min.   : 65       Min.   : 86.6   Min.   :141.1   Min.   :60.30  
##  1st Qu.: 94       1st Qu.: 94.5   1st Qu.:166.8   1st Qu.:64.10  
##  Median :115       Median : 97.0   Median :173.2   Median :65.50  
##  Mean   :122       Mean   : 98.8   Mean   :174.2   Mean   :65.89  
##  3rd Qu.:150       3rd Qu.:102.4   3rd Qu.:183.5   3rd Qu.:66.60  
##  Max.   :256       Max.   :120.9   Max.   :208.1   Max.   :72.00  
##  NA's   :37                                                       
##      height       curb-weight        bore           stroke     
##  Min.   :47.80   Min.   :1488   Min.   :2.540   Min.   :2.070  
##  1st Qu.:52.00   1st Qu.:2169   1st Qu.:3.150   1st Qu.:3.110  
##  Median :54.10   Median :2414   Median :3.310   Median :3.290  
##  Mean   :53.77   Mean   :2556   Mean   :3.331   Mean   :3.257  
##  3rd Qu.:55.50   3rd Qu.:2926   3rd Qu.:3.590   3rd Qu.:3.410  
##  Max.   :59.80   Max.   :4066   Max.   :3.940   Max.   :4.170  
##                                 NA's   :4       NA's   :4      
##  compression-rate   horsepower       peak-rpm       city-mpg    
##  Min.   : 7.00    Min.   : 48.0   Min.   :4150   Min.   :13.00  
##  1st Qu.: 8.60    1st Qu.: 70.0   1st Qu.:4800   1st Qu.:19.00  
##  Median : 9.00    Median : 95.0   Median :5200   Median :24.00  
##  Mean   :10.16    Mean   :103.4   Mean   :5118   Mean   :25.18  
##  3rd Qu.: 9.40    3rd Qu.:116.0   3rd Qu.:5500   3rd Qu.:30.00  
##  Max.   :23.00    Max.   :262.0   Max.   :6600   Max.   :49.00  
##                   NA's   :2       NA's   :2                     
##   highway-mpg        price      
##  Min.   :16.00   Min.   : 5118  
##  1st Qu.:25.00   1st Qu.: 7775  
##  Median :30.00   Median :10295  
##  Mean   :30.69   Mean   :13207  
##  3rd Qu.:34.00   3rd Qu.:16500  
##  Max.   :54.00   Max.   :45400  
## 
Reemplazo de ‘NA’ en columnas por sus medias correspondientes:
library(dplyr)
data = data %>% mutate_if(is.numeric, ~replace_na(.,mean(., na.rm = TRUE)))
colSums(is.na(data))
## normalized-losses        wheel-base            length             width 
##                 0                 0                 0                 0 
##            height       curb-weight              bore            stroke 
##                 0                 0                 0                 0 
##  compression-rate        horsepower          peak-rpm          city-mpg 
##                 0                 0                 0                 0 
##       highway-mpg             price 
##                 0                 0
Estandarización de variables en [0,1]:
target = data$price
data1 = sweep(sweep(data, 2, apply(data, 2, min)),2, apply(data, 2, max)-apply(data, 2, min), FUN = "/")
data1$price = target
summary(data1)
##  normalized-losses   wheel-base         length           width       
##  Min.   :0.0000    Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1885    1st Qu.:0.2303   1st Qu.:0.3836   1st Qu.:0.3248  
##  Median :0.2984    Median :0.3032   Median :0.4791   Median :0.4444  
##  Mean   :0.2984    Mean   :0.3556   Mean   :0.4940   Mean   :0.4777  
##  3rd Qu.:0.3770    3rd Qu.:0.4606   3rd Qu.:0.6328   3rd Qu.:0.5385  
##  Max.   :1.0000    Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height        curb-weight          bore            stroke      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3500   1st Qu.:0.2642   1st Qu.:0.4357   1st Qu.:0.4952  
##  Median :0.5250   Median :0.3592   Median :0.5500   Median :0.5810  
##  Mean   :0.4972   Mean   :0.4141   Mean   :0.5648   Mean   :0.5652  
##  3rd Qu.:0.6417   3rd Qu.:0.5578   3rd Qu.:0.7429   3rd Qu.:0.6381  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  compression-rate   horsepower        peak-rpm         city-mpg     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1000   1st Qu.:0.1028   1st Qu.:0.2653   1st Qu.:0.1667  
##  Median :0.1250   Median :0.2196   Median :0.3949   Median :0.3056  
##  Mean   :0.1978   Mean   :0.2589   Mean   :0.3949   Mean   :0.3383  
##  3rd Qu.:0.1500   3rd Qu.:0.3178   3rd Qu.:0.5510   3rd Qu.:0.4722  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   highway-mpg         price      
##  Min.   :0.0000   Min.   : 5118  
##  1st Qu.:0.2368   1st Qu.: 7775  
##  Median :0.3684   Median :10295  
##  Mean   :0.3865   Mean   :13207  
##  3rd Qu.:0.4737   3rd Qu.:16500  
##  Max.   :1.0000   Max.   :45400
Una forma más compacta de estandarización:
data2 = data
med = apply(data2, 2, min)
dev = apply(data2, 2, max)-apply(data2, 2, min)
data2 = as.data.frame(scale(data2, center = med, scale = dev))
data2$price = target
summary(data2)
##  normalized-losses   wheel-base         length           width       
##  Min.   :0.0000    Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1885    1st Qu.:0.2303   1st Qu.:0.3836   1st Qu.:0.3248  
##  Median :0.2984    Median :0.3032   Median :0.4791   Median :0.4444  
##  Mean   :0.2984    Mean   :0.3556   Mean   :0.4940   Mean   :0.4777  
##  3rd Qu.:0.3770    3rd Qu.:0.4606   3rd Qu.:0.6328   3rd Qu.:0.5385  
##  Max.   :1.0000    Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height        curb-weight          bore            stroke      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3500   1st Qu.:0.2642   1st Qu.:0.4357   1st Qu.:0.4952  
##  Median :0.5250   Median :0.3592   Median :0.5500   Median :0.5810  
##  Mean   :0.4972   Mean   :0.4141   Mean   :0.5648   Mean   :0.5652  
##  3rd Qu.:0.6417   3rd Qu.:0.5578   3rd Qu.:0.7429   3rd Qu.:0.6381  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  compression-rate   horsepower        peak-rpm         city-mpg     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1000   1st Qu.:0.1028   1st Qu.:0.2653   1st Qu.:0.1667  
##  Median :0.1250   Median :0.2196   Median :0.3949   Median :0.3056  
##  Mean   :0.1978   Mean   :0.2589   Mean   :0.3949   Mean   :0.3383  
##  3rd Qu.:0.1500   3rd Qu.:0.3178   3rd Qu.:0.5510   3rd Qu.:0.4722  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   highway-mpg         price      
##  Min.   :0.0000   Min.   : 5118  
##  1st Qu.:0.2368   1st Qu.: 7775  
##  Median :0.3684   Median :10295  
##  Mean   :0.3865   Mean   :13207  
##  3rd Qu.:0.4737   3rd Qu.:16500  
##  Max.   :1.0000   Max.   :45400

3. MODELLING

Análisis de correlación:
library(corrplot)
library(Hmisc)
corr = rcorr(as.matrix(data2))
corr$r
##                   normalized-losses  wheel-base      length       width
## normalized-losses        1.00000000 -0.05666124  0.01942356  0.08680206
## wheel-base              -0.05666124  1.00000000  0.87602389  0.81450665
## length                   0.01942356  0.87602389  1.00000000  0.85717032
## width                    0.08680206  0.81450665  0.85717032  1.00000000
## height                  -0.37373695  0.59074167  0.49206255  0.30600216
## curb-weight              0.09940425  0.78209724  0.88066479  0.86620110
## bore                    -0.02979985  0.49320299  0.60894083  0.54487909
## stroke                   0.05512732  0.15796369  0.12391279  0.18881359
## compression-rate        -0.11471325  0.25031309  0.15973311  0.18986712
## horsepower               0.21730000  0.37124988  0.57973062  0.61500603
## peak-rpm                 0.23954380 -0.36023264 -0.28603534 -0.24585165
## city-mpg                -0.22501573 -0.47060641 -0.66519239 -0.63353064
## highway-mpg             -0.18187718 -0.54330447 -0.69814185 -0.68063521
## price                    0.13399873  0.58464182  0.69062838  0.75126534
##                        height curb-weight         bore      stroke
## normalized-losses -0.37373695  0.09940425 -0.029799848  0.05512732
## wheel-base         0.59074167  0.78209724  0.493202986  0.15796369
## length             0.49206255  0.88066479  0.608940834  0.12391279
## width              0.30600216  0.86620110  0.544879092  0.18881359
## height             1.00000000  0.30758082  0.180326923 -0.06082202
## curb-weight        0.30758082  1.00000000  0.644040577  0.16741187
## bore               0.18032692  0.64404058  1.000000000 -0.05539001
## stroke            -0.06082202  0.16741187 -0.055390011  1.00000000
## compression-rate   0.25973714  0.15643261  0.001249645  0.18785352
## horsepower        -0.08694068  0.75799367  0.566837859  0.09781468
## peak-rpm          -0.30991346 -0.27934961 -0.267338383 -0.06371957
## city-mpg          -0.04979997 -0.74954309 -0.582121055 -0.03407894
## highway-mpg       -0.10481184 -0.79488894 -0.591390045 -0.03474142
## price              0.13548631  0.83441453  0.543153766  0.08226710
##                   compression-rate  horsepower    peak-rpm    city-mpg
## normalized-losses     -0.114713246  0.21730000  0.23954380 -0.22501573
## wheel-base             0.250313088  0.37124988 -0.36023264 -0.47060641
## length                 0.159733109  0.57973062 -0.28603534 -0.66519239
## width                  0.189867118  0.61500603 -0.24585165 -0.63353064
## height                 0.259737141 -0.08694068 -0.30991346 -0.04979997
## curb-weight            0.156432613  0.75799367 -0.27934961 -0.74954309
## bore                   0.001249645  0.56683786 -0.26733838 -0.58212106
## stroke                 0.187853516  0.09781468 -0.06371957 -0.03407894
## compression-rate       1.000000000 -0.21443063 -0.43572083  0.33142484
## horsepower            -0.214430629  1.00000000  0.10788157 -0.82213847
## peak-rpm              -0.435720829  0.10788157  1.00000000 -0.11535804
## city-mpg               0.331424839 -0.82213847 -0.11535804  1.00000000
## highway-mpg            0.268464848 -0.80458746 -0.05860516  0.97204371
## price                  0.071107327  0.80968120 -0.10154203 -0.68657101
##                   highway-mpg       price
## normalized-losses -0.18187718  0.13399873
## wheel-base        -0.54330447  0.58464182
## length            -0.69814185  0.69062838
## width             -0.68063521  0.75126534
## height            -0.10481184  0.13548631
## curb-weight       -0.79488894  0.83441453
## bore              -0.59139004  0.54315377
## stroke            -0.03474142  0.08226710
## compression-rate   0.26846485  0.07110733
## horsepower        -0.80458746  0.80968120
## peak-rpm          -0.05860516 -0.10154203
## city-mpg           0.97204371 -0.68657101
## highway-mpg        1.00000000 -0.70469227
## price             -0.70469227  1.00000000
Uso de orden jerárquico para ordenar la correlación (se tachan las correlaciones que no son significativamente diferentes de cero):
corr$P[is.na(corr$P)] = 0
corrplot(corr$r, p.mat = corr$P, method = 'circle', order = 'hclust')

\(\textbf{Ejercicio 1:}\) Calcula la correlación de normalized-losses y wheel-base usando la definición de correlación entre dos vectores. \(\textbf{Ejercicio 2:}\) Define el volumen con length*width*height y normaliza entre 1000. Grafica el boxplot del volumen de acuerdo al estilo del carro.

Plot de variables altamente correlacionadas:
plot(data2$horsepower,data$price, main = 'Price vs horsepower',
     xlab = 'horsepower', ylab = 'price', pch = 16, col = 'blue', cex = 1.5)

Para aplicar el método de clasificación suérvisada \(k\)-nn, puedes consultar ejemplos en knn. Una vez realizada la lectura, procedemos a aplicar el método en nuestro caso. Primero, veamos el impacto de una variable, por ejemplo height, en la descripción de la variable price.
set.seed(1987)
dat.d =  sample(1:nrow(data2),size=nrow(data2)*0.5,replace = FALSE)
entrenamiento =  data2[dat.d,-14] # 50% de datos para entrenar
prueba =  data2[-dat.d,-14]       # 50% de datos para probar

precio_entrenamiento =  target[dat.d]
precio_prueba = target[-dat.d]
library(class)
k1 = floor(sqrt(NROW(entrenamiento))) # Determinación empírica del número de vecinos
knn.k1 =  knn(train=as.matrix(entrenamiento$height), test=as.matrix(prueba$height), cl=precio_entrenamiento, k=k1)
Desempeño del algoritmo para \(k = k1\), basado en el \(r^2\)-score para la variable height.
x1 = as.vector(knn.k1)
plot(x1,precio_prueba)

r2score1 = rcorr(x1,precio_prueba)
r2score1
##      x    y
## x 1.00 0.16
## y 0.16 1.00
## 
## n= 101 
## 
## 
## P
##   x      y     
## x        0.1035
## y 0.1035
Ahora veamos el desempeño del algoritmo para \(k = 1,3,5,7,9,11,13,15,20\), basado en el \(r^2\)-score, para cada una de las 13 variables.
r2score = 0
k = c(1,3,5,7,9,11,13,15,20)
r2score = data.frame()
for (j in 1:13){
  for (i in 1:length(k)) {
    r2score[i,j] = rcorr(as.vector(knn(train=as.matrix(entrenamiento[,j]), 
                                     test=as.matrix(prueba[,j]),                                       
                                     cl=precio_entrenamiento, 
                                     k=k[i])),precio_prueba)[[1]][1,2]
  }
}
colnames(r2score) = colnames(data2)[-14]
re2score.graficar = data.frame(k = k, r2score) 
library(reshape2)  
graficar = melt(re2score.graficar, id = "k")
plot.k = ggplot(graficar,            
                   aes(x = k,
                       y = value,
                       color = variable)) +  geom_line()
plot.k

library(plotly)  
ggplotly(plot.k)