1. Introducción

Realizacion del Proyecto Fin de Master BigData 2019/2020 Vamos a realizar un analisis del dataset seleccionado “Breast Cancer Wisconsin” para presentar un modelo de Machine Learning que sea capaz de realizar predicciones sobre la diagnosis de un paciente dado, determinando si el cáncer que padece es Maligno o Benigno(M = malignant, B = benign). El dataset se encuentra disponible en la direccion https://www.kaggle.com/uciml/breast-cancer-wisconsin-data y la descripcion de las variables y de el tipo de cancer( variable objetivo) se encuentra tambien disponible en https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 descargando el fichero wdbc.names podemos encontrar el numero de observaciones y la descripcion de las variables.

Es interesante indicar, que las caracteristicas han sido obtenidas mediante un proceso computacional analizando imagenes de los nucleos de las celulas tumorales de las pacientes.

2. Carga de datos

Descargamos el archivo y leemos el contenido del fichero con las caracteristicas.

# descarga de ficheros
#fileURL <- "https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/download/breast-cancer-wisconsin-data.zip"
fileURL <- "https://query.data.world/s/d3ikssglnxd5hn3257bokisfg2qvtp"

if(! "downloader" %in% installed.packages())
   install.packages("downloader")
library(downloader)
## Warning: package 'downloader' was built under R version 3.6.3
download(fileURL,"F:/colmenar/master big data/PFM/datos/breast-cancer-wisconsin-data.zip", mode ="wb")
unzip("F:/colmenar/master big data/PFM/datos/breast-cancer-wisconsin-data.zip", exdir="F:/colmenar/master big data/PFM/datos")
fechaDescarga <- date() 
fechaDescarga
## [1] "Thu Aug 06 18:07:47 2020"
con <- file("F:/colmenar/master big data/PFM/datos/data.csv","r")
breastCan <- read.csv(con)
close(con)
kable(head(breastCan[,1:5]))
id diagnosis radius_mean texture_mean perimeter_mean
842302 M 17.99 10.38 122.80
842517 M 20.57 17.77 132.90
84300903 M 19.69 21.25 130.00
84348301 M 11.42 20.38 77.58
84358402 M 20.29 14.34 135.10
843786 M 12.45 15.70 82.57

3. Analisis descriptivo

Vamos a ir realizando una serie de estudios para ir conociendo el dataset y sus características Observaciones y variables del dataset

dim(breastCan) # observaciones y variables del dataset
## [1] 569  33
str(breastCan)
## 'data.frame':    569 obs. of  33 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ X                      : logi  NA NA NA NA NA NA ...

Vemos que hay: - Un campo de tipo factor con la variable objetivo - Todas las variables predictoras son numéricas - Hay una columna con valores NA

Busqueda de observaciones con valores nulos:

any(is.na.data.frame(breastCan)) # buscamos valores nulos
## [1] TRUE

Paso de todos los nombres de las columnas a minusculas

names(breastCan) <- tolower(names(breastCan)) # nombres de las variables a minusculas
names(breastCan)
##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave.points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave.points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave.points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"
## [33] "x"
kable(head(breastCan[,1:8])) # estructura del dataset
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760
842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864
84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990
84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390
84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280
843786 M 12.45 15.70 82.57 477.1 0.12780 0.17000

3.1. Añadimos nuevas variables

De la variable objetivo diagnosis la vamos a transformar en numerica creando una nueva columna “diag”, siendo 1 Maligno y 0 Benigno

breastCan$diag <- ifelse(breastCan$diagnosis =="M", 1, 0)
prop.table( table(breastCan$diag)) #proporcion respecto a la variable objetivo
## 
##         0         1 
## 0.6274165 0.3725835

Vamos a eliminar la columna X que contiene valores nulos y la columna id que no aporta nada

  library(dplyr)
   breastCan2 <- select(breastCan, -x,-id) 
  brc <- breastCan2
  brc <- as.data.frame(brc)
  any(is.na.data.frame(brc))
## [1] FALSE
  str(brc)
## 'data.frame':    569 obs. of  32 variables:
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ diag                   : num  1 1 1 1 1 1 1 1 1 1 ...

Visualizamos las variables que han quedado y sus principales valores estadísticos:

  summary(brc)
##  diagnosis  radius_mean      texture_mean   perimeter_mean  
##  B:357     Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  M:212     1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##            Median :13.370   Median :18.84   Median : 86.24  
##            Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##            3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##            Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
##  Median :0.03350     Median :0.1792   Median :0.06154       
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se    
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509  
##  Median :0.006380   Median :0.020450   Median :0.02589  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##  concave.points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.007638   1st Qu.:0.015160   1st Qu.:0.0022480   
##  Median :0.010930   Median :0.018730   Median :0.0031870   
##  Mean   :0.011796   Mean   :0.020542   Mean   :0.0037949   
##  3rd Qu.:0.014710   3rd Qu.:0.023480   3rd Qu.:0.0045580   
##  Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11   1st Qu.: 515.3  
##  Median :14.97   Median :25.41   Median : 97.66   Median : 686.5  
##  Mean   :16.27   Mean   :25.68   Mean   :107.26   Mean   : 880.6  
##  3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40   3rd Qu.:1084.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave.points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145   1st Qu.:0.06493     
##  Median :0.13130   Median :0.21190   Median :0.2267   Median :0.09993     
##  Mean   :0.13237   Mean   :0.25427   Mean   :0.2722   Mean   :0.11461     
##  3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829   3rd Qu.:0.16140     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst      diag       
##  Min.   :0.1565   Min.   :0.05504         Min.   :0.0000  
##  1st Qu.:0.2504   1st Qu.:0.07146         1st Qu.:0.0000  
##  Median :0.2822   Median :0.08004         Median :0.0000  
##  Mean   :0.2901   Mean   :0.08395         Mean   :0.3726  
##  3rd Qu.:0.3179   3rd Qu.:0.09208         3rd Qu.:1.0000  
##  Max.   :0.6638   Max.   :0.20750         Max.   :1.0000
  prop.table( table(brc$diag))
## 
##         0         1 
## 0.6274165 0.3725835

Número de observaciones y variables:

  dim(brc)
## [1] 569  32

3.2.Normalizar los datos

En el summary vemos que hay mucha distancia entre los valores de unas variables y otras.Vamos a normalizar los valores para que puedan ser bien representados en las graficas:

brc_n <- as.data.frame(lapply(brc[,2:31], scale, center = TRUE, scale = TRUE))
brc_n$diag <-  brc$diag

Visualizamos antes

summary(brc[,c("radius_mean", "area_mean", "smoothness_mean")])
##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
##  1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
##  Median :13.370   Median : 551.1   Median :0.09587  
##  Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
##  3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
##  Max.   :28.110   Max.   :2501.0   Max.   :0.16340

Y despues de normalizar:

summary(brc_n[,c("radius_mean", "area_mean", "smoothness_mean")])
##   radius_mean        area_mean       smoothness_mean   
##  Min.   :-2.0279   Min.   :-1.4532   Min.   :-3.10935  
##  1st Qu.:-0.6888   1st Qu.:-0.6666   1st Qu.:-0.71034  
##  Median :-0.2149   Median :-0.2949   Median :-0.03486  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.4690   3rd Qu.: 0.3632   3rd Qu.: 0.63564  
##  Max.   : 3.9678   Max.   : 5.2459   Max.   : 4.76672

3.3. Matriz de correlacion

Vamos a estudiar el grado de correlacion entra las variables.Las mas correladas entre predictoras no las voy a eliminar ya que usare los metodos de caret para que lo haga automaticamente, usando Findcorrelation:

Exportacion fichero JSON para d3 ( matriz de correlacion):

expD3Cor <-  correlacion %>% select (-diag) 
expD3Cor$VariableOrigen <- rownames(expD3Cor)
expD3Cor <- expD3Cor %>% filter(VariableOrigen != "diag")

Gráfica con la matriz de correlación:

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
par(mfrow=c(1,1)) 

corrplot(matCor, method = "shade",shade.col = NA, tl.col = "black",
          tl.cex = 0.6, number.cex = 0.3, mar=c(0,0,0,0),type = "upper",addCoef.col="black",is.corr=FALSE,
         diag = FALSE,order="AOE")

3.4.Reducción de la dimensionalidad: findcorrelation

# ensure the results are repeatable
set.seed(7)
# load the library
library(mlbench)
library(caret)
# calculate correlation matrix without Target
correlationMatrix <- cor(brc[,c("radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean",
                     "compactness_mean","concavity_mean","concave.points_mean","symmetry_mean","fractal_dimension_mean",
                     "radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se",
                     "concave.points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst",
                     "perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst",
                     "concave.points_worst","symmetry_worst","fractal_dimension_worst")])
# summarize the correlation matrix
kable(correlationMatrix[,1:8])
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave.points_mean
radius_mean 1.0000000 0.3237819 0.9978553 0.9873572 0.1705812 0.5061236 0.6767636 0.8225285
texture_mean 0.3237819 1.0000000 0.3295331 0.3210857 -0.0233885 0.2367022 0.3024178 0.2934641
perimeter_mean 0.9978553 0.3295331 1.0000000 0.9865068 0.2072782 0.5569362 0.7161357 0.8509770
area_mean 0.9873572 0.3210857 0.9865068 1.0000000 0.1770284 0.4985017 0.6859828 0.8232689
smoothness_mean 0.1705812 -0.0233885 0.2072782 0.1770284 1.0000000 0.6591232 0.5219838 0.5536952
compactness_mean 0.5061236 0.2367022 0.5569362 0.4985017 0.6591232 1.0000000 0.8831207 0.8311350
concavity_mean 0.6767636 0.3024178 0.7161357 0.6859828 0.5219838 0.8831207 1.0000000 0.9213910
concave.points_mean 0.8225285 0.2934641 0.8509770 0.8232689 0.5536952 0.8311350 0.9213910 1.0000000
symmetry_mean 0.1477412 0.0714010 0.1830272 0.1512931 0.5577748 0.6026410 0.5006666 0.4624974
fractal_dimension_mean -0.3116308 -0.0764372 -0.2614769 -0.2831098 0.5847920 0.5653687 0.3367834 0.1669174
radius_se 0.6790904 0.2758687 0.6917650 0.7325622 0.3014671 0.4974734 0.6319248 0.6980498
texture_se -0.0973174 0.3863576 -0.0867611 -0.0662802 0.0684064 0.0462048 0.0762183 0.0214796
perimeter_se 0.6741716 0.2816731 0.6931349 0.7266283 0.2960919 0.5489053 0.6603908 0.7106499
area_se 0.7358637 0.2598450 0.7449827 0.8000859 0.2465524 0.4556529 0.6174268 0.6902985
smoothness_se -0.2226001 0.0066138 -0.2026940 -0.1667767 0.3323754 0.1352993 0.0985637 0.0276533
compactness_se 0.2060000 0.1919746 0.2507437 0.2125826 0.3189433 0.7387218 0.6702788 0.4904242
concavity_se 0.1942036 0.1432931 0.2280823 0.2076601 0.2483957 0.5705169 0.6912702 0.4391671
concave.points_se 0.3761690 0.1638510 0.4072169 0.3723203 0.3806757 0.6422619 0.6832599 0.6156341
symmetry_se -0.1043209 0.0091272 -0.0816293 -0.0724966 0.2007744 0.2299766 0.1780092 0.0953508
fractal_dimension_se -0.0426413 0.0544575 -0.0055234 -0.0198870 0.2836067 0.5073181 0.4493007 0.2575837
radius_worst 0.9695390 0.3525729 0.9694764 0.9627461 0.2131201 0.5353154 0.6882364 0.8303176
texture_worst 0.2970076 0.9120446 0.3030384 0.2874886 0.0360718 0.2481328 0.2998789 0.2927517
perimeter_worst 0.9651365 0.3580396 0.9703869 0.9591196 0.2388526 0.5902104 0.7295649 0.8559231
area_worst 0.9410825 0.3435459 0.9415498 0.9592133 0.2067184 0.5096038 0.6759872 0.8096296
smoothness_worst 0.1196161 0.0775034 0.1505494 0.1235229 0.8053242 0.5655412 0.4488220 0.4527531
compactness_worst 0.4134628 0.2778296 0.4557742 0.3904103 0.4724684 0.8658090 0.7549680 0.6674537
concavity_worst 0.5269115 0.3010252 0.5638793 0.5126059 0.4349257 0.8162752 0.8841026 0.7523995
concave.points_worst 0.7442142 0.2953158 0.7712408 0.7220166 0.5030534 0.8155732 0.8613230 0.9101553
symmetry_worst 0.1639533 0.1050079 0.1891150 0.1435699 0.3943095 0.5102234 0.4094641 0.3757441
fractal_dimension_worst 0.0070659 0.1192054 0.0510185 0.0037376 0.4993164 0.6873823 0.5149299 0.3686611
Variables altamente correl adas:
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
##  [1]  7  8  6 28 27 23 21  3 26 24  1 13 18 16 14  5 10  2
# Fichero con las variables y una nueva columna que indica su eliminacion por predictor altamente correlado
expD3CorFind <-expD3Cor
expD3CorFind$Eliminada <- expD3CorFind$VariableOrigen %in% names(brc[highlyCorrelated])
CorCuantas <- expD3CorFind %>% group_by(Eliminada) %>% summarise(n=n())
CorCuantas
## # A tibble: 2 x 2
##   Eliminada     n
##   <lgl>     <int>
## 1 FALSE        13
## 2 TRUE         17

Los nombres de las variables mas correladas son:

print(names(brc[highlyCorrelated]))
##  [1] "compactness_mean"     "concavity_mean"       "smoothness_mean"     
##  [4] "concavity_worst"      "compactness_worst"    "texture_worst"       
##  [7] "fractal_dimension_se" "texture_mean"         "smoothness_worst"    
## [10] "perimeter_worst"      "diagnosis"            "texture_se"          
## [13] "concavity_se"         "smoothness_se"        "perimeter_se"        
## [16] "area_mean"            "symmetry_mean"        "radius_mean"

3.5. Vamos a usar el algoritmo LVQ para determinar la importancia de las variables

# ensure results are repeatable
set.seed(7)
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
brc2 <- select(brc, -diagnosis)
brc2$diag = factor(brc$diag)
brc2 <- as.data.frame(brc2)
dim(brc2)
## [1] 569  31
summary(brc2)
##   radius_mean      texture_mean   perimeter_mean     area_mean     
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79   Min.   : 143.5  
##  1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17   1st Qu.: 420.3  
##  Median :13.370   Median :18.84   Median : 86.24   Median : 551.1  
##  Mean   :14.127   Mean   :19.29   Mean   : 91.97   Mean   : 654.9  
##  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10   3rd Qu.: 782.7  
##  Max.   :28.110   Max.   :39.28   Max.   :188.50   Max.   :2501.0  
##  smoothness_mean   compactness_mean  concavity_mean    concave.points_mean
##  Min.   :0.05263   Min.   :0.01938   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956   1st Qu.:0.02031    
##  Median :0.09587   Median :0.09263   Median :0.06154   Median :0.03350    
##  Mean   :0.09636   Mean   :0.10434   Mean   :0.08880   Mean   :0.04892    
##  3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070   3rd Qu.:0.07400    
##  Max.   :0.16340   Max.   :0.34540   Max.   :0.42680   Max.   :0.20120    
##  symmetry_mean    fractal_dimension_mean   radius_se        texture_se    
##  Min.   :0.1060   Min.   :0.04996        Min.   :0.1115   Min.   :0.3602  
##  1st Qu.:0.1619   1st Qu.:0.05770        1st Qu.:0.2324   1st Qu.:0.8339  
##  Median :0.1792   Median :0.06154        Median :0.3242   Median :1.1080  
##  Mean   :0.1812   Mean   :0.06280        Mean   :0.4052   Mean   :1.2169  
##  3rd Qu.:0.1957   3rd Qu.:0.06612        3rd Qu.:0.4789   3rd Qu.:1.4740  
##  Max.   :0.3040   Max.   :0.09744        Max.   :2.8730   Max.   :4.8850  
##   perimeter_se       area_se        smoothness_se      compactness_se    
##  Min.   : 0.757   Min.   :  6.802   Min.   :0.001713   Min.   :0.002252  
##  1st Qu.: 1.606   1st Qu.: 17.850   1st Qu.:0.005169   1st Qu.:0.013080  
##  Median : 2.287   Median : 24.530   Median :0.006380   Median :0.020450  
##  Mean   : 2.866   Mean   : 40.337   Mean   :0.007041   Mean   :0.025478  
##  3rd Qu.: 3.357   3rd Qu.: 45.190   3rd Qu.:0.008146   3rd Qu.:0.032450  
##  Max.   :21.980   Max.   :542.200   Max.   :0.031130   Max.   :0.135400  
##   concavity_se     concave.points_se   symmetry_se      
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.007882  
##  1st Qu.:0.01509   1st Qu.:0.007638   1st Qu.:0.015160  
##  Median :0.02589   Median :0.010930   Median :0.018730  
##  Mean   :0.03189   Mean   :0.011796   Mean   :0.020542  
##  3rd Qu.:0.04205   3rd Qu.:0.014710   3rd Qu.:0.023480  
##  Max.   :0.39600   Max.   :0.052790   Max.   :0.078950  
##  fractal_dimension_se  radius_worst   texture_worst   perimeter_worst 
##  Min.   :0.0008948    Min.   : 7.93   Min.   :12.02   Min.   : 50.41  
##  1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11  
##  Median :0.0031870    Median :14.97   Median :25.41   Median : 97.66  
##  Mean   :0.0037949    Mean   :16.27   Mean   :25.68   Mean   :107.26  
##  3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40  
##  Max.   :0.0298400    Max.   :36.04   Max.   :49.54   Max.   :251.20  
##    area_worst     smoothness_worst  compactness_worst concavity_worst 
##  Min.   : 185.2   Min.   :0.07117   Min.   :0.02729   Min.   :0.0000  
##  1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145  
##  Median : 686.5   Median :0.13130   Median :0.21190   Median :0.2267  
##  Mean   : 880.6   Mean   :0.13237   Mean   :0.25427   Mean   :0.2722  
##  3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829  
##  Max.   :4254.0   Max.   :0.22260   Max.   :1.05800   Max.   :1.2520  
##  concave.points_worst symmetry_worst   fractal_dimension_worst diag   
##  Min.   :0.00000      Min.   :0.1565   Min.   :0.05504         0:357  
##  1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146         1:212  
##  Median :0.09993      Median :0.2822   Median :0.08004                
##  Mean   :0.11461      Mean   :0.2901   Mean   :0.08395                
##  3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208                
##  Max.   :0.29100      Max.   :0.6638   Max.   :0.20750
model <- train(diag ~ ., data=brc2, method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## ROC curve variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                      Importance
## perimeter_worst          0.9755
## radius_worst             0.9704
## area_worst               0.9698
## concave.points_worst     0.9667
## concave.points_mean      0.9644
## perimeter_mean           0.9469
## area_mean                0.9383
## concavity_mean           0.9378
## radius_mean              0.9375
## area_se                  0.9264
## concavity_worst          0.9214
## perimeter_se             0.8764
## radius_se                0.8683
## compactness_mean         0.8638
## compactness_worst        0.8623
## concave.points_se        0.7918
## texture_worst            0.7846
## concavity_se             0.7808
## texture_mean             0.7758
## smoothness_worst         0.7541
#Datos a exportar para d3.js( importancia sobre vae.objetivo)
expd3 <-importance$importance %>% select (correlacion = X0)
expd3$variable <- rownames(expd3)

Representacion de las variables mas importantes respecto a la variable objetivo diag:

# plot importance
plot(importance)

## 3.6.Seleccion de variables usando el metodo RFE

# ensure the results are repeatable
set.seed(7)
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm
results <- rfe(brc2[,1:30], brc2[,31], sizes=c(1:30), rfeControl=control)
# summarize the results
print(results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          1   0.8633 0.7049    0.05188 0.11050         
##          2   0.9315 0.8530    0.03425 0.07490         
##          3   0.9264 0.8415    0.04934 0.10848         
##          4   0.9402 0.8720    0.02652 0.05534         
##          5   0.9543 0.9033    0.03727 0.07695         
##          6   0.9543 0.9027    0.02654 0.05448         
##          7   0.9578 0.9099    0.02249 0.04713         
##          8   0.9666 0.9284    0.01546 0.03280         
##          9   0.9612 0.9172    0.01862 0.03897         
##         10   0.9648 0.9251    0.02209 0.04646         
##         11   0.9701 0.9360    0.01687 0.03593         
##         12   0.9718 0.9399    0.01907 0.04068        *
##         13   0.9683 0.9325    0.01832 0.03872         
##         14   0.9683 0.9325    0.01832 0.03872         
##         15   0.9666 0.9289    0.02424 0.05114         
##         16   0.9683 0.9325    0.02331 0.04935         
##         17   0.9700 0.9362    0.02228 0.04707         
##         18   0.9665 0.9286    0.01964 0.04134         
##         19   0.9648 0.9251    0.02209 0.04646         
##         20   0.9683 0.9325    0.01832 0.03872         
##         21   0.9683 0.9323    0.02191 0.04635         
##         22   0.9701 0.9360    0.02043 0.04360         
##         23   0.9631 0.9210    0.02107 0.04472         
##         24   0.9630 0.9208    0.01773 0.03711         
##         25   0.9666 0.9287    0.02123 0.04482         
##         26   0.9683 0.9324    0.02026 0.04267         
##         27   0.9684 0.9324    0.02596 0.05528         
##         28   0.9630 0.9209    0.01956 0.04123         
##         29   0.9578 0.9096    0.01704 0.03600         
##         30   0.9631 0.9209    0.01556 0.03234         
## 
## The top 5 variables (out of 12):
##    area_worst, concave.points_worst, perimeter_worst, radius_worst, texture_worst
# list the chosen features
predictors(results)
##  [1] "area_worst"           "concave.points_worst" "perimeter_worst"     
##  [4] "radius_worst"         "texture_worst"        "concave.points_mean" 
##  [7] "area_se"              "texture_mean"         "concavity_worst"     
## [10] "smoothness_worst"     "concavity_mean"       "area_mean"

Representamos los resultados graficamente, precision usando cross validation

# plot the results
plot(results, type=c("g", "o"))

Del resultado de esta grafica podemos interpretar que las 12 variables mas correladas con la variable objetivo son el punto optimo para nuestro dataset de trabajo.

3.7.Análisis gráfico de las variables

Eliminamos las variables devueltas por findcorrelation y visualizamos las variables mas correladas:

cor.brc <- brc[,-highlyCorrelated]
pairs(cor.brc)

Análisis :perimeter_mean

brc_graph = brc
# distribucion normalizada
brc_graph_n = brc_n
brc_graph$diagCategorico <- ifelse(brc$diag == 1, "Maligno", "Benigno")
brc_graph_n$diagCategorico <- ifelse(brc_n$diag == 1, "Maligno", "Benigno")
library(ggplot2)
ggplot(brc_graph, aes(x = perimeter_mean)) + geom_histogram(binwidth = 1, fill = "green", colour = "black") + facet_grid(diagCategorico ~ .) + ggtitle ("Fig 1.1. Histograma perimeter_mean por tipo de tumor") + theme(plot.title=element_text(vjust = +1.5, size = 12))

Análisis: radius_worst

ggplot(brc_graph, aes(x = radius_worst)) + geom_histogram(binwidth = 1, fill = "red", colour = "black") + facet_grid(diagCategorico ~ .) + ggtitle ("Fig 1.2. Histograma radius_worst por tipo de tumor") + theme(plot.title=element_text(vjust = +1.5, size = 12))