Realizacion del Proyecto Fin de Master BigData 2019/2020 Vamos a realizar un analisis del dataset seleccionado “Breast Cancer Wisconsin” para presentar un modelo de Machine Learning que sea capaz de realizar predicciones sobre la diagnosis de un paciente dado, determinando si el cáncer que padece es Maligno o Benigno(M = malignant, B = benign). El dataset se encuentra disponible en la direccion https://www.kaggle.com/uciml/breast-cancer-wisconsin-data y la descripcion de las variables y de el tipo de cancer( variable objetivo) se encuentra tambien disponible en https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 descargando el fichero wdbc.names podemos encontrar el numero de observaciones y la descripcion de las variables.
Es interesante indicar, que las caracteristicas han sido obtenidas mediante un proceso computacional analizando imagenes de los nucleos de las celulas tumorales de las pacientes.
Descargamos el archivo y leemos el contenido del fichero con las caracteristicas.
# descarga de ficheros
#fileURL <- "https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/download/breast-cancer-wisconsin-data.zip"
fileURL <- "https://query.data.world/s/d3ikssglnxd5hn3257bokisfg2qvtp"
if(! "downloader" %in% installed.packages())
install.packages("downloader")
library(downloader)
## Warning: package 'downloader' was built under R version 3.6.3
download(fileURL,"F:/colmenar/master big data/PFM/datos/breast-cancer-wisconsin-data.zip", mode ="wb")
unzip("F:/colmenar/master big data/PFM/datos/breast-cancer-wisconsin-data.zip", exdir="F:/colmenar/master big data/PFM/datos")
fechaDescarga <- date()
fechaDescarga
## [1] "Thu Aug 06 18:07:47 2020"
con <- file("F:/colmenar/master big data/PFM/datos/data.csv","r")
breastCan <- read.csv(con)
close(con)
kable(head(breastCan[,1:5]))
| id | diagnosis | radius_mean | texture_mean | perimeter_mean |
|---|---|---|---|---|
| 842302 | M | 17.99 | 10.38 | 122.80 |
| 842517 | M | 20.57 | 17.77 | 132.90 |
| 84300903 | M | 19.69 | 21.25 | 130.00 |
| 84348301 | M | 11.42 | 20.38 | 77.58 |
| 84358402 | M | 20.29 | 14.34 | 135.10 |
| 843786 | M | 12.45 | 15.70 | 82.57 |
Vamos a ir realizando una serie de estudios para ir conociendo el dataset y sus características Observaciones y variables del dataset
dim(breastCan) # observaciones y variables del dataset
## [1] 569 33
str(breastCan)
## 'data.frame': 569 obs. of 33 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ X : logi NA NA NA NA NA NA ...
Vemos que hay: - Un campo de tipo factor con la variable objetivo - Todas las variables predictoras son numéricas - Hay una columna con valores NA
Busqueda de observaciones con valores nulos:
any(is.na.data.frame(breastCan)) # buscamos valores nulos
## [1] TRUE
Paso de todos los nombres de las columnas a minusculas
names(breastCan) <- tolower(names(breastCan)) # nombres de las variables a minusculas
names(breastCan)
## [1] "id" "diagnosis"
## [3] "radius_mean" "texture_mean"
## [5] "perimeter_mean" "area_mean"
## [7] "smoothness_mean" "compactness_mean"
## [9] "concavity_mean" "concave.points_mean"
## [11] "symmetry_mean" "fractal_dimension_mean"
## [13] "radius_se" "texture_se"
## [15] "perimeter_se" "area_se"
## [17] "smoothness_se" "compactness_se"
## [19] "concavity_se" "concave.points_se"
## [21] "symmetry_se" "fractal_dimension_se"
## [23] "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst"
## [27] "smoothness_worst" "compactness_worst"
## [29] "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_dimension_worst"
## [33] "x"
kable(head(breastCan[,1:8])) # estructura del dataset
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean |
|---|---|---|---|---|---|---|---|
| 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 |
| 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 |
| 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 |
| 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 |
| 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 |
| 843786 | M | 12.45 | 15.70 | 82.57 | 477.1 | 0.12780 | 0.17000 |
De la variable objetivo diagnosis la vamos a transformar en numerica creando una nueva columna “diag”, siendo 1 Maligno y 0 Benigno
breastCan$diag <- ifelse(breastCan$diagnosis =="M", 1, 0)
prop.table( table(breastCan$diag)) #proporcion respecto a la variable objetivo
##
## 0 1
## 0.6274165 0.3725835
Vamos a eliminar la columna X que contiene valores nulos y la columna id que no aporta nada
library(dplyr)
breastCan2 <- select(breastCan, -x,-id)
brc <- breastCan2
brc <- as.data.frame(brc)
any(is.na.data.frame(brc))
## [1] FALSE
str(brc)
## 'data.frame': 569 obs. of 32 variables:
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ diag : num 1 1 1 1 1 1 1 1 1 1 ...
Visualizamos las variables que han quedado y sus principales valores estadísticos:
summary(brc)
## diagnosis radius_mean texture_mean perimeter_mean
## B:357 Min. : 6.981 Min. : 9.71 Min. : 43.79
## M:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.03350 Median :0.1792 Median :0.06154
## Mean :0.04892 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612
## Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
## Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
## Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
## 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000
## 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509
## Median :0.006380 Median :0.020450 Median :0.02589
## Mean :0.007041 Mean :0.025478 Mean :0.03189
## 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205
## Max. :0.031130 Max. :0.135400 Max. :0.39600
## concave.points_se symmetry_se fractal_dimension_se
## Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
## Median :0.010930 Median :0.018730 Median :0.0031870
## Mean :0.011796 Mean :0.020542 Mean :0.0037949
## 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
## Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
## Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
## Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
## 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
## Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
## Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
## 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
## symmetry_worst fractal_dimension_worst diag
## Min. :0.1565 Min. :0.05504 Min. :0.0000
## 1st Qu.:0.2504 1st Qu.:0.07146 1st Qu.:0.0000
## Median :0.2822 Median :0.08004 Median :0.0000
## Mean :0.2901 Mean :0.08395 Mean :0.3726
## 3rd Qu.:0.3179 3rd Qu.:0.09208 3rd Qu.:1.0000
## Max. :0.6638 Max. :0.20750 Max. :1.0000
prop.table( table(brc$diag))
##
## 0 1
## 0.6274165 0.3725835
Número de observaciones y variables:
dim(brc)
## [1] 569 32
En el summary vemos que hay mucha distancia entre los valores de unas variables y otras.Vamos a normalizar los valores para que puedan ser bien representados en las graficas:
brc_n <- as.data.frame(lapply(brc[,2:31], scale, center = TRUE, scale = TRUE))
brc_n$diag <- brc$diag
Visualizamos antes
summary(brc[,c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. : 6.981 Min. : 143.5 Min. :0.05263
## 1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
## Median :13.370 Median : 551.1 Median :0.09587
## Mean :14.127 Mean : 654.9 Mean :0.09636
## 3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
## Max. :28.110 Max. :2501.0 Max. :0.16340
Y despues de normalizar:
summary(brc_n[,c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. :-2.0279 Min. :-1.4532 Min. :-3.10935
## 1st Qu.:-0.6888 1st Qu.:-0.6666 1st Qu.:-0.71034
## Median :-0.2149 Median :-0.2949 Median :-0.03486
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.4690 3rd Qu.: 0.3632 3rd Qu.: 0.63564
## Max. : 3.9678 Max. : 5.2459 Max. : 4.76672
Vamos a estudiar el grado de correlacion entra las variables.Las mas correladas entre predictoras no las voy a eliminar ya que usare los metodos de caret para que lo haga automaticamente, usando Findcorrelation:
Exportacion fichero JSON para d3 ( matriz de correlacion):
expD3Cor <- correlacion %>% select (-diag)
expD3Cor$VariableOrigen <- rownames(expD3Cor)
expD3Cor <- expD3Cor %>% filter(VariableOrigen != "diag")
Gráfica con la matriz de correlación:
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
par(mfrow=c(1,1))
corrplot(matCor, method = "shade",shade.col = NA, tl.col = "black",
tl.cex = 0.6, number.cex = 0.3, mar=c(0,0,0,0),type = "upper",addCoef.col="black",is.corr=FALSE,
diag = FALSE,order="AOE")
# ensure the results are repeatable
set.seed(7)
# load the library
library(mlbench)
library(caret)
# calculate correlation matrix without Target
correlationMatrix <- cor(brc[,c("radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean",
"compactness_mean","concavity_mean","concave.points_mean","symmetry_mean","fractal_dimension_mean",
"radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se",
"concave.points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst",
"perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst",
"concave.points_worst","symmetry_worst","fractal_dimension_worst")])
# summarize the correlation matrix
kable(correlationMatrix[,1:8])
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave.points_mean | |
|---|---|---|---|---|---|---|---|---|
| radius_mean | 1.0000000 | 0.3237819 | 0.9978553 | 0.9873572 | 0.1705812 | 0.5061236 | 0.6767636 | 0.8225285 |
| texture_mean | 0.3237819 | 1.0000000 | 0.3295331 | 0.3210857 | -0.0233885 | 0.2367022 | 0.3024178 | 0.2934641 |
| perimeter_mean | 0.9978553 | 0.3295331 | 1.0000000 | 0.9865068 | 0.2072782 | 0.5569362 | 0.7161357 | 0.8509770 |
| area_mean | 0.9873572 | 0.3210857 | 0.9865068 | 1.0000000 | 0.1770284 | 0.4985017 | 0.6859828 | 0.8232689 |
| smoothness_mean | 0.1705812 | -0.0233885 | 0.2072782 | 0.1770284 | 1.0000000 | 0.6591232 | 0.5219838 | 0.5536952 |
| compactness_mean | 0.5061236 | 0.2367022 | 0.5569362 | 0.4985017 | 0.6591232 | 1.0000000 | 0.8831207 | 0.8311350 |
| concavity_mean | 0.6767636 | 0.3024178 | 0.7161357 | 0.6859828 | 0.5219838 | 0.8831207 | 1.0000000 | 0.9213910 |
| concave.points_mean | 0.8225285 | 0.2934641 | 0.8509770 | 0.8232689 | 0.5536952 | 0.8311350 | 0.9213910 | 1.0000000 |
| symmetry_mean | 0.1477412 | 0.0714010 | 0.1830272 | 0.1512931 | 0.5577748 | 0.6026410 | 0.5006666 | 0.4624974 |
| fractal_dimension_mean | -0.3116308 | -0.0764372 | -0.2614769 | -0.2831098 | 0.5847920 | 0.5653687 | 0.3367834 | 0.1669174 |
| radius_se | 0.6790904 | 0.2758687 | 0.6917650 | 0.7325622 | 0.3014671 | 0.4974734 | 0.6319248 | 0.6980498 |
| texture_se | -0.0973174 | 0.3863576 | -0.0867611 | -0.0662802 | 0.0684064 | 0.0462048 | 0.0762183 | 0.0214796 |
| perimeter_se | 0.6741716 | 0.2816731 | 0.6931349 | 0.7266283 | 0.2960919 | 0.5489053 | 0.6603908 | 0.7106499 |
| area_se | 0.7358637 | 0.2598450 | 0.7449827 | 0.8000859 | 0.2465524 | 0.4556529 | 0.6174268 | 0.6902985 |
| smoothness_se | -0.2226001 | 0.0066138 | -0.2026940 | -0.1667767 | 0.3323754 | 0.1352993 | 0.0985637 | 0.0276533 |
| compactness_se | 0.2060000 | 0.1919746 | 0.2507437 | 0.2125826 | 0.3189433 | 0.7387218 | 0.6702788 | 0.4904242 |
| concavity_se | 0.1942036 | 0.1432931 | 0.2280823 | 0.2076601 | 0.2483957 | 0.5705169 | 0.6912702 | 0.4391671 |
| concave.points_se | 0.3761690 | 0.1638510 | 0.4072169 | 0.3723203 | 0.3806757 | 0.6422619 | 0.6832599 | 0.6156341 |
| symmetry_se | -0.1043209 | 0.0091272 | -0.0816293 | -0.0724966 | 0.2007744 | 0.2299766 | 0.1780092 | 0.0953508 |
| fractal_dimension_se | -0.0426413 | 0.0544575 | -0.0055234 | -0.0198870 | 0.2836067 | 0.5073181 | 0.4493007 | 0.2575837 |
| radius_worst | 0.9695390 | 0.3525729 | 0.9694764 | 0.9627461 | 0.2131201 | 0.5353154 | 0.6882364 | 0.8303176 |
| texture_worst | 0.2970076 | 0.9120446 | 0.3030384 | 0.2874886 | 0.0360718 | 0.2481328 | 0.2998789 | 0.2927517 |
| perimeter_worst | 0.9651365 | 0.3580396 | 0.9703869 | 0.9591196 | 0.2388526 | 0.5902104 | 0.7295649 | 0.8559231 |
| area_worst | 0.9410825 | 0.3435459 | 0.9415498 | 0.9592133 | 0.2067184 | 0.5096038 | 0.6759872 | 0.8096296 |
| smoothness_worst | 0.1196161 | 0.0775034 | 0.1505494 | 0.1235229 | 0.8053242 | 0.5655412 | 0.4488220 | 0.4527531 |
| compactness_worst | 0.4134628 | 0.2778296 | 0.4557742 | 0.3904103 | 0.4724684 | 0.8658090 | 0.7549680 | 0.6674537 |
| concavity_worst | 0.5269115 | 0.3010252 | 0.5638793 | 0.5126059 | 0.4349257 | 0.8162752 | 0.8841026 | 0.7523995 |
| concave.points_worst | 0.7442142 | 0.2953158 | 0.7712408 | 0.7220166 | 0.5030534 | 0.8155732 | 0.8613230 | 0.9101553 |
| symmetry_worst | 0.1639533 | 0.1050079 | 0.1891150 | 0.1435699 | 0.3943095 | 0.5102234 | 0.4094641 | 0.3757441 |
| fractal_dimension_worst | 0.0070659 | 0.1192054 | 0.0510185 | 0.0037376 | 0.4993164 | 0.6873823 | 0.5149299 | 0.3686611 |
| Variables altamente correl | adas: |
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
## [1] 7 8 6 28 27 23 21 3 26 24 1 13 18 16 14 5 10 2
# Fichero con las variables y una nueva columna que indica su eliminacion por predictor altamente correlado
expD3CorFind <-expD3Cor
expD3CorFind$Eliminada <- expD3CorFind$VariableOrigen %in% names(brc[highlyCorrelated])
CorCuantas <- expD3CorFind %>% group_by(Eliminada) %>% summarise(n=n())
CorCuantas
## # A tibble: 2 x 2
## Eliminada n
## <lgl> <int>
## 1 FALSE 13
## 2 TRUE 17
Los nombres de las variables mas correladas son:
print(names(brc[highlyCorrelated]))
## [1] "compactness_mean" "concavity_mean" "smoothness_mean"
## [4] "concavity_worst" "compactness_worst" "texture_worst"
## [7] "fractal_dimension_se" "texture_mean" "smoothness_worst"
## [10] "perimeter_worst" "diagnosis" "texture_se"
## [13] "concavity_se" "smoothness_se" "perimeter_se"
## [16] "area_mean" "symmetry_mean" "radius_mean"
# ensure results are repeatable
set.seed(7)
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
brc2 <- select(brc, -diagnosis)
brc2$diag = factor(brc$diag)
brc2 <- as.data.frame(brc2)
dim(brc2)
## [1] 569 31
summary(brc2)
## radius_mean texture_mean perimeter_mean area_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
## Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
## Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
## Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
## Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
## Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
## 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
## Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
## symmetry_mean fractal_dimension_mean radius_se texture_se
## Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602
## 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339
## Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080
## Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169
## 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740
## Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850
## perimeter_se area_se smoothness_se compactness_se
## Min. : 0.757 Min. : 6.802 Min. :0.001713 Min. :0.002252
## 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080
## Median : 2.287 Median : 24.530 Median :0.006380 Median :0.020450
## Mean : 2.866 Mean : 40.337 Mean :0.007041 Mean :0.025478
## 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450
## Max. :21.980 Max. :542.200 Max. :0.031130 Max. :0.135400
## concavity_se concave.points_se symmetry_se
## Min. :0.00000 Min. :0.000000 Min. :0.007882
## 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160
## Median :0.02589 Median :0.010930 Median :0.018730
## Mean :0.03189 Mean :0.011796 Mean :0.020542
## 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480
## Max. :0.39600 Max. :0.052790 Max. :0.078950
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
## Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
## Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
## 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
## Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000
## 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145
## Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267
## Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722
## 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829
## Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520
## concave.points_worst symmetry_worst fractal_dimension_worst diag
## Min. :0.00000 Min. :0.1565 Min. :0.05504 0:357
## 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146 1:212
## Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.29100 Max. :0.6638 Max. :0.20750
model <- train(diag ~ ., data=brc2, method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## ROC curve variable importance
##
## only 20 most important variables shown (out of 30)
##
## Importance
## perimeter_worst 0.9755
## radius_worst 0.9704
## area_worst 0.9698
## concave.points_worst 0.9667
## concave.points_mean 0.9644
## perimeter_mean 0.9469
## area_mean 0.9383
## concavity_mean 0.9378
## radius_mean 0.9375
## area_se 0.9264
## concavity_worst 0.9214
## perimeter_se 0.8764
## radius_se 0.8683
## compactness_mean 0.8638
## compactness_worst 0.8623
## concave.points_se 0.7918
## texture_worst 0.7846
## concavity_se 0.7808
## texture_mean 0.7758
## smoothness_worst 0.7541
#Datos a exportar para d3.js( importancia sobre vae.objetivo)
expd3 <-importance$importance %>% select (correlacion = X0)
expd3$variable <- rownames(expd3)
Representacion de las variables mas importantes respecto a la variable objetivo diag:
# plot importance
plot(importance)
## 3.6.Seleccion de variables usando el metodo RFE
# ensure the results are repeatable
set.seed(7)
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm
results <- rfe(brc2[,1:30], brc2[,31], sizes=c(1:30), rfeControl=control)
# summarize the results
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.8633 0.7049 0.05188 0.11050
## 2 0.9315 0.8530 0.03425 0.07490
## 3 0.9264 0.8415 0.04934 0.10848
## 4 0.9402 0.8720 0.02652 0.05534
## 5 0.9543 0.9033 0.03727 0.07695
## 6 0.9543 0.9027 0.02654 0.05448
## 7 0.9578 0.9099 0.02249 0.04713
## 8 0.9666 0.9284 0.01546 0.03280
## 9 0.9612 0.9172 0.01862 0.03897
## 10 0.9648 0.9251 0.02209 0.04646
## 11 0.9701 0.9360 0.01687 0.03593
## 12 0.9718 0.9399 0.01907 0.04068 *
## 13 0.9683 0.9325 0.01832 0.03872
## 14 0.9683 0.9325 0.01832 0.03872
## 15 0.9666 0.9289 0.02424 0.05114
## 16 0.9683 0.9325 0.02331 0.04935
## 17 0.9700 0.9362 0.02228 0.04707
## 18 0.9665 0.9286 0.01964 0.04134
## 19 0.9648 0.9251 0.02209 0.04646
## 20 0.9683 0.9325 0.01832 0.03872
## 21 0.9683 0.9323 0.02191 0.04635
## 22 0.9701 0.9360 0.02043 0.04360
## 23 0.9631 0.9210 0.02107 0.04472
## 24 0.9630 0.9208 0.01773 0.03711
## 25 0.9666 0.9287 0.02123 0.04482
## 26 0.9683 0.9324 0.02026 0.04267
## 27 0.9684 0.9324 0.02596 0.05528
## 28 0.9630 0.9209 0.01956 0.04123
## 29 0.9578 0.9096 0.01704 0.03600
## 30 0.9631 0.9209 0.01556 0.03234
##
## The top 5 variables (out of 12):
## area_worst, concave.points_worst, perimeter_worst, radius_worst, texture_worst
# list the chosen features
predictors(results)
## [1] "area_worst" "concave.points_worst" "perimeter_worst"
## [4] "radius_worst" "texture_worst" "concave.points_mean"
## [7] "area_se" "texture_mean" "concavity_worst"
## [10] "smoothness_worst" "concavity_mean" "area_mean"
Representamos los resultados graficamente, precision usando cross validation
# plot the results
plot(results, type=c("g", "o"))
Del resultado de esta grafica podemos interpretar que las 12 variables mas correladas con la variable objetivo son el punto optimo para nuestro dataset de trabajo.
Eliminamos las variables devueltas por findcorrelation y visualizamos las variables mas correladas:
cor.brc <- brc[,-highlyCorrelated]
pairs(cor.brc)
Análisis :perimeter_mean
brc_graph = brc
# distribucion normalizada
brc_graph_n = brc_n
brc_graph$diagCategorico <- ifelse(brc$diag == 1, "Maligno", "Benigno")
brc_graph_n$diagCategorico <- ifelse(brc_n$diag == 1, "Maligno", "Benigno")
library(ggplot2)
ggplot(brc_graph, aes(x = perimeter_mean)) + geom_histogram(binwidth = 1, fill = "green", colour = "black") + facet_grid(diagCategorico ~ .) + ggtitle ("Fig 1.1. Histograma perimeter_mean por tipo de tumor") + theme(plot.title=element_text(vjust = +1.5, size = 12))
Análisis: radius_worst
ggplot(brc_graph, aes(x = radius_worst)) + geom_histogram(binwidth = 1, fill = "red", colour = "black") + facet_grid(diagCategorico ~ .) + ggtitle ("Fig 1.2. Histograma radius_worst por tipo de tumor") + theme(plot.title=element_text(vjust = +1.5, size = 12))
Análisis: area_worst
Análisis de las variables utilizando boxplot sobre datos normalizados:
Analisis de las 3 principales variables utilizando graficas de dispersion: perimeter_worst, radius_worst y area_worst sobre resultado: maligno o benigno
## `geom_smooth()` using formula 'y ~ x'
Analisis de las variables: concave.points_worst,concave.points_mean ,perimeter_mean
## `geom_smooth()` using formula 'y ~ x'
Analisis de las variables: area_mean,concavity_mean y radius_mean
## `geom_smooth()` using formula 'y ~ x'
Vamos a utilizar un algoritmo no supervisado de clustering, en concreto k-means
Carga de las librerias necesarias
if(! "dplyr" %in% installed.packages()) install.packages("dplyr", depend = TRUE)
if(! "plotrix" %in% installed.packages()) install.packages("plotrix", depend = TRUE)
if(! "knitr" %in% installed.packages()) install.packages("knitr", depend = TRUE)
library(knitr)
library(dplyr)
library("plotrix")
Vamos a quitar la variable Diag para que no condicione el resultado y su homóloga categorica
brc.mod <- brc %>% select(-diagnosis,-diag)
head(brc.mod)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 17.99 10.38 122.80 1001.0 0.11840
## 2 20.57 17.77 132.90 1326.0 0.08474
## 3 19.69 21.25 130.00 1203.0 0.10960
## 4 11.42 20.38 77.58 386.1 0.14250
## 5 20.29 14.34 135.10 1297.0 0.10030
## 6 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1 0.27760 0.3001 0.14710 0.2419
## 2 0.07864 0.0869 0.07017 0.1812
## 3 0.15990 0.1974 0.12790 0.2069
## 4 0.28390 0.2414 0.10520 0.2597
## 5 0.13280 0.1980 0.10430 0.1809
## 6 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.07871 1.0950 0.9053 8.589 153.40
## 2 0.05667 0.5435 0.7339 3.398 74.08
## 3 0.05999 0.7456 0.7869 4.585 94.03
## 4 0.09744 0.4956 1.1560 3.445 27.23
## 5 0.05883 0.7572 0.7813 5.438 94.44
## 6 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1 0.006399 0.04904 0.05373 0.01587 0.03003
## 2 0.005225 0.01308 0.01860 0.01340 0.01389
## 3 0.006150 0.04006 0.03832 0.02058 0.02250
## 4 0.009110 0.07458 0.05661 0.01867 0.05963
## 5 0.011490 0.02461 0.05688 0.01885 0.01756
## 6 0.007510 0.03345 0.03672 0.01137 0.02165
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.006193 25.38 17.33 184.60
## 2 0.003532 24.99 23.41 158.80
## 3 0.004571 23.57 25.53 152.50
## 4 0.009208 14.91 26.50 98.87
## 5 0.005115 22.54 16.67 152.20
## 6 0.005082 15.47 23.75 103.40
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019.0 0.1622 0.6656 0.7119
## 2 1956.0 0.1238 0.1866 0.2416
## 3 1709.0 0.1444 0.4245 0.4504
## 4 567.7 0.2098 0.8663 0.6869
## 5 1575.0 0.1374 0.2050 0.4000
## 6 741.6 0.1791 0.5249 0.5355
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1 0.2654 0.4601 0.11890
## 2 0.1860 0.2750 0.08902
## 3 0.2430 0.3613 0.08758
## 4 0.2575 0.6638 0.17300
## 5 0.1625 0.2364 0.07678
## 6 0.1741 0.3985 0.12440
Vamos a aplicar el clustering con K-means, pues todas son variables numéricas y podemos hacerlo. En este caso es necesario elegir el número de cluster final (k) antes de comenzar el análisis, pero el método de elbow nos puede dar una sugerencia del número ideal
mydata <- brc.mod
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Numero de Clusters", ylab="Sumas de cuadrados dentro de los grupos", main="Num de clusters óptimo según Elbow", pch=20, cex=2)
La gráfica intenta explicar la variación entre los clusters (suma de cuadrados). A partir del quinto la variación es muy pequeña, así que el número óptimo es k=5 Aplicamos entonces kmeans() con k=5
set.seed(1234)
kmeans.clust <- kmeans(brc.mod, 5)
kmeans.clust
## K-means clustering with 5 clusters of sizes 13, 71, 179, 237, 69
##
## Cluster means:
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 24.23154 22.70000 161.17692 1856.3846 0.10144692
## 2 19.77141 21.77535 131.01972 1219.6845 0.10220620
## 3 13.99190 19.09011 90.92447 605.5073 0.09456106
## 4 11.16257 17.91797 71.58143 386.5038 0.09453257
## 5 16.95029 21.31841 111.48406 897.3058 0.10033188
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1 0.16121692 0.22780000 0.13527308 0.1811692
## 2 0.15657789 0.19054817 0.10624239 0.1951239
## 3 0.10319240 0.07835383 0.04323582 0.1777648
## 4 0.07916945 0.04455745 0.02288237 0.1777637
## 5 0.12931261 0.13697159 0.07783913 0.1872783
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.05801385 1.2486615 1.136354 8.910462 209.63385
## 2 0.06099930 0.7615394 1.197383 5.339296 96.18930
## 3 0.06191458 0.3170950 1.054665 2.274385 27.40503
## 4 0.06479494 0.2841013 1.339365 1.993919 19.39548
## 5 0.06097971 0.5238986 1.252006 3.712870 56.44768
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1 0.006421923 0.02954923 0.03996231 0.01564615 0.01867154
## 2 0.006615197 0.03435310 0.04629620 0.01617515 0.02106549
## 3 0.005907911 0.02450365 0.02916068 0.01113232 0.01844364
## 4 0.008129451 0.02231621 0.02756094 0.01001733 0.02251468
## 5 0.006796478 0.02896748 0.03752580 0.01439672 0.01902604
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.003439692 31.01615 30.25692 208.67692
## 2 0.004034986 24.37676 29.00423 163.29014
## 3 0.003476408 15.79274 25.46603 104.26905
## 4 0.003954030 12.28611 23.65616 79.62688
## 5 0.003894464 20.06522 28.88072 133.18116
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2971.6923 0.1384923 0.3607000 0.4676769
## 2 1819.8451 0.1416254 0.3857817 0.4897634
## 3 766.5888 0.1292507 0.2722225 0.2796540
## 4 465.8270 0.1290738 0.1769841 0.1581117
## 5 1240.4406 0.1410954 0.3177414 0.3839377
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1 0.22912308 0.2826154 0.08088231
## 2 0.20161408 0.3214901 0.08826915
## 3 0.11479972 0.2910056 0.08488542
## 4 0.06789541 0.2750122 0.08163084
## 5 0.16344029 0.3084826 0.08558826
##
## Clustering vector:
## [1] 2 2 2 4 2 3 2 3 3 3 5 5 5 3 3 3 5 5 2 3 3 4 3 1 2 5 3 5 5 5 2 3 5 2 5
## [36] 5 3 4 3 3 3 4 2 3 3 2 4 3 4 3 4 3 4 5 5 4 2 3 3 4 4 4 3 4 3 3 4 4 4 4
## [71] 2 4 2 3 4 5 3 2 2 3 4 3 1 5 4 2 3 2 4 3 3 3 3 3 3 2 4 4 4 3 3 4 4 4 4
## [106] 3 4 4 2 4 4 4 3 4 4 4 4 5 5 5 4 2 2 3 3 3 3 2 3 2 4 5 5 3 2 3 4 4 3 4
## [141] 4 5 4 3 4 4 4 3 3 3 3 4 4 4 3 4 5 5 4 4 4 2 2 4 1 3 4 5 2 3 4 3 5 4 4
## [176] 4 4 5 3 4 1 2 5 4 3 4 5 4 4 4 3 4 4 3 3 4 3 5 2 3 4 5 2 5 3 3 4 5 3 3
## [211] 2 4 1 5 3 3 4 4 2 2 3 3 4 5 3 3 4 3 3 3 5 4 4 2 4 3 1 2 3 5 3 4 4 3 5
## [246] 4 3 3 4 4 2 4 2 5 2 3 2 3 5 5 2 5 5 3 5 1 4 3 4 4 3 4 2 4 5 4 4 5 3 3
## [281] 2 4 2 5 4 4 4 4 4 4 3 3 4 4 4 3 4 4 3 4 2 4 2 4 4 4 3 4 3 3 4 3 3 4 4
## [316] 4 4 5 4 4 4 2 4 2 4 4 3 4 5 5 5 3 4 4 4 5 4 2 4 1 3 4 4 2 4 4 4 3 4 4
## [351] 4 3 1 5 4 4 4 3 4 4 4 3 4 5 3 2 2 4 1 2 5 3 2 2 3 3 4 3 3 4 4 4 4 4 3
## [386] 3 4 3 4 5 4 4 5 2 4 3 3 4 4 4 5 4 3 4 4 4 3 3 5 4 4 4 4 3 3 4 4 2 4 4
## [421] 4 3 4 3 4 4 4 4 4 4 3 4 2 2 3 3 3 3 3 3 4 5 3 4 5 4 5 3 3 2 4 5 4 3 3
## [456] 3 4 3 3 4 2 1 3 4 3 3 3 4 5 4 4 4 3 4 4 4 3 3 4 3 4 3 3 3 3 4 3 2 4 5
## [491] 4 5 5 4 3 3 3 4 2 2 3 3 4 1 4 4 4 4 3 3 4 3 3 3 3 4 5 2 3 3 4 1 4 3 4
## [526] 4 3 4 3 4 4 4 3 2 4 2 3 4 4 4 4 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 3 4
## [561] 3 4 3 2 2 2 5 2 4
##
## Within cluster sum of squares by cluster:
## [1] 4886284 6023143 3159520 4037338 2428885
## (between_SS / total_SS = 92.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Vamos a comparar el resultado de la clusterización con el dataset inicial que teníamos para ver si hay alguna relación.
table(brc$diagnosis, kmeans.clust$cluster)
##
## 1 2 3 4 5
## B 0 0 122 232 3
## M 13 71 57 5 66
Del resultado podemos sacar las siguientes conclusiones: Clusters 1 , 2 y 5 son claramente un cancer maligno Cluster 4 es claramente benigno El cluster 3 tiene los dos tipos con mas posibilidades de benigno un 50%. Que existan 5 cluster en lugar de 2, puede indicar que dentro de las dos categorias que analizamos podrían deducirse distintos tipos de tumor.
Graficamente, mostrando las 2 variables mas correladas visualizamos los centroides:
plot(brc.mod %>% select(perimeter_worst,radius_worst), col = kmeans.clust$cluster)
points(as.data.frame(kmeans.clust$centers) %>% select(perimeter_worst,radius_worst), col = 1:3, pch = 8, cex = 2)
Incluso podemos representarlo confrontando las 8 variables principales:
kable(head(brc.mod %>% select(perimeter_worst,radius_worst,area_worst,concave.points_worst,concave.points_mean,perimeter_mean,area_mean,concavity_mean)))
| perimeter_worst | radius_worst | area_worst | concave.points_worst | concave.points_mean | perimeter_mean | area_mean | concavity_mean |
|---|---|---|---|---|---|---|---|
| 184.60 | 25.38 | 2019.0 | 0.2654 | 0.14710 | 122.80 | 1001.0 | 0.3001 |
| 158.80 | 24.99 | 1956.0 | 0.1860 | 0.07017 | 132.90 | 1326.0 | 0.0869 |
| 152.50 | 23.57 | 1709.0 | 0.2430 | 0.12790 | 130.00 | 1203.0 | 0.1974 |
| 98.87 | 14.91 | 567.7 | 0.2575 | 0.10520 | 77.58 | 386.1 | 0.2414 |
| 152.20 | 22.54 | 1575.0 | 0.1625 | 0.10430 | 135.10 | 1297.0 | 0.1980 |
| 103.40 | 15.47 | 741.6 | 0.1741 | 0.08089 | 82.57 | 477.1 | 0.1578 |
plot(brc.mod %>% select(perimeter_worst,radius_worst,area_worst,concave.points_worst,concave.points_mean,perimeter_mean), col=kmeans.clust$cluster)
Por ultimo vamos a representar un grafico de araña para representar los clusters:
library(plotrix)
radial.plot(kmeans.clust$centers[1,], labels=names(kmeans.clust$centers[1,]), rp.type="s", radial.lim=c(0,8), point.symbols=16, point.col="red", mar = c(0,0,0,0))
Creamos el dataset final con el dataset original mas la columna con el cluster asignado:
brc_kmeans <- brc %>% mutate(cluster_id = kmeans.clust$cluster)
kable(head(brc_kmeans[,27:32]))
| compactness_worst | concavity_worst | concave.points_worst | symmetry_worst | fractal_dimension_worst | diag |
|---|---|---|---|---|---|
| 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
| 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
| 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
| 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
| 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
| 0.5249 | 0.5355 | 0.1741 | 0.3985 | 0.12440 | 1 |
Este dataset, brc_kmeans, sera el que exportemos para un analisis grafico mas representativo usando Tableau
Cargamos las librerías necesarias para ejecutar el ejemplo con caret
if(! "mlbench" %in% installed.packages()) install.packages("mlbench", depend = TRUE)
if(! "caret" %in% installed.packages()) install.packages("caret", depend = TRUE)
if(! "ROCR" %in% installed.packages()) install.packages("ROCR", depend = TRUE)
if(! "dplyr" %in% installed.packages()) install.packages("dplyr", depend = TRUE)
if(! "corrplot" %in% installed.packages()) install.packages("corrplot", depend = TRUE)
if(! "e1071" %in% installed.packages()) install.packages("e1071", depend = TRUE)
if(! "gbm" %in% installed.packages()) install.packages("gbm", depend = TRUE)
library(mlbench)
library(caret)
library(dplyr)
library(ROCR)
library(corrplot)
library(e1071)
library(gbm)
Necesitamos que la varible objetivo sea un factor
brc <- brc %>% select(-diagnosis)
brc$diag <- ifelse(brc$diag == 1, 'Maligno', 'Benigno')
brc$diag <- factor(brc$diag,levels = c('Maligno', 'Benigno') )
prop.table( table(brc$diag))
##
## Maligno Benigno
## 0.3725835 0.6274165
Se crean los dataset de entrenamiento y test con la función de caret: createDataPartition() Estas particiones las usaremos para los 6 modelos propuestos:
### Creamos las particiones de entrenamiento y test para los datos
set.seed(3456)
index.brc <- createDataPartition(brc$diag, p=0.8, list=F)
train.brc <- brc[index.brc,]
test.brc <- brc[ -index.brc, ]
Comprobamos que las proporciones se mantienen en los datasets de entrenamiento y test similares a las del dataset original
prop.table( table(train.brc$diag))
##
## Maligno Benigno
## 0.372807 0.627193
prop.table( table(test.brc$diag))
##
## Maligno Benigno
## 0.3716814 0.6283186
Vemos que se mantienen las proporciones.
Comenzamos el preprocesado de datos Buscamos variables con varianza cercana a cero (casi constantes) pues no aportan al clasificador con la función nearZeroVar().Siempre excluyendo la variable objetivo
zero.var.train.brc <- nearZeroVar( train.brc[,-dim(train.brc)[2]], saveMetrics=F )
if (length(zero.var.train.brc) > 0) {
# Eliminamos las columnas con varianza casi nula en caso de que haya alguna
train.brc.nz <- train.brc[,-zero.var.train.brc]
test.brc.nz <- test.brc[,-zero.var.train.brc]
} else {
train.brc.nz <- train.brc
test.brc.nz <- test.brc
}
En este caso no hay ninguna variable con varianza casi nula
Ahora buscamos variables fuertemente correladas con la función cor() y findCorrelation(). Aunque esto ya lo hicimos en la fase de análisis, ahora vamos a alterar los datos:
cor.train.brc.matrix <- cor( train.brc.nz[, -dim(train.brc.nz)[2]] )
# Seleccionamos los indices de las columnas con una correlación mayor de 0,8
cor.train.brc.index <- findCorrelation( cor.train.brc.matrix, 0.80 )
nombresEliminadas <- colnames(train.brc.nz[,cor.train.brc.index])
cor.train.brc <- train.brc.nz[,-cor.train.brc.index]
print("Train:")
## [1] "Train:"
dim(cor.train.brc)
## [1] 456 15
cor.test.brc <- test.brc.nz[,-cor.train.brc.index]
print("Test:")
## [1] "Test:"
dim(cor.test.brc)
## [1] 113 15
Se han eliminado las variables :
## [1] "concavity_mean" "concave.points_mean" "compactness_mean"
## [4] "concave.points_worst" "concavity_worst" "perimeter_worst"
## [7] "radius_worst" "perimeter_mean" "area_worst"
## [10] "compactness_worst" "radius_mean" "perimeter_se"
## [13] "area_se" "compactness_se" "smoothness_mean"
## [16] "texture_mean"
Centramos y escalamos las variables para reducir la desviación con la función preProcess()
xTrans.brc <- preProcess(cor.train.brc[, -dim(cor.train.brc)[2]], method= c("center", "scale"))
train.brc.prep <- predict( xTrans.brc, cor.train.brc[,-dim(cor.train.brc)[2]])
train.brc.prep$diag <- cor.train.brc$diag
test.brc.prep <- predict( xTrans.brc, cor.test.brc[,-dim(cor.test.brc)[2]], method= c("center", "scale"))
test.brc.prep$diag <- cor.test.brc$diag
summary(cor.train.brc)
## area_mean symmetry_mean fractal_dimension_mean radius_se
## Min. : 143.5 Min. :0.1060 Min. :0.04996 Min. :0.1115
## 1st Qu.: 423.4 1st Qu.:0.1619 1st Qu.:0.05753 1st Qu.:0.2343
## Median : 545.6 Median :0.1792 Median :0.06155 Median :0.3258
## Mean : 666.2 Mean :0.1808 Mean :0.06270 Mean :0.4106
## 3rd Qu.: 801.3 3rd Qu.:0.1953 3rd Qu.:0.06612 3rd Qu.:0.4965
## Max. :2501.0 Max. :0.3040 Max. :0.09744 Max. :2.8730
## texture_se smoothness_se concavity_se concave.points_se
## Min. :0.3602 Min. :0.002667 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.8329 1st Qu.:0.005216 1st Qu.:0.01513 1st Qu.:0.007589
## Median :1.1410 Median :0.006363 Median :0.02599 Median :0.010905
## Mean :1.2210 Mean :0.007015 Mean :0.03138 Mean :0.011761
## 3rd Qu.:1.4765 3rd Qu.:0.008151 3rd Qu.:0.04109 3rd Qu.:0.014833
## Max. :3.8960 Max. :0.023330 Max. :0.39600 Max. :0.052790
## symmetry_se fractal_dimension_se texture_worst smoothness_worst
## Min. :0.007882 Min. :0.0009683 Min. :12.02 Min. :0.08125
## 1st Qu.:0.015022 1st Qu.:0.0022265 1st Qu.:21.07 1st Qu.:0.11675
## Median :0.018695 Median :0.0031615 Median :25.50 Median :0.13125
## Mean :0.020473 Mean :0.0037996 Mean :25.74 Mean :0.13221
## 3rd Qu.:0.023733 3rd Qu.:0.0044800 3rd Qu.:29.90 3rd Qu.:0.14600
## Max. :0.061460 Max. :0.0298400 Max. :49.54 Max. :0.20980
## symmetry_worst fractal_dimension_worst diag
## Min. :0.1565 Min. :0.05504 Maligno:170
## 1st Qu.:0.2509 1st Qu.:0.07196 Benigno:286
## Median :0.2827 Median :0.08002
## Mean :0.2899 Mean :0.08364
## 3rd Qu.:0.3175 3rd Qu.:0.09160
## Max. :0.6638 Max. :0.20750
summary(train.brc.prep)
## area_mean symmetry_mean fractal_dimension_mean
## Min. :-1.4228 Min. :-2.76263 Min. :-1.7960
## 1st Qu.:-0.6608 1st Qu.:-0.69676 1st Qu.:-0.7291
## Median :-0.3280 Median :-0.05556 Median :-0.1625
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.3678 3rd Qu.: 0.53760 3rd Qu.: 0.4817
## Max. : 4.9950 Max. : 4.55479 Max. : 4.8961
## radius_se texture_se smoothness_se concavity_se
## Min. :-1.0419 Min. :-1.5917 Min. :-1.5163 Min. :-1.0981
## 1st Qu.:-0.6142 1st Qu.:-0.7175 1st Qu.:-0.6276 1st Qu.:-0.5686
## Median :-0.2953 Median :-0.1479 Median :-0.2277 Median :-0.1887
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2995 3rd Qu.: 0.4725 3rd Qu.: 0.3960 3rd Qu.: 0.3399
## Max. : 8.5792 Max. : 4.9466 Max. : 5.6891 Max. :12.7607
## concave.points_se symmetry_se fractal_dimension_se
## Min. :-1.9284 Min. :-1.5626 Min. :-1.0274
## 1st Qu.:-0.6840 1st Qu.:-0.6764 1st Qu.:-0.5708
## Median :-0.1403 Median :-0.2206 Median :-0.2315
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5037 3rd Qu.: 0.4046 3rd Qu.: 0.2469
## Max. : 6.7274 Max. : 5.0868 Max. : 9.4491
## texture_worst smoothness_worst symmetry_worst
## Min. :-2.21664 Min. :-2.29539 Min. :-2.1744
## 1st Qu.:-0.75338 1st Qu.:-0.69643 1st Qu.:-0.6367
## Median :-0.03751 Median :-0.04333 Median :-0.1184
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.67311 3rd Qu.: 0.62103 3rd Qu.: 0.4500
## Max. : 3.84647 Max. : 3.49466 Max. : 6.0937
## fractal_dimension_worst diag
## Min. :-1.6190 Maligno:170
## 1st Qu.:-0.6611 Benigno:286
## Median :-0.2047
## Mean : 0.0000
## 3rd Qu.: 0.4506
## Max. : 7.0115
Vemos que el centrado y escalado ha reducido la diferencia de valores entre las variables
# Aplicamos resampling de repeated k cross validation
set.seed(1234)
fitControl <- trainControl(method="repeatedcv", repeats=5)
# Entrenamos el modelo
knn.brc.model <- train(x=train.brc.prep[,-dim(train.brc.prep)[2]], y=train.brc.prep$diag, method="knn",
tuneLength=10, trControl=fitControl)
knn.brc.model
## k-Nearest Neighbors
##
## 456 samples
## 14 predictor
## 2 classes: 'Maligno', 'Benigno'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 411, 410, 410, 410, 411, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9315266 0.8499399
## 7 0.9345894 0.8564354
## 9 0.9354783 0.8585149
## 11 0.9359420 0.8589761
## 13 0.9364058 0.8595775
## 15 0.9385797 0.8641936
## 17 0.9390435 0.8651708
## 19 0.9368406 0.8601756
## 21 0.9364058 0.8591591
## 23 0.9364058 0.8591581
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 17.
Graficamente,vemos que el valor optimo de k = 17
plot1 <- plot(knn.brc.model,metric="Accuracy")
print(plot1)
Visualizamos la salida del modelo KNN
knn.brc.test <- predict(knn.brc.model, newdata= test.brc.prep[,-dim(train.brc.prep)[2]])
knn.brc.test
## [1] Maligno Maligno Maligno Benigno Maligno Maligno Maligno Maligno
## [9] Maligno Maligno Benigno Benigno Benigno Benigno Maligno Maligno
## [17] Benigno Maligno Maligno Benigno Benigno Maligno Benigno Benigno
## [25] Benigno Benigno Benigno Maligno Maligno Benigno Benigno Benigno
## [33] Benigno Benigno Maligno Benigno Benigno Benigno Benigno Maligno
## [41] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [49] Maligno Maligno Maligno Benigno Benigno Benigno Benigno Benigno
## [57] Benigno Benigno Benigno Maligno Benigno Benigno Benigno Benigno
## [65] Benigno Benigno Maligno Benigno Maligno Benigno Benigno Maligno
## [73] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [81] Maligno Maligno Benigno Benigno Benigno Benigno Maligno Benigno
## [89] Benigno Benigno Benigno Benigno Benigno Benigno Benigno Benigno
## [97] Benigno Benigno Benigno Benigno Benigno Benigno Benigno Benigno
## [105] Benigno Benigno Benigno Benigno Maligno Maligno Maligno Maligno
## [113] Benigno
## Levels: Maligno Benigno
Comparación de predicciones, probabilidades sobre el conjunto de test
# Extracción de prediciones
knn.brc.test.preds <- extractPrediction( list(model1=knn.brc.model), testX=test.brc.prep[,-dim(test.brc.prep)[2]] , testY=test.brc.prep$diag)
conjunto.test.preds <- subset(knn.brc.test.preds, dataType== "Test")
head(conjunto.test.preds)
## obs pred model dataType object
## 457 Maligno Maligno knn Test model1
## 458 Maligno Maligno knn Test model1
## 459 Maligno Maligno knn Test model1
## 460 Benigno Benigno knn Test model1
## 461 Maligno Maligno knn Test model1
## 462 Maligno Maligno knn Test model1
Extraccion de probabilidades:
# Extraccion de probabilidades
knn.brc.test.probs <- extractProb( list(model1=knn.brc.model),testX= test.brc.prep[,-dim(test.brc.prep)[2]], testY=test.brc.prep$diag)
conjunto.test.probs <- subset(knn.brc.test.probs, dataType== "Test")
plotClassProbs(conjunto.test.probs )
Vemos en la gráfica que es casi igual de certero prediciendo Benigno que maligno
Evaluación del modelo, matriz de confusión y curvas ROC. Funciones confusionMatrix() y prediction()
confusionMatrix(knn.brc.test, test.brc.prep$diag )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 33 1
## Benigno 9 70
##
## Accuracy : 0.9115
## 95% CI : (0.8433, 0.9567)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : 6.062e-12
##
## Kappa : 0.8029
##
## Mcnemar's Test P-Value : 0.02686
##
## Sensitivity : 0.7857
## Specificity : 0.9859
## Pos Pred Value : 0.9706
## Neg Pred Value : 0.8861
## Prevalence : 0.3717
## Detection Rate : 0.2920
## Detection Prevalence : 0.3009
## Balanced Accuracy : 0.8858
##
## 'Positive' Class : Maligno
##
Obtencion de la curva ROC:
# Para poder pintar la curva ROC tenemos que pasar los valores del target a 0 y 1
pr <- prediction(ifelse(knn.brc.test == 'Maligno',1,0), ifelse(test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
Roc_KNN <- data.frame(model= "KNN", fpr =prf@x.values[[1]], tpr = prf@y.values[[1]] )
plot(prf)
title ("Curva ROC KNN")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
knn.auc <- auc
auc
## [1] 0.8858149
Vemos que nos da una precision Accuracy : 0.9115 y area bajo la curva 0.8858149
Repetimos los mismos pasos que para Knn. Utilizamos validación cruzada como técnica de remuestreo Entrenamiento
set.seed(1234)
svm.brc.model <- train(x=train.brc.prep[,-dim(train.brc.prep)[2]], y=train.brc.prep$diag, method="svmRadial", tuneLength=10, trControl=fitControl)
svm.brc.model
## Support Vector Machines with Radial Basis Function Kernel
##
## 456 samples
## 14 predictor
## 2 classes: 'Maligno', 'Benigno'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 411, 410, 410, 410, 411, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9390048 0.8650691
## 0.50 0.9516618 0.8941844
## 1.00 0.9648116 0.9237611
## 2.00 0.9647826 0.9238861
## 4.00 0.9630628 0.9207217
## 8.00 0.9648116 0.9248343
## 16.00 0.9639807 0.9229678
## 32.00 0.9613333 0.9170707
## 64.00 0.9613333 0.9170707
## 128.00 0.9613333 0.9170707
##
## Tuning parameter 'sigma' was held constant at a value of 0.08590669
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.08590669 and C = 1.
Visualización del modelo
plot1 <- plot(svm.brc.model, metric="Accuracy")
print(plot1)
Predicción sobre nuevos valores
svm.brc.test <- predict(svm.brc.model, newdata= test.brc.prep[,-dim(test.brc.prep)[2]])
svm.brc.test
## [1] Maligno Maligno Maligno Benigno Maligno Maligno Maligno Maligno
## [9] Maligno Maligno Benigno Benigno Benigno Maligno Maligno Maligno
## [17] Maligno Maligno Maligno Benigno Benigno Maligno Benigno Benigno
## [25] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [33] Benigno Maligno Maligno Maligno Maligno Benigno Maligno Maligno
## [41] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [49] Maligno Maligno Maligno Benigno Benigno Benigno Benigno Benigno
## [57] Benigno Benigno Benigno Maligno Benigno Benigno Benigno Benigno
## [65] Benigno Benigno Maligno Benigno Maligno Benigno Benigno Maligno
## [73] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [81] Maligno Maligno Benigno Benigno Maligno Benigno Maligno Benigno
## [89] Benigno Benigno Benigno Benigno Benigno Benigno Benigno Maligno
## [97] Benigno Benigno Benigno Benigno Benigno Benigno Benigno Benigno
## [105] Benigno Benigno Benigno Benigno Maligno Maligno Maligno Maligno
## [113] Benigno
## Levels: Maligno Benigno
Comparación de predicciones, probabilidades sobre el conjunto de test
# Extracción de prediciones
svm.brc.test.preds <- extractPrediction( list(model1=svm.brc.model), testX=test.brc.prep[,-dim(test.brc.prep)[2]] , testY=test.brc.prep$diag)
conjunto.test.preds2 <- subset(svm.brc.test.preds, dataType== "Test")
head(conjunto.test.preds2)
## obs pred model dataType object
## 457 Maligno Maligno svmRadial Test model1
## 458 Maligno Maligno svmRadial Test model1
## 459 Maligno Maligno svmRadial Test model1
## 460 Benigno Benigno svmRadial Test model1
## 461 Maligno Maligno svmRadial Test model1
## 462 Maligno Maligno svmRadial Test model1
Evaluación del modelo, matriz de confusión y curvas ROC
confusionMatrix(svm.brc.test, test.brc.prep$diag )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 40 1
## Benigno 2 70
##
## Accuracy : 0.9735
## 95% CI : (0.9244, 0.9945)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9429
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9524
## Specificity : 0.9859
## Pos Pred Value : 0.9756
## Neg Pred Value : 0.9722
## Prevalence : 0.3717
## Detection Rate : 0.3540
## Detection Prevalence : 0.3628
## Balanced Accuracy : 0.9691
##
## 'Positive' Class : Maligno
##
pr <- prediction(ifelse(svm.brc.test == 'Maligno',1,0), ifelse(test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
Roc_SVM <- data.frame(model= "SVM", fpr =prf@x.values[[1]], tpr = prf@y.values[[1]] )
plot(prf)
title ("Curva ROC SVM")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
svm.auc <- auc
auc
## [1] 0.9691482
Vemos que con SVM obtenemos Accuracy : 0.9735 y AUC = 0.9691482
Comparación de ambos modelos (Model1=knn, Model2=SVM) con Caret
models <- list( knn.brc.model, svm.brc.model )
compar.models <- resamples( models )
summary( compar.models )
##
## Call:
## summary.resamples(object = compar.models)
##
## Models: Model1, Model2
## Number of resamples: 50
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Model1 0.8478261 0.9130435 0.9347826 0.9390435 0.9565217 1 0
## Model2 0.9111111 0.9555556 0.9671498 0.9648116 0.9782609 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Model1 0.6522678 0.8087318 0.8556362 0.8651708 0.9051881 1 0
## Model2 0.8064516 0.9032258 0.9288204 0.9237611 0.9539078 1 0
Visualización del rendimiento de cada modelo. Model1=knn, Model2=SVM
dotplot( compar.models)
Se ve claramente que Model2=SVM para este caso en concreto tiene mayor precisión.
Vamos a probar un algoritmo de regresion lineal multiple y comprobar su bondad. Necesitamos que la variable de salida sea numerica
# Creamos una copia con diag de tipo numerico
lm.train.brc.prep <- train.brc.prep
lm.train.brc.prep$diag <- ifelse(lm.train.brc.prep$diag == 'Maligno', 1, 0)
lm.train.brc.prep$diag = factor(lm.train.brc.prep$diag)
lm.test.brc.prep <- test.brc.prep
lm.test.brc.prep$diag <- ifelse(lm.test.brc.prep$diag == 'Maligno', 1, 0)
lm.test.brc.prep$diag = factor(lm.test.brc.prep$diag )
# prepare resampling method
#control <- trainControl(method="cv", number=5)
set.seed(7)
glm.brc.model <- train(diag~., data=lm.train.brc.prep, method="glm", metric="Accuracy", trControl=fitControl)
# display results
print(glm.brc.model)
## Generalized Linear Model
##
## 456 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 410, 410, 410, 410, 411, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9605411 0.9152777
Salida del modelo:
glm.brc.test <- predict(glm.brc.model, newdata= lm.test.brc.prep[,-dim(lm.test.brc.prep)[2]])
Matriz de confusion:
confusionMatrix(glm.brc.test, lm.test.brc.prep$diag )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 2
## 1 2 40
##
## Accuracy : 0.9646
## 95% CI : (0.9118, 0.9903)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9242
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9718
## Specificity : 0.9524
## Pos Pred Value : 0.9718
## Neg Pred Value : 0.9524
## Prevalence : 0.6283
## Detection Rate : 0.6106
## Detection Prevalence : 0.6283
## Balanced Accuracy : 0.9621
##
## 'Positive' Class : 0
##
Curva ROC:
pr <- prediction(ifelse(glm.brc.test == 1,1,0), ifelse(lm.test.brc.prep$diag == 1,1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
Roc_GLM <- data.frame(model= "GLM", fpr =prf@x.values[[1]], tpr = prf@y.values[[1]] )
plot(prf)
title ("Curva ROC GLM")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
glm.auc <- auc
auc
## [1] 0.962106
Cargamos las librerias necesarias para ejecutar arboles de decision:
if(! "partykit" %in% installed.packages()) install.packages("partykit", depend = TRUE)
if(! "rpart" %in% installed.packages()) install.packages("rpart", depend = TRUE)
if(! "randomForest" %in% installed.packages()) install.packages("randomForest", depend = TRUE)
library(partykit)
library(rpart)
library(randomForest)
Construimos el modelo con ctree() del paquete partykit y mostramos las reglas resultado con print() Posteriormente este modelo lo realizaremos con el metodo Train para que los resultados se puedan comparar mas facilmente
# Creamos una copia con diag de tipo numerico
tree.train.brc.prep <- train.brc.prep
tree.test.brc.prep <- test.brc.prep
ctree.brc.model <- ctree(diag ~ ., data=tree.train.brc.prep)
# Mostrar las reglas
print(ctree.brc.model)
##
## Model formula:
## diag ~ area_mean + symmetry_mean + fractal_dimension_mean + radius_se +
## texture_se + smoothness_se + concavity_se + concave.points_se +
## symmetry_se + fractal_dimension_se + texture_worst + smoothness_worst +
## symmetry_worst + fractal_dimension_worst
##
## Fitted party:
## [1] root
## | [2] area_mean <= 0.07499
## | | [3] symmetry_worst <= 1.0722
## | | | [4] smoothness_worst <= 1.17954
## | | | | [5] area_mean <= -0.28326
## | | | | | [6] radius_se <= 0.40291: Benigno (n = 204, err = 0.0%)
## | | | | | [7] radius_se > 0.40291: Benigno (n = 7, err = 14.3%)
## | | | | [8] area_mean > -0.28326
## | | | | | [9] smoothness_worst <= -0.11314: Benigno (n = 42, err = 2.4%)
## | | | | | [10] smoothness_worst > -0.11314: Maligno (n = 17, err = 47.1%)
## | | | [11] smoothness_worst > 1.17954
## | | | | [12] area_mean <= -0.69051: Benigno (n = 11, err = 9.1%)
## | | | | [13] area_mean > -0.69051: Maligno (n = 10, err = 10.0%)
## | | [14] symmetry_worst > 1.0722
## | | | [15] texture_worst <= 0.10389: Benigno (n = 7, err = 28.6%)
## | | | [16] texture_worst > 0.10389: Maligno (n = 16, err = 6.2%)
## | [17] area_mean > 0.07499
## | | [18] texture_worst <= -0.99497: Benigno (n = 15, err = 46.7%)
## | | [19] texture_worst > -0.99497: Maligno (n = 127, err = 1.6%)
##
## Number of inner nodes: 9
## Number of terminal nodes: 10
Vemos los valores de corte de las variables y el error en la clasificación en cada nodo Graficamente:
# Arbol completo
plot(ctree.brc.model)
# Arbol simplificado
plot(ctree.brc.model, type="simple")
Prediccion sobre el conjunto de test:
tree.test.brc.pred <- predict(ctree.brc.model, newdata = tree.test.brc.prep)
table(tree.test.brc.pred, tree.test.brc.prep$diag)
##
## tree.test.brc.pred Maligno Benigno
## Maligno 39 3
## Benigno 3 68
Matriz de confusion:
confusionMatrix(tree.test.brc.pred, tree.test.brc.prep$diag )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 39 3
## Benigno 3 68
##
## Accuracy : 0.9469
## 95% CI : (0.888, 0.9803)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : 1.866e-15
##
## Kappa : 0.8863
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9286
## Specificity : 0.9577
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.9577
## Prevalence : 0.3717
## Detection Rate : 0.3451
## Detection Prevalence : 0.3717
## Balanced Accuracy : 0.9432
##
## 'Positive' Class : Maligno
##
Curva ROC:
pr <- prediction(ifelse(tree.test.brc.pred == 'Maligno',1,0), ifelse(tree.test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
title ("Curva ROC CTREE")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
tree.auc <- auc
auc
## [1] 0.943159
Vamos a intentar mejorar el modelo, usando Random Forest:
Este modelo lo vamos a implementar primero con la librería randomForest para ver las graficas y posteriormente lo haremos con train para la comparación de resultados
if(! "randomForest" %in% installed.packages()) install.packages("randomForest", depend = TRUE)
library(randomForest)
rf.brc.model <- randomForest(diag ~ ., data=tree.train.brc.prep, ntree=100, proximity=TRUE)
# Vemos la matriz de confusion para el conjunto de entrenamiento
print(rf.brc.model)
##
## Call:
## randomForest(formula = diag ~ ., data = tree.train.brc.prep, ntree = 100, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 3.73%
## Confusion matrix:
## Maligno Benigno class.error
## Maligno 156 14 0.08235294
## Benigno 3 283 0.01048951
Atributos del modelo:
# Vemos los atributos del modelo
attributes(rf.brc.model)
## $names
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
Representacion grafica:
# plot the tree
plot(rf.brc.model)
legend("top", colnames(rf.brc.model$err.rate),col=1:4,cex=0.8,fill=1:4)
Se ha mejorado mucho el modelo respecto al modelo de un solo arbol.Vemos que a partir de 50 arboles ya no se mejora el modelo respecto al error. Importancia de las variables:
# importance of variables
varImpPlot(rf.brc.model)
Podemos observar que area_mean y radius_se son las dos variables que mas afectan en la construccion del modelo. Creacion de la prediccion y matrix de confusion:
rf.brc.pred <- predict(rf.brc.model, newdata=tree.test.brc.prep)
table(rf.brc.pred, tree.test.brc.prep$diag)
##
## rf.brc.pred Maligno Benigno
## Maligno 37 0
## Benigno 5 71
Graficamente:
plot(margin(rf.brc.model, tree.test.brc.prep$diag))
legend("right", colnames(rf.brc.model$err.rate),col=1:3,cex=0.8,fill=1:4)
Matriz de confusion:
confusionMatrix(rf.brc.pred, tree.test.brc.prep$diag )
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 37 0
## Benigno 5 71
##
## Accuracy : 0.9558
## 95% CI : (0.8998, 0.9855)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9029
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8810
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9342
## Prevalence : 0.3717
## Detection Rate : 0.3274
## Detection Prevalence : 0.3274
## Balanced Accuracy : 0.9405
##
## 'Positive' Class : Maligno
##
Curva ROC:
pr <- prediction(ifelse(rf.brc.pred == 'Maligno',1,0), ifelse(tree.test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
title ("Curva ROC RANDOM FOREST")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
rf.auc <- auc
auc
## [1] 0.9404762
Instalamos las librerias necesarias:
library(caret)
library(parallel)
if(! "doParallel" %in% installed.packages())
install.packages("doParallel")
if(! "fastAdaboost" %in% installed.packages())
install.packages("fastAdaboost")
library(doParallel)
#cluster <- makeCluster(detectCores() - 1)
#registerDoParallel(cluster) # procesamiento paralelo
#fitControl <- trainControl(method = "cv",
# number = 10,
# search = "grid",
# allowParallel = TRUE)
# Hiperparámetros a evaluar
grid <- expand.grid(nIter = c(100, 400, 600),
method = "adaboost") #nº de iteraciones (clasificadores)
set.seed(7)
adaboost.brc.model <- train(diag ~ ., data = train.brc.prep,
method = "adaboost",
#tuneGrid = grid,
trControl = fitControl,
#metric = "Accuracy",
verbose = FALSE)
adaboost.brc.model
## AdaBoost Classification Trees
##
## 456 samples
## 14 predictor
## 2 classes: 'Maligno', 'Benigno'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 410, 410, 410, 410, 411, ...
## Resampling results across tuning parameters:
##
## nIter method Accuracy Kappa
## 50 Adaboost.M1 0.9556522 0.9040419
## 50 Real adaboost 0.9451594 0.8795943
## 100 Adaboost.M1 0.9534783 0.8993891
## 100 Real adaboost 0.9486473 0.8873988
## 150 Adaboost.M1 0.9543478 0.9011431
## 150 Real adaboost 0.9486473 0.8876488
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nIter = 50 and method
## = Adaboost.M1.
Matriz de confusion:
confusionMatrix(predict(adaboost.brc.model, test.brc.prep), test.brc.prep$diag)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 38 0
## Benigno 4 71
##
## Accuracy : 0.9646
## 95% CI : (0.9118, 0.9903)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9227
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 0.9048
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9467
## Prevalence : 0.3717
## Detection Rate : 0.3363
## Detection Prevalence : 0.3363
## Balanced Accuracy : 0.9524
##
## 'Positive' Class : Maligno
##
#stopCluster(cluster)
#registerDoSEQ()
Curva ROC:
adaboost.brc.pred <- predict(adaboost.brc.model, newdata= test.brc.prep)
pr <- prediction(ifelse(adaboost.brc.pred == 'Maligno',1,0), ifelse(test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
Roc_ADABOOST <- data.frame(model= "ADABOOST", fpr =prf@x.values[[1]], tpr = prf@y.values[[1]] )
plot(prf)
title ("Curva ROC ADA BOOST")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
adaboost.auc <- auc
auc
## [1] 0.952381
Cargamos las librerias necesarias para ejecutar arboles de decision:
# Creamos una copia con diag de tipo numerico
tree.train.brc.prep <- train.brc.prep
tree.test.brc.prep <- test.brc.prep
#ctree.brc.model <- ctree(diag ~ ., data=tree.train.brc.prep)
# prepare resampling method
fitControl <- trainControl(method="repeatedcv", repeats=5)
#control <- trainControl(method="cv", number=5)
set.seed(7)
tree.brc.model <- train(diag~., data=tree.train.brc.prep, method="ctree", metric="Accuracy", trControl=fitControl)
# display results
print(tree.brc.model)
## Conditional Inference Tree
##
## 456 samples
## 14 predictor
## 2 classes: 'Maligno', 'Benigno'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 410, 410, 410, 410, 411, ...
## Resampling results across tuning parameters:
##
## mincriterion Accuracy Kappa
## 0.01 0.8903092 0.7661170
## 0.50 0.8903092 0.7661170
## 0.99 0.8885894 0.7621125
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.5.
Visualización del modelo
plot1 <- plot(tree.brc.model, metric="Accuracy")
print(plot1)
Predicción sobre nuevos valores
ctree.brc.test <- predict(ctree.brc.model, newdata= test.brc.prep[,-dim(test.brc.prep)[2]])
ctree.brc.test
## 9 17 18 20 23 27 28 31 35
## Maligno Maligno Maligno Maligno Benigno Maligno Maligno Maligno Maligno
## 43 52 59 68 69 76 79 101 106
## Maligno Benigno Benigno Benigno Benigno Maligno Maligno Benigno Maligno
## 118 126 131 132 136 138 140 146 151
## Maligno Benigno Benigno Maligno Benigno Benigno Benigno Benigno Benigno
## 152 157 175 189 192 193 198 204 206
## Benigno Maligno Benigno Benigno Benigno Benigno Maligno Maligno Maligno
## 208 212 214 216 222 225 227 236 238
## Maligno Benigno Maligno Maligno Benigno Benigno Benigno Benigno Maligno
## 249 252 256 258 259 281 286 290 297
## Benigno Benigno Maligno Maligno Maligno Maligno Benigno Benigno Benigno
## 305 310 311 312 317 318 339 346 349
## Benigno Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## 351 359 363 373 375 380 384 387 394
## Benigno Benigno Benigno Maligno Benigno Maligno Benigno Benigno Maligno
## 411 412 413 417 418 422 424 429 431
## Benigno Benigno Benigno Benigno Maligno Maligno Benigno Benigno Maligno
## 434 438 440 445 446 447 454 456 465
## Maligno Benigno Benigno Maligno Benigno Maligno Maligno Benigno Benigno
## 466 467 470 475 478 480 483 484 487
## Benigno Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## 489 496 514 516 526 533 540 554 559
## Benigno Benigno Benigno Benigno Benigno Benigno Benigno Benigno Benigno
## 563 564 566 567 569
## Maligno Maligno Maligno Maligno Benigno
## Levels: Maligno Benigno
Evaluación del modelo, matriz de confusión y curvas ROC
confusionMatrix(predict(tree.brc.model, test.brc.prep), test.brc.prep$diag)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 39 3
## Benigno 3 68
##
## Accuracy : 0.9469
## 95% CI : (0.888, 0.9803)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : 1.866e-15
##
## Kappa : 0.8863
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9286
## Specificity : 0.9577
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.9577
## Prevalence : 0.3717
## Detection Rate : 0.3451
## Detection Prevalence : 0.3717
## Balanced Accuracy : 0.9432
##
## 'Positive' Class : Maligno
##
Curva ROC:
tree.brc.pred <- predict(tree.brc.model, newdata= test.brc.prep)
pr <- prediction(ifelse(tree.brc.pred == 'Maligno',1,0), ifelse(test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
Roc_CTREE <- data.frame(model= "CTREE", fpr =prf@x.values[[1]], tpr = prf@y.values[[1]] )
plot(prf)
title ("Curva ROC CTREE")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
ctree.auc <- auc
auc
## [1] 0.943159
# nombre de las variables con los conjuntos de train y test
#train.brc.prep
#test.brc.prep
# prepare resampling method
#fitControl <- trainControl(method="repeatedcv", repeats=5)
#control <- trainControl(method="cv", number=5)
set.seed(7)
rf.brc.model <- train(diag~., data=train.brc.prep, method="rf", metric="Accuracy", trControl=fitControl)
# display results
print(rf.brc.model)
## Random Forest
##
## 456 samples
## 14 predictor
## 2 classes: 'Maligno', 'Benigno'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 410, 410, 410, 410, 411, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9605024 0.9132472
## 8 0.9494783 0.8900671
## 14 0.9481353 0.8871325
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Mostrar las reglas
print(rf.brc.model)
## Random Forest
##
## 456 samples
## 14 predictor
## 2 classes: 'Maligno', 'Benigno'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 410, 410, 410, 410, 410, 411, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9605024 0.9132472
## 8 0.9494783 0.8900671
## 14 0.9481353 0.8871325
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
Visualización del modelo
plot1 <- plot(rf.brc.model, metric="Accuracy")
print(plot1)
Predicción sobre nuevos valores
rftree.brc.test <- predict(rf.brc.model, newdata= test.brc.prep[,-dim(test.brc.prep)[2]])
rftree.brc.test
## [1] Maligno Maligno Maligno Benigno Maligno Maligno Maligno Maligno
## [9] Maligno Maligno Benigno Benigno Benigno Benigno Maligno Maligno
## [17] Benigno Benigno Maligno Benigno Benigno Maligno Benigno Benigno
## [25] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [33] Benigno Maligno Maligno Benigno Maligno Benigno Maligno Maligno
## [41] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [49] Maligno Maligno Maligno Benigno Benigno Benigno Benigno Benigno
## [57] Benigno Benigno Benigno Maligno Benigno Benigno Benigno Benigno
## [65] Benigno Benigno Maligno Benigno Maligno Benigno Benigno Maligno
## [73] Benigno Benigno Benigno Benigno Maligno Benigno Benigno Benigno
## [81] Maligno Maligno Benigno Benigno Benigno Benigno Maligno Benigno
## [89] Benigno Benigno Benigno Benigno Benigno Benigno Benigno Maligno
## [97] Benigno Benigno Benigno Benigno Benigno Benigno Benigno Benigno
## [105] Benigno Benigno Benigno Benigno Maligno Maligno Maligno Maligno
## [113] Benigno
## Levels: Maligno Benigno
Evaluación del modelo, matriz de confusión y curvas ROC
confusionMatrix(predict(rf.brc.model, test.brc.prep), test.brc.prep$diag)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Maligno Benigno
## Maligno 36 0
## Benigno 6 71
##
## Accuracy : 0.9469
## 95% CI : (0.888, 0.9803)
## No Information Rate : 0.6283
## P-Value [Acc > NIR] : 1.866e-15
##
## Kappa : 0.8829
##
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 0.8571
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9221
## Prevalence : 0.3717
## Detection Rate : 0.3186
## Detection Prevalence : 0.3186
## Balanced Accuracy : 0.9286
##
## 'Positive' Class : Maligno
##
Curva ROC:
rf.brc.pred <- predict(rf.brc.model, newdata= test.brc.prep)
pr <- prediction(ifelse(rf.brc.pred == 'Maligno',1,0), ifelse(test.brc.prep$diag == 'Maligno',1,0))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
Roc_RFOREST <- data.frame(model= "RFOREST", fpr =prf@x.values[[1]], tpr = prf@y.values[[1]] )
plot(prf)
title ("Curva ROC RANDOM FOREST")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
rftree.auc <- auc
auc
## [1] 0.9285714
models <- list(rf.brc.model , tree.brc.model, knn.brc.model, glm.brc.model, svm.brc.model, adaboost.brc.model )
compar.models <- resamples( models )
summary( compar.models )
##
## Call:
## summary.resamples(object = compar.models)
##
## Models: Model1, Model2, Model3, Model4, Model5, Model6
## Number of resamples: 50
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Model1 0.8666667 0.9347826 0.9565217 0.9605024 0.9782609 1.0000000 0
## Model2 0.8043478 0.8673913 0.8900966 0.8903092 0.9130435 0.9777778 0
## Model3 0.8478261 0.9130435 0.9347826 0.9390435 0.9565217 1.0000000 0
## Model4 0.8913043 0.9399758 0.9565217 0.9605411 0.9782609 1.0000000 0
## Model5 0.9111111 0.9555556 0.9671498 0.9648116 0.9782609 1.0000000 0
## Model6 0.8478261 0.9347826 0.9565217 0.9556522 0.9782609 1.0000000 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Model1 0.6952596 0.8586904 0.9066937 0.9132472 0.9527721 1.000000 0
## Model2 0.5529158 0.7139199 0.7651078 0.7661170 0.8178218 0.953271 0
## Model3 0.6522678 0.8087318 0.8556362 0.8651708 0.9051881 1.000000 0
## Model4 0.7749511 0.8745344 0.9071456 0.9152777 0.9527721 1.000000 0
## Model5 0.8064516 0.9032258 0.9288204 0.9237611 0.9539078 1.000000 0
## Model6 0.6430155 0.8583162 0.9066937 0.9040419 0.9527721 1.000000 0
str(compar.models)
## List of 6
## $ call : language resamples.default(x = models)
## $ values :'data.frame': 50 obs. of 13 variables:
## ..$ Resample : chr [1:50] "Fold01.Rep1" "Fold01.Rep2" "Fold01.Rep3" "Fold01.Rep4" ...
## ..$ Model1~Accuracy: num [1:50] 0.978 0.957 0.935 1 0.956 ...
## ..$ Model1~Kappa : num [1:50] 0.953 0.904 0.855 1 0.903 ...
## ..$ Model2~Accuracy: num [1:50] 0.891 0.87 0.913 0.867 0.822 ...
## ..$ Model2~Kappa : num [1:50] 0.758 0.72 0.813 0.723 0.622 ...
## ..$ Model3~Accuracy: num [1:50] 0.957 0.891 0.957 0.935 0.911 ...
## ..$ Model3~Kappa : num [1:50] 0.904 0.752 0.907 0.862 0.802 ...
## ..$ Model4~Accuracy: num [1:50] 1 0.978 0.957 0.956 0.956 ...
## ..$ Model4~Kappa : num [1:50] 1 0.953 0.904 0.908 0.903 ...
## ..$ Model5~Accuracy: num [1:50] 0.978 0.935 0.978 0.978 0.978 ...
## ..$ Model5~Kappa : num [1:50] 0.953 0.855 0.954 0.954 0.952 ...
## ..$ Model6~Accuracy: num [1:50] 0.978 0.978 0.957 0.978 0.933 ...
## ..$ Model6~Kappa : num [1:50] 0.953 0.953 0.907 0.952 0.857 ...
## $ models : chr [1:6] "Model1" "Model2" "Model3" "Model4" ...
## $ metrics: chr [1:2] "Accuracy" "Kappa"
## $ timings:'data.frame': 6 obs. of 3 variables:
## ..$ Everything: num [1:6] 37.52 6 4.31 1.56 14.5 ...
## ..$ FinalModel: num [1:6] 0.21 0.01 0 0.01 0.01 ...
## ..$ Prediction: num [1:6] NA NA NA NA NA NA
## $ methods: chr [1:6] "rf" "ctree" "knn" "glm" ...
## - attr(*, "class")= chr "resamples"
kable(head(compar.models$values))
| Resample | Model1~Accuracy | Model1~Kappa | Model2~Accuracy | Model2~Kappa | Model3~Accuracy | Model3~Kappa | Model4~Accuracy | Model4~Kappa | Model5~Accuracy | Model5~Kappa | Model6~Accuracy | Model6~Kappa |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fold01.Rep1 | 0.9782609 | 0.9527721 | 0.8913043 | 0.7578947 | 0.9565217 | 0.9043659 | 1.0000000 | 1.0000000 | 0.9782609 | 0.9527721 | 0.9782609 | 0.9527721 |
| Fold01.Rep2 | 0.9565217 | 0.9043659 | 0.8695652 | 0.7200811 | 0.8913043 | 0.7516199 | 0.9782609 | 0.9527721 | 0.9347826 | 0.8547368 | 0.9782609 | 0.9527721 |
| Fold01.Rep3 | 0.9347826 | 0.8547368 | 0.9130435 | 0.8133874 | 0.9565217 | 0.9066937 | 0.9565217 | 0.9043659 | 0.9782609 | 0.9539078 | 0.9565217 | 0.9066937 |
| Fold01.Rep4 | 1.0000000 | 1.0000000 | 0.8666667 | 0.7227926 | 0.9347826 | 0.8617234 | 0.9555556 | 0.9075975 | 0.9782609 | 0.9539078 | 0.9777778 | 0.9521785 |
| Fold01.Rep5 | 0.9555556 | 0.9032258 | 0.8222222 | 0.6218487 | 0.9111111 | 0.8017621 | 0.9555556 | 0.9032258 | 0.9777778 | 0.9521785 | 0.9333333 | 0.8565356 |
| Fold02.Rep1 | 1.0000000 | 1.0000000 | 0.8260870 | 0.6521739 | 0.9555556 | 0.9032258 | 0.8913043 | 0.7749511 | 0.9555556 | 0.9032258 | 0.8913043 | 0.7749511 |
| Graficamente: |
dotplot( compar.models, main="1.RForest 2.Arbol 3.KNN 4.GLM 5.SVM 6.AdaBoosT")
Comparativa Area under Curve ROC:
vModelos <- c("knn","svm","glm","Tree","RandomForest","AdaBoost")
vAuc <- c(knn.auc,svm.auc,glm.auc,ctree.auc,rftree.auc,adaboost.auc)
ComparAuc <- data.frame( modelos = vModelos, auc = vAuc)
ComparAuc2 <-ComparAuc %>% arrange(desc(auc))
ComparAuc2
## modelos auc
## 1 svm 0.9691482
## 2 glm 0.9621060
## 3 AdaBoost 0.9523810
## 4 Tree 0.9431590
## 5 RandomForest 0.9285714
## 6 knn 0.8858149
Graficamente:
plt = ggplot(ComparAuc , aes(x = reorder(modelos,auc), y= auc, fill = (auc > 0.95)))
plt2 = geom_text(aes(label = round(auc,4) ), size = 3, hjust = 0.5, vjust = 3, position = "stack")
plt + geom_bar(stat="identity",width = .5 ) + plt2 + ggtitle ("Area bajo la curva ROC (AUC)")
Como conclusion obtenemos que el mejor modelo es el SVM, con un resultado un poco mejor que Random Forest y GLM que estan bastante parejos. El Algoritmo AdaBoost nos ha dado peor resultado y ademas tiene un coste computacional superior, al menos sin paralelizar. El arbol de clasificacion simple es menos eficaz que el ensemble de Random Forest como era de esperar.
#if(! "RJSONIO" %in% installed.packages()) install.packages("RJSONIO", depend = TRUE)
if(! "rio" %in% installed.packages()) install.packages("rio", depend = TRUE)
#library(RJSONIO)
library(rio)
if (!file.exists("./datos/output")) {
dir.create("./datos/output")
}
Lo vamos a exportar en csv para Tableau y en JSON para D3.js
brc_kmeans$diagnosis <- ifelse(brc_kmeans$diag == 1, "Maligno", "Benigno")
# datos de accuracy y kappa
modeloscsv <- compar.models$values %>% select (RfAccuracy = "Model1~Accuracy",RfKappa = "Model1~Kappa",
TreeAccuracy = "Model2~Accuracy",TreeKappa = "Model2~Kappa",
KNNAccuracy = "Model3~Accuracy",KNNKappa = "Model3~Kappa",
GLMAccuracy = "Model4~Accuracy",GLMKappa = "Model4~Kappa",
SVMAccuracy = "Model5~Accuracy",SVMKappa = "Model5~Kappa",
ADAAccuracy = "Model6~Accuracy",ADAKappa = "Model6~Kappa")
# datos para representar las curvas ROC
Roc_Exp <- Roc_RFOREST
Roc_Exp <- rbind(Roc_Exp, Roc_KNN)
Roc_Exp <- rbind(Roc_Exp, Roc_SVM)
Roc_Exp <- rbind(Roc_Exp, Roc_GLM)
Roc_Exp <- rbind(Roc_Exp, Roc_ADABOOST)
Roc_Exp <- rbind(Roc_Exp, Roc_CTREE)
kable(Roc_Exp)
| model | fpr | tpr |
|---|---|---|
| RFOREST | 0.0000000 | 0.0000000 |
| RFOREST | 0.0000000 | 0.8571429 |
| RFOREST | 1.0000000 | 1.0000000 |
| KNN | 0.0000000 | 0.0000000 |
| KNN | 0.0140845 | 0.7857143 |
| KNN | 1.0000000 | 1.0000000 |
| SVM | 0.0000000 | 0.0000000 |
| SVM | 0.0140845 | 0.9523810 |
| SVM | 1.0000000 | 1.0000000 |
| GLM | 0.0000000 | 0.0000000 |
| GLM | 0.0281690 | 0.9523810 |
| GLM | 1.0000000 | 1.0000000 |
| ADABOOST | 0.0000000 | 0.0000000 |
| ADABOOST | 0.0000000 | 0.9047619 |
| ADABOOST | 1.0000000 | 1.0000000 |
| CTREE | 0.0000000 | 0.0000000 |
| CTREE | 0.0422535 | 0.9285714 |
| CTREE | 1.0000000 | 1.0000000 |
#write.table(brc_kmeans,file="./datos/output/brc_kmeans.csv", sep=";",row.names = FALSE, col.names = TRUE)
write.table(modeloscsv, file="./datos/output/brc_modelos.csv", sep=";",row.names = FALSE, col.names = TRUE)
write.table(ComparAuc2,file="./datos/output/brc_comparauc.csv", sep=";",row.names = FALSE, col.names = TRUE)
write.table(Roc_Exp,file="./datos/output/Roc_Exp.csv", sep=";",row.names = FALSE, col.names = TRUE)
#kmeansJson <- toJSON(brc_kmeans, pretty=TRUE)
#cat(kmeansJson)
#demo <- fromJSON(kmeansJson)
#data.frame(demo)
#export(brc_kmeans, "./datos/output/kmeans.json") # JSON
head(brc_kmeans)
## diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 Maligno 17.99 10.38 122.80 1001.0
## 2 Maligno 20.57 17.77 132.90 1326.0
## 3 Maligno 19.69 21.25 130.00 1203.0
## 4 Maligno 11.42 20.38 77.58 386.1
## 5 Maligno 20.29 14.34 135.10 1297.0
## 6 Maligno 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.3001 0.14710
## 2 0.08474 0.07864 0.0869 0.07017
## 3 0.10960 0.15990 0.1974 0.12790
## 4 0.14250 0.28390 0.2414 0.10520
## 5 0.10030 0.13280 0.1980 0.10430
## 6 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 1 0.03003 0.006193 25.38 17.33
## 2 0.01389 0.003532 24.99 23.41
## 3 0.02250 0.004571 23.57 25.53
## 4 0.05963 0.009208 14.91 26.50
## 5 0.01756 0.005115 22.54 16.67
## 6 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## 1 184.60 2019.0 0.1622 0.6656
## 2 158.80 1956.0 0.1238 0.1866
## 3 152.50 1709.0 0.1444 0.4245
## 4 98.87 567.7 0.2098 0.8663
## 5 152.20 1575.0 0.1374 0.2050
## 6 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## 1 0.7119 0.2654 0.4601
## 2 0.2416 0.1860 0.2750
## 3 0.4504 0.2430 0.3613
## 4 0.6869 0.2575 0.6638
## 5 0.4000 0.1625 0.2364
## 6 0.5355 0.1741 0.3985
## fractal_dimension_worst diag cluster_id
## 1 0.11890 1 2
## 2 0.08902 1 2
## 3 0.08758 1 2
## 4 0.17300 1 4
## 5 0.07678 1 2
## 6 0.12440 1 3
str(brc_kmeans)
## 'data.frame': 569 obs. of 33 variables:
## $ diagnosis : chr "Maligno" "Maligno" "Maligno" "Maligno" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ diag : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cluster_id : int 2 2 2 4 2 3 2 3 3 3 ...
brc_kmeans %>% group_by(cluster_id, diagnosis,round(perimeter_worst / 50,0) ,round(radius_worst / 15,0),round(area_worst / 680,0) ) %>% summarise(n=n(),media1=mean(perimeter_worst)) #%>% filter(cluster_id == 1)
## # A tibble: 25 x 7
## # Groups: cluster_id, diagnosis, round(perimeter_worst/50, 0),
## # round(radius_worst/15, 0) [16]
## cluster_id diagnosis `round(perimete~ `round(radius_w~ `round(area_wor~
## <int> <chr> <dbl> <dbl> <dbl>
## 1 1 Maligno 4 2 4
## 2 1 Maligno 4 2 5
## 3 1 Maligno 5 2 5
## 4 1 Maligno 5 2 6
## 5 2 Maligno 3 1 2
## 6 2 Maligno 3 2 2
## 7 2 Maligno 3 2 3
## 8 2 Maligno 4 2 2
## 9 2 Maligno 4 2 3
## 10 2 Maligno 4 2 4
## # ... with 15 more rows, and 2 more variables: n <int>, media1 <dbl>
paso1 = brc_kmeans %>% group_by(cluster_id, diagnosis,
factor1 = round(perimeter_worst /mean(perimeter_worst),0) ,
factor2 = round(radius_worst / mean(radius_worst),0) ,
factor3 = round(area_worst / mean(area_worst),0) ) %>% summarise(n=n(), media1 = mean(perimeter_worst),media2 = mean(radius_worst),media3 = mean(area_worst))
paso1$diagnosis = ifelse(paso1$diagnosis == 1, "Maligno", "Benigno")
paso1$perimeter_worst = paso1$factor1 * paso1$media1
paso1$radius_worst = paso1$factor2 * paso1$media2
paso1$area_worst = paso1$factor3 * paso1$media3
paso2 <- data.frame(paso1 )
paso3 <- paso2 %>% select(cluster_id,diagnosis,perimeter_worst,radius_worst,area_worst, n) %>% arrange(cluster_id, diagnosis,perimeter_worst,radius_worst,area_worst)
paso3
## cluster_id diagnosis perimeter_worst radius_worst area_worst n
## 1 1 Benigno 399.50000 59.42000 8007.3750 8
## 2 1 Benigno 431.80000 64.74500 13025.0000 4
## 3 1 Benigno 502.40000 72.08000 21270.0000 1
## 4 2 Benigno 152.92424 23.10030 3269.5758 33
## 5 2 Benigno 157.16667 49.88667 3674.6667 3
## 6 2 Benigno 334.00000 23.99778 3495.1111 9
## 7 2 Benigno 342.73684 51.04947 3971.3684 19
## 8 2 Benigno 376.17143 55.04571 6982.7143 7
## 9 3 Benigno 100.65434 15.38279 726.3131 122
## 10 3 Benigno 112.00579 16.67018 852.7930 57
## 11 4 Benigno 0.00000 0.00000 0.0000 1
## 12 4 Benigno 69.50400 10.78387 0.0000 85
## 13 4 Benigno 85.33178 13.14493 529.0452 146
## 14 4 Benigno 90.97600 13.61800 547.0200 5
## 15 5 Benigno 121.53333 18.72333 1083.6667 3
## 16 5 Benigno 130.52500 19.63000 1188.3000 48
## 17 5 Benigno 142.20556 21.44944 2811.2222 18
export(paso3, "./datos/output/kmeans2.json") # JSON
## Loading required namespace: jsonlite
export(expd3, "./datos/output/correl.json") # JSON
export(expD3Cor, "./datos/output/correl2.json") # JSON
export(expD3CorFind, "./datos/output/correlFind.json") # JSON
#summary(brc_kmeans %>% select (perimeter_worst,radius_worst,area_worst))
#str(knn.brc.model)
#class(knn.brc.model)