TRABAJO 1

EJERCICIO 1:

Comenzamos creando la semilla:

set.seed(13579)

Instalamos el paquete mlbench y lo ejecutamos:

install.packages("mlbench")
library("mlbench")

Eliminamos los valores perdidos del data.frame BreastCancer:

data(BreastCancer)
BreastCancer=data.frame(na.omit(BreastCancer))

Eliminamos la primera columna del data.frame correspondiente a la columna de ID:

BreastCancer=BreastCancer[,!colnames(BreastCancer)=="Id"]

dim(BreastCancer)
## [1] 683  10
n= nrow(BreastCancer)
summary(BreastCancer)
##   Cl.thickness   Cell.size     Cell.shape  Marg.adhesion  Epith.c.size
##  1      :139   1      :373   1      :346   1      :393   2      :376  
##  5      :128   10     : 67   2      : 58   2      : 58   3      : 71  
##  3      :104   3      : 52   10     : 58   3      : 58   4      : 48  
##  4      : 79   2      : 45   3      : 53   10     : 55   1      : 44  
##  10     : 69   4      : 38   4      : 43   4      : 33   6      : 40  
##  2      : 50   5      : 30   5      : 32   8      : 25   5      : 39  
##  (Other):114   (Other): 78   (Other): 93   (Other): 61   (Other): 65  
##   Bare.nuclei   Bl.cromatin  Normal.nucleoli    Mitoses          Class    
##  1      :402   3      :161   1      :432     1      :563   benign   :444  
##  10     :132   2      :160   10     : 60     2      : 35   malignant:239  
##  2      : 30   1      :150   3      : 42     3      : 33                  
##  5      : 30   7      : 71   2      : 36     10     : 14                  
##  3      : 28   4      : 39   8      : 23     4      : 12                  
##  8      : 21   5      : 34   6      : 22     7      :  9                  
##  (Other): 40   (Other): 68   (Other): 68     (Other): 17
indices=1:n

Realizamos la partición 70% entrenamiento - 30% test

inditest = sample(1:n, size = trunc(3*n/10))
BreastCancer.entr = BreastCancer[-inditest,]
BreastCancer.test = BreastCancer[inditest,]

Construcción de Naive-Bayes en el conjunto de entrenamiento

  1. Instalamos y ejecutamos el paquete e1071:
install.packages("e1071")
library(e1071)
  1. Construimos el modelo Naive-Bayes:
modeloNB = naiveBayes(Class ~ ., data = BreastCancer.entr)
modeloNB
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    benign malignant 
## 0.6430063 0.3569937 
## 
## Conditional probabilities:
##            Cl.thickness
## Y                     1           2           3           4           5
##   benign    0.314935065 0.090909091 0.214285714 0.146103896 0.198051948
##   malignant 0.017543860 0.017543860 0.052631579 0.052631579 0.192982456
##            Cl.thickness
## Y                     6           7           8           9          10
##   benign    0.032467532 0.000000000 0.003246753 0.000000000 0.000000000
##   malignant 0.070175439 0.076023392 0.169590643 0.046783626 0.304093567
## 
##            Cell.size
## Y                     1           2           3           4           5
##   benign    0.818181818 0.081168831 0.074675325 0.016233766 0.000000000
##   malignant 0.011695906 0.029239766 0.116959064 0.099415205 0.140350877
##            Cell.size
## Y                     6           7           8           9          10
##   benign    0.000000000 0.003246753 0.003246753 0.003246753 0.000000000
##   malignant 0.087719298 0.081871345 0.116959064 0.017543860 0.298245614
## 
##            Cell.shape
## Y                     1           2           3           4           5
##   benign    0.762987013 0.120129870 0.071428571 0.029220779 0.003246753
##   malignant 0.005847953 0.017543860 0.105263158 0.128654971 0.128654971
##            Cell.shape
## Y                     6           7           8           9          10
##   benign    0.003246753 0.006493506 0.003246753 0.000000000 0.000000000
##   malignant 0.093567251 0.140350877 0.099415205 0.035087719 0.245614035
## 
##            Marg.adhesion
## Y                     1           2           3           4           5
##   benign    0.811688312 0.100649351 0.058441558 0.012987013 0.006493506
##   malignant 0.134502924 0.070175439 0.122807018 0.122807018 0.081871345
##            Marg.adhesion
## Y                     6           7           8           9          10
##   benign    0.006493506 0.000000000 0.000000000 0.000000000 0.003246753
##   malignant 0.087719298 0.046783626 0.087719298 0.017543860 0.228070175
## 
##            Epith.c.size
## Y                     1           2           3           4           5
##   benign    0.103896104 0.795454545 0.064935065 0.016233766 0.006493506
##   malignant 0.005847953 0.099415205 0.175438596 0.169590643 0.146198830
##            Epith.c.size
## Y                     6           7           8           9          10
##   benign    0.003246753 0.006493506 0.003246753 0.000000000 0.000000000
##   malignant 0.163742690 0.035087719 0.070175439 0.005847953 0.128654971
## 
##            Bare.nuclei
## Y                     1           2           3           4           5
##   benign    0.863636364 0.048701299 0.035714286 0.012987013 0.025974026
##   malignant 0.058479532 0.029239766 0.064327485 0.052631579 0.076023392
##            Bare.nuclei
## Y                     6           7           8           9          10
##   benign    0.000000000 0.000000000 0.006493506 0.000000000 0.006493506
##   malignant 0.017543860 0.035087719 0.076023392 0.029239766 0.561403509
## 
##            Bl.cromatin
## Y                     1           2           3           4           5
##   benign    0.340909091 0.331168831 0.279220779 0.016233766 0.012987013
##   malignant 0.011695906 0.029239766 0.175438596 0.116959064 0.140350877
##            Bl.cromatin
## Y                     6           7           8           9          10
##   benign    0.003246753 0.016233766 0.000000000 0.000000000 0.000000000
##   malignant 0.023391813 0.263157895 0.116959064 0.035087719 0.087719298
## 
##            Normal.nucleoli
## Y                     1           2           3           4           5
##   benign    0.883116883 0.061688312 0.025974026 0.003246753 0.003246753
##   malignant 0.169590643 0.035087719 0.134502924 0.058479532 0.064327485
##            Normal.nucleoli
## Y                     6           7           8           9          10
##   benign    0.009740260 0.006493506 0.006493506 0.000000000 0.000000000
##   malignant 0.076023392 0.064327485 0.081871345 0.058479532 0.257309942
## 
##            Mitoses
## Y                     1           2           3           4           5
##   benign    0.967532468 0.022727273 0.003246753 0.000000000 0.000000000
##   malignant 0.526315789 0.122807018 0.140350877 0.052631579 0.029239766
##            Mitoses
## Y                     6           7           8          10
##   benign    0.000000000 0.003246753 0.003246753 0.000000000
##   malignant 0.005847953 0.029239766 0.035087719 0.058479532
  1. Lo aplicamos al conjunto test:
preditest = predict(modeloNB,BreastCancer.test[,1:9])
confutest = table(BreastCancer.test[,10],preditest)
confutest
##            preditest
##             benign malignant
##   benign       132         4
##   malignant      1        67
  1. Obtenemos las medidas del rendimiento del modeloNB:
cat(" Tasa de acierto test= \t",
    100*(confutest[1,1]+confutest[2,2])/sum(confutest),"\n",
    "Sensitividad test= \t",
    100*confutest[2,2]/sum(confutest[2,]),"\n",
    "Especificidad test=  \t",
    100*confutest[1,1]/sum(confutest[1,]) ,"\n")
##  Tasa de acierto test=    97.54902 
##  Sensitividad test=   98.52941 
##  Especificidad test=      97.05882
cbind(confutest,Acierto=round(100*diag(prop.table(confutest,1)),2))
##           benign malignant Acierto
## benign       132         4   97.06
## malignant      1        67   98.53
probabitest_g=modeloNB$prob[,2]
  1. Instalamos el paquete ROCR y gplots para medir el test del área bajo la curva
install.packages("ROCR")
install.packages("gplots")
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
probabi = predict(modeloNB,BreastCancer.test[,1:9],type="raw")[,2] #Prob. S?
prediobj = prediction(probabi,BreastCancer.test[,10])
plot(performance(prediobj, "tpr","fpr"),main="COR TEST. Naive Bayes, BreastCancer",
     xlab="Tasa de falsos positivos", ylab="Tasa de verdaderos positivos")
abline(a=0,b=1,col="blue",lty=2)
auc<- as.numeric(performance(prediobj,"auc")@y.values)
legend("bottomright",legend=paste("AUC=",round(auc,3)))

cat("Área bajo la curva COR Test= ",auc,"\n")
## Área bajo la curva COR Test=  0.9940528

Construcción de KKNN en el conjunto de entrenamiento

  1. Instalamos y ejecutamos el paquete kknn:
install.packages("kknn")
library(kknn)
  1. Construimos el modelo KKNN (configurando los parámetros mediante validación cruzada):
(fit.train1 = train.kknn(
  Class ~ ., BreastCancer.entr, kmax = 15,
  kernel = c("triangular", "rectangular", 
             "epanechnikov", "optimal"), 
  distance = 2) )
## 
## Call:
## train.kknn(formula = Class ~ ., data = BreastCancer.entr, kmax = 15,     distance = 2, kernel = c("triangular", "rectangular", "epanechnikov",         "optimal"))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.03549061
## Best kernel: triangular
## Best k: 3
  • Seleccionamos el “mejor” k y el “mejor” núcleo:
(k=fit.train1$best.parameters$k)
## [1] 3
(nucleo=fit.train1$best.parameters$kernel)
## [1] "triangular"
  • Calculamos las predicciones test con la función kknn:
modeloKKNN <- kknn(Class ~ ., BreastCancer.entr, k =k, 
                   test=BreastCancer.test,
                   kernel = nucleo, distance = 2) 
(confutest=table(BreastCancer.test$Class,modeloKKNN$fitted.values))
##            
##             benign malignant
##   benign       133         3
##   malignant      2        66
  1. Obtenemos las medidas de rendimiento para el modeloKKNN:
cat(" Tasa de acierto test= \t",
    100*(confutest[1,1]+confutest[2,2])/sum(confutest),"\n",
    "Sensitividad test= \t",
    100*confutest[2,2]/sum(confutest[2,]),"\n",
    "Especificidad test=  \t",
    100*confutest[1,1]/sum(confutest[1,]) ,"\n")
##  Tasa de acierto test=    97.54902 
##  Sensitividad test=   97.05882 
##  Especificidad test=      97.79412
cbind(confutest,Acierto=round(100*diag(prop.table(confutest,1)),2))
##           benign malignant Acierto
## benign       133         3   97.79
## malignant      2        66   97.06
probabitest_g=modeloKKNN$prob[,2]
  1. Instalamos el paquete ROCR y gplots para medir el test del área bajo la curva:
install.packages("gplots")
install.packages("ROCR")
library(ROCR)
pred = prediction( probabitest_g, BreastCancer.test$Class) 
perf = performance(pred,"tpr","fpr") 
plot(perf,main="COR Test. KKNN, BreastCancer")
abline(a=0,b=1,col="blue",lty=2)
grid()
auc = as.numeric(performance(pred,"auc")@y.values)
legend("bottomright",legend=paste("AUC=",round(auc,3)))

cat("Área bajo la curva COR Test= ",auc,"\n")
## Área bajo la curva COR Test=  0.9799957

EJERCICIO 2

Instalamos el paquete mlbench y lo ejecutamos

install.packages(“mlbench”)

library(“mlbench”)

data(Glass) Glass=data.frame(Glass)

dim(Glass) n= nrow(Glass) summary(Glass) indices=1:n

Partición 70% entrenamiento - 30% test

inditest = sample(1:n, size = trunc(3*n/10)) Glass.entr = Glass[-inditest,] Glass.test = Glass[inditest,]

Construcción de KKNN en el conjunto de entrenamiento

1) Instalamos y ejecutamos el paquete kknn

install.packages(“kknn”)

library(kknn)

2) Construimos el modelo KKNN (configurando los parámetros mediante validación cruzada)

(fit.train1 = train.kknn( Type ~ ., Glass.entr, kmax = 15, kernel = c(“triangular”, “rectangular”, “epanechnikov”, “optimal”), distance = 2) )

Seleccionamos el “mejor” k y el “mejor” núcleo

(k=fit.train1\(best.parameters\)k) (nucleo=fit.train1\(best.parameters\)kernel) fit.train1$distance

Calculamos las predicciones test con la función kknn

modeloKKNN <- kknn(Type ~ ., Glass.entr, k =k, test=Glass.test, kernel = nucleo, distance = 2)

confutest=table(Glass.test\(Type,modeloKKNN\)fitted.values) confutest

3) Obtenemos las medidas de rendimiento para el modeloKKNN

cat(" Tasa de acierto test= “, 100(confutest[1,1]+confutest[2,2])/sum(confutest),“”, “Sensitividad test= ”, 100confutest[2,2]/sum(confutest[2,]),”“,”Especificidad test= “, 100confutest[1,1]/sum(confutest[1,]) ,“”) cbind(confutest,Acierto=round(100diag(prop.table(confutest,1)),2)) probabitest_g=modeloKKNN$prob[,2]

5) Instalamos el paquete ROCR y gplots para medir el test del área bajo la curva

install.packages(“gplots”)

install.packages(“ROCR”)

library(ROCR) pred = prediction( probabitest_g, Glass.test$Type) perf = performance(pred,“tpr”,“fpr”) plot(perf,main=“KKNN, Glass”) abline(a=0,b=1,col=“blue”,lty=2) grid() auc = as.numeric(performance(pred,“auc”)@y.values) legend(“bottomright”,legend=paste(“AUC=”,round(auc,3))) cat(“Área bajo la curva COR Test=”,auc,“”)