Conjunto treinamento e teste

set.seed(123)
ind<-sample(2,nrow(iris),replace = TRUE, prob = c(0.7,0.3))
treino<-iris[ind==1,]
teste<-iris[ind==2,]

dim(treino)
## [1] 106   5
dim(teste)
## [1] 44  5
table(treino$Species)
## 
##     setosa versicolor  virginica 
##         35         36         35
table(teste$Species)
## 
##     setosa versicolor  virginica 
##         15         14         15
# outra maneira de fazer

library(caTools)

set.seed(123)
divisao <- sample.split(iris$Species, SplitRatio = 0.7)
divisao
##   [1]  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
##  [12]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
##  [23]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
##  [34] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [45]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
##  [56]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [67] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE
##  [89] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [100]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [111] FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [122]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [133]  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [144]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
treinamento <- subset(iris, divisao == TRUE)
treinamento
teste <- subset(iris, divisao == FALSE)
teste

Árvores

library(rpart)
classificador1 <- rpart(formula =  Species ~ ., data = treino)
print(classificador1)
## n= 106 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 106 70 versicolor (0.33018868 0.33962264 0.33018868)  
##   2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 71 35 versicolor (0.00000000 0.50704225 0.49295775)  
##     6) Petal.Length< 4.75 34  0 versicolor (0.00000000 1.00000000 0.00000000) *
##     7) Petal.Length>=4.75 37  2 virginica (0.00000000 0.05405405 0.94594595) *
plot(classificador1)
text(classificador1)

# Melhorando o gráfico

library(rpart.plot)
rpart.plot(classificador1)

Matriz de Confusão

A Matriz de confusão (tradução livre) é uma matriz de valores reais e valores preditos pelo seu classificador.

É utilizada para modelos de classificação e tem como objetivo calcular a quantidade de falso positivo e falso negativo; e de verdadeiro positivo e verdadeiro negativo, além de te fornecer a acurácia e sensibilidade.

Matriz de confusão

Matriz de confusão

#### Matriz de confusão

previsoes = predict(classificador1, newdata = teste[-5], type = 'class')
matriz_confusao = table(teste[, 5], previsoes)
print(matriz_confusao)
##             previsoes
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         11         4
##   virginica       0          1        14
library(caret)
confusionMatrix(matriz_confusao)
## Confusion Matrix and Statistics
## 
##             previsoes
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         11         4
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8889          
##                  95% CI : (0.7595, 0.9629)
##     No Information Rate : 0.4             
##     P-Value [Acc > NIR] : 1.248e-11       
##                                           
##                   Kappa : 0.8333          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9167           0.7778
## Specificity                 1.0000            0.8788           0.9630
## Pos Pred Value              1.0000            0.7333           0.9333
## Neg Pred Value              1.0000            0.9667           0.8667
## Prevalence                  0.3333            0.2667           0.4000
## Detection Rate              0.3333            0.2444           0.3111
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.8977           0.8704

Métricas de erro

No caso de modelos de regressão as medidas de erro baseiam-se no cálculo da diferença numérica entre os valores previstos e os valores reais.

Random Forest

Consiste, basicamente, de uma coleção de árvores de decisão na qual podemos pesquisar (consultar), sendo que a decisão final é computada pelo voto da maioria.

library(randomForest)

classificador2 <- randomForest(Species ~ .,treino,ntree=500)
summary(classificador2)
##                 Length Class  Mode     
## call               4   -none- call     
## type               1   -none- character
## predicted        106   factor numeric  
## err.rate        2000   -none- numeric  
## confusion         12   -none- numeric  
## votes            318   matrix numeric  
## oob.times        106   -none- numeric  
## classes            3   -none- character
## importance         4   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                106   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
predicted = predict(classificador2,teste)
predicted
##          2          4          5          8         11         16 
##     setosa     setosa     setosa     setosa     setosa     setosa 
##         20         21         24         26         31         32 
##     setosa     setosa     setosa     setosa     setosa     setosa 
##         34         37         50         53         58         59 
##     setosa     setosa     setosa versicolor versicolor versicolor 
##         65         67         68         69         71         73 
## versicolor versicolor versicolor versicolor  virginica  virginica 
##         82         84         87         88         89         97 
## versicolor  virginica versicolor versicolor versicolor versicolor 
##        104        106        107        111        114        115 
##  virginica  virginica versicolor  virginica  virginica  virginica 
##        118        126        132        134        137        138 
##  virginica  virginica  virginica  virginica  virginica  virginica 
##        139        145        150 
##  virginica  virginica  virginica 
## Levels: setosa versicolor virginica
plot(classificador2)

previsoes2 = predict(classificador2, newdata = teste[-5], type = 'class')
matriz_confusao2 = table(teste[, 5], previsoes2)
print(matriz_confusao2)
##             previsoes2
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
confusionMatrix(matriz_confusao2)
## Confusion Matrix and Statistics
## 
##             previsoes2
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3778          
##     P-Value [Acc > NIR] : 1.099e-13       
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9231           0.8235
## Specificity                 1.0000            0.9062           0.9643
## Pos Pred Value              1.0000            0.8000           0.9333
## Neg Pred Value              1.0000            0.9667           0.9000
## Prevalence                  0.3333            0.2889           0.3778
## Detection Rate              0.3333            0.2667           0.3111
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9147           0.8939
# Ordem de importância das variáveis 

library(caret)
import1 <- varImp(classificador1, scale = FALSE)
import1
import2 <- varImp(classificador2, scale = FALSE)
import2

K-Vizinhos mais próximos (k-NN)

Este método realiza a previsão de novos exemplos sem criar explicitamente um modelo a partir dos dados disponíveis para o treino.

library(caTools)

set.seed(123)
divisao = sample.split(iris$Species, SplitRatio = 0.70)
treino = subset(iris, divisao == TRUE)
teste = subset(iris, divisao == FALSE)

library(class)

classificador3<- knn(treino[,-5],teste[,-5],cl = treino[,5],k=5)
print(classificador3)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor versicolor virginica  versicolor virginica 
## [25] versicolor virginica  versicolor versicolor versicolor versicolor
## [31] virginica  virginica  versicolor virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica
matriz_confusao3 = table(teste[,5], classificador3)
print(matriz_confusao3)
##             classificador3
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
confusionMatrix(matriz_confusao3)
## Confusion Matrix and Statistics
## 
##             classificador3
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3778          
##     P-Value [Acc > NIR] : 1.099e-13       
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9231           0.8235
## Specificity                 1.0000            0.9062           0.9643
## Pos Pred Value              1.0000            0.8000           0.9333
## Neg Pred Value              1.0000            0.9667           0.9000
## Prevalence                  0.3333            0.2889           0.3778
## Detection Rate              0.3333            0.2667           0.3111
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9147           0.8939
## Escalonamento

rm(list=ls())

iris[,1:4]=scale(iris[,1:4])

set.seed(123)
divisao = sample.split(iris$Species, SplitRatio = 0.70)
treino = subset(iris, divisao == TRUE)
teste = subset(iris, divisao == FALSE)

classificador4<- knn(treino[,-5],teste[,-5],cl = treino[,5],k=5)
print(classificador4)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor versicolor virginica  versicolor virginica 
## [25] versicolor virginica  versicolor versicolor versicolor versicolor
## [31] virginica  virginica  versicolor virginica  virginica  virginica 
## [37] virginica  virginica  virginica  versicolor virginica  virginica 
## [43] versicolor virginica  versicolor
## Levels: setosa versicolor virginica
matriz_confusao4 = table(teste[,5], classificador4)
print(matriz_confusao4)
##             classificador4
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          4        11
confusionMatrix(matriz_confusao4)
## Confusion Matrix and Statistics
## 
##             classificador4
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          4        11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8444          
##                  95% CI : (0.7054, 0.9351)
##     No Information Rate : 0.3556          
##     P-Value [Acc > NIR] : 1.996e-11       
##                                           
##                   Kappa : 0.7667          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7500           0.7857
## Specificity                 1.0000            0.8966           0.8710
## Pos Pred Value              1.0000            0.8000           0.7333
## Neg Pred Value              1.0000            0.8667           0.9000
## Prevalence                  0.3333            0.3556           0.3111
## Detection Rate              0.3333            0.2667           0.2444
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.8233           0.8283
### Normalização

rm(list=ls())
normalize <- function(x){
  return ((x-min(x))/(max(x)-min(x)))
}

iris[,1:4]=normalize(iris[,1:4])

set.seed(123)
divisao = sample.split(iris$Species, SplitRatio = 0.70)
treino = subset(iris, divisao == TRUE)
teste = subset(iris, divisao == FALSE)

classificador5<- knn(treino[,-5],teste[,-5],cl = treino[,5],k=5)
print(classificador5)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor versicolor virginica  versicolor virginica 
## [25] versicolor virginica  versicolor versicolor versicolor versicolor
## [31] virginica  virginica  versicolor virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica
matriz_confusao5 = table(teste[,5], classificador5)
print(matriz_confusao5)
##             classificador5
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
confusionMatrix(matriz_confusao5)
## Confusion Matrix and Statistics
## 
##             classificador5
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3778          
##     P-Value [Acc > NIR] : 1.099e-13       
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9231           0.8235
## Specificity                 1.0000            0.9062           0.9643
## Pos Pred Value              1.0000            0.8000           0.9333
## Neg Pred Value              1.0000            0.9667           0.9000
## Prevalence                  0.3333            0.2889           0.3778
## Detection Rate              0.3333            0.2667           0.3111
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9147           0.8939

Os valores na diagonal mostram o número de instâncias classificadas corretamente. Os valores fora da diagonal indicam que eles foram incorretamente instâncias.

Naive Bayes

O método Naive Bayes utiliza o conceito de probabilidade condicional por meio do teorema de Bayes.

rm(list=ls())

set.seed(123)
divisao = sample.split(iris$Species, SplitRatio = 0.70)
treino = subset(iris, divisao == TRUE)
teste = subset(iris, divisao == FALSE)

library(e1071)
classificador6<- naiveBayes(Species ~., treino)
print(classificador6)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             Sepal.Length
## Y                [,1]      [,2]
##   setosa     4.940000 0.3541352
##   versicolor 5.920000 0.5166635
##   virginica  6.634286 0.5422952
## 
##             Sepal.Width
## Y                [,1]      [,2]
##   setosa     3.405714 0.3685766
##   versicolor 2.777143 0.3144423
##   virginica  2.925714 0.2831990
## 
##             Petal.Length
## Y                [,1]      [,2]
##   setosa     1.445714 0.1930298
##   versicolor 4.217143 0.4462166
##   virginica  5.565714 0.5075563
## 
##             Petal.Width
## Y                 [,1]      [,2]
##   setosa     0.2428571 0.1092372
##   versicolor 1.3114286 0.1827429
##   virginica  2.0428571 0.2714728
previsoes3 = predict(classificador6, teste)
matriz_confusao6 = table(teste[,5],previsoes3)
confusionMatrix(matriz_confusao6)
## Confusion Matrix and Statistics
## 
##             previsoes3
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          2        13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8889          
##                  95% CI : (0.7595, 0.9629)
##     No Information Rate : 0.3556          
##     P-Value [Acc > NIR] : 1.581e-13       
##                                           
##                   Kappa : 0.8333          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8571           0.8125
## Specificity                 1.0000            0.9032           0.9310
## Pos Pred Value              1.0000            0.8000           0.8667
## Neg Pred Value              1.0000            0.9333           0.9000
## Prevalence                  0.3333            0.3111           0.3556
## Detection Rate              0.3333            0.2667           0.2889
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.8802           0.8718

Support Vector Machine - SVM

library(ggplot2)

set.ver<-subset(iris)
ggplot(set.ver, aes(x=Petal.Length, y = Petal.Width, color =Species)) +
  geom_point(size=4) + 
  scale_color_manual(values = c("black","white","red")) +
  geom_point(shape=1,size =4, color = "black") + theme(panel.grid.major = element_blank(),panel.grid.minor=element_blank())

rm(list=ls())

set.seed(123)
divisao = sample.split(iris$Species, SplitRatio = 0.70)
treino = subset(iris, divisao == TRUE)
teste = subset(iris, divisao == FALSE)

library(e1071)
classificador7 = svm(formula = treino$Species ~ ., data = treino, type = 'C-classification', kernel = 'radial')

previsoes4 = predict(classificador7, newdata = teste[-5])
matriz_confusao7 = table(teste[,5], previsoes4)
confusionMatrix(matriz_confusao7)
## Confusion Matrix and Statistics
## 
##             previsoes4
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3778          
##     P-Value [Acc > NIR] : 1.099e-13       
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9231           0.8235
## Specificity                 1.0000            0.9062           0.9643
## Pos Pred Value              1.0000            0.8000           0.9333
## Neg Pred Value              1.0000            0.9667           0.9000
## Prevalence                  0.3333            0.2889           0.3778
## Detection Rate              0.3333            0.2667           0.3111
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9147           0.8939
# Pacote kernlab

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
classificador8 = ksvm(treino$Species ~ ., data = treino,kernel = "vanilladot")
##  Setting default kernel parameters
previsoes5 = predict(classificador8, newdata = teste[-5])
matriz_confusao9 = table(teste[,5], previsoes5)
confusionMatrix(matriz_confusao9)
## Confusion Matrix and Statistics
## 
##             previsoes5
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          0        15
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9333         
##                  95% CI : (0.8173, 0.986)
##     No Information Rate : 0.4            
##     P-Value [Acc > NIR] : 6.213e-14      
##                                          
##                   Kappa : 0.9            
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.8333
## Specificity                 1.0000            0.9091           1.0000
## Pos Pred Value              1.0000            0.8000           1.0000
## Neg Pred Value              1.0000            1.0000           0.9000
## Prevalence                  0.3333            0.2667           0.4000
## Detection Rate              0.3333            0.2667           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9545           0.9167

Redes Neurais Artificiais

rm(list=ls())

set.seed(123)
divisao = sample.split(iris$Species, SplitRatio = 0.70)
treino = subset(iris, divisao == TRUE)
teste = subset(iris, divisao == FALSE)

library(nnet)
classificador9 = nnet(treino$Species ~ ., data = treino,size = 5)
## # weights:  43
## initial  value 127.247024 
## iter  10 value 47.801205
## iter  20 value 9.285821
## iter  30 value 0.063104
## iter  40 value 0.004759
## iter  50 value 0.000197
## final  value 0.000073 
## converged
previsoes6 = predict(classificador9,teste[-5],type = 'class')
matriz_confusao9 = table(teste[,5], previsoes6)
confusionMatrix(matriz_confusao9)
## Confusion Matrix and Statistics
## 
##             previsoes6
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         12         3
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3778          
##     P-Value [Acc > NIR] : 1.099e-13       
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9231           0.8235
## Specificity                 1.0000            0.9062           0.9643
## Pos Pred Value              1.0000            0.8000           0.9333
## Neg Pred Value              1.0000            0.9667           0.9000
## Prevalence                  0.3333            0.2889           0.3778
## Detection Rate              0.3333            0.2667           0.3111
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9147           0.8939