Algoritmo KNN

(Variavel dependente(factor) e 2 variaveis explicativas numéricas) O objetivo é conseguir prever se y é 2 ou 7 consoante os valores de x_1 e x_2

Exportação de Dados

if(!exists("mnist")) mnist <- read_mnist()
head(mnist_27$train$x_1)
## [1] 0.03947368 0.16071429 0.02127660 0.13580247 0.39024390 0.04854369
data.class(mnist_27$train$x_1)
## [1] "numeric"
head(mnist_27$train$x_2)#variaveis independentes
## [1] 0.18421053 0.08928571 0.27659574 0.22222222 0.36585366 0.28155340
data.class(mnist_27$train$x_2)
## [1] "numeric"
head(mnist_27$train$y) #variaveis dependentes
## [1] 2 7 2 2 7 2
## Levels: 2 7
data.class(mnist_27$train$y)
## [1] "factor"
dim(mnist_27$train)
## [1] 800   3

Analise e compreensão dos dados

p = data.frame(mnist_27$train$x_1,mnist_27$train$x_2)
panel.hist <- function(x, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5) )
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks; nB <- length(breaks)
  y <- h$counts; y <- y/max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(p, diag.panel = panel.hist, upper.panel = panel.cor,
      lower.panel = panel.smooth)

ggpairs(mnist_27$train, columns = 1:3, ggplot2::aes(colour=y))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mnist_27$train %>% mutate(y = factor(y)) %>% ggplot(aes(x_1, x_2, fill = y, color=y)) + geom_point(show.legend = FALSE) + stat_ellipse(type="norm")

KNN (Criacao do Modelo e Avaliacao de resultados)

set.seed(2) 
#pick the k in knn
ks <- seq(1, 50, 1)
accuracy <- map_df(ks, function(k){
  fit <- knn3(y ~ ., data = mnist_27$train, k = k)
  y_hat <- predict(fit, mnist_27$train, type = "class")
  cm_train <- confusionMatrix(data = y_hat, reference = mnist_27$train$y)
  train_error <- cm_train$overall["Accuracy"]
  y_hat <- predict(fit, mnist_27$test, type = "class")
  cm_test <- confusionMatrix(data = y_hat, reference = mnist_27$test$y)
  test_error <- cm_test$overall["Accuracy"]
  y_hat <- predict(fit, mnist_27$test, type = "class")
  F_1<-F_meas(data = y_hat, reference = factor(mnist_27$test$y))
  tibble(train = train_error, test = test_error,F1=F_1)
}) ## Algoritmo KNN com Accuracy e F-meas

predicted.type <- NULL
error.rate <- NULL
for (i in 1:50) {
  predicted.type <- knn(mnist_27$train[2:3],mnist_27$test[2:3],mnist_27$train$y,k=i)
  error.rate[i] <- mean(predicted.type!=mnist_27$test$y)
}
knn.error <- as.data.frame(cbind(k=1:50,error.type =error.rate))
 ggplot(knn.error,aes(k,error.type))+ 
      geom_point()+ 
      geom_line() + 
      scale_x_continuous(breaks=1:50)+ 
      theme_bw() +
      xlab("Value of K") +
      ylab('Error')

 plot(ks, accuracy$F1)

print("F_meas")
## [1] "F_meas"
kss=ks[which.max(accuracy$F1)]
kss
## [1] 41
max(accuracy$F1)
## [1] 0.8703704
knn_fit_ks <- knn3(y ~ ., data = mnist_27$train, k = kss)
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$train, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$train$y)$overall["Accuracy"]
## Accuracy 
##   0.8475
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$test, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  2  7
##          2 94 16
##          7 12 78
##                                           
##                Accuracy : 0.86            
##                  95% CI : (0.8041, 0.9049)
##     No Information Rate : 0.53            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7183          
##                                           
##  Mcnemar's Test P-Value : 0.5708          
##                                           
##             Sensitivity : 0.8868          
##             Specificity : 0.8298          
##          Pos Pred Value : 0.8545          
##          Neg Pred Value : 0.8667          
##              Prevalence : 0.5300          
##          Detection Rate : 0.4700          
##    Detection Prevalence : 0.5500          
##       Balanced Accuracy : 0.8583          
##                                           
##        'Positive' Class : 2               
## 
head(y_hat_knn_ks) #Resultados do F-MEAS
## [1] 2 7 7 7 7 2
## Levels: 2 7
print("Accuracy")
## [1] "Accuracy"
#pick the k that maximizes accuracy using the estimates built on the test data
kss=ks[which.max(accuracy$test)]
kss
## [1] 41
max(accuracy$test)
## [1] 0.86
#Fit Model
knn_fit_ks <- knn3(y ~ ., data = mnist_27$train, k = kss)
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$train, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$train$y)$overall["Accuracy"]
## Accuracy 
##   0.8475
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$test, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  2  7
##          2 94 16
##          7 12 78
##                                           
##                Accuracy : 0.86            
##                  95% CI : (0.8041, 0.9049)
##     No Information Rate : 0.53            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7183          
##                                           
##  Mcnemar's Test P-Value : 0.5708          
##                                           
##             Sensitivity : 0.8868          
##             Specificity : 0.8298          
##          Pos Pred Value : 0.8545          
##          Neg Pred Value : 0.8667          
##              Prevalence : 0.5300          
##          Detection Rate : 0.4700          
##    Detection Prevalence : 0.5500          
##       Balanced Accuracy : 0.8583          
##                                           
##        'Positive' Class : 2               
## 
head(y_hat_knn_ks) #Resultados Accuracy
## [1] 2 7 7 7 7 2
## Levels: 2 7
train.control <- trainControl(method = "cv", number = 10, p = .9)
train_knn <- train(y ~ ., method = "knn", tuneGrid = data.frame(k = seq(1, 50, 1)),
                   data = mnist_27$train,trControl = train.control)
y_hat_knn_ks <- predict(train_knn, mnist_27$test)
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  2  7
##          2 92 18
##          7 14 76
##                                           
##                Accuracy : 0.84            
##                  95% CI : (0.7817, 0.8879)
##     No Information Rate : 0.53            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6781          
##                                           
##  Mcnemar's Test P-Value : 0.5959          
##                                           
##             Sensitivity : 0.8679          
##             Specificity : 0.8085          
##          Pos Pred Value : 0.8364          
##          Neg Pred Value : 0.8444          
##              Prevalence : 0.5300          
##          Detection Rate : 0.4600          
##    Detection Prevalence : 0.5500          
##       Balanced Accuracy : 0.8382          
##                                           
##        'Positive' Class : 2               
## 
head(y_hat_knn_ks) #Resultados com Cross-Validation
## [1] 2 7 7 7 7 2
## Levels: 2 7
train_knn$bestTune
##     k
## 29 29
train_knn$finalModel
## 29-nearest neighbor model
## Training set outcome distribution:
## 
##   2   7 
## 379 421
train_knn$results %>% 
  ggplot(aes(x = k, y = Accuracy)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(x = k, 
                    ymin = Accuracy - AccuracySD,
                    ymax = Accuracy + AccuracySD))

ggplot(train_knn, highlight = TRUE)