(Variavel dependente(factor) e 2 variaveis explicativas numéricas) O objetivo é conseguir prever se y é 2 ou 7 consoante os valores de x_1 e x_2
if(!exists("mnist")) mnist <- read_mnist()
head(mnist_27$train$x_1)
## [1] 0.03947368 0.16071429 0.02127660 0.13580247 0.39024390 0.04854369
data.class(mnist_27$train$x_1)
## [1] "numeric"
head(mnist_27$train$x_2)#variaveis independentes
## [1] 0.18421053 0.08928571 0.27659574 0.22222222 0.36585366 0.28155340
data.class(mnist_27$train$x_2)
## [1] "numeric"
head(mnist_27$train$y) #variaveis dependentes
## [1] 2 7 2 2 7 2
## Levels: 2 7
data.class(mnist_27$train$y)
## [1] "factor"
dim(mnist_27$train)
## [1] 800 3
p = data.frame(mnist_27$train$x_1,mnist_27$train$x_2)
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(p, diag.panel = panel.hist, upper.panel = panel.cor,
lower.panel = panel.smooth)
ggpairs(mnist_27$train, columns = 1:3, ggplot2::aes(colour=y))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mnist_27$train %>% mutate(y = factor(y)) %>% ggplot(aes(x_1, x_2, fill = y, color=y)) + geom_point(show.legend = FALSE) + stat_ellipse(type="norm")
set.seed(2)
#pick the k in knn
ks <- seq(1, 50, 1)
accuracy <- map_df(ks, function(k){
fit <- knn3(y ~ ., data = mnist_27$train, k = k)
y_hat <- predict(fit, mnist_27$train, type = "class")
cm_train <- confusionMatrix(data = y_hat, reference = mnist_27$train$y)
train_error <- cm_train$overall["Accuracy"]
y_hat <- predict(fit, mnist_27$test, type = "class")
cm_test <- confusionMatrix(data = y_hat, reference = mnist_27$test$y)
test_error <- cm_test$overall["Accuracy"]
y_hat <- predict(fit, mnist_27$test, type = "class")
F_1<-F_meas(data = y_hat, reference = factor(mnist_27$test$y))
tibble(train = train_error, test = test_error,F1=F_1)
}) ## Algoritmo KNN com Accuracy e F-meas
predicted.type <- NULL
error.rate <- NULL
for (i in 1:50) {
predicted.type <- knn(mnist_27$train[2:3],mnist_27$test[2:3],mnist_27$train$y,k=i)
error.rate[i] <- mean(predicted.type!=mnist_27$test$y)
}
knn.error <- as.data.frame(cbind(k=1:50,error.type =error.rate))
ggplot(knn.error,aes(k,error.type))+
geom_point()+
geom_line() +
scale_x_continuous(breaks=1:50)+
theme_bw() +
xlab("Value of K") +
ylab('Error')
plot(ks, accuracy$F1)
print("F_meas")
## [1] "F_meas"
kss=ks[which.max(accuracy$F1)]
kss
## [1] 41
max(accuracy$F1)
## [1] 0.8703704
knn_fit_ks <- knn3(y ~ ., data = mnist_27$train, k = kss)
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$train, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$train$y)$overall["Accuracy"]
## Accuracy
## 0.8475
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$test, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 2 7
## 2 94 16
## 7 12 78
##
## Accuracy : 0.86
## 95% CI : (0.8041, 0.9049)
## No Information Rate : 0.53
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7183
##
## Mcnemar's Test P-Value : 0.5708
##
## Sensitivity : 0.8868
## Specificity : 0.8298
## Pos Pred Value : 0.8545
## Neg Pred Value : 0.8667
## Prevalence : 0.5300
## Detection Rate : 0.4700
## Detection Prevalence : 0.5500
## Balanced Accuracy : 0.8583
##
## 'Positive' Class : 2
##
head(y_hat_knn_ks) #Resultados do F-MEAS
## [1] 2 7 7 7 7 2
## Levels: 2 7
print("Accuracy")
## [1] "Accuracy"
#pick the k that maximizes accuracy using the estimates built on the test data
kss=ks[which.max(accuracy$test)]
kss
## [1] 41
max(accuracy$test)
## [1] 0.86
#Fit Model
knn_fit_ks <- knn3(y ~ ., data = mnist_27$train, k = kss)
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$train, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$train$y)$overall["Accuracy"]
## Accuracy
## 0.8475
y_hat_knn_ks <- predict(knn_fit_ks, mnist_27$test, type = "class")
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 2 7
## 2 94 16
## 7 12 78
##
## Accuracy : 0.86
## 95% CI : (0.8041, 0.9049)
## No Information Rate : 0.53
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7183
##
## Mcnemar's Test P-Value : 0.5708
##
## Sensitivity : 0.8868
## Specificity : 0.8298
## Pos Pred Value : 0.8545
## Neg Pred Value : 0.8667
## Prevalence : 0.5300
## Detection Rate : 0.4700
## Detection Prevalence : 0.5500
## Balanced Accuracy : 0.8583
##
## 'Positive' Class : 2
##
head(y_hat_knn_ks) #Resultados Accuracy
## [1] 2 7 7 7 7 2
## Levels: 2 7
train.control <- trainControl(method = "cv", number = 10, p = .9)
train_knn <- train(y ~ ., method = "knn", tuneGrid = data.frame(k = seq(1, 50, 1)),
data = mnist_27$train,trControl = train.control)
y_hat_knn_ks <- predict(train_knn, mnist_27$test)
confusionMatrix(data=y_hat_knn_ks, reference=mnist_27$test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 2 7
## 2 92 18
## 7 14 76
##
## Accuracy : 0.84
## 95% CI : (0.7817, 0.8879)
## No Information Rate : 0.53
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6781
##
## Mcnemar's Test P-Value : 0.5959
##
## Sensitivity : 0.8679
## Specificity : 0.8085
## Pos Pred Value : 0.8364
## Neg Pred Value : 0.8444
## Prevalence : 0.5300
## Detection Rate : 0.4600
## Detection Prevalence : 0.5500
## Balanced Accuracy : 0.8382
##
## 'Positive' Class : 2
##
head(y_hat_knn_ks) #Resultados com Cross-Validation
## [1] 2 7 7 7 7 2
## Levels: 2 7
train_knn$bestTune
## k
## 29 29
train_knn$finalModel
## 29-nearest neighbor model
## Training set outcome distribution:
##
## 2 7
## 379 421
train_knn$results %>%
ggplot(aes(x = k, y = Accuracy)) +
geom_line() +
geom_point() +
geom_errorbar(aes(x = k,
ymin = Accuracy - AccuracySD,
ymax = Accuracy + AccuracySD))
ggplot(train_knn, highlight = TRUE)