#1. Cutoff Algoritm
(Variavéis resposta: Categorica variavel explicativa: Numerica (1) )
Algoritmo que vai descobrir o melhor ponto de corte para definir uma resposta
knitr::opts_chunk$set(echo = TRUE)
###1. Importar as Extenções
library(e1071)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(dslabs)
library(ggrepel)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
##1.Exportação de Dados
# define the outcome and predictors
data(heights)
y <- heights$sex
x <- heights$height
head(y)
## [1] Male Male Male Male Male Female
## Levels: Female Male
head(x)
## [1] 75 70 68 74 61 65
#Divisão do Treino e do Teste
set.seed(2007)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
test_set <- heights[test_index, ]
train_set <- heights[-test_index, ]
##2.Analise de dados
ggpairs(heights, columns = 1:2, ggplot2::aes(colour=sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##2.1 Treino (Decisão do Valor pela Accuracy) - Média
heights %>% group_by(sex) %>% summarize(mean(height), sd(height)) ### Criar matriz para ver média de alturas e desvio padrão
## # A tibble: 2 x 3
## sex `mean(height)` `sd(height)`
## <fct> <dbl> <dbl>
## 1 Female 64.9 3.76
## 2 Male 69.3 3.61
###Escolha do Melhor Cutoff
#MEAN
cutoff <- seq(min(heights$height), max(heights$height))
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train_set$height > x, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
mean(y_hat == train_set$sex)
})
max(accuracy)
## [1] 0.8495238
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 64
# Algoritmo com o melhor Cutoff
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
# Accuracy do Melhor Algoritmo
mean(y_hat == test_set$sex)
## [1] 0.8038095
confusionMatrix(data = y_hat,reference = test_set$sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 48 32
## Male 71 374
##
## Accuracy : 0.8038
## 95% CI : (0.7672, 0.8369)
## No Information Rate : 0.7733
## P-Value [Acc > NIR] : 0.0513022
##
## Kappa : 0.3671
##
## Mcnemar's Test P-Value : 0.0001809
##
## Sensitivity : 0.40336
## Specificity : 0.92118
## Pos Pred Value : 0.60000
## Neg Pred Value : 0.84045
## Prevalence : 0.22667
## Detection Rate : 0.09143
## Detection Prevalence : 0.15238
## Balanced Accuracy : 0.66227
##
## 'Positive' Class : Female
##
#F-MEAS
cutoff <- seq(min(heights$height), max(heights$height))
F_1 <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train_set$height > x, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
F_meas(data = y_hat, reference = factor(train_set$sex))
})
max(F_1)
## [1] NA
best_cutoff <- cutoff[which.max(F_1)]
' Ponto de Corte do Melhor Resultado:'
## [1] " Ponto de Corte do Melhor Resultado:"
best_cutoff
## [1] 66
# Criar o Algoritmo com o melhor Ponto de Corte
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
## Accuracy do Melhor Algoritmo
mean(y_hat == test_set$sex)
## [1] 0.7866667
confusionMatrix(data = y_hat,reference = test_set$sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 75 68
## Male 44 338
##
## Accuracy : 0.7867
## 95% CI : (0.7491, 0.821)
## No Information Rate : 0.7733
## P-Value [Acc > NIR] : 0.25067
##
## Kappa : 0.432
##
## Mcnemar's Test P-Value : 0.02976
##
## Sensitivity : 0.6303
## Specificity : 0.8325
## Pos Pred Value : 0.5245
## Neg Pred Value : 0.8848
## Prevalence : 0.2267
## Detection Rate : 0.1429
## Detection Prevalence : 0.2724
## Balanced Accuracy : 0.7314
##
## 'Positive' Class : Female
##
| ##2. Cutoff (Variavéis resposta: Categorica variavel explicativa: Categorica (1) ) |
r library(dslabs) library(dplyr) library(lubridate) |
## ## Attaching package: 'lubridate' |
## The following object is masked from 'package:base': ## ## date |
r data(reported_heights) #install.packages("e1071") library(e1071) #install.packages("tidyverse") library(tidyverse) #install.packages("caret") library(caret) #install.packages("dslabs") library(dslabs) #install.packages("ggrepel") library(ggrepel) |
| ##1.Exportação de Dados |
| ```r dat <- mutate(reported_heights, date_time = ymd_hms(time_stamp)) %>% filter(date_time >= make_date(2016, 01, 25) & date_time < make_date(2016, 02, 1)) %>% mutate(type = ifelse(day(date_time) == 25 & hour(date_time) == 8 & between(minute(date_time), 15, 30), “inclass”,“online”)) %>% select(sex, type) |
| y <- factor(dat\(sex, c("Female", "Male")) x <- dat\)type ``` |
| ##2. Analise exploratoria |
r dat %>% group_by(type) %>% summarize(prop_female = mean(sex == "Female")) |
## # A tibble: 2 x 2 ## type prop_female ## <chr> <dbl> ## 1 inclass 0.667 ## 2 online 0.378 |
| ##3. Testar o Algoritmo sem Treino |
| ```r # Algoritmo com o melhor Cutoff y_hat <- ifelse(dat\(type == 'online', "Male", "Female")# %>% # factor(levels = levels(dat\)sex)) y_hat <- factor(y_hat) |
| # Accuracy do Melhor Algoritmo mean(y_hat == dat$sex) ``` |
## [1] 0.6333333 |
##3. Cutoff (Variavéis resposta: Categorica variavel explicativa: Numerica (4) )
##1. Exportação de Dados
data(iris)
iris <- iris[-which(iris$Species=='setosa'),]
y <- iris$Species
set.seed(2) # if using R 3.6 or later, use set.seed(2, sample.kind="Rounding")
test_index <- createDataPartition(y,times=1,p=0.5,list=FALSE)
## Warning in createDataPartition(y, times = 1, p = 0.5, list = FALSE): Some
## classes have no records ( setosa ) and these will be ignored
test <- iris[test_index,]
train <- iris[-test_index,]
##2. Analise da variavel com melhor resultado (Treino)
"Sepal.Length"
## [1] "Sepal.Length"
cutoff <- seq(min(iris$Sepal.Length), max(iris$Sepal.Length),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Sepal.Length > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.74
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 6.1
"Sepal.Width"
## [1] "Sepal.Width"
cutoff <- seq(min(iris$Sepal.Width), max(iris$Sepal.Width),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Sepal.Width > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.64
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 2.8
"Petal.Length"
## [1] "Petal.Length"
cutoff <- seq(min(iris$Petal.Length), max(iris$Petal.Length),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Petal.Length > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.96
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 4.7
"Petal.Width"
## [1] "Petal.Width"
cutoff <- seq(min(iris$Petal.Width), max(iris$Petal.Width),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Petal.Width > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.98
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 1.7
##3. Teste (Nota: Escolhemos o Petal.Length porque se retirarmos o espaçamento de 0.1 é o que apresenta melhor resultado )
"Petal.Length"
## [1] "Petal.Length"
cutoff <- seq(min(iris$Petal.Length), max(iris$Petal.Length),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Petal.Length > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.96
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 4.7
# Algoritmo com o melhor Cutoff
y_hat <- ifelse(test$Petal.Length > best_cutoff, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
#y_hat <- factor(y_hat)
# Accuracy do Melhor Algoritmo
mean(y_hat == test$Species)
## [1] 0.9
confusionMatrix(data = y_hat,reference = test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 0 0 0
## versicolor 0 21 1
## virginica 0 4 24
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.7819, 0.9667)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.105e-09
##
## Kappa : 0.8
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity NA 0.8400 0.9600
## Specificity 1 0.9600 0.8400
## Pos Pred Value NA 0.9545 0.8571
## Neg Pred Value NA 0.8571 0.9545
## Prevalence 0 0.5000 0.5000
## Detection Rate 0 0.4200 0.4800
## Detection Prevalence 0 0.4400 0.5600
## Balanced Accuracy NA 0.9000 0.9000
##4. Melhoria do Algoritmo com correlação entre variaveis
plot(iris,pch=21,bg=iris$Species)
k=data.frame(iris$Sepal.Length,iris$Sepal.Width,iris$Petal.Length,iris$Petal.Width)
cor(k)
## iris.Sepal.Length iris.Sepal.Width iris.Petal.Length
## iris.Sepal.Length 1.0000000 0.5538548 0.8284787
## iris.Sepal.Width 0.5538548 1.0000000 0.5198023
## iris.Petal.Length 0.8284787 0.5198023 1.0000000
## iris.Petal.Width 0.5937094 0.5662025 0.8233476
## iris.Petal.Width
## iris.Sepal.Length 0.5937094
## iris.Sepal.Width 0.5662025
## iris.Petal.Length 0.8233476
## iris.Petal.Width 1.0000000
"Petal.Length"
## [1] "Petal.Length"
cutoff <- seq(min(iris$Petal.Length), max(iris$Petal.Length),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Petal.Length > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.96
best_cutoff <- cutoff[which.max(accuracy)]
best_cutoff
## [1] 4.7
"Petal.Width"
## [1] "Petal.Width"
cutoff <- seq(min(iris$Petal.Width), max(iris$Petal.Width),0.1)
accuracy <- map_dbl(cutoff, function(x){
y_hat <- ifelse(train$Petal.Width > x, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
mean(y_hat == train$Species)
})
max(accuracy)
## [1] 0.98
best_cutoff2 <- cutoff[which.max(accuracy)]
best_cutoff2
## [1] 1.7
# Algoritmo com o melhor Cutoff
y_hat <- ifelse(test$Petal.Length > best_cutoff | test$Petal.Width > best_cutoff2, "virginica", "versicolor") %>%
factor(levels = levels(test$Species))
#y_hat <- factor(y_hat)
# Accuracy do Melhor Algoritmo
"Accuracy"
## [1] "Accuracy"
mean(y_hat == test$Species)
## [1] 0.9
confusionMatrix(data = y_hat,reference = test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 0 0 0
## versicolor 0 21 1
## virginica 0 4 24
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.7819, 0.9667)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.105e-09
##
## Kappa : 0.8
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity NA 0.8400 0.9600
## Specificity 1 0.9600 0.8400
## Pos Pred Value NA 0.9545 0.8571
## Neg Pred Value NA 0.8571 0.9545
## Prevalence 0 0.5000 0.5000
## Detection Rate 0 0.4200 0.4800
## Detection Prevalence 0 0.4400 0.5600
## Balanced Accuracy NA 0.9000 0.9000