Cutoff Algoritm

#1. Cutoff Algoritm

(Variavéis resposta: Categorica variavel explicativa: Numerica (1) )

Algoritmo que vai descobrir o melhor ponto de corte para definir uma resposta

knitr::opts_chunk$set(echo = TRUE)
###1. Importar as Extenções
library(e1071)
library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(dslabs)
library(ggrepel)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

##1.Exportação de Dados

# define the outcome and predictors
data(heights)
y <- heights$sex
x <- heights$height
head(y)

## [1] Male   Male   Male   Male   Male   Female
## Levels: Female Male

head(x)

## [1] 75 70 68 74 61 65

#Divisão do Treino e do Teste
set.seed(2007)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
test_set <- heights[test_index, ]
train_set <- heights[-test_index, ]

##2.Analise de dados

ggpairs(heights, columns = 1:2, ggplot2::aes(colour=sex))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##2.1 Treino (Decisão do Valor pela Accuracy) - Média

heights %>% group_by(sex) %>% summarize(mean(height), sd(height)) ### Criar matriz para ver média de alturas e desvio padrão

## # A tibble: 2 x 3
##   sex    `mean(height)` `sd(height)`
##   <fct>           <dbl>        <dbl>
## 1 Female           64.9         3.76
## 2 Male             69.3         3.61

###Escolha do Melhor Cutoff
#MEAN
cutoff <- seq(min(heights$height), max(heights$height)) 
accuracy <- map_dbl(cutoff, function(x){ 
            y_hat <- ifelse(train_set$height > x, "Male", "Female") %>% 
            factor(levels = levels(test_set$sex))
            mean(y_hat == train_set$sex) 
}) 
max(accuracy)

## [1] 0.8495238

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 64

#  Algoritmo com o melhor Cutoff
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>% 
  factor(levels = levels(test_set$sex))
# Accuracy do Melhor Algoritmo
mean(y_hat == test_set$sex)

## [1] 0.8038095

confusionMatrix(data = y_hat,reference = test_set$sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     48   32
##     Male       71  374
##                                           
##                Accuracy : 0.8038          
##                  95% CI : (0.7672, 0.8369)
##     No Information Rate : 0.7733          
##     P-Value [Acc > NIR] : 0.0513022       
##                                           
##                   Kappa : 0.3671          
##                                           
##  Mcnemar's Test P-Value : 0.0001809       
##                                           
##             Sensitivity : 0.40336         
##             Specificity : 0.92118         
##          Pos Pred Value : 0.60000         
##          Neg Pred Value : 0.84045         
##              Prevalence : 0.22667         
##          Detection Rate : 0.09143         
##    Detection Prevalence : 0.15238         
##       Balanced Accuracy : 0.66227         
##                                           
##        'Positive' Class : Female          
##

#F-MEAS
cutoff <- seq(min(heights$height), max(heights$height))
F_1 <- map_dbl(cutoff, function(x){
  y_hat <- ifelse(train_set$height > x, "Male", "Female") %>% 
    factor(levels = levels(test_set$sex))
  F_meas(data = y_hat, reference = factor(train_set$sex))
})
max(F_1)

## [1] NA

best_cutoff <- cutoff[which.max(F_1)] 
' Ponto de Corte do Melhor Resultado:'

## [1] " Ponto de Corte do Melhor Resultado:"

best_cutoff

## [1] 66

# Criar o Algoritmo com o melhor Ponto de Corte
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>% 
  factor(levels = levels(test_set$sex))
## Accuracy do Melhor Algoritmo
mean(y_hat == test_set$sex)

## [1] 0.7866667

confusionMatrix(data = y_hat,reference = test_set$sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     75   68
##     Male       44  338
##                                          
##                Accuracy : 0.7867         
##                  95% CI : (0.7491, 0.821)
##     No Information Rate : 0.7733         
##     P-Value [Acc > NIR] : 0.25067        
##                                          
##                   Kappa : 0.432          
##                                          
##  Mcnemar's Test P-Value : 0.02976        
##                                          
##             Sensitivity : 0.6303         
##             Specificity : 0.8325         
##          Pos Pred Value : 0.5245         
##          Neg Pred Value : 0.8848         
##              Prevalence : 0.2267         
##          Detection Rate : 0.1429         
##    Detection Prevalence : 0.2724         
##       Balanced Accuracy : 0.7314         
##                                          
##        'Positive' Class : Female         
##

##2. Cutoff (Variavéis resposta: Categorica variavel explicativa: Categorica (1) )

r library(dslabs) library(dplyr) library(lubridate)

## ## Attaching package: 'lubridate'

## The following object is masked from 'package:base': ## ## date

r data(reported_heights) #install.packages("e1071") library(e1071) #install.packages("tidyverse") library(tidyverse) #install.packages("caret") library(caret) #install.packages("dslabs") library(dslabs) #install.packages("ggrepel") library(ggrepel)

##1.Exportação de Dados

```r dat <- mutate(reported_heights, date_time = ymd_hms(time_stamp)) %>% filter(date_time >= make_date(2016, 01, 25) & date_time < make_date(2016, 02, 1)) %>% mutate(type = ifelse(day(date_time) == 25 & hour(date_time) == 8 & between(minute(date_time), 15, 30), “inclass”,“online”)) %>% select(sex, type)

y <- factor(dat$sex, c("Female", "Male")) x <- dat$type ```

##2. Analise exploratoria

r dat %>% group_by(type) %>% summarize(prop_female = mean(sex == "Female"))

## # A tibble: 2 x 2 ## type prop_female ## <chr> <dbl> ## 1 inclass 0.667 ## 2 online 0.378

##3. Testar o Algoritmo sem Treino

```r # Algoritmo com o melhor Cutoff y_hat <- ifelse(dat$type == 'online', "Male", "Female")# %>% # factor(levels = levels(dat$sex)) y_hat <- factor(y_hat)

# Accuracy do Melhor Algoritmo mean(y_hat == dat$sex) ```

## [1] 0.6333333

##3. Cutoff (Variavéis resposta: Categorica variavel explicativa: Numerica (4) )

##1. Exportação de Dados

data(iris)
iris <- iris[-which(iris$Species=='setosa'),]
y <- iris$Species

set.seed(2)    # if using R 3.6 or later, use set.seed(2, sample.kind="Rounding")
test_index <- createDataPartition(y,times=1,p=0.5,list=FALSE)

## Warning in createDataPartition(y, times = 1, p = 0.5, list = FALSE): Some
## classes have no records ( setosa ) and these will be ignored

test <- iris[test_index,]
train <- iris[-test_index,]

##2. Analise da variavel com melhor resultado (Treino)

"Sepal.Length"

## [1] "Sepal.Length"

cutoff <- seq(min(iris$Sepal.Length), max(iris$Sepal.Length),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Sepal.Length > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 
max(accuracy)

## [1] 0.74

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 6.1

"Sepal.Width"

## [1] "Sepal.Width"

cutoff <- seq(min(iris$Sepal.Width), max(iris$Sepal.Width),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Sepal.Width > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 
max(accuracy)

## [1] 0.64

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 2.8

"Petal.Length"

## [1] "Petal.Length"

cutoff <- seq(min(iris$Petal.Length), max(iris$Petal.Length),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Petal.Length > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 
max(accuracy)

## [1] 0.96

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 4.7

"Petal.Width"

## [1] "Petal.Width"

cutoff <- seq(min(iris$Petal.Width), max(iris$Petal.Width),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Petal.Width > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 
max(accuracy)

## [1] 0.98

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 1.7

##3. Teste (Nota: Escolhemos o Petal.Length porque se retirarmos o espaçamento de 0.1 é o que apresenta melhor resultado )

"Petal.Length"

## [1] "Petal.Length"

cutoff <- seq(min(iris$Petal.Length), max(iris$Petal.Length),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Petal.Length > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 

max(accuracy)

## [1] 0.96

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 4.7

#  Algoritmo com o melhor Cutoff
y_hat <- ifelse(test$Petal.Length > best_cutoff, "virginica", "versicolor") %>% 
  factor(levels = levels(test$Species))
#y_hat <- factor(y_hat)

# Accuracy do Melhor Algoritmo
mean(y_hat == test$Species)

## [1] 0.9

confusionMatrix(data = y_hat,reference = test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa          0          0         0
##   versicolor      0         21         1
##   virginica       0          4        24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.7819, 0.9667)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 2.105e-09       
##                                           
##                   Kappa : 0.8             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                     NA            0.8400           0.9600
## Specificity                      1            0.9600           0.8400
## Pos Pred Value                  NA            0.9545           0.8571
## Neg Pred Value                  NA            0.8571           0.9545
## Prevalence                       0            0.5000           0.5000
## Detection Rate                   0            0.4200           0.4800
## Detection Prevalence             0            0.4400           0.5600
## Balanced Accuracy               NA            0.9000           0.9000

##4. Melhoria do Algoritmo com correlação entre variaveis

plot(iris,pch=21,bg=iris$Species)

k=data.frame(iris$Sepal.Length,iris$Sepal.Width,iris$Petal.Length,iris$Petal.Width)
cor(k)

##                   iris.Sepal.Length iris.Sepal.Width iris.Petal.Length
## iris.Sepal.Length         1.0000000        0.5538548         0.8284787
## iris.Sepal.Width          0.5538548        1.0000000         0.5198023
## iris.Petal.Length         0.8284787        0.5198023         1.0000000
## iris.Petal.Width          0.5937094        0.5662025         0.8233476
##                   iris.Petal.Width
## iris.Sepal.Length        0.5937094
## iris.Sepal.Width         0.5662025
## iris.Petal.Length        0.8233476
## iris.Petal.Width         1.0000000

"Petal.Length"

## [1] "Petal.Length"

cutoff <- seq(min(iris$Petal.Length), max(iris$Petal.Length),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Petal.Length > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 

max(accuracy)

## [1] 0.96

best_cutoff <- cutoff[which.max(accuracy)] 
best_cutoff

## [1] 4.7

"Petal.Width"

## [1] "Petal.Width"

cutoff <- seq(min(iris$Petal.Width), max(iris$Petal.Width),0.1) 
accuracy <- map_dbl(cutoff, function(x){ 
  y_hat <- ifelse(train$Petal.Width > x, "virginica", "versicolor") %>% 
    factor(levels = levels(test$Species))
  mean(y_hat == train$Species) 
}) 

max(accuracy)

## [1] 0.98

best_cutoff2 <- cutoff[which.max(accuracy)] 
best_cutoff2

## [1] 1.7

#  Algoritmo com o melhor Cutoff
y_hat <- ifelse(test$Petal.Length > best_cutoff | test$Petal.Width > best_cutoff2, "virginica", "versicolor") %>% 
  factor(levels = levels(test$Species))
#y_hat <- factor(y_hat)

# Accuracy do Melhor Algoritmo
"Accuracy"

## [1] "Accuracy"

mean(y_hat == test$Species)

## [1] 0.9

confusionMatrix(data = y_hat,reference = test$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa          0          0         0
##   versicolor      0         21         1
##   virginica       0          4        24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.7819, 0.9667)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 2.105e-09       
##                                           
##                   Kappa : 0.8             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                     NA            0.8400           0.9600
## Specificity                      1            0.9600           0.8400
## Pos Pred Value                  NA            0.9545           0.8571
## Neg Pred Value                  NA            0.8571           0.9545
## Prevalence                       0            0.5000           0.5000
## Detection Rate                   0            0.4200           0.4800
## Detection Prevalence             0            0.4400           0.5600
## Balanced Accuracy               NA            0.9000           0.9000

Cutoff Algoritm

Luís Freitas

29/03/2020