Modelos Generativos NB, LDA, QDA

(Variavel Dependente: factor e variavel independente: Numeric)

Exportação de Dados

data("heights")
head(heights$sex)

## [1] Male   Male   Male   Male   Male   Female
## Levels: Female Male

head(heights$height)

## [1] 75 70 68 74 61 65

y <- heights$height
set.seed(2)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- heights %>% slice(-test_index)
test_set <- heights %>% slice(test_index)
head(train_set)

##      sex height
## 1   Male     75
## 2   Male     70
## 3   Male     68
## 4   Male     74
## 5 Female     62
## 6 Female     66

##Criação e analise dos modelos

# Estimating averages and standard deviations
params <- train_set %>%
  group_by(sex) %>%
  summarize(avg = mean(height), sd = sd(height))
params

## # A tibble: 2 x 3
##   sex      avg    sd
##   <fct>  <dbl> <dbl>
## 1 Female  65.3  3.70
## 2 Male    69.3  3.60

# Estimating the prevalence
pi <- train_set %>% summarize(pi=mean(sex=="Female")) %>% pull(pi)
pi

## [1] 0.2232824

# Getting an actual rule
x <- test_set$height
f0 <- dnorm(x, params$avg[2], params$sd[2])
f1 <- dnorm(x, params$avg[1], params$sd[1])
p_hat_bayes <- f1*pi / (f1*pi + f0*(1 - pi))
#Computing
y_hat_bayes <- ifelse(p_hat_bayes > 0.5, "Female", "Male")
confusionMatrix(data = factor(y_hat_bayes), reference = factor(test_set$sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     46   21
##     Male       75  384
##                                           
##                Accuracy : 0.8175          
##                  95% CI : (0.7818, 0.8496)
##     No Information Rate : 0.77            
##     P-Value [Acc > NIR] : 0.004726        
##                                           
##                   Kappa : 0.3892          
##                                           
##  Mcnemar's Test P-Value : 6.328e-08       
##                                           
##             Sensitivity : 0.38017         
##             Specificity : 0.94815         
##          Pos Pred Value : 0.68657         
##          Neg Pred Value : 0.83660         
##              Prevalence : 0.23004         
##          Detection Rate : 0.08745         
##    Detection Prevalence : 0.12738         
##       Balanced Accuracy : 0.66416         
##                                           
##        'Positive' Class : Female          
##

# Changing the cutoff of the decision rule
p_hat_bayes_unbiased <- f1 * 0.5 / (f1 * 0.5 + f0 * (1 - 0.5))
y_hat_bayes_unbiased <- ifelse(p_hat_bayes_unbiased > 0.5, "Female", "Male")
confusionMatrix(data = factor(y_hat_bayes_unbiased), reference = factor(test_set$sex))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     96  112
##     Male       25  293
##                                           
##                Accuracy : 0.7395          
##                  95% CI : (0.6998, 0.7766)
##     No Information Rate : 0.77            
##     P-Value [Acc > NIR] : 0.9548          
##                                           
##                   Kappa : 0.4128          
##                                           
##  Mcnemar's Test P-Value : 2.02e-13        
##                                           
##             Sensitivity : 0.7934          
##             Specificity : 0.7235          
##          Pos Pred Value : 0.4615          
##          Neg Pred Value : 0.9214          
##              Prevalence : 0.2300          
##          Detection Rate : 0.1825          
##    Detection Prevalence : 0.3954          
##       Balanced Accuracy : 0.7584          
##                                           
##        'Positive' Class : Female          
##

train_qda <- train(sex ~., method = "qda", data = train_set)
y_hat <- predict(train_qda, test_set)
confusionMatrix(data = y_hat, reference = test_set$sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     46   21
##     Male       75  384
##                                           
##                Accuracy : 0.8175          
##                  95% CI : (0.7818, 0.8496)
##     No Information Rate : 0.77            
##     P-Value [Acc > NIR] : 0.004726        
##                                           
##                   Kappa : 0.3892          
##                                           
##  Mcnemar's Test P-Value : 6.328e-08       
##                                           
##             Sensitivity : 0.38017         
##             Specificity : 0.94815         
##          Pos Pred Value : 0.68657         
##          Neg Pred Value : 0.83660         
##              Prevalence : 0.23004         
##          Detection Rate : 0.08745         
##    Detection Prevalence : 0.12738         
##       Balanced Accuracy : 0.66416         
##                                           
##        'Positive' Class : Female          
##

train_lda <- train(sex ~., method = "lda", data = train_set)
y_hat <- predict(train_lda, test_set)
confusionMatrix(data = y_hat, reference = test_set$sex)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Female Male
##     Female     46   21
##     Male       75  384
##                                           
##                Accuracy : 0.8175          
##                  95% CI : (0.7818, 0.8496)
##     No Information Rate : 0.77            
##     P-Value [Acc > NIR] : 0.004726        
##                                           
##                   Kappa : 0.3892          
##                                           
##  Mcnemar's Test P-Value : 6.328e-08       
##                                           
##             Sensitivity : 0.38017         
##             Specificity : 0.94815         
##          Pos Pred Value : 0.68657         
##          Neg Pred Value : 0.83660         
##              Prevalence : 0.23004         
##          Detection Rate : 0.08745         
##    Detection Prevalence : 0.12738         
##       Balanced Accuracy : 0.66416         
##                                           
##        'Positive' Class : Female          
##

Naive Bayes, QDA, LDA

Luís

03/05/2020

Modelos Generativos NB, LDA, QDA

Exportação de Dados