(Variavel Dependente: factor e variavel independente: Numeric)
data("heights")
head(heights$sex)
## [1] Male Male Male Male Male Female
## Levels: Female Male
head(heights$height)
## [1] 75 70 68 74 61 65
y <- heights$height
set.seed(2)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- heights %>% slice(-test_index)
test_set <- heights %>% slice(test_index)
head(train_set)
## sex height
## 1 Male 75
## 2 Male 70
## 3 Male 68
## 4 Male 74
## 5 Female 62
## 6 Female 66
##Criação e analise dos modelos
# Estimating averages and standard deviations
params <- train_set %>%
group_by(sex) %>%
summarize(avg = mean(height), sd = sd(height))
params
## # A tibble: 2 x 3
## sex avg sd
## <fct> <dbl> <dbl>
## 1 Female 65.3 3.70
## 2 Male 69.3 3.60
# Estimating the prevalence
pi <- train_set %>% summarize(pi=mean(sex=="Female")) %>% pull(pi)
pi
## [1] 0.2232824
# Getting an actual rule
x <- test_set$height
f0 <- dnorm(x, params$avg[2], params$sd[2])
f1 <- dnorm(x, params$avg[1], params$sd[1])
p_hat_bayes <- f1*pi / (f1*pi + f0*(1 - pi))
#Computing
y_hat_bayes <- ifelse(p_hat_bayes > 0.5, "Female", "Male")
confusionMatrix(data = factor(y_hat_bayes), reference = factor(test_set$sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 46 21
## Male 75 384
##
## Accuracy : 0.8175
## 95% CI : (0.7818, 0.8496)
## No Information Rate : 0.77
## P-Value [Acc > NIR] : 0.004726
##
## Kappa : 0.3892
##
## Mcnemar's Test P-Value : 6.328e-08
##
## Sensitivity : 0.38017
## Specificity : 0.94815
## Pos Pred Value : 0.68657
## Neg Pred Value : 0.83660
## Prevalence : 0.23004
## Detection Rate : 0.08745
## Detection Prevalence : 0.12738
## Balanced Accuracy : 0.66416
##
## 'Positive' Class : Female
##
# Changing the cutoff of the decision rule
p_hat_bayes_unbiased <- f1 * 0.5 / (f1 * 0.5 + f0 * (1 - 0.5))
y_hat_bayes_unbiased <- ifelse(p_hat_bayes_unbiased > 0.5, "Female", "Male")
confusionMatrix(data = factor(y_hat_bayes_unbiased), reference = factor(test_set$sex))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 96 112
## Male 25 293
##
## Accuracy : 0.7395
## 95% CI : (0.6998, 0.7766)
## No Information Rate : 0.77
## P-Value [Acc > NIR] : 0.9548
##
## Kappa : 0.4128
##
## Mcnemar's Test P-Value : 2.02e-13
##
## Sensitivity : 0.7934
## Specificity : 0.7235
## Pos Pred Value : 0.4615
## Neg Pred Value : 0.9214
## Prevalence : 0.2300
## Detection Rate : 0.1825
## Detection Prevalence : 0.3954
## Balanced Accuracy : 0.7584
##
## 'Positive' Class : Female
##
train_qda <- train(sex ~., method = "qda", data = train_set)
y_hat <- predict(train_qda, test_set)
confusionMatrix(data = y_hat, reference = test_set$sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 46 21
## Male 75 384
##
## Accuracy : 0.8175
## 95% CI : (0.7818, 0.8496)
## No Information Rate : 0.77
## P-Value [Acc > NIR] : 0.004726
##
## Kappa : 0.3892
##
## Mcnemar's Test P-Value : 6.328e-08
##
## Sensitivity : 0.38017
## Specificity : 0.94815
## Pos Pred Value : 0.68657
## Neg Pred Value : 0.83660
## Prevalence : 0.23004
## Detection Rate : 0.08745
## Detection Prevalence : 0.12738
## Balanced Accuracy : 0.66416
##
## 'Positive' Class : Female
##
train_lda <- train(sex ~., method = "lda", data = train_set)
y_hat <- predict(train_lda, test_set)
confusionMatrix(data = y_hat, reference = test_set$sex)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Female Male
## Female 46 21
## Male 75 384
##
## Accuracy : 0.8175
## 95% CI : (0.7818, 0.8496)
## No Information Rate : 0.77
## P-Value [Acc > NIR] : 0.004726
##
## Kappa : 0.3892
##
## Mcnemar's Test P-Value : 6.328e-08
##
## Sensitivity : 0.38017
## Specificity : 0.94815
## Pos Pred Value : 0.68657
## Neg Pred Value : 0.83660
## Prevalence : 0.23004
## Detection Rate : 0.08745
## Detection Prevalence : 0.12738
## Balanced Accuracy : 0.66416
##
## 'Positive' Class : Female
##