Comparison between LDA and GLM use data(PimaIndianDiabetes) from library(mlbench)
## Loading required package: MASS
dt$diabetes <- ifelse(dt$diabetes=="pos",1,0)
dt$diabetes <- as.factor(dt$diabetes)
set.seed(123)
spl <- sample(nrow(dt),nrow(dt)*.7)
train <- dt[spl,]
test <- dt[-spl,]
lda_anl <- lda(diabetes ~ ., data = train)
lda_anl
## Call:
## lda(diabetes ~ ., data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.6499069 0.3500931
##
## Group means:
## pregnant glucose pressure triceps insulin mass pedigree age
## 0 3.315186 108.7507 67.63324 20.29226 67.75645 30.27765 0.4320917 31.14040
## 1 4.675532 140.6436 70.42021 22.94149 97.61170 35.44947 0.5529681 36.57979
##
## Coefficients of linear discriminants:
## LD1
## pregnant 0.095262106
## glucose 0.029220687
## pressure -0.010385290
## triceps -0.001534027
## insulin -0.001186982
## mass 0.060652890
## pedigree 0.632364644
## age 0.006395161
pred_lda <- predict(lda_anl,newdata=test)
tbl <- table(test$diabetes,pred_lda$class)
accuracy <- sum(diag(tbl))/sum(tbl)
accuracy
## [1] 0.7748918
mean(pred_lda$posterior)
## [1] 0.5
mean(pred_lda$posterior[,1]>=.5)
## [1] 0.7142857
mean(pred_lda$posterior[,1]<.5)
## [1] 0.2857143
pred_lda$posterior[1:10,1]
## 2 3 6 9 12 13 17
## 0.96052978 0.12788062 0.83908699 0.28129954 0.07026885 0.20192169 0.69431949
## 18 21 23
## 0.79471564 0.64475348 0.03757760
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
par(mfrow=c(1, 2))
prediction(pred_lda$posterior[,2], test$diabetes) %>%
performance(measure = "tpr", x.measure = "fpr") %>%
plot()
set.seed(123)
sample <- sample(c(TRUE, FALSE), nrow(dt), replace = T, prob = c(0.7,0.3))
train <- dt[sample, ]
test <- dt[!sample, ]
log0 <- glm(diabetes~.,,data=train, family = "binomial")
log1 <- glm(diabetes~glucose+pressure+triceps+insulin+mass+pedigree+age,data=train, family = "binomial")
log2 <- glm(diabetes~glucose+pressure+triceps+insulin+mass+pedigree,data=train, family = "binomial")
log3 <- glm(diabetes~glucose+pressure+triceps+insulin+mass,data=train, family = "binomial")
log4 <- glm(diabetes~glucose+pressure+triceps+insulin,data=train, family = "binomial")
test.pred.m0 <- predict(log0, newdata = test, type = "response")
test.pred.m1 <- predict(log1, newdata = test, type = "response")
test.pred.m2 <- predict(log2, newdata = test, type = "response")
test.pred.m3 <- predict(log3, newdata = test, type = "response")
test.pred.m4 <- predict(log4, newdata = test, type = "response")
library(dplyr)
list(
log0 = table(test$diabetes, test.pred.m1 > 0.5) %>% prop.table() %>% round(3),
log1 = table(test$diabetes, test.pred.m1 > 0.5) %>% prop.table() %>% round(3),
log2 = table(test$diabetes, test.pred.m2 > 0.5) %>% prop.table() %>% round(3),
log3 = table(test$diabetes, test.pred.m3 > 0.5) %>% prop.table() %>% round(3),
log4 = table(test$diabetes, test.pred.m3 > 0.5) %>% prop.table() %>% round(3)
)
## $log0
##
## FALSE TRUE
## 0 0.541 0.092
## 1 0.148 0.218
##
## $log1
##
## FALSE TRUE
## 0 0.541 0.092
## 1 0.148 0.218
##
## $log2
##
## FALSE TRUE
## 0 0.550 0.083
## 1 0.175 0.192
##
## $log3
##
## FALSE TRUE
## 0 0.546 0.087
## 1 0.175 0.192
##
## $log4
##
## FALSE TRUE
## 0 0.546 0.087
## 1 0.175 0.192
test %>%
mutate(m0.pred = ifelse(test.pred.m0 > 0.5, "pos", "neg"),
m1.pred = ifelse(test.pred.m1 > 0.5, "pos", "neg"),
m2.pred = ifelse(test.pred.m2 > 0.5, "pos", "neg"),
m3.pred = ifelse(test.pred.m3 > 0.5, "pos", "neg")) %>%
summarise(m0.error = mean(diabetes != m0.pred),
m1.error = mean(diabetes != m1.pred),
m2.error = mean(diabetes != m2.pred),
m3.error = mean(diabetes != m3.pred))
## m0.error m1.error m2.error m3.error
## 1 1 1 1 1
library(ROCR)
par(mfrow=c(1, 2))
prediction(test.pred.m0, test$diabetes) %>%
performance(measure = "tpr", x.measure = "fpr") %>%
plot()
prediction(test.pred.m1, test$diabetes) %>%
performance(measure = "tpr", x.measure = "fpr") %>%
plot()
prediction(test.pred.m2, test$diabetes) %>%
performance(measure = "tpr", x.measure = "fpr") %>%
plot()
prediction(test.pred.m0, test$diabetes) %>%
performance(measure = "auc") %>%
.@y.values
## [[1]]
## [1] 0.8286535
prediction(test.pred.m1, test$diabetes) %>%
performance(measure = "auc") %>%
.@y.values
## [[1]]
## [1] 0.8140394
prediction(test.pred.m2, test$diabetes) %>%
performance(measure = "auc") %>%
.@y.values
## [[1]]
## [1] 0.8043514
prediction(test.pred.m3, test$diabetes) %>%
performance(measure = "auc") %>%
.@y.values
## [[1]]
## [1] 0.7900657
prediction(test.pred.m4, test$diabetes) %>%
performance(measure = "auc") %>%
.@y.values
## [[1]]
## [1] 0.7600985