Comparison between LDA and GLM use data(PimaIndianDiabetes) from library(mlbench)

## Loading required package: MASS

Convert varb response to factor

dt$diabetes <- ifelse(dt$diabetes=="pos",1,0)
dt$diabetes <- as.factor(dt$diabetes)

Split data

set.seed(123)
spl <- sample(nrow(dt),nrow(dt)*.7)
train <- dt[spl,]
test <- dt[-spl,]

Model LDA(Linier Discriminant Analisys)

lda_anl <- lda(diabetes ~ ., data = train)
lda_anl
## Call:
## lda(diabetes ~ ., data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.6499069 0.3500931 
## 
## Group means:
##   pregnant  glucose pressure  triceps  insulin     mass  pedigree      age
## 0 3.315186 108.7507 67.63324 20.29226 67.75645 30.27765 0.4320917 31.14040
## 1 4.675532 140.6436 70.42021 22.94149 97.61170 35.44947 0.5529681 36.57979
## 
## Coefficients of linear discriminants:
##                   LD1
## pregnant  0.095262106
## glucose   0.029220687
## pressure -0.010385290
## triceps  -0.001534027
## insulin  -0.001186982
## mass      0.060652890
## pedigree  0.632364644
## age       0.006395161

Prediction

pred_lda <- predict(lda_anl,newdata=test)

Accuracy

tbl <- table(test$diabetes,pred_lda$class)
accuracy <- sum(diag(tbl))/sum(tbl)
accuracy
## [1] 0.7748918
mean(pred_lda$posterior)
## [1] 0.5
mean(pred_lda$posterior[,1]>=.5)
## [1] 0.7142857
mean(pred_lda$posterior[,1]<.5)
## [1] 0.2857143
pred_lda$posterior[1:10,1]
##          2          3          6          9         12         13         17 
## 0.96052978 0.12788062 0.83908699 0.28129954 0.07026885 0.20192169 0.69431949 
##         18         21         23 
## 0.79471564 0.64475348 0.03757760

Performance LDA

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
par(mfrow=c(1, 2))
prediction(pred_lda$posterior[,2], test$diabetes) %>%
  performance(measure = "tpr", x.measure = "fpr") %>%
  plot()

Model GLM(logistic regression)

Split data

set.seed(123)
sample <- sample(c(TRUE, FALSE), nrow(dt), replace = T, prob = c(0.7,0.3))
train <- dt[sample, ]
test <- dt[!sample, ]

Model logistic

log0 <- glm(diabetes~.,,data=train, family = "binomial")
log1 <- glm(diabetes~glucose+pressure+triceps+insulin+mass+pedigree+age,data=train, family = "binomial")
log2 <- glm(diabetes~glucose+pressure+triceps+insulin+mass+pedigree,data=train, family = "binomial")
log3 <- glm(diabetes~glucose+pressure+triceps+insulin+mass,data=train, family = "binomial")
log4 <- glm(diabetes~glucose+pressure+triceps+insulin,data=train, family = "binomial")

Prediction

test.pred.m0 <- predict(log0, newdata = test, type = "response")
test.pred.m1 <- predict(log1, newdata = test, type = "response")
test.pred.m2 <- predict(log2, newdata = test, type = "response")
test.pred.m3 <- predict(log3, newdata = test, type = "response")
test.pred.m4 <- predict(log4, newdata = test, type = "response")
library(dplyr)
list(
  log0 = table(test$diabetes, test.pred.m1 > 0.5) %>% prop.table() %>% round(3),
  log1 = table(test$diabetes, test.pred.m1 > 0.5) %>% prop.table() %>% round(3),
  log2 = table(test$diabetes, test.pred.m2 > 0.5) %>% prop.table() %>% round(3),
  log3 = table(test$diabetes, test.pred.m3 > 0.5) %>% prop.table() %>% round(3),
  log4 = table(test$diabetes, test.pred.m3 > 0.5) %>% prop.table() %>% round(3)
)
## $log0
##    
##     FALSE  TRUE
##   0 0.541 0.092
##   1 0.148 0.218
## 
## $log1
##    
##     FALSE  TRUE
##   0 0.541 0.092
##   1 0.148 0.218
## 
## $log2
##    
##     FALSE  TRUE
##   0 0.550 0.083
##   1 0.175 0.192
## 
## $log3
##    
##     FALSE  TRUE
##   0 0.546 0.087
##   1 0.175 0.192
## 
## $log4
##    
##     FALSE  TRUE
##   0 0.546 0.087
##   1 0.175 0.192

TEST

test %>%
  mutate(m0.pred = ifelse(test.pred.m0 > 0.5, "pos", "neg"),
         m1.pred = ifelse(test.pred.m1 > 0.5, "pos", "neg"),
         m2.pred = ifelse(test.pred.m2 > 0.5, "pos", "neg"),
         m3.pred = ifelse(test.pred.m3 > 0.5, "pos", "neg")) %>%
  summarise(m0.error = mean(diabetes != m0.pred),
            m1.error = mean(diabetes != m1.pred),
            m2.error = mean(diabetes != m2.pred),
            m3.error = mean(diabetes != m3.pred))
##   m0.error m1.error m2.error m3.error
## 1        1        1        1        1

Plot

library(ROCR)
par(mfrow=c(1, 2))

prediction(test.pred.m0, test$diabetes) %>%
  performance(measure = "tpr", x.measure = "fpr") %>%
  plot()


prediction(test.pred.m1, test$diabetes) %>%
  performance(measure = "tpr", x.measure = "fpr") %>%
  plot()

prediction(test.pred.m2, test$diabetes) %>%
  performance(measure = "tpr", x.measure = "fpr") %>%
  plot()

Model performance

Model 0

prediction(test.pred.m0, test$diabetes) %>%
  performance(measure = "auc") %>%
  .@y.values
## [[1]]
## [1] 0.8286535
prediction(test.pred.m1, test$diabetes) %>%
  performance(measure = "auc") %>%
  .@y.values
## [[1]]
## [1] 0.8140394

Model 2

prediction(test.pred.m2, test$diabetes) %>%
  performance(measure = "auc") %>%
  .@y.values
## [[1]]
## [1] 0.8043514

Model 3

prediction(test.pred.m3, test$diabetes) %>%
  performance(measure = "auc") %>%
  .@y.values
## [[1]]
## [1] 0.7900657

Model 4

prediction(test.pred.m4, test$diabetes) %>%
  performance(measure = "auc") %>%
  .@y.values
## [[1]]
## [1] 0.7600985