Week 10

Logistic Regression Essentials in R

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

Warning: package 'caret' was built under R version 4.4.2

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

theme_set(theme_bw())


# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)

# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)

    pregnant glucose pressure triceps insulin mass pedigree age diabetes
450        0     120       74      18      63 30.5    0.285  26      neg
366        5      99       54      28      83 34.0    0.499  30      neg
449        0     104       64      37      64 33.6    0.510  22      pos

# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]

# Fit the model
model <- glm( diabetes ~., data = train.data, family = binomial)

# Summarize the model

summary(model)


Call:
glm(formula = diabetes ~ ., family = binomial, data = train.data)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.053e+01  1.440e+00  -7.317 2.54e-13 ***
pregnant     1.005e-01  6.127e-02   1.640  0.10092    
glucose      3.710e-02  6.486e-03   5.719 1.07e-08 ***
pressure    -3.876e-04  1.383e-02  -0.028  0.97764    
triceps      1.418e-02  1.998e-02   0.710  0.47800    
insulin      5.940e-04  1.508e-03   0.394  0.69371    
mass         7.997e-02  3.180e-02   2.515  0.01190 *  
pedigree     1.329e+00  4.823e-01   2.756  0.00585 ** 
age          2.718e-02  2.020e-02   1.346  0.17840    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 398.80  on 313  degrees of freedom
Residual deviance: 267.18  on 305  degrees of freedom
AIC: 285.18

Number of Fisher Scoring iterations: 5

# Make predictions
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")

# Model accuracy
mean(predicted.classes == test.data$diabetes)

[1] 0.7564103

model <- glm( diabetes ~ glucose, data = train.data, family = binomial)
summary(model)$coef

               Estimate  Std. Error   z value     Pr(>|z|)
(Intercept) -6.15882009 0.700096646 -8.797100 1.403974e-18
glucose      0.04327234 0.005341133  8.101716 5.418949e-16

newdata <- data.frame(glucose = c(20,  180))
probabilities <- model %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes

    1     2 
"neg" "pos"

train.data %>%
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
  ggplot(aes(glucose, prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  labs(
    title = "Logistic Regression Model", 
    x = "Plasma Glucose Concentration",
    y = "Probability of being diabete-pos"
    )

`geom_smooth()` using formula = 'y ~ x'

model <- glm( diabetes ~ glucose + mass + pregnant, 
                data = train.data, family = binomial)
summary(model)$coef

               Estimate  Std. Error   z value     Pr(>|z|)
(Intercept) -9.32369818 1.125997285 -8.280391 1.227711e-16
glucose      0.03886154 0.005404219  7.190962 6.433636e-13
mass         0.09458458 0.023529905  4.019760 5.825738e-05
pregnant     0.14466661 0.045125729  3.205857 1.346611e-03

model <- glm( diabetes ~., data = train.data, family = binomial)
summary(model)$coef

                 Estimate  Std. Error     z value     Pr(>|z|)
(Intercept) -1.053400e+01 1.439679266 -7.31690975 2.537464e-13
pregnant     1.005031e-01 0.061266974  1.64041157 1.009196e-01
glucose      3.709621e-02 0.006486093  5.71934633 1.069346e-08
pressure    -3.875933e-04 0.013826185 -0.02803328 9.776356e-01
triceps      1.417771e-02 0.019981885  0.70952823 4.779967e-01
insulin      5.939876e-04 0.001508231  0.39383055 6.937061e-01
mass         7.997447e-02 0.031798907  2.51500698 1.190300e-02
pedigree     1.329149e+00 0.482291020  2.75590704 5.852963e-03
age          2.718224e-02 0.020199295  1.34570257 1.783985e-01

coef(model)

  (Intercept)      pregnant       glucose      pressure       triceps 
-1.053400e+01  1.005031e-01  3.709621e-02 -3.875933e-04  1.417771e-02 
      insulin          mass      pedigree           age 
 5.939876e-04  7.997447e-02  1.329149e+00  2.718224e-02

summary(model )$coef

                 Estimate  Std. Error     z value     Pr(>|z|)
(Intercept) -1.053400e+01 1.439679266 -7.31690975 2.537464e-13
pregnant     1.005031e-01 0.061266974  1.64041157 1.009196e-01
glucose      3.709621e-02 0.006486093  5.71934633 1.069346e-08
pressure    -3.875933e-04 0.013826185 -0.02803328 9.776356e-01
triceps      1.417771e-02 0.019981885  0.70952823 4.779967e-01
insulin      5.939876e-04 0.001508231  0.39383055 6.937061e-01
mass         7.997447e-02 0.031798907  2.51500698 1.190300e-02
pedigree     1.329149e+00 0.482291020  2.75590704 5.852963e-03
age          2.718224e-02 0.020199295  1.34570257 1.783985e-01

model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, 
                data = train.data, family = binomial)

probabilities <- model %>% predict(test.data, type = "response")
head(probabilities)

       19        21        32        55        64        71 
0.1352603 0.5127526 0.6795376 0.7517408 0.2734867 0.1648174

contrasts(test.data$diabetes)

    pos
neg   0
pos   1

predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)

   19    21    32    55    64    71 
"neg" "pos" "pos" "pos" "neg" "neg"

mean(predicted.classes == test.data$diabetes)

[1] 0.7564103

For MT CAR Data Set

library(tidyverse)
library(caret)
theme_set(theme_bw())


# Load the data and remove NAs

data("mtcars")
mtcars <- na.omit(mtcars)

# Inspect the data
sample_n(mtcars, 3)

             mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Merc 280C   17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Mazda RX4   21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Honda Civic 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2

# Split the data into training and test set
set.seed(123)
training.samples <- mtcars$am %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- mtcars[training.samples, ]
test.data <- mtcars[-training.samples, ]
train.data

                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4          21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Datsun 710         22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive     21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
Duster 360         14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
Merc 240D          24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
Merc 230           22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
Merc 280           19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Merc 280C          17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Merc 450SLC        15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
Cadillac Fleetwood 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
Chrysler Imperial  14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
Fiat 128           32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Honda Civic        30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Toyota Corolla     33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
Toyota Corona      21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
Dodge Challenger   15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
AMC Javelin        15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
Camaro Z28         13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
Fiat X1-9          27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
Porsche 914-2      26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
Lotus Europa       30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
Ford Pantera L     15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
Ferrari Dino       19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
Maserati Bora      15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
Volvo 142E         21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

# Fit the model
model <- glm(am ~., data = train.data, family = binomial)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Summarize the model

summary(model)


Call:
glm(formula = am ~ ., family = binomial, data = train.data)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.860e+01  1.993e+06       0        1
mpg         -1.114e-01  7.690e+04       0        1
cyl         -1.301e-01  3.127e+05       0        1
disp        -4.812e-01  7.367e+03       0        1
hp           3.994e-01  4.925e+03       0        1
drat         2.703e+01  4.056e+05       0        1
wt           1.624e+01  9.006e+05       0        1
qsec        -8.030e+00  7.423e+04       0        1
vs          -5.034e+01  5.938e+05       0        1
gear         3.554e+01  7.552e+05       0        1
carb        -1.957e+01  2.526e+05       0        1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3.5890e+01  on 25  degrees of freedom
Residual deviance: 5.0366e-10  on 15  degrees of freedom
AIC: 22

Number of Fisher Scoring iterations: 25

# Make predictions
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")

# Model accuracy
mean(predicted.classes == test.data$am)

[1] 0

model <- glm( am ~ mpg, data = train.data, family = binomial)
summary(model)$coef

              Estimate Std. Error   z value   Pr(>|z|)
(Intercept) -5.6787708  2.2621512 -2.510341 0.01206147
mpg          0.2674735  0.1086672  2.461402 0.01383954

newdata <- data.frame(mpg = c(20,  180))
probabilities <- model %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes

    1     2 
"neg" "pos"

train.data %>%
  mutate(prob = ifelse(am == "pos", 1, 0)) %>%
  ggplot(aes(mpg, prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  labs(
    title = "Logistic Regression Model", 
    x = "am",
    y = "mpg"
    )

`geom_smooth()` using formula = 'y ~ x'

model <- glm( am ~., data = train.data, family = binomial)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)$coef

               Estimate  Std. Error       z value  Pr(>|z|)
(Intercept) -18.5964175 1993190.080 -9.329977e-06 0.9999926
mpg          -0.1113959   76896.770 -1.448642e-06 0.9999988
cyl          -0.1300507  312749.981 -4.158296e-07 0.9999997
disp         -0.4811709    7366.707 -6.531696e-05 0.9999479
hp            0.3994277    4924.666  8.110758e-05 0.9999353
drat         27.0289227  405617.578  6.663647e-05 0.9999468
wt           16.2447628  900644.022  1.803683e-05 0.9999856
qsec         -8.0299606   74230.936 -1.081754e-04 0.9999137
vs          -50.3366805  593806.706 -8.476947e-05 0.9999324
gear         35.5368001  755228.427  4.705437e-05 0.9999625
carb        -19.5741150  252594.110 -7.749237e-05 0.9999382