── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
Warning: package 'caret' was built under R version 4.4.2
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
theme_set(theme_bw())# Load the data and remove NAsdata("PimaIndiansDiabetes2", package ="mlbench")PimaIndiansDiabetes2 <-na.omit(PimaIndiansDiabetes2)# Inspect the datasample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test setset.seed(123)training.samples <- PimaIndiansDiabetes2$diabetes %>%createDataPartition(p =0.8, list =FALSE)train.data <- PimaIndiansDiabetes2[training.samples, ]test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the modelmodel <-glm( diabetes ~., data = train.data, family = binomial)# Summarize the modelsummary(model)
Call:
glm(formula = diabetes ~ ., family = binomial, data = train.data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.053e+01 1.440e+00 -7.317 2.54e-13 ***
pregnant 1.005e-01 6.127e-02 1.640 0.10092
glucose 3.710e-02 6.486e-03 5.719 1.07e-08 ***
pressure -3.876e-04 1.383e-02 -0.028 0.97764
triceps 1.418e-02 1.998e-02 0.710 0.47800
insulin 5.940e-04 1.508e-03 0.394 0.69371
mass 7.997e-02 3.180e-02 2.515 0.01190 *
pedigree 1.329e+00 4.823e-01 2.756 0.00585 **
age 2.718e-02 2.020e-02 1.346 0.17840
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 398.80 on 313 degrees of freedom
Residual deviance: 267.18 on 305 degrees of freedom
AIC: 285.18
Number of Fisher Scoring iterations: 5
# Make predictionsprobabilities <- model %>%predict(test.data, type ="response")predicted.classes <-ifelse(probabilities >0.5, "pos", "neg")# Model accuracymean(predicted.classes == test.data$diabetes)
[1] 0.7564103
model <-glm( diabetes ~ glucose, data = train.data, family = binomial)summary(model)$coef
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.15882009 0.700096646 -8.797100 1.403974e-18
glucose 0.04327234 0.005341133 8.101716 5.418949e-16
newdata <-data.frame(glucose =c(20, 180))probabilities <- model %>%predict(newdata, type ="response")predicted.classes <-ifelse(probabilities >0.5, "pos", "neg")predicted.classes
1 2
"neg" "pos"
train.data %>%mutate(prob =ifelse(diabetes =="pos", 1, 0)) %>%ggplot(aes(glucose, prob)) +geom_point(alpha =0.2) +geom_smooth(method ="glm", method.args =list(family ="binomial")) +labs(title ="Logistic Regression Model", x ="Plasma Glucose Concentration",y ="Probability of being diabete-pos" )
`geom_smooth()` using formula = 'y ~ x'
model <-glm( diabetes ~ glucose + mass + pregnant, data = train.data, family = binomial)summary(model)$coef
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.32369818 1.125997285 -8.280391 1.227711e-16
glucose 0.03886154 0.005404219 7.190962 6.433636e-13
mass 0.09458458 0.023529905 4.019760 5.825738e-05
pregnant 0.14466661 0.045125729 3.205857 1.346611e-03
model <-glm( diabetes ~., data = train.data, family = binomial)summary(model)$coef
library(tidyverse)library(caret)theme_set(theme_bw())# Load the data and remove NAsdata("mtcars")mtcars <-na.omit(mtcars)# Inspect the datasample_n(mtcars, 3)
# Split the data into training and test setset.seed(123)training.samples <- mtcars$am %>%createDataPartition(p =0.8, list =FALSE)train.data <- mtcars[training.samples, ]test.data <- mtcars[-training.samples, ]train.data
# Fit the modelmodel <-glm(am ~., data = train.data, family = binomial)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Summarize the modelsummary(model)
Call:
glm(formula = am ~ ., family = binomial, data = train.data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.860e+01 1.993e+06 0 1
mpg -1.114e-01 7.690e+04 0 1
cyl -1.301e-01 3.127e+05 0 1
disp -4.812e-01 7.367e+03 0 1
hp 3.994e-01 4.925e+03 0 1
drat 2.703e+01 4.056e+05 0 1
wt 1.624e+01 9.006e+05 0 1
qsec -8.030e+00 7.423e+04 0 1
vs -5.034e+01 5.938e+05 0 1
gear 3.554e+01 7.552e+05 0 1
carb -1.957e+01 2.526e+05 0 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3.5890e+01 on 25 degrees of freedom
Residual deviance: 5.0366e-10 on 15 degrees of freedom
AIC: 22
Number of Fisher Scoring iterations: 25
# Make predictionsprobabilities <- model %>%predict(test.data, type ="response")predicted.classes <-ifelse(probabilities >0.5, "pos", "neg")# Model accuracymean(predicted.classes == test.data$am)
[1] 0
model <-glm( am ~ mpg, data = train.data, family = binomial)summary(model)$coef
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.6787708 2.2621512 -2.510341 0.01206147
mpg 0.2674735 0.1086672 2.461402 0.01383954
newdata <-data.frame(mpg =c(20, 180))probabilities <- model %>%predict(newdata, type ="response")predicted.classes <-ifelse(probabilities >0.5, "pos", "neg")predicted.classes