This presentation shows the steps on how to deploy a GLM (Logistic Regression) machine learning model created in R via a Plumber API.
setwd("~/Documents/Deploying a simple ML model with Plumber 101")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
# change reference value to 'pos' using the relevel() function.
PimaIndiansDiabetes$diabetes <- relevel(PimaIndiansDiabetes$diabetes, ref = "pos")
levels(PimaIndiansDiabetes$diabetes)
## [1] "pos" "neg"
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "pos","neg": 1 2 1 2 1 2 1 2 1 1 ...
set.seed(3456)
trainIndex <- createDataPartition(PimaIndiansDiabetes$diabetes, p = .8, list = FALSE, times = 1)
df_Train <- PimaIndiansDiabetes[ trainIndex,]
df_Test <- PimaIndiansDiabetes[-trainIndex,]
# Prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# Train the GLM model
set.seed(7)
modelGlm <- train(diabetes~., data=df_Train, method="glm", metric="Accuracy", trControl=control)
summary(modelGlm)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.5724432 0.8119720 10.558 < 2e-16 ***
## pregnant -0.1376126 0.0368160 -3.738 0.000186 ***
## glucose -0.0408528 0.0043887 -9.309 < 2e-16 ***
## pressure 0.0128471 0.0059049 2.176 0.029580 *
## triceps -0.0033086 0.0077105 -0.429 0.667845
## insulin 0.0014884 0.0009913 1.501 0.133240
## mass -0.0786215 0.0168542 -4.665 3.09e-06 ***
## pedigree -0.9839826 0.3379274 -2.912 0.003593 **
## age -0.0055268 0.0108558 -0.509 0.610677
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 796.05 on 614 degrees of freedom
## Residual deviance: 562.35 on 606 degrees of freedom
## AIC: 580.35
##
## Number of Fisher Scoring iterations: 5
# Estimate variable importance
importanceGlm <- varImp(modelGlm, scale=FALSE)
# Summarize importance
print(importanceGlm)
## glm variable importance
##
## Overall
## glucose 9.3086
## mass 4.6648
## pregnant 3.7379
## pedigree 2.9118
## pressure 2.1757
## insulin 1.5014
## age 0.5091
## triceps 0.4291
# Plot importance
plot(importanceGlm, main = "Variable Importance GLM Model")
# Use model to predict probability of default
Glm_pred <- predict(modelGlm, df_Test)
# Create confusion matrix
confusionMatrix(df_Test$diabetes, Glm_pred, positive='pos') # add [, positive='pos'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 26 27
## neg 15 85
##
## Accuracy : 0.7255
## 95% CI : (0.6476, 0.7945)
## No Information Rate : 0.732
## P-Value [Acc > NIR] : 0.61284
##
## Kappa : 0.3597
##
## Mcnemar's Test P-Value : 0.08963
##
## Sensitivity : 0.6341
## Specificity : 0.7589
## Pos Pred Value : 0.4906
## Neg Pred Value : 0.8500
## Prevalence : 0.2680
## Detection Rate : 0.1699
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6965
##
## 'Positive' Class : pos
##
control <- trainControl(method="repeatedcv", number=10, repeats=3)
set.seed(7)
model_diabetes<- train(diabetes~ glucose + mass + pregnant + pedigree,
data=df_Train, method="glm", metric="Accuracy", trControl=control)
summary(model_diabetes)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.742910 0.749635 11.663 < 2e-16 ***
## glucose -0.038620 0.003969 -9.731 < 2e-16 ***
## mass -0.069377 0.015490 -4.479 7.51e-06 ***
## pregnant -0.141247 0.030403 -4.646 3.39e-06 ***
## pedigree -0.918530 0.329290 -2.789 0.00528 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 796.05 on 614 degrees of freedom
## Residual deviance: 569.76 on 610 degrees of freedom
## AIC: 579.76
##
## Number of Fisher Scoring iterations: 5
# Use model to predict probability of default
Glm_pred_diabetes <- predict(model_diabetes, df_Test)
# Create confusion matrix
confusionMatrix(df_Test$diabetes, Glm_pred_diabetes) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 25 28
## neg 14 86
##
## Accuracy : 0.7255
## 95% CI : (0.6476, 0.7945)
## No Information Rate : 0.7451
## P-Value [Acc > NIR] : 0.74470
##
## Kappa : 0.3537
##
## Mcnemar's Test P-Value : 0.04486
##
## Sensitivity : 0.6410
## Specificity : 0.7544
## Pos Pred Value : 0.4717
## Neg Pred Value : 0.8600
## Prevalence : 0.2549
## Detection Rate : 0.1634
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6977
##
## 'Positive' Class : pos
##
diabetesDiagnosis<- function(glucose, mass, pregnant, pedigree){
newdata<- data.frame(glucose=glucose, mass=mass, pregnant=pregnant, pedigree=pedigree)
predict(model_diabetes, newdata, type = "prob")
}
print(diabetesDiagnosis(148,33.6,6,0.627))
## pos neg
## 1 0.6742193 0.3257807
saveRDS(model_diabetes, "model-diabetes.rds")
model <- readRDS("model-diabetes.rds")
#* Returns the probability of patient being positive for diabetes
#* @param glucose Plasma glucose concentration
#* @param mass Body mass index
#* @param pregnant Number of times pregnant
#* @param pedigree Diabetes pedigree function
#* @post /diabetes
function(glucose, mass, pregnant, pedigree){
newdata <- data.frame(glucose = as.numeric(glucose), mass = as.numeric(mass), pregnant = as.numeric(pregnant), pedigree = as.numeric(pedigree))
predict(model, newdata, type = "prob")
}
## function(glucose, mass, pregnant, pedigree){
## newdata <- data.frame(glucose = as.numeric(glucose), mass = as.numeric(mass), pregnant = as.numeric(pregnant), pedigree = as.numeric(pedigree))
## predict(model, newdata, type = "prob")
## }
knitr::include_graphics("Debug_pic.png")
Here is an example of the Plumber API window:
To begin using the model click on the [POST] button. After, click on the [Try it out] button.
Now, you can enter the following values into the parameters: glucose=148, mass=33.6, pregnant=6 and pedigree=0.627. After, to view result click on the [Execute] button.
Here is the final predictive output in the response body: “pos” = 0.6742 and “neg” = 0.3258 This result indicates that the person most likely has diabetes.
A.M.D.G.