Application of Generalized additive model (GAM) in detecting diabetes

Load the necessary packages

library(mgcv)   # library for GAM

## Loading required package: nlme

## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.

library(ggplot2)  # for beautiful plots

## Warning: package 'ggplot2' was built under R version 4.0.5

library(cdata)    # data wrangling

## Warning: package 'cdata' was built under R version 4.0.5

## Loading required package: wrapr

## Warning: package 'wrapr' was built under R version 4.0.5

## 
## Attaching package: 'wrapr'

## The following object is masked from 'package:mgcv':
## 
##     %.%

library(sigr)     # AUC calculation

## Warning: package 'sigr' was built under R version 4.0.5

Set up working directory where the data file is located. Split the data into both training (80%) and test data (20%), similar to the previous post

data=read.csv('diabetes.csv')   # load the data into global environment

set.seed(123)                   # for reproducibility
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]

Use training dataset for model training. We can model predictor variable as non-linear by using notation s() in formula - input argument of gam() in mgcv package.

# formula: "Outcome==1" is to binarize the response variable
form_gam=as.formula("Outcome==1~s(Pregnancies)+s(Glucose)+s(BloodPressure)+
                    s(SkinThickness)+s(Insulin)+s(BMI)+s(DiabetesPedigreeFunction)+
                    s(Age)")

gam_model=gam(form_gam,data=train,family = binomial(link="logit"))
gam_model$converged       # check if the algorithm converges

## [1] TRUE

summary(gam_model)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Outcome == 1 ~ s(Pregnancies) + s(Glucose) + s(BloodPressure) + 
##     s(SkinThickness) + s(Insulin) + s(BMI) + s(DiabetesPedigreeFunction) + 
##     s(Age)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1111     0.1328  -8.369   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df Chi.sq  p-value    
## s(Pregnancies)              1.098  1.188  0.316   0.7062    
## s(Glucose)                  4.612  5.571 78.896 6.65e-15 ***
## s(BloodPressure)            1.000  1.000  5.512   0.0189 *  
## s(SkinThickness)            1.000  1.000  0.337   0.5613    
## s(Insulin)                  1.000  1.000  1.379   0.2403    
## s(BMI)                      5.174  6.207 36.599 2.95e-06 ***
## s(DiabetesPedigreeFunction) 2.013  2.544  9.750   0.0135 *  
## s(Age)                      2.936  3.698 15.614   0.0034 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =    0.4   Deviance explained = 36.6%
## UBRE = -0.1246  Scale est. = 1         n = 613

We can always use plot(model) to visualize the basis function mapping on each predictor. But this syntax only allow plot of one smooth function on scale of linear predictor in one figure. Instead, data wrangling and ggplot() is applied to visualize all the basis function mapping.

# Visualize s() output 
terms=predict(gam_model,type="terms")
terms=cbind(Outcome=train$Outcome,terms)

# convert terms into data frame
frame1=as.data.frame(terms)
# remove brackets
colnames(frame1)=gsub('[()]','',colnames(frame1))
vars=setdiff(colnames(train),"Outcome")
# remove first character 's'
colnames(frame1)[-1]=sub(".","",colnames(frame1)[-1])
# Convert the wide to long form
library(cdata)
frame1_long=unpivot_to_blocks(frame1,nameForNewKeyColumn = "basis_function",
                              nameForNewValueColumn = "basis_value",
                              columnsToTakeFrom = vars)
# get the predictor values from training data
data_long=unpivot_to_blocks(train,nameForNewKeyColumn = "predictor",
                            nameForNewValueColumn = "value",
                            columnsToTakeFrom = vars)
frame1_long=cbind(frame1_long,value=data_long$value)

ggplot(data=frame1_long,aes(x=value,y=basis_value)) +
  geom_smooth() +
  facet_wrap(~basis_function,ncol = 3,scales = "free")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# function to calculate the performance of model in terms of accuracy, precision, recall and
# area under the curve (AUC). The curve refers to receiver operating characteristics curve.
performance=function(y,pred){
  confmat_test=table(truth=y,predict=pred>0.5)
  acc=sum(diag(confmat_test))/sum(confmat_test)
  precision=confmat_test[2,2]/sum(confmat_test[,2])
  recall=confmat_test[2,2]/sum(confmat_test[2,])
  auc=calcAUC(pred,y)
  c(acc,precision,recall,auc)
}

# Posterior probability
train$pred=predict(gam_model,newdata = train,type = "response")
test$pred=predict(gam_model,newdata=test,type="response")

# model performance evaluated using training data
perf_train=performance(train$Outcome,train$pred)
perf_test=performance(test$Outcome,test$pred)
perf_mat=rbind(perf_train,perf_test)
colnames(perf_mat)=c("accuracy","precision","recall","AUC")
round(perf_mat,4)

##            accuracy precision recall    AUC
## perf_train   0.8091    0.7446 0.6618 0.8825
## perf_test    0.7419    0.7333 0.5410 0.8186

Slight improvement compared to logistic regression.