Load library

library(this.path)
library(readstata13)
library(lspline)
library(mfp)
## Loading required package: survival
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Section 1 – Nonlinear Models (18 marks in total)

The first part of this assessment will focus on the dataset called “PPMI_pred_training_2026.dta”, which contains participants from the original PPMI cohort. As described above, the binary dependent variable is called “updrs2_bin_y2” and indicates whether the participants had either moderate or severe motor problems at the second yearly follow-up. This will be the dependent variable for your whole assignment.

setwd(this.dir())

ppmi_train_data  <- read.dta13("PPMI_pred_training_2026.dta")

str(ppmi_train_data)
## 'data.frame':    390 obs. of  15 variables:
##  $ patno        : int  3001 3002 3003 3006 3007 3010 3012 3014 3018 3020 ...
##  $ age          : num  65 67 56 57 64 46 58 67 60 73 ...
##  $ sex          : int  1 0 0 0 1 1 1 1 0 0 ...
##  $ scopa_gi     : int  1 7 2 0 1 2 3 1 1 6 ...
##  $ scopa_cv     : int  0 0 2 0 0 1 1 1 0 0 ...
##  $ scopa_therm  : int  0 3 0 5 0 5 1 0 0 4 ...
##  $ ess          : int  6 14 8 0 16 2 7 7 14 7 ...
##  $ lns          : int  16 12 12 15 10 14 10 8 8 9 ...
##  $ rem          : int  4 7 3 4 6 9 8 2 3 6 ...
##  $ mseadlg      : int  1 3 3 1 1 1 1 1 1 1 ...
##  $ sdmtotal     : int  10 10 9 10 8 13 11 8 9 9 ...
##  $ stai         : int  12 17 12 12 15 18 17 13 21 14 ...
##  $ updrs1_bl    : int  8 8 12 3 8 11 11 8 7 15 ...
##  $ gds          : int  1 3 1 0 0 3 4 0 3 3 ...
##  $ updrs2_bin_y2: int  0 1 0 1 1 1 1 0 1 0 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr "29 Jan 2026 13:54"
##  - attr(*, "formats")= chr [1:15] "%10.0g" "%10.0g" "%10.0g" "%10.0g" ...
##  - attr(*, "types")= int [1:15] 65528 65526 65530 65530 65530 65530 65530 65530 65530 65530 ...
##  - attr(*, "val.labels")= Named chr [1:15] "" "" "sexlab" "" ...
##   ..- attr(*, "names")= chr [1:15] "" "" "sexlab" "" ...
##  - attr(*, "var.labels")= chr [1:15] "Patient ID" "Age at enrolment" "Patient sex" "SCOPA-AUT Gastrointestinal (GI) Score" ...
##  - attr(*, "version")= int 117
##  - attr(*, "label.table")= list()
##  - attr(*, "expansion.fields")=List of 12
##   ..$ : chr [1:3] "_dta" "ReS_i" "patno"
##   ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
##   ..$ : chr [1:3] "_dta" "ReS_j" "year"
##   ..$ : chr [1:3] "_dta" "ReS_str" "1"
##   ..$ : chr [1:3] "_dta" "ReS_Xij" "updrs1_bin updrs2_bin"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_wide1" "updrs1_bin_y0 updrs1_bin_y1 updrs1_bin_y2 updrs1_bin_y3 updrs1_bin_y4 updrs1_bin_y5"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_long1" "updrs1_bin"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_wide2" "updrs2_bin_y0 updrs2_bin_y1 updrs2_bin_y2 updrs2_bin_y3 updrs2_bin_y4 updrs2_bin_y5"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_long2" "updrs2_bin"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_n" "2"
##   ..$ : chr [1:3] "patno" "destring" "Characters removed were:"
##   ..$ : chr [1:3] "patno" "destring_cmd" "destring patno, replace"
##  - attr(*, "byteorder")= chr "LSF"
##  - attr(*, "orig.dim")= int [1:2] 390 15
##  - attr(*, "data.label")= chr(0)
names (ppmi_train_data)
##  [1] "patno"         "age"           "sex"           "scopa_gi"     
##  [5] "scopa_cv"      "scopa_therm"   "ess"           "lns"          
##  [9] "rem"           "mseadlg"       "sdmtotal"      "stai"         
## [13] "updrs1_bl"     "gds"           "updrs2_bin_y2"

Q1. Age and sex and the odds of motor problems.

Fit a model for your binary dependent variable, containing age at enrolment, participant sex, plus their interaction. For this model, centre age at 60 years and assume that it is linearly related to the outcome. (i) Interpret the four parameters from this model, on the odds scale

# Recode sex

ppmi_train_data$sex <- factor(ppmi_train_data$sex, levels = c(0,1), labels = c("Female","Male"))

ppmi_train_data$updrs2_bin_y2 <- factor(
  ppmi_train_data$updrs2_bin_y2,
  levels = c(0,1),
  labels = c("No/Mild","Moderate/Severe")
)


# explore the data

hist(ppmi_train_data$age)

prop.table(table(ppmi_train_data$sex))*100
## 
##   Female     Male 
## 34.10256 65.89744
prop.table(table(ppmi_train_data$updrs2_bin_y2))*100
## 
##         No/Mild Moderate/Severe 
##        79.23077        20.76923
# Centre age at 60

ppmi_train_data$age_c <- ppmi_train_data$age - 60
hist(ppmi_train_data$age_c)

# model with interaction terms 

model_Ai <- glm(updrs2_bin_y2 ~ age_c*sex,
             data = ppmi_train_data,
             family = binomial(link = "logit"))

summary (model_Ai)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ age_c * sex, family = binomial(link = "logit"), 
##     data = ppmi_train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.91821    0.26050  -7.363 1.79e-13 ***
## age_c         -0.01880    0.02782  -0.676   0.4993    
## sexMale        0.70868    0.30421   2.330   0.0198 *  
## age_c:sexMale  0.05870    0.03226   1.819   0.0689 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 383.29  on 386  degrees of freedom
## AIC: 391.29
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(model_Ai), confint(model_Ai)))
## Waiting for profiling to be done...
##                      OR      2.5 %    97.5 %
## (Intercept)   0.1468697 0.08501712 0.2376474
## age_c         0.9813782 0.92859198 1.0366733
## sexMale       2.0313078 1.13892879 3.7794592
## age_c:sexMale 1.0604534 0.99554637 1.1305672
# model without interaction terms 

model_n <- glm(updrs2_bin_y2 ~ age_c + sex,
               data = ppmi_train_data,
               family = binomial)
summary(model_n)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ age_c + sex, family = binomial, 
##     data = ppmi_train_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.95947    0.26247  -7.466 8.29e-14 ***
## age_c        0.02547    0.01375   1.852  0.06398 .  
## sexMale      0.79657    0.29824   2.671  0.00756 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 386.61  on 387  degrees of freedom
## AIC: 392.61
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(model_n), confint(model_n)))
## Waiting for profiling to be done...
##                    OR     2.5 %    97.5 %
## (Intercept) 0.1409334 0.0813086 0.2289387
## age_c       1.0257950 0.9989889 1.0544585
## sexMale     2.2179301 1.2608576 4.0846687
# A likelihood ratio test

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
lrtest(model_Ai, model_n)
## Likelihood ratio test
## 
## Model 1: updrs2_bin_y2 ~ age_c * sex
## Model 2: updrs2_bin_y2 ~ age_c + sex
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1   4 -191.65                       
## 2   3 -193.31 -1 3.3195    0.06846 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Plot the fitted values from your regression model, to illustrate the sex-specific association between age and the outcome. What type of interaction is this? (2 marks).
library(ggplot2)

# Create predicted probabilities from your model
ppmi_train_data$pred <- predict(model_Ai, type = "response")

# Plot
ggplot(ppmi_train_data, aes(x = age, y = pred, colour = sex)) +
  geom_line() +
  labs(
    x = "Age (years)",
    y = "Predicted probability of moderate/severe motor problems",
    colour = "Sex"
  ) +
  theme_minimal()

A linear spline model for non-motor skills at enrolment.

Next, we will focus on updrs1_bl which quantifies problems with non-motor skills at enrolment, such as sleep, negative mood, and cognition.

  1. Fit a model for your binary dependent variable, using a pair of splines for updrs1_bl and taking the median value of this predictor as the single knot-point. Interpret all three parameters from this model, on the odds scale (3 marks).
# spline variables 

# Find median knot
knot <- median(ppmi_train_data$updrs1_bl, na.rm = TRUE)

# Create spline term
ppmi_train_data$updrs1_spline <- pmax(0, ppmi_train_data$updrs1_bl - knot)

# Fit logistic regression
model_spline <- glm(updrs2_bin_y2 ~ updrs1_bl + updrs1_spline,
                    family = binomial,
                    data = ppmi_train_data)

summary(model_spline)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ updrs1_bl + updrs1_spline, family = binomial, 
##     data = ppmi_train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.0647     0.5103  -6.005 1.91e-09 ***
## updrs1_bl       0.3744     0.1190   3.146  0.00166 ** 
## updrs1_spline  -0.2935     0.1388  -2.114  0.03448 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 371.19  on 387  degrees of freedom
## AIC: 377.19
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = coef(model_spline), confint(model_spline)))
## Waiting for profiling to be done...
##                       OR      2.5 %    97.5 %
## (Intercept)   0.04666793 0.01549201 0.1165727
## updrs1_bl     1.45404682 1.16759173 1.8683273
## updrs1_spline 0.74566896 0.56019303 0.9686740

(ii) Produce a suitable plot to illustrate the relationship between updrs1_bl and your outcome and add the fitted values from your logistic regression model

# Create numeric binary outcome for plotting
ppmi_train_data$updrs2_bin_y2_num <- ifelse(
  ppmi_train_data$updrs2_bin_y2 == "Moderate/Severe", 1, 0
)

# Create sequence of values for smooth curve
newdata <- data.frame(
  updrs1_bl = seq(min(ppmi_train_data$updrs1_bl, na.rm = TRUE),
                  max(ppmi_train_data$updrs1_bl, na.rm = TRUE),
                  length.out = 200)
)

# Add spline term using same knot
knot <- median(ppmi_train_data$updrs1_bl, na.rm = TRUE)
newdata$updrs1_spline <- pmax(0, newdata$updrs1_bl - knot)

# Predicted probabilities
newdata$pred <- predict(model_spline, newdata = newdata, type = "response")

# Plot raw data (jittered)
plot(ppmi_train_data$updrs1_bl,
     jitter(ppmi_train_data$updrs2_bin_y2_num, amount = 0.05),
     xlab = "UPDRS1 (baseline)",
     ylab = "Probability of Moderate/Severe outcome",
     pch = 16,
     col = "grey70",
     ylim = c(0, 1.05))

# Add fitted spline curve
lines(newdata$updrs1_bl, newdata$pred, col = "blue", lwd = 3)

# Add vertical line at knot (optional but good)
abline(v = knot, lty = 2, col = "red")

median(ppmi_train_data$updrs1_bl, na.rm = TRUE)
## [1] 5

Fractional Polynomials for gastrointestinal dysautonomia.

The SCOPA-GI measure assesses gastrointestinal dysautonomia in patients with PD. It is part of the Scale for Outcomes in PD for Autonomic Symptoms (SCOPA-AUT). The specific measure used here assesses the impact of gastrointestinal dysautonomia on patients’ health and quality of life, particularly in relation to constipation and other gastrointestinal issues.

Determine the best-fitting fractional polynomial (FP) model for the relationship between SCOPA-GI and your binary dependent variable.

  1. Justify your choice of model
  2. Report the degree of this model, the power (or powers) chosen, and what these powers represent (i.e., which functions of the explanatory variable)
  3. Write out an expression for this regression model which links the log-odds of your outcome with this explanatory variable
# Load the package 

install.packages("mfp")
## Warning: package 'mfp' is in use and will not be installed
library(mfp)
library(survival)
model_fp2 <- mfp(updrs2_bin_y2 ~ fp(scopa_gi, df = 4, select = 0.05),
                family = binomial,
                data = ppmi_train_data,
                verbose = TRUE)
## 
##  Variable    Deviance    Power(s)
## ------------------------------------------------
## Cycle 1
##  scopa_gi             
##              398.489      
##              370.067     1
##              361.778     -0.5
##              357.332     1 2
## 
## 
## Tansformation
##          shift scale
## scopa_gi     1    10
## 
## Fractional polynomials
##          df.initial select alpha df.final power1 power2
## scopa_gi          4   0.05  0.05        2   -0.5      .
## 
## 
## Transformations of covariates:
##                            formula
## scopa_gi I(((scopa_gi+1)/10)^-0.5)
## 
## 
## Deviance table:
##           Resid. Dev
## Null model    398.4888
## Linear model  370.0673
## Final model   361.7777
summary(model_fp2)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ I(((scopa_gi + 1)/10)^-0.5), family = binomial, 
##     data = ppmi_train_data)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   1.2025     0.4657   2.582  0.00981 ** 
## I(((scopa_gi + 1)/10)^-0.5)  -1.3425     0.2554  -5.256 1.47e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 361.78  on 388  degrees of freedom
## AIC: 365.78
## 
## Number of Fisher Scoring iterations: 5
model_fp2$pvalues
##                p.null       p.lin      p.FP power2 power4.1 power4.2
## scopa_gi 2.493678e-08 0.005244407 0.1082733   -0.5        1        2

Iv. Produce a suitable plot which displays the fitted values from your chosen FP model alongside the raw data. Comment on how well this curve appears to fit the data (2 marks).

# Create numeric binary outcome for plotting
ppmi_train_data$updrs2_bin_y2_num <- ifelse(
  ppmi_train_data$updrs2_bin_y2 == "Moderate/Severe", 1, 0
)

# Create values for smooth prediction
newdata <- data.frame(
  scopa_gi = seq(min(ppmi_train_data$scopa_gi, na.rm = TRUE),
                 max(ppmi_train_data$scopa_gi, na.rm = TRUE),
                 length.out = 200)
)

# Predicted probabilities from the fitted FP model
newdata$pred <- predict(model_fp2, newdata = newdata, type = "response")

# Plot raw data with jitter
plot(ppmi_train_data$scopa_gi,
     jitter(ppmi_train_data$updrs2_bin_y2_num, amount = 0.05),
     xlab = "SCOPA-GI",
     ylab = "Predicted probability of UPDRS2 outcome",
     main = "Fractional polynomial fit for SCOPA-GI",
     pch = 16,
     col = "grey70",
     ylim = c(0, 1.05))

# Add fitted FP curve
lines(newdata$scopa_gi, newdata$pred, lwd = 3, col = "blue")

Schwab and England ADL (Activities of Daily Living) scale.

The Schwab and England ADL (Activities of Daily Living) scale is a method of assessing the capabilities of people with impaired mobility. A patient with a score of 100 (indicative of 100%) would be completely independent – able to do chores without slowness, difficulty or impairment. Lower scores indicate some lack of independence, with a score of zero corresponding to patient who is bedridden and helpless.

  1. Determine whether it is adequate to assume that this variable - mseadlg - is linearly related to the log-odds of your binary outcome and justify your conclusions (1 mark).
  2. Interpret all parameters from your chosen model, on the odds scale (1 mark).
model_lin <- glm(updrs2_bin_y2 ~ mseadlg,
                 data = ppmi_train_data,
                 family = binomial (link = "logit"))
summary(model_lin)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ mseadlg, family = binomial(link = "logit"), 
##     data = ppmi_train_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.3510     0.2707  -8.685  < 2e-16 ***
## mseadlg       0.6803     0.1422   4.785 1.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 372.22  on 388  degrees of freedom
## AIC: 376.22
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(model_lin), confint(model_lin)))
## Waiting for profiling to be done...
##                     OR      2.5 %    97.5 %
## (Intercept) 0.09527644 0.05420649 0.1573279
## mseadlg     1.97442350 1.50724133 2.6373904
model_cat <- glm(updrs2_bin_y2 ~ as.factor(mseadlg),
                 data = ppmi_train_data,
                 family = binomial (link = "logit"))

summary(model_cat)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ as.factor(mseadlg), family = binomial(link = "logit"), 
##     data = ppmi_train_data)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.5735     0.3458  -7.442 9.93e-14 ***
## as.factor(mseadlg)1   1.2742     0.4750   2.682  0.00731 ** 
## as.factor(mseadlg)2   1.5794     0.3857   4.095 4.22e-05 ***
## as.factor(mseadlg)3   2.1427     0.4965   4.316 1.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 370.45  on 386  degrees of freedom
## AIC: 378.45
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = coef(model_cat), confint(model_cat)))
## Waiting for profiling to be done...
##                             OR      2.5 %     97.5 %
## (Intercept)         0.07627119 0.03594315  0.1416949
## as.factor(mseadlg)1 3.57575758 1.41766850  9.3296586
## as.factor(mseadlg)2 4.85214348 2.38042553 10.9747574
## as.factor(mseadlg)3 8.52222222 3.26648383 23.2909180
AIC(model_lin, model_cat)
##           df      AIC
## model_lin  2 376.2167
## model_cat  4 378.4502
lrtest(model_lin, model_cat)
## Likelihood ratio test
## 
## Model 1: updrs2_bin_y2 ~ mseadlg
## Model 2: updrs2_bin_y2 ~ as.factor(mseadlg)
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   2 -186.11                     
## 2   4 -185.22  2 1.7664     0.4134

Section II – Prediction modelling (12 marks in total)

Q1. Model development.

For the model development stage, please continue to use the dataset PPMI_pred_training_2026.dta. (i) Fit a multivariable logistic regression model for motor-problems incorporating eight predictors in the format described in Table 2 below. Report the summary output for this model (i.e., regression parameters on the log-odds scale, SEs, and p-values) (1 mark). (ii) Report and interpret on the AUROC for this model (1 mark).

model_final <- glm(
  updrs2_bin_y2 ~ age_c * sex +
    updrs1_bl + updrs1_spline +
    I(((scopa_gi + 1)/10)^-0.5) +
    mseadlg +
    gds +
    ess +
    scopa_cv +
    sdmtotal,
  data = ppmi_train_data,
  family = binomial
)

summary(model_final)
## 
## Call:
## glm(formula = updrs2_bin_y2 ~ age_c * sex + updrs1_bl + updrs1_spline + 
##     I(((scopa_gi + 1)/10)^-0.5) + mseadlg + gds + ess + scopa_cv + 
##     sdmtotal, family = binomial, data = ppmi_train_data)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -0.98396    1.24653  -0.789 0.429901    
## age_c                       -0.01913    0.03126  -0.612 0.540470    
## sexMale                      0.76375    0.35342   2.161 0.030694 *  
## updrs1_bl                    0.16521    0.13365   1.236 0.216406    
## updrs1_spline               -0.11475    0.15646  -0.733 0.463306    
## I(((scopa_gi + 1)/10)^-0.5) -0.81824    0.30252  -2.705 0.006836 ** 
## mseadlg                      0.47574    0.15963   2.980 0.002880 ** 
## gds                         -0.07135    0.06964  -1.025 0.305598    
## ess                          0.15663    0.04192   3.737 0.000187 ***
## scopa_cv                     0.10393    0.17382   0.598 0.549881    
## sdmtotal                    -0.18305    0.06742  -2.715 0.006625 ** 
## age_c:sexMale                0.01722    0.03587   0.480 0.631144    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 398.49  on 389  degrees of freedom
## Residual deviance: 303.82  on 378  degrees of freedom
## AIC: 327.82
## 
## Number of Fisher Scoring iterations: 5
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
prob <- predict(model_final, type = "response")
roc_obj <- roc(ppmi_train_data$updrs2_bin_y2, prob)
## Setting levels: control = No/Mild, case = Moderate/Severe
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.8249
plot(roc_obj, col = "blue", lwd = 2)

Model validation.

For the validation step you should use the second datafile called PPMI_pred_testing_2026.dta containing a sample of 534 more recent recruits to the PPMI study. (i) Produce, and interpret, a calibration plot for the multivariable logistic model you derived in Q1, using this second dataset (1 mark).

  1. Report the Area Under the Curve (AUROC), the calibration slope and the “calibration in the large” (CITL) and using these three values comment on the performance of your model in this validation dataset (3 marks).
ppmi_test <- read.dta13("PPMI_pred_testing_2026.dta")

str(ppmi_test)
## 'data.frame':    534 obs. of  15 variables:
##  $ patno        : int  100005 100012 100018 100268 100738 100842 100878 100889 100891 100898 ...
##  $ age          : num  52 66 69 72 71 63 65 74 62 75 ...
##  $ sex          : int  1 0 0 0 0 1 1 0 1 1 ...
##  $ scopa_gi     : int  1 3 2 6 0 2 1 0 3 3 ...
##  $ scopa_cv     : int  1 1 0 0 0 0 2 0 1 0 ...
##  $ scopa_therm  : int  0 2 0 1 1 3 0 0 1 1 ...
##  $ ess          : int  2 2 1 4 4 9 2 3 9 5 ...
##  $ lns          : int  9 9 9 11 10 13 12 11 15 12 ...
##  $ rem          : int  2 5 2 4 5 3 2 1 11 4 ...
##  $ mseadlg      : int  0 1 1 2 0 2 0 0 1 0 ...
##  $ sdmtotal     : int  10 12 9 10 11 10 12 12 8 7 ...
##  $ stai         : int  12 12 22 11 15 24 16 10 15 10 ...
##  $ updrs1_bl    : int  6 7 12 4 9 8 3 0 8 4 ...
##  $ gds          : int  0 4 5 0 2 5 0 0 1 0 ...
##  $ updrs2_bin_y2: int  0 0 1 1 0 0 0 0 0 0 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr "29 Jan 2026 13:54"
##  - attr(*, "formats")= chr [1:15] "%10.0g" "%10.0g" "%10.0g" "%10.0g" ...
##  - attr(*, "types")= int [1:15] 65528 65526 65530 65530 65530 65530 65530 65530 65530 65530 ...
##  - attr(*, "val.labels")= Named chr [1:15] "" "" "sexlab" "" ...
##   ..- attr(*, "names")= chr [1:15] "" "" "sexlab" "" ...
##  - attr(*, "var.labels")= chr [1:15] "Patient ID" "Age at enrolment" "Patient sex" "SCOPA-AUT Gastrointestinal (GI) Score" ...
##  - attr(*, "version")= int 117
##  - attr(*, "label.table")= list()
##  - attr(*, "expansion.fields")=List of 12
##   ..$ : chr [1:3] "_dta" "ReS_i" "patno"
##   ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
##   ..$ : chr [1:3] "_dta" "ReS_j" "year"
##   ..$ : chr [1:3] "_dta" "ReS_str" "1"
##   ..$ : chr [1:3] "_dta" "ReS_Xij" "updrs1_bin updrs2_bin"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_wide1" "updrs1_bin_y0 updrs1_bin_y1 updrs1_bin_y2 updrs1_bin_y3 updrs1_bin_y4 updrs1_bin_y5"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_long1" "updrs1_bin"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_wide2" "updrs2_bin_y0 updrs2_bin_y1 updrs2_bin_y2 updrs2_bin_y3 updrs2_bin_y4 updrs2_bin_y5"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_long2" "updrs2_bin"
##   ..$ : chr [1:3] "_dta" "ReS_Xij_n" "2"
##   ..$ : chr [1:3] "patno" "destring" "Characters removed were:"
##   ..$ : chr [1:3] "patno" "destring_cmd" "destring patno, replace"
##  - attr(*, "byteorder")= chr "LSF"
##  - attr(*, "orig.dim")= int [1:2] 534 15
##  - attr(*, "data.label")= chr(0)
# recode

ppmi_test$sex <- factor(ppmi_test$sex,
                        levels = c(0,1),
                        labels = c("Female","Male"))

ppmi_test$updrs2_bin_y2 <- factor(
  ppmi_test$updrs2_bin_y2,
  levels = c(0,1),
  labels = c("No/Mild","Moderate/Severe")
)

# summary

hist(ppmi_test$age)

prop.table(table(ppmi_test$sex))*100
## 
##   Female     Male 
## 33.70787 66.29213
prop.table(table(ppmi_test$updrs2_bin_y2))*100
## 
##         No/Mild Moderate/Severe 
##        78.83895        21.16105
# Centre age at 60

ppmi_test$age_c <- ppmi_test$age - 60
hist(ppmi_test$age_c)

ppmi_test$updrs1_spline <- pmax(0, ppmi_test$updrs1_bl - 5)

ppmi_test$prob <- predict(model_final, newdata = ppmi_test, type = "response")

ppmi_test$y_num <- ifelse(ppmi_test$updrs2_bin_y2 == "Moderate/Severe", 1, 0)

ppmi_test <- ppmi_test %>%
  mutate(decile = ntile(prob, 10))

calib <- ppmi_test %>%
  group_by(decile) %>%
  summarise(
    mean_pred = mean(prob, na.rm = TRUE),
    obs = mean(y_num, na.rm = TRUE)
  )

ggplot(calib, aes(x = mean_pred, y = obs)) +
  geom_point(size = 2) +
  geom_line() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    x = "Mean predicted probability",
    y = "Observed proportion of moderate/severe motor problems"
  ) +
  theme_minimal()

roc_obj <- roc(ppmi_test$y_num, ppmi_test$prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.7828
plot(roc_obj)

ppmi_test$lp <- log(ppmi_test$prob / (1 - ppmi_test$prob))

model_slope <- glm(y_num ~ lp, data = ppmi_test, family = binomial)
summary(model_slope)
## 
## Call:
## glm(formula = y_num ~ lp, family = binomial, data = ppmi_test)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.13345    0.16159  -0.826    0.409    
## lp           0.78613    0.09239   8.509   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 551.18  on 533  degrees of freedom
## Residual deviance: 449.63  on 532  degrees of freedom
## AIC: 453.63
## 
## Number of Fisher Scoring iterations: 5
coef(model_slope)["lp"]
##        lp 
## 0.7861271
model_citl <- glm(y_num ~ 1,
                  offset = lp,
                  data = ppmi_test,
                  family = binomial)
summary(model_citl)
## 
## Call:
## glm(formula = y_num ~ 1, family = binomial, data = ppmi_test, 
##     offset = lp)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   0.1110     0.1241   0.894    0.371
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 454.5  on 533  degrees of freedom
## Residual deviance: 454.5  on 533  degrees of freedom
## AIC: 456.5
## 
## Number of Fisher Scoring iterations: 4
coef(model_citl)
## (Intercept) 
##   0.1109521
ppmi_test$pred_class <- ifelse(ppmi_test$prob > 0.5, 1, 0)

table(Predicted = ppmi_test$pred_class,
      Actual = ppmi_test$y_num)
##          Actual
## Predicted   0   1
##         0 405  76
##         1  16  37
# Sensitivity = TP / (TP + FN)
sensitivity <- sum(ppmi_test$pred_class == 1 & ppmi_test$y_num == 1) /
               sum(ppmi_test$y_num == 1)

# Specificity = TN / (TN + FP)
specificity <- sum(ppmi_test$pred_class == 0 & ppmi_test$y_num == 0) /
               sum(ppmi_test$y_num == 0)

sensitivity
## [1] 0.3274336
specificity
## [1] 0.9619952