library(this.path)
library(readstata13)
library(lspline)
library(mfp)
## Loading required package: survival
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The first part of this assessment will focus on the dataset called “PPMI_pred_training_2026.dta”, which contains participants from the original PPMI cohort. As described above, the binary dependent variable is called “updrs2_bin_y2” and indicates whether the participants had either moderate or severe motor problems at the second yearly follow-up. This will be the dependent variable for your whole assignment.
setwd(this.dir())
ppmi_train_data <- read.dta13("PPMI_pred_training_2026.dta")
str(ppmi_train_data)
## 'data.frame': 390 obs. of 15 variables:
## $ patno : int 3001 3002 3003 3006 3007 3010 3012 3014 3018 3020 ...
## $ age : num 65 67 56 57 64 46 58 67 60 73 ...
## $ sex : int 1 0 0 0 1 1 1 1 0 0 ...
## $ scopa_gi : int 1 7 2 0 1 2 3 1 1 6 ...
## $ scopa_cv : int 0 0 2 0 0 1 1 1 0 0 ...
## $ scopa_therm : int 0 3 0 5 0 5 1 0 0 4 ...
## $ ess : int 6 14 8 0 16 2 7 7 14 7 ...
## $ lns : int 16 12 12 15 10 14 10 8 8 9 ...
## $ rem : int 4 7 3 4 6 9 8 2 3 6 ...
## $ mseadlg : int 1 3 3 1 1 1 1 1 1 1 ...
## $ sdmtotal : int 10 10 9 10 8 13 11 8 9 9 ...
## $ stai : int 12 17 12 12 15 18 17 13 21 14 ...
## $ updrs1_bl : int 8 8 12 3 8 11 11 8 7 15 ...
## $ gds : int 1 3 1 0 0 3 4 0 3 3 ...
## $ updrs2_bin_y2: int 0 1 0 1 1 1 1 0 1 0 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "29 Jan 2026 13:54"
## - attr(*, "formats")= chr [1:15] "%10.0g" "%10.0g" "%10.0g" "%10.0g" ...
## - attr(*, "types")= int [1:15] 65528 65526 65530 65530 65530 65530 65530 65530 65530 65530 ...
## - attr(*, "val.labels")= Named chr [1:15] "" "" "sexlab" "" ...
## ..- attr(*, "names")= chr [1:15] "" "" "sexlab" "" ...
## - attr(*, "var.labels")= chr [1:15] "Patient ID" "Age at enrolment" "Patient sex" "SCOPA-AUT Gastrointestinal (GI) Score" ...
## - attr(*, "version")= int 117
## - attr(*, "label.table")= list()
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr [1:3] "_dta" "ReS_i" "patno"
## ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
## ..$ : chr [1:3] "_dta" "ReS_j" "year"
## ..$ : chr [1:3] "_dta" "ReS_str" "1"
## ..$ : chr [1:3] "_dta" "ReS_Xij" "updrs1_bin updrs2_bin"
## ..$ : chr [1:3] "_dta" "ReS_Xij_wide1" "updrs1_bin_y0 updrs1_bin_y1 updrs1_bin_y2 updrs1_bin_y3 updrs1_bin_y4 updrs1_bin_y5"
## ..$ : chr [1:3] "_dta" "ReS_Xij_long1" "updrs1_bin"
## ..$ : chr [1:3] "_dta" "ReS_Xij_wide2" "updrs2_bin_y0 updrs2_bin_y1 updrs2_bin_y2 updrs2_bin_y3 updrs2_bin_y4 updrs2_bin_y5"
## ..$ : chr [1:3] "_dta" "ReS_Xij_long2" "updrs2_bin"
## ..$ : chr [1:3] "_dta" "ReS_Xij_n" "2"
## ..$ : chr [1:3] "patno" "destring" "Characters removed were:"
## ..$ : chr [1:3] "patno" "destring_cmd" "destring patno, replace"
## - attr(*, "byteorder")= chr "LSF"
## - attr(*, "orig.dim")= int [1:2] 390 15
## - attr(*, "data.label")= chr(0)
names (ppmi_train_data)
## [1] "patno" "age" "sex" "scopa_gi"
## [5] "scopa_cv" "scopa_therm" "ess" "lns"
## [9] "rem" "mseadlg" "sdmtotal" "stai"
## [13] "updrs1_bl" "gds" "updrs2_bin_y2"
Fit a model for your binary dependent variable, containing age at enrolment, participant sex, plus their interaction. For this model, centre age at 60 years and assume that it is linearly related to the outcome. (i) Interpret the four parameters from this model, on the odds scale
# Recode sex
ppmi_train_data$sex <- factor(ppmi_train_data$sex, levels = c(0,1), labels = c("Female","Male"))
ppmi_train_data$updrs2_bin_y2 <- factor(
ppmi_train_data$updrs2_bin_y2,
levels = c(0,1),
labels = c("No/Mild","Moderate/Severe")
)
# explore the data
hist(ppmi_train_data$age)
prop.table(table(ppmi_train_data$sex))*100
##
## Female Male
## 34.10256 65.89744
prop.table(table(ppmi_train_data$updrs2_bin_y2))*100
##
## No/Mild Moderate/Severe
## 79.23077 20.76923
# Centre age at 60
ppmi_train_data$age_c <- ppmi_train_data$age - 60
hist(ppmi_train_data$age_c)
# model with interaction terms
model_Ai <- glm(updrs2_bin_y2 ~ age_c*sex,
data = ppmi_train_data,
family = binomial(link = "logit"))
summary (model_Ai)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ age_c * sex, family = binomial(link = "logit"),
## data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.91821 0.26050 -7.363 1.79e-13 ***
## age_c -0.01880 0.02782 -0.676 0.4993
## sexMale 0.70868 0.30421 2.330 0.0198 *
## age_c:sexMale 0.05870 0.03226 1.819 0.0689 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 383.29 on 386 degrees of freedom
## AIC: 391.29
##
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(model_Ai), confint(model_Ai)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.1468697 0.08501712 0.2376474
## age_c 0.9813782 0.92859198 1.0366733
## sexMale 2.0313078 1.13892879 3.7794592
## age_c:sexMale 1.0604534 0.99554637 1.1305672
# model without interaction terms
model_n <- glm(updrs2_bin_y2 ~ age_c + sex,
data = ppmi_train_data,
family = binomial)
summary(model_n)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ age_c + sex, family = binomial,
## data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.95947 0.26247 -7.466 8.29e-14 ***
## age_c 0.02547 0.01375 1.852 0.06398 .
## sexMale 0.79657 0.29824 2.671 0.00756 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 386.61 on 387 degrees of freedom
## AIC: 392.61
##
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(model_n), confint(model_n)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.1409334 0.0813086 0.2289387
## age_c 1.0257950 0.9989889 1.0544585
## sexMale 2.2179301 1.2608576 4.0846687
# A likelihood ratio test
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
lrtest(model_Ai, model_n)
## Likelihood ratio test
##
## Model 1: updrs2_bin_y2 ~ age_c * sex
## Model 2: updrs2_bin_y2 ~ age_c + sex
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 4 -191.65
## 2 3 -193.31 -1 3.3195 0.06846 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
# Create predicted probabilities from your model
ppmi_train_data$pred <- predict(model_Ai, type = "response")
# Plot
ggplot(ppmi_train_data, aes(x = age, y = pred, colour = sex)) +
geom_line() +
labs(
x = "Age (years)",
y = "Predicted probability of moderate/severe motor problems",
colour = "Sex"
) +
theme_minimal()
Next, we will focus on updrs1_bl which quantifies problems with non-motor skills at enrolment, such as sleep, negative mood, and cognition.
# spline variables
# Find median knot
knot <- median(ppmi_train_data$updrs1_bl, na.rm = TRUE)
# Create spline term
ppmi_train_data$updrs1_spline <- pmax(0, ppmi_train_data$updrs1_bl - knot)
# Fit logistic regression
model_spline <- glm(updrs2_bin_y2 ~ updrs1_bl + updrs1_spline,
family = binomial,
data = ppmi_train_data)
summary(model_spline)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ updrs1_bl + updrs1_spline, family = binomial,
## data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0647 0.5103 -6.005 1.91e-09 ***
## updrs1_bl 0.3744 0.1190 3.146 0.00166 **
## updrs1_spline -0.2935 0.1388 -2.114 0.03448 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 371.19 on 387 degrees of freedom
## AIC: 377.19
##
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = coef(model_spline), confint(model_spline)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.04666793 0.01549201 0.1165727
## updrs1_bl 1.45404682 1.16759173 1.8683273
## updrs1_spline 0.74566896 0.56019303 0.9686740
# Create numeric binary outcome for plotting
ppmi_train_data$updrs2_bin_y2_num <- ifelse(
ppmi_train_data$updrs2_bin_y2 == "Moderate/Severe", 1, 0
)
# Create sequence of values for smooth curve
newdata <- data.frame(
updrs1_bl = seq(min(ppmi_train_data$updrs1_bl, na.rm = TRUE),
max(ppmi_train_data$updrs1_bl, na.rm = TRUE),
length.out = 200)
)
# Add spline term using same knot
knot <- median(ppmi_train_data$updrs1_bl, na.rm = TRUE)
newdata$updrs1_spline <- pmax(0, newdata$updrs1_bl - knot)
# Predicted probabilities
newdata$pred <- predict(model_spline, newdata = newdata, type = "response")
# Plot raw data (jittered)
plot(ppmi_train_data$updrs1_bl,
jitter(ppmi_train_data$updrs2_bin_y2_num, amount = 0.05),
xlab = "UPDRS1 (baseline)",
ylab = "Probability of Moderate/Severe outcome",
pch = 16,
col = "grey70",
ylim = c(0, 1.05))
# Add fitted spline curve
lines(newdata$updrs1_bl, newdata$pred, col = "blue", lwd = 3)
# Add vertical line at knot (optional but good)
abline(v = knot, lty = 2, col = "red")
median(ppmi_train_data$updrs1_bl, na.rm = TRUE)
## [1] 5
The SCOPA-GI measure assesses gastrointestinal dysautonomia in patients with PD. It is part of the Scale for Outcomes in PD for Autonomic Symptoms (SCOPA-AUT). The specific measure used here assesses the impact of gastrointestinal dysautonomia on patients’ health and quality of life, particularly in relation to constipation and other gastrointestinal issues.
Determine the best-fitting fractional polynomial (FP) model for the relationship between SCOPA-GI and your binary dependent variable.
# Load the package
install.packages("mfp")
## Warning: package 'mfp' is in use and will not be installed
library(mfp)
library(survival)
model_fp2 <- mfp(updrs2_bin_y2 ~ fp(scopa_gi, df = 4, select = 0.05),
family = binomial,
data = ppmi_train_data,
verbose = TRUE)
##
## Variable Deviance Power(s)
## ------------------------------------------------
## Cycle 1
## scopa_gi
## 398.489
## 370.067 1
## 361.778 -0.5
## 357.332 1 2
##
##
## Tansformation
## shift scale
## scopa_gi 1 10
##
## Fractional polynomials
## df.initial select alpha df.final power1 power2
## scopa_gi 4 0.05 0.05 2 -0.5 .
##
##
## Transformations of covariates:
## formula
## scopa_gi I(((scopa_gi+1)/10)^-0.5)
##
##
## Deviance table:
## Resid. Dev
## Null model 398.4888
## Linear model 370.0673
## Final model 361.7777
summary(model_fp2)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ I(((scopa_gi + 1)/10)^-0.5), family = binomial,
## data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2025 0.4657 2.582 0.00981 **
## I(((scopa_gi + 1)/10)^-0.5) -1.3425 0.2554 -5.256 1.47e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 361.78 on 388 degrees of freedom
## AIC: 365.78
##
## Number of Fisher Scoring iterations: 5
model_fp2$pvalues
## p.null p.lin p.FP power2 power4.1 power4.2
## scopa_gi 2.493678e-08 0.005244407 0.1082733 -0.5 1 2
# Create numeric binary outcome for plotting
ppmi_train_data$updrs2_bin_y2_num <- ifelse(
ppmi_train_data$updrs2_bin_y2 == "Moderate/Severe", 1, 0
)
# Create values for smooth prediction
newdata <- data.frame(
scopa_gi = seq(min(ppmi_train_data$scopa_gi, na.rm = TRUE),
max(ppmi_train_data$scopa_gi, na.rm = TRUE),
length.out = 200)
)
# Predicted probabilities from the fitted FP model
newdata$pred <- predict(model_fp2, newdata = newdata, type = "response")
# Plot raw data with jitter
plot(ppmi_train_data$scopa_gi,
jitter(ppmi_train_data$updrs2_bin_y2_num, amount = 0.05),
xlab = "SCOPA-GI",
ylab = "Predicted probability of UPDRS2 outcome",
main = "Fractional polynomial fit for SCOPA-GI",
pch = 16,
col = "grey70",
ylim = c(0, 1.05))
# Add fitted FP curve
lines(newdata$scopa_gi, newdata$pred, lwd = 3, col = "blue")
The Schwab and England ADL (Activities of Daily Living) scale is a method of assessing the capabilities of people with impaired mobility. A patient with a score of 100 (indicative of 100%) would be completely independent – able to do chores without slowness, difficulty or impairment. Lower scores indicate some lack of independence, with a score of zero corresponding to patient who is bedridden and helpless.
model_lin <- glm(updrs2_bin_y2 ~ mseadlg,
data = ppmi_train_data,
family = binomial (link = "logit"))
summary(model_lin)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ mseadlg, family = binomial(link = "logit"),
## data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3510 0.2707 -8.685 < 2e-16 ***
## mseadlg 0.6803 0.1422 4.785 1.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 372.22 on 388 degrees of freedom
## AIC: 376.22
##
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(model_lin), confint(model_lin)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.09527644 0.05420649 0.1573279
## mseadlg 1.97442350 1.50724133 2.6373904
model_cat <- glm(updrs2_bin_y2 ~ as.factor(mseadlg),
data = ppmi_train_data,
family = binomial (link = "logit"))
summary(model_cat)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ as.factor(mseadlg), family = binomial(link = "logit"),
## data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.5735 0.3458 -7.442 9.93e-14 ***
## as.factor(mseadlg)1 1.2742 0.4750 2.682 0.00731 **
## as.factor(mseadlg)2 1.5794 0.3857 4.095 4.22e-05 ***
## as.factor(mseadlg)3 2.1427 0.4965 4.316 1.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 370.45 on 386 degrees of freedom
## AIC: 378.45
##
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = coef(model_cat), confint(model_cat)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.07627119 0.03594315 0.1416949
## as.factor(mseadlg)1 3.57575758 1.41766850 9.3296586
## as.factor(mseadlg)2 4.85214348 2.38042553 10.9747574
## as.factor(mseadlg)3 8.52222222 3.26648383 23.2909180
AIC(model_lin, model_cat)
## df AIC
## model_lin 2 376.2167
## model_cat 4 378.4502
lrtest(model_lin, model_cat)
## Likelihood ratio test
##
## Model 1: updrs2_bin_y2 ~ mseadlg
## Model 2: updrs2_bin_y2 ~ as.factor(mseadlg)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 2 -186.11
## 2 4 -185.22 2 1.7664 0.4134
For the model development stage, please continue to use the dataset PPMI_pred_training_2026.dta. (i) Fit a multivariable logistic regression model for motor-problems incorporating eight predictors in the format described in Table 2 below. Report the summary output for this model (i.e., regression parameters on the log-odds scale, SEs, and p-values) (1 mark). (ii) Report and interpret on the AUROC for this model (1 mark).
model_final <- glm(
updrs2_bin_y2 ~ age_c * sex +
updrs1_bl + updrs1_spline +
I(((scopa_gi + 1)/10)^-0.5) +
mseadlg +
gds +
ess +
scopa_cv +
sdmtotal,
data = ppmi_train_data,
family = binomial
)
summary(model_final)
##
## Call:
## glm(formula = updrs2_bin_y2 ~ age_c * sex + updrs1_bl + updrs1_spline +
## I(((scopa_gi + 1)/10)^-0.5) + mseadlg + gds + ess + scopa_cv +
## sdmtotal, family = binomial, data = ppmi_train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.98396 1.24653 -0.789 0.429901
## age_c -0.01913 0.03126 -0.612 0.540470
## sexMale 0.76375 0.35342 2.161 0.030694 *
## updrs1_bl 0.16521 0.13365 1.236 0.216406
## updrs1_spline -0.11475 0.15646 -0.733 0.463306
## I(((scopa_gi + 1)/10)^-0.5) -0.81824 0.30252 -2.705 0.006836 **
## mseadlg 0.47574 0.15963 2.980 0.002880 **
## gds -0.07135 0.06964 -1.025 0.305598
## ess 0.15663 0.04192 3.737 0.000187 ***
## scopa_cv 0.10393 0.17382 0.598 0.549881
## sdmtotal -0.18305 0.06742 -2.715 0.006625 **
## age_c:sexMale 0.01722 0.03587 0.480 0.631144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 398.49 on 389 degrees of freedom
## Residual deviance: 303.82 on 378 degrees of freedom
## AIC: 327.82
##
## Number of Fisher Scoring iterations: 5
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
prob <- predict(model_final, type = "response")
roc_obj <- roc(ppmi_train_data$updrs2_bin_y2, prob)
## Setting levels: control = No/Mild, case = Moderate/Severe
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.8249
plot(roc_obj, col = "blue", lwd = 2)
For the validation step you should use the second datafile called PPMI_pred_testing_2026.dta containing a sample of 534 more recent recruits to the PPMI study. (i) Produce, and interpret, a calibration plot for the multivariable logistic model you derived in Q1, using this second dataset (1 mark).
ppmi_test <- read.dta13("PPMI_pred_testing_2026.dta")
str(ppmi_test)
## 'data.frame': 534 obs. of 15 variables:
## $ patno : int 100005 100012 100018 100268 100738 100842 100878 100889 100891 100898 ...
## $ age : num 52 66 69 72 71 63 65 74 62 75 ...
## $ sex : int 1 0 0 0 0 1 1 0 1 1 ...
## $ scopa_gi : int 1 3 2 6 0 2 1 0 3 3 ...
## $ scopa_cv : int 1 1 0 0 0 0 2 0 1 0 ...
## $ scopa_therm : int 0 2 0 1 1 3 0 0 1 1 ...
## $ ess : int 2 2 1 4 4 9 2 3 9 5 ...
## $ lns : int 9 9 9 11 10 13 12 11 15 12 ...
## $ rem : int 2 5 2 4 5 3 2 1 11 4 ...
## $ mseadlg : int 0 1 1 2 0 2 0 0 1 0 ...
## $ sdmtotal : int 10 12 9 10 11 10 12 12 8 7 ...
## $ stai : int 12 12 22 11 15 24 16 10 15 10 ...
## $ updrs1_bl : int 6 7 12 4 9 8 3 0 8 4 ...
## $ gds : int 0 4 5 0 2 5 0 0 1 0 ...
## $ updrs2_bin_y2: int 0 0 1 1 0 0 0 0 0 0 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "29 Jan 2026 13:54"
## - attr(*, "formats")= chr [1:15] "%10.0g" "%10.0g" "%10.0g" "%10.0g" ...
## - attr(*, "types")= int [1:15] 65528 65526 65530 65530 65530 65530 65530 65530 65530 65530 ...
## - attr(*, "val.labels")= Named chr [1:15] "" "" "sexlab" "" ...
## ..- attr(*, "names")= chr [1:15] "" "" "sexlab" "" ...
## - attr(*, "var.labels")= chr [1:15] "Patient ID" "Age at enrolment" "Patient sex" "SCOPA-AUT Gastrointestinal (GI) Score" ...
## - attr(*, "version")= int 117
## - attr(*, "label.table")= list()
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr [1:3] "_dta" "ReS_i" "patno"
## ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
## ..$ : chr [1:3] "_dta" "ReS_j" "year"
## ..$ : chr [1:3] "_dta" "ReS_str" "1"
## ..$ : chr [1:3] "_dta" "ReS_Xij" "updrs1_bin updrs2_bin"
## ..$ : chr [1:3] "_dta" "ReS_Xij_wide1" "updrs1_bin_y0 updrs1_bin_y1 updrs1_bin_y2 updrs1_bin_y3 updrs1_bin_y4 updrs1_bin_y5"
## ..$ : chr [1:3] "_dta" "ReS_Xij_long1" "updrs1_bin"
## ..$ : chr [1:3] "_dta" "ReS_Xij_wide2" "updrs2_bin_y0 updrs2_bin_y1 updrs2_bin_y2 updrs2_bin_y3 updrs2_bin_y4 updrs2_bin_y5"
## ..$ : chr [1:3] "_dta" "ReS_Xij_long2" "updrs2_bin"
## ..$ : chr [1:3] "_dta" "ReS_Xij_n" "2"
## ..$ : chr [1:3] "patno" "destring" "Characters removed were:"
## ..$ : chr [1:3] "patno" "destring_cmd" "destring patno, replace"
## - attr(*, "byteorder")= chr "LSF"
## - attr(*, "orig.dim")= int [1:2] 534 15
## - attr(*, "data.label")= chr(0)
# recode
ppmi_test$sex <- factor(ppmi_test$sex,
levels = c(0,1),
labels = c("Female","Male"))
ppmi_test$updrs2_bin_y2 <- factor(
ppmi_test$updrs2_bin_y2,
levels = c(0,1),
labels = c("No/Mild","Moderate/Severe")
)
# summary
hist(ppmi_test$age)
prop.table(table(ppmi_test$sex))*100
##
## Female Male
## 33.70787 66.29213
prop.table(table(ppmi_test$updrs2_bin_y2))*100
##
## No/Mild Moderate/Severe
## 78.83895 21.16105
# Centre age at 60
ppmi_test$age_c <- ppmi_test$age - 60
hist(ppmi_test$age_c)
ppmi_test$updrs1_spline <- pmax(0, ppmi_test$updrs1_bl - 5)
ppmi_test$prob <- predict(model_final, newdata = ppmi_test, type = "response")
ppmi_test$y_num <- ifelse(ppmi_test$updrs2_bin_y2 == "Moderate/Severe", 1, 0)
ppmi_test <- ppmi_test %>%
mutate(decile = ntile(prob, 10))
calib <- ppmi_test %>%
group_by(decile) %>%
summarise(
mean_pred = mean(prob, na.rm = TRUE),
obs = mean(y_num, na.rm = TRUE)
)
ggplot(calib, aes(x = mean_pred, y = obs)) +
geom_point(size = 2) +
geom_line() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(
x = "Mean predicted probability",
y = "Observed proportion of moderate/severe motor problems"
) +
theme_minimal()
roc_obj <- roc(ppmi_test$y_num, ppmi_test$prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.7828
plot(roc_obj)
ppmi_test$lp <- log(ppmi_test$prob / (1 - ppmi_test$prob))
model_slope <- glm(y_num ~ lp, data = ppmi_test, family = binomial)
summary(model_slope)
##
## Call:
## glm(formula = y_num ~ lp, family = binomial, data = ppmi_test)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.13345 0.16159 -0.826 0.409
## lp 0.78613 0.09239 8.509 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 551.18 on 533 degrees of freedom
## Residual deviance: 449.63 on 532 degrees of freedom
## AIC: 453.63
##
## Number of Fisher Scoring iterations: 5
coef(model_slope)["lp"]
## lp
## 0.7861271
model_citl <- glm(y_num ~ 1,
offset = lp,
data = ppmi_test,
family = binomial)
summary(model_citl)
##
## Call:
## glm(formula = y_num ~ 1, family = binomial, data = ppmi_test,
## offset = lp)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1110 0.1241 0.894 0.371
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 454.5 on 533 degrees of freedom
## Residual deviance: 454.5 on 533 degrees of freedom
## AIC: 456.5
##
## Number of Fisher Scoring iterations: 4
coef(model_citl)
## (Intercept)
## 0.1109521
ppmi_test$pred_class <- ifelse(ppmi_test$prob > 0.5, 1, 0)
table(Predicted = ppmi_test$pred_class,
Actual = ppmi_test$y_num)
## Actual
## Predicted 0 1
## 0 405 76
## 1 16 37
# Sensitivity = TP / (TP + FN)
sensitivity <- sum(ppmi_test$pred_class == 1 & ppmi_test$y_num == 1) /
sum(ppmi_test$y_num == 1)
# Specificity = TN / (TN + FP)
specificity <- sum(ppmi_test$pred_class == 0 & ppmi_test$y_num == 0) /
sum(ppmi_test$y_num == 0)
sensitivity
## [1] 0.3274336
specificity
## [1] 0.9619952