library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
demo <- read_csv("Demographics_08Apr2026.csv", show_col_types = FALSE)
updrs <- read_csv("MDS-UPDRS_Part_III_08Apr2026.csv", show_col_types = FALSE)
cohort <- read_csv("Subject_Cohort_History_08Apr2026.csv", show_col_types = FALSE)
colnames(updrs)
## [1] "REC_ID" "PATNO" "EVENT_ID" "PAG_NAME" "INFODT"
## [6] "PDTRTMNT" "PDSTATE" "HRPOSTMED" "HRDBSON" "HRDBSOFF"
## [11] "PDMEDYN" "DBSYN" "ONOFFORDER" "OFFEXAM" "OFFNORSN"
## [16] "DBSOFFYN" "DBSOFFTM" "ONEXAM" "ONNORSN" "HIFUYN"
## [21] "DBSONYN" "DBSONTM" "PDMEDDT" "PDMEDTM" "EXAMDT"
## [26] "EXAMTM" "NP3SPCH" "NP3FACXP" "NP3RIGN" "NP3RIGRU"
## [31] "NP3RIGLU" "NP3RIGRL" "NP3RIGLL" "NP3FTAPR" "NP3FTAPL"
## [36] "NP3HMOVR" "NP3HMOVL" "NP3PRSPR" "NP3PRSPL" "NP3TTAPR"
## [41] "NP3TTAPL" "NP3LGAGR" "NP3LGAGL" "NP3RISNG" "NP3GAIT"
## [46] "NP3FRZGT" "NP3PSTBL" "NP3POSTR" "NP3BRADY" "NP3PTRMR"
## [51] "NP3PTRML" "NP3KTRMR" "NP3KTRML" "NP3RTARU" "NP3RTALU"
## [56] "NP3RTARL" "NP3RTALL" "NP3RTALJ" "NP3RTCON" "NP3TOT"
## [61] "DYSKPRES" "DYSKIRAT" "NHY" "ORIG_ENTRY" "LAST_UPDATE"
colnames(cohort)
## [1] "PATNO" "APPRDX" "COHORT"
colnames(demo)
## [1] "REC_ID" "PATNO" "EVENT_ID" "PAG_NAME" "INFODT"
## [6] "AFICBERB" "ASHKJEW" "BASQUE" "BIRTHDT" "SEX"
## [11] "CHLDBEAR" "HOWLIVE" "GAYLES" "HETERO" "BISEXUAL"
## [16] "PANSEXUAL" "ASEXUAL" "OTHSEXUALITY" "HANDED" "HISPLAT"
## [21] "RAASIAN" "RABLACK" "RAHAWOPI" "RAINDALS" "RANOS"
## [26] "RAWHITE" "RAUNKNOWN" "ORIG_ENTRY" "LAST_UPDATE"
demo2 <- demo %>%
select(PATNO, BIRTHDT, INFODT, SEX) %>%
mutate(
BIRTHDT_date = dmy(paste0("01/", BIRTHDT)),
INFODT_date = dmy(paste0("01/", INFODT)),
AGE = as.numeric(interval(BIRTHDT_date, INFODT_date) / years(1)),
sex_label = case_when(
SEX == 1 ~ "Male",
SEX == 0 ~ "Female",
TRUE ~ NA_character_
)
) %>%
group_by(PATNO) %>%
slice(1) %>%
ungroup() %>%
select(PATNO, AGE, sex_label)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `BIRTHDT_date = dmy(paste0("01/", BIRTHDT))`.
## Caused by warning:
## ! 8 failed to parse.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
summary(demo2$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 20.41 61.58 66.41 65.87 71.25 122.67 9
table(demo2$sex_label, useNA = "ifany")
##
## Female Male <NA>
## 4308 4027 2
table(cohort$COHORT, cohort$APPRDX)
##
## 1 2 3 4 5 6
## 1 246 0 0 0 199 0
## 2 0 111 0 0 0 0
## 3 0 0 3 0 0 0
## 4 0 0 0 36 0 353
updrs_cohort <- updrs %>%
filter(EVENT_ID == "BL") %>%
select(PATNO, NP3TOT) %>%
filter(!is.na(NP3TOT)) %>%
group_by(PATNO) %>%
slice(1) %>%
ungroup() %>%
inner_join(cohort, by = "PATNO")
updrs_cohort %>%
group_by(COHORT) %>%
summarise(
n = n(),
mean_NP3TOT = mean(NP3TOT, na.rm = TRUE),
median_NP3TOT = median(NP3TOT, na.rm = TRUE)
)
## # A tibble: 4 Ă— 4
## COHORT n mean_NP3TOT median_NP3TOT
## <dbl> <int> <dbl> <dbl>
## 1 1 444 20.7 19
## 2 2 111 1.15 0
## 3 3 3 15.7 17
## 4 4 382 2.60 1
A total of 556 subjects were included in the analysis, comprising approximately 445 PD patients and 111 healthy controls.
# 3. Cohort: keep only PD and Control
cohort2 <- cohort %>%
mutate(
group = case_when(
COHORT == 1 ~ "PD",
COHORT == 2 ~ "Control",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(group)) %>%
select(PATNO, group)
table(cohort2$group)
##
## Control PD
## 111 445
# 4. Baseline UPDRS Part III
updrs_bl <- updrs %>%
filter(EVENT_ID == "BL") %>%
select(PATNO, EVENT_ID, NP3TOT, NHY, PDSTATE, PDMEDYN) %>%
filter(!is.na(NP3TOT)) %>%
arrange(PATNO) %>%
group_by(PATNO) %>%
slice(1) %>%
ungroup()
summary(updrs_bl$NP3TOT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 6.00 10.53 17.00 78.00
table(updrs_bl$PDSTATE, useNA = "ifany")
##
## OFF ON <NA>
## 199 195 4348
unique(cohort$COHORT)
## [1] 2 1 3 4
unique(cohort$APPRDX)
## [1] 2 1 3 4 6 5
table(cohort$COHORT, useNA = "ifany")
##
## 1 2 3 4
## 445 111 3 389
table(cohort$APPRDX, useNA = "ifany")
##
## 1 2 3 4 5 6
## 246 111 3 36 199 353
head(demo[, c("BIRTHDT", "INFODT")], 10)
## # A tibble: 10 Ă— 2
## BIRTHDT INFODT
## <chr> <chr>
## 1 12/1941 01/2011
## 2 01/1946 02/2011
## 3 08/1943 03/2011
## 4 07/1954 03/2011
## 5 11/1951 03/2011
## 6 11/1947 03/2011
## 7 10/1953 03/2011
## 8 11/1946 04/2011
## 9 07/1929 05/2011
## 10 10/1927 05/2011
# 5. Merge
data_final <- cohort2 %>%
inner_join(demo2, by = "PATNO") %>%
inner_join(updrs_bl, by = "PATNO")
dim(data_final)
## [1] 555 9
table(data_final$group)
##
## Control PD
## 111 444
summary(data_final$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.75 54.29 61.25 60.53 67.75 84.75
summary(data_final$NP3TOT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.00 16.00 16.82 24.00 62.00
head(data_final)
## # A tibble: 6 Ă— 9
## PATNO group AGE sex_label EVENT_ID NP3TOT NHY PDSTATE PDMEDYN
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 3000 Control 69.1 Female BL 4 0 <NA> NA
## 2 3001 PD 65.1 Male BL 12 1 <NA> 0
## 3 3002 PD 67.6 Female BL 17 2 <NA> 0
## 4 3003 PD 56.7 Female BL 29 2 <NA> 0
## 5 3004 Control 59.3 Male BL 2 0 <NA> NA
## 6 3008 Control 81.8 Female BL 10 0 <NA> NA
The baseline MDS-UPDRS Part III score (NP3TOT) shows a clear separation between PD and control groups. The PD group exhibits substantially higher motor impairment scores, with a median around 19, whereas the control group remains close to zero. This indicates that motor symptoms strongly distinguish PD patients from healthy controls.
# 6. Boxplot
ggplot(data_final, aes(x = group, y = NP3TOT, fill = group)) +
geom_boxplot(alpha = 0.8) +
theme_minimal() +
labs(
title = "Baseline MDS-UPDRS Part III Score by Group",
x = "Group",
y = "NP3TOT"
)
PCA based on age and motor score (NP3TOT) shows partial separation between PD and control subjects. While there is some clustering tendency, the two groups still overlap, indicating that these features alone do not fully separate the populations in a linear space.
# 7. PCA
pca_input <- data_final %>%
select(AGE, NP3TOT) %>%
na.omit()
pca <- prcomp(pca_input, scale. = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$group <- data_final %>%
select(AGE, NP3TOT, group) %>%
na.omit() %>%
pull(group)
ggplot(pca_df, aes(x = PC1, y = PC2, color = group)) +
geom_point(size = 2.5, alpha = 0.8) +
theme_minimal() +
labs(title = "PCA: PD vs Control")
Logistic regression analysis shows that NP3TOT is a highly significant predictor of PD status (p < 0.001). Age is also statistically significant (p < 0.05), while sex does not show a significant effect. This suggests that motor symptom severity is the primary driver of classification between PD and control groups.
# 8. Logistic regression
data_model <- data_final %>%
mutate(group_binary = ifelse(group == "PD", 1, 0)) %>%
select(group_binary, AGE, NP3TOT, sex_label) %>%
na.omit()
model <- glm(group_binary ~ AGE + NP3TOT + sex_label,
data = data_model,
family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = group_binary ~ AGE + NP3TOT + sex_label, family = "binomial",
## data = data_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.46087 1.94445 -0.237 0.8126
## AGE -0.07726 0.03475 -2.224 0.0262 *
## NP3TOT 1.06803 0.16363 6.527 6.71e-11 ***
## sex_labelMale 0.21360 0.70100 0.305 0.7606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 555.447 on 554 degrees of freedom
## Residual deviance: 60.983 on 551 degrees of freedom
## AIC: 68.983
##
## Number of Fisher Scoring iterations: 10
The classification model achieved an accuracy of approximately 98.4%, indicating excellent predictive performance. The confusion matrix shows very few misclassifications between PD and control subjects.
# 9. Predicted probabilities
data_model$pred_prob <- predict(model, type = "response")
data_model$pred_class <- ifelse(data_model$pred_prob >= 0.5, 1, 0)
table(Predicted = data_model$pred_class, Actual = data_model$group_binary)
## Actual
## Predicted 0 1
## 0 107 5
## 1 4 439
mean(data_model$pred_class == data_model$group_binary)
## [1] 0.9837838
The predicted probability distribution further confirms strong model separation, with PD subjects concentrated near probability 1 and controls near probability 0.
# 10. Probability plot
ggplot(data_model, aes(x = pred_prob, fill = factor(group_binary))) +
geom_histogram(bins = 25, alpha = 0.7, position = "identity") +
theme_minimal() +
labs(
title = "Predicted Probability of PD",
x = "Predicted probability",
fill = "True group"
)
In this analysis, I integrated demographic and clinical data from the PPMI dataset to distinguish Parkinson’s disease (PD) patients from healthy controls.
The results demonstrate that the MDS-UPDRS Part III motor score (NP3TOT) is a highly effective discriminator, showing substantial differences between the two groups.
While PCA reveals partial separation, logistic regression confirms that NP3TOT is the most significant predictor of PD status.
The model achieved high classification accuracy (~98%), primarily driven by the strong signal in motor symptoms.
Compared with transcriptomic datasets, clinical data provides clearer and more robust separation between PD and control subjects, making it more suitable for classification tasks in this context.
The comparison between blood transcriptomic data and clinical data highlights a key insight: not all data modalities carry equally strong disease signals.
While blood gene expression showed limited separation between PD and control subjects, clinical motor scores (NP3TOT) provided a clear and robust distinction.
This suggests that clinical features directly related to disease symptoms are more informative for classification than peripheral molecular measurements in this context.