EDA_PPMI

library(readr)

## Warning: package 'readr' was built under R version 4.5.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.5.2

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

demo <- read_csv("Demographics_08Apr2026.csv", show_col_types = FALSE)
updrs <- read_csv("MDS-UPDRS_Part_III_08Apr2026.csv", show_col_types = FALSE)
cohort <- read_csv("Subject_Cohort_History_08Apr2026.csv", show_col_types = FALSE)
colnames(updrs)

##  [1] "REC_ID"      "PATNO"       "EVENT_ID"    "PAG_NAME"    "INFODT"     
##  [6] "PDTRTMNT"    "PDSTATE"     "HRPOSTMED"   "HRDBSON"     "HRDBSOFF"   
## [11] "PDMEDYN"     "DBSYN"       "ONOFFORDER"  "OFFEXAM"     "OFFNORSN"   
## [16] "DBSOFFYN"    "DBSOFFTM"    "ONEXAM"      "ONNORSN"     "HIFUYN"     
## [21] "DBSONYN"     "DBSONTM"     "PDMEDDT"     "PDMEDTM"     "EXAMDT"     
## [26] "EXAMTM"      "NP3SPCH"     "NP3FACXP"    "NP3RIGN"     "NP3RIGRU"   
## [31] "NP3RIGLU"    "NP3RIGRL"    "NP3RIGLL"    "NP3FTAPR"    "NP3FTAPL"   
## [36] "NP3HMOVR"    "NP3HMOVL"    "NP3PRSPR"    "NP3PRSPL"    "NP3TTAPR"   
## [41] "NP3TTAPL"    "NP3LGAGR"    "NP3LGAGL"    "NP3RISNG"    "NP3GAIT"    
## [46] "NP3FRZGT"    "NP3PSTBL"    "NP3POSTR"    "NP3BRADY"    "NP3PTRMR"   
## [51] "NP3PTRML"    "NP3KTRMR"    "NP3KTRML"    "NP3RTARU"    "NP3RTALU"   
## [56] "NP3RTARL"    "NP3RTALL"    "NP3RTALJ"    "NP3RTCON"    "NP3TOT"     
## [61] "DYSKPRES"    "DYSKIRAT"    "NHY"         "ORIG_ENTRY"  "LAST_UPDATE"

colnames(cohort)

## [1] "PATNO"  "APPRDX" "COHORT"

colnames(demo)

##  [1] "REC_ID"       "PATNO"        "EVENT_ID"     "PAG_NAME"     "INFODT"      
##  [6] "AFICBERB"     "ASHKJEW"      "BASQUE"       "BIRTHDT"      "SEX"         
## [11] "CHLDBEAR"     "HOWLIVE"      "GAYLES"       "HETERO"       "BISEXUAL"    
## [16] "PANSEXUAL"    "ASEXUAL"      "OTHSEXUALITY" "HANDED"       "HISPLAT"     
## [21] "RAASIAN"      "RABLACK"      "RAHAWOPI"     "RAINDALS"     "RANOS"       
## [26] "RAWHITE"      "RAUNKNOWN"    "ORIG_ENTRY"   "LAST_UPDATE"

demo2 <- demo %>%
  select(PATNO, BIRTHDT, INFODT, SEX) %>%
  mutate(
    BIRTHDT_date = dmy(paste0("01/", BIRTHDT)),
    INFODT_date  = dmy(paste0("01/", INFODT)),
    AGE = as.numeric(interval(BIRTHDT_date, INFODT_date) / years(1)),
    sex_label = case_when(
      SEX == 1 ~ "Male",
      SEX == 0 ~ "Female",
      TRUE ~ NA_character_
    )
  ) %>%
  group_by(PATNO) %>%
  slice(1) %>%
  ungroup() %>%
  select(PATNO, AGE, sex_label)

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `BIRTHDT_date = dmy(paste0("01/", BIRTHDT))`.
## Caused by warning:
## !  8 failed to parse.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

summary(demo2$AGE)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   20.41   61.58   66.41   65.87   71.25  122.67       9

table(demo2$sex_label, useNA = "ifany")

## 
## Female   Male   <NA> 
##   4308   4027      2

table(cohort$COHORT, cohort$APPRDX)

##    
##       1   2   3   4   5   6
##   1 246   0   0   0 199   0
##   2   0 111   0   0   0   0
##   3   0   0   3   0   0   0
##   4   0   0   0  36   0 353

updrs_cohort <- updrs %>%
  filter(EVENT_ID == "BL") %>%
  select(PATNO, NP3TOT) %>%
  filter(!is.na(NP3TOT)) %>%
  group_by(PATNO) %>%
  slice(1) %>%
  ungroup() %>%
  inner_join(cohort, by = "PATNO")

updrs_cohort %>%
  group_by(COHORT) %>%
  summarise(
    n = n(),
    mean_NP3TOT = mean(NP3TOT, na.rm = TRUE),
    median_NP3TOT = median(NP3TOT, na.rm = TRUE)
  )

## # A tibble: 4 × 4
##   COHORT     n mean_NP3TOT median_NP3TOT
##    <dbl> <int>       <dbl>         <dbl>
## 1      1   444       20.7             19
## 2      2   111        1.15             0
## 3      3     3       15.7             17
## 4      4   382        2.60             1

A total of 556 subjects were included in the analysis, comprising approximately 445 PD patients and 111 healthy controls.

# 3. Cohort: keep only PD and Control
cohort2 <- cohort %>%
  mutate(
    group = case_when(
      COHORT == 1 ~ "PD",
      COHORT == 2 ~ "Control",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(group)) %>%
  select(PATNO, group)

table(cohort2$group)

## 
## Control      PD 
##     111     445

# 4. Baseline UPDRS Part III
updrs_bl <- updrs %>%
  filter(EVENT_ID == "BL") %>%
  select(PATNO, EVENT_ID, NP3TOT, NHY, PDSTATE, PDMEDYN) %>%
  filter(!is.na(NP3TOT)) %>%
  arrange(PATNO) %>%
  group_by(PATNO) %>%
  slice(1) %>%
  ungroup()
summary(updrs_bl$NP3TOT)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    2.00    6.00   10.53   17.00   78.00

table(updrs_bl$PDSTATE, useNA = "ifany")

## 
##  OFF   ON <NA> 
##  199  195 4348

unique(cohort$COHORT)

## [1] 2 1 3 4

unique(cohort$APPRDX)

## [1] 2 1 3 4 6 5

table(cohort$COHORT, useNA = "ifany")

## 
##   1   2   3   4 
## 445 111   3 389

table(cohort$APPRDX, useNA = "ifany")

## 
##   1   2   3   4   5   6 
## 246 111   3  36 199 353

head(demo[, c("BIRTHDT", "INFODT")], 10)

## # A tibble: 10 × 2
##    BIRTHDT INFODT 
##    <chr>   <chr>  
##  1 12/1941 01/2011
##  2 01/1946 02/2011
##  3 08/1943 03/2011
##  4 07/1954 03/2011
##  5 11/1951 03/2011
##  6 11/1947 03/2011
##  7 10/1953 03/2011
##  8 11/1946 04/2011
##  9 07/1929 05/2011
## 10 10/1927 05/2011

# 5. Merge
data_final <- cohort2 %>%
  inner_join(demo2, by = "PATNO") %>%
  inner_join(updrs_bl, by = "PATNO")

dim(data_final)

## [1] 555   9

table(data_final$group)

## 
## Control      PD 
##     111     444

summary(data_final$AGE)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.75   54.29   61.25   60.53   67.75   84.75

summary(data_final$NP3TOT)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.00   16.00   16.82   24.00   62.00

head(data_final)

## # A tibble: 6 × 9
##   PATNO group     AGE sex_label EVENT_ID NP3TOT   NHY PDSTATE PDMEDYN
##   <dbl> <chr>   <dbl> <chr>     <chr>     <dbl> <dbl> <chr>     <dbl>
## 1  3000 Control  69.1 Female    BL            4     0 <NA>         NA
## 2  3001 PD       65.1 Male      BL           12     1 <NA>          0
## 3  3002 PD       67.6 Female    BL           17     2 <NA>          0
## 4  3003 PD       56.7 Female    BL           29     2 <NA>          0
## 5  3004 Control  59.3 Male      BL            2     0 <NA>         NA
## 6  3008 Control  81.8 Female    BL           10     0 <NA>         NA

The baseline MDS-UPDRS Part III score (NP3TOT) shows a clear separation between PD and control groups. The PD group exhibits substantially higher motor impairment scores, with a median around 19, whereas the control group remains close to zero. This indicates that motor symptoms strongly distinguish PD patients from healthy controls.

# 6. Boxplot
ggplot(data_final, aes(x = group, y = NP3TOT, fill = group)) +
  geom_boxplot(alpha = 0.8) +
  theme_minimal() +
  labs(
    title = "Baseline MDS-UPDRS Part III Score by Group",
    x = "Group",
    y = "NP3TOT"
  )

PCA based on age and motor score (NP3TOT) shows partial separation between PD and control subjects. While there is some clustering tendency, the two groups still overlap, indicating that these features alone do not fully separate the populations in a linear space.

# 7. PCA
pca_input <- data_final %>%
  select(AGE, NP3TOT) %>%
  na.omit()

pca <- prcomp(pca_input, scale. = TRUE)

pca_df <- as.data.frame(pca$x)
pca_df$group <- data_final %>%
  select(AGE, NP3TOT, group) %>%
  na.omit() %>%
  pull(group)
ggplot(pca_df, aes(x = PC1, y = PC2, color = group)) +
  geom_point(size = 2.5, alpha = 0.8) +
  theme_minimal() +
  labs(title = "PCA: PD vs Control")

Logistic regression analysis shows that NP3TOT is a highly significant predictor of PD status (p < 0.001). Age is also statistically significant (p < 0.05), while sex does not show a significant effect. This suggests that motor symptom severity is the primary driver of classification between PD and control groups.

# 8. Logistic regression
data_model <- data_final %>%
  mutate(group_binary = ifelse(group == "PD", 1, 0)) %>%
  select(group_binary, AGE, NP3TOT, sex_label) %>%
  na.omit()

model <- glm(group_binary ~ AGE + NP3TOT + sex_label,
             data = data_model,
             family = "binomial")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)

## 
## Call:
## glm(formula = group_binary ~ AGE + NP3TOT + sex_label, family = "binomial", 
##     data = data_model)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.46087    1.94445  -0.237   0.8126    
## AGE           -0.07726    0.03475  -2.224   0.0262 *  
## NP3TOT         1.06803    0.16363   6.527 6.71e-11 ***
## sex_labelMale  0.21360    0.70100   0.305   0.7606    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 555.447  on 554  degrees of freedom
## Residual deviance:  60.983  on 551  degrees of freedom
## AIC: 68.983
## 
## Number of Fisher Scoring iterations: 10

The classification model achieved an accuracy of approximately 98.4%, indicating excellent predictive performance. The confusion matrix shows very few misclassifications between PD and control subjects.

# 9. Predicted probabilities
data_model$pred_prob <- predict(model, type = "response")
data_model$pred_class <- ifelse(data_model$pred_prob >= 0.5, 1, 0)

table(Predicted = data_model$pred_class, Actual = data_model$group_binary)

##          Actual
## Predicted   0   1
##         0 107   5
##         1   4 439

mean(data_model$pred_class == data_model$group_binary)

## [1] 0.9837838

The predicted probability distribution further confirms strong model separation, with PD subjects concentrated near probability 1 and controls near probability 0.

# 10. Probability plot
ggplot(data_model, aes(x = pred_prob, fill = factor(group_binary))) +
  geom_histogram(bins = 25, alpha = 0.7, position = "identity") +
  theme_minimal() +
  labs(
    title = "Predicted Probability of PD",
    x = "Predicted probability",
    fill = "True group"
  )

In this analysis, I integrated demographic and clinical data from the PPMI dataset to distinguish Parkinson’s disease (PD) patients from healthy controls.

The results demonstrate that the MDS-UPDRS Part III motor score (NP3TOT) is a highly effective discriminator, showing substantial differences between the two groups.

While PCA reveals partial separation, logistic regression confirms that NP3TOT is the most significant predictor of PD status.

The model achieved high classification accuracy (~98%), primarily driven by the strong signal in motor symptoms.

Compared with transcriptomic datasets, clinical data provides clearer and more robust separation between PD and control subjects, making it more suitable for classification tasks in this context.

The comparison between blood transcriptomic data and clinical data highlights a key insight: not all data modalities carry equally strong disease signals.

While blood gene expression showed limited separation between PD and control subjects, clinical motor scores (NP3TOT) provided a clear and robust distinction.

This suggests that clinical features directly related to disease symptoms are more informative for classification than peripheral molecular measurements in this context.

EDA_PPMI

2026-04-08