# 0. Load packages
library(data.table)

## Warning: package 'data.table' was built under R version 4.5.2

library(limma)

## Warning: package 'limma' was built under R version 4.5.2

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

library(pheatmap)

Data preprocessing

Data preprocessing: load expression matrix and sample metadata, then filter to include only Parkinson’s disease and healthy control samples.

# 1. Function to read GEO series matrix
read_geo_series_matrix <- function(file) {
  lines <- readLines(gzfile(file), warn = FALSE)

  # expression matrix location
  begin_idx <- grep("^!series_matrix_table_begin", lines)
  end_idx   <- grep("^!series_matrix_table_end", lines)

  # sample metadata lines
  meta_lines <- lines[grepl("^!Sample_", lines)]

  parse_meta_line <- function(x) {
    parts <- strsplit(x, "\t")[[1]]
    key <- sub("^!Sample_", "", parts[1])
    vals <- gsub('^"|"$', "", parts[-1])
    data.frame(field = key, t(vals), check.names = FALSE)
  }

  meta_list <- lapply(meta_lines, parse_meta_line)
  meta_df <- data.table::rbindlist(meta_list, fill = TRUE)

  # metadata wide format
  if (nrow(meta_df) > 0) {
    sample_ids <- unlist(
      meta_df[meta_df$field == "geo_accession", -1, with = FALSE],
      use.names = FALSE
    )
    meta_wide <- as.data.frame(
      t(meta_df[, -1, with = FALSE]),
      stringsAsFactors = FALSE
    )
    colnames(meta_wide) <- meta_df$field
    rownames(meta_wide) <- sample_ids
  } else {
    meta_wide <- NULL
  }

  # expression matrix
  expr <- NULL
  if (length(begin_idx) > 0 && length(end_idx) > 0 && (end_idx - begin_idx) > 1) {
    table_lines <- lines[(begin_idx + 1):(end_idx - 1)]
    tf <- tempfile(fileext = ".txt")
    writeLines(table_lines, tf)

    expr <- read.delim(tf, check.names = FALSE, stringsAsFactors = FALSE)
    unlink(tf)

    rownames(expr) <- expr[, 1]
    expr <- expr[, -1, drop = FALSE]
    expr <- as.matrix(expr)
    mode(expr) <- "numeric"
  }

  list(expr = expr, meta = meta_wide)
}

# 2. Read GSE6613
gse6613 <- read_geo_series_matrix("GSE6613_series_matrix.txt.gz")

expr <- gse6613$expr
meta <- gse6613$meta

# check dimensions
dim(expr)

## [1] 22283   105

dim(meta)

## [1] 105  29

# check groups
table(meta$characteristics_ch1)

## 
##              healthy control neurological disease control 
##                           22                           33 
##          Parkinson's disease 
##                           50

dim(gse6613$expr)

## [1] 22283   105

dim(gse6613$meta)

## [1] 105  29

colnames(gse6613$meta)

##  [1] "title"                   "geo_accession"          
##  [3] "status"                  "submission_date"        
##  [5] "last_update_date"        "type"                   
##  [7] "channel_count"           "source_name_ch1"        
##  [9] "organism_ch1"            "characteristics_ch1"    
## [11] "molecule_ch1"            "extract_protocol_ch1"   
## [13] "label_ch1"               "label_protocol_ch1"     
## [15] "taxid_ch1"               "hyb_protocol"           
## [17] "scan_protocol"           "description"            
## [19] "data_processing"         "platform_id"            
## [21] "contact_name"            "contact_department"     
## [23] "contact_institute"       "contact_address"        
## [25] "contact_city"            "contact_zip/postal_code"
## [27] "contact_country"         "supplementary_file"     
## [29] "data_row_count"

# 3. Keep only PD and healthy control
keep <- meta$characteristics_ch1 %in% c("Parkinson's disease", "healthy control")

meta_sub <- meta[keep, , drop = FALSE]
expr_sub <- expr[, keep]

# define group
group <- ifelse(meta_sub$characteristics_ch1 == "Parkinson's disease", "PD", "Control")
group <- factor(group, levels = c("Control", "PD"))

table(group)

## group
## Control      PD 
##      22      50

Log transformation and differential expression analysis

Log2 transformation is applied to stabilize variance and normalize expression values. Differential expression analysis is performed using the limma package, comparing PD patients with healthy controls.

# 4. Log transform
expr_sub <- log2(expr_sub + 1)

# optional sanity check
summary(as.vector(expr_sub))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1375  4.0704  5.6865  5.5989  6.9105 17.2761

# 5. Differential expression with limma
design <- model.matrix(~ group)
colnames(design) <- c("Intercept", "PD_vs_Control")

fit <- lmFit(expr_sub, design)
fit <- eBayes(fit)

res <- topTable(
  fit,
  coef = "PD_vs_Control",
  number = Inf,
  adjust.method = "BH"
)

res$probe_id <- rownames(res)

# save full results
write.csv(res, "GSE6613_DE_results.csv", row.names = FALSE)

# preview
head(res, 20)

##                  logFC  AveExpr         t      P.Value adj.P.Val          B
## 217094_s_at  0.6365489 6.102724  4.373186 3.962334e-05  0.680897  0.2160273
## 208666_s_at -0.4630350 6.321870 -4.194298 7.539137e-05  0.680897 -0.1527346
## 219437_s_at  0.8715065 6.390325  4.042142 1.288496e-04  0.680897 -0.4607988
## 205398_s_at -0.8355378 5.281489 -3.953186 1.753878e-04  0.680897 -0.6383221
## 211971_s_at -0.3574042 6.943788 -3.904231 2.074887e-04  0.680897 -0.7351599
## 212183_at   -1.0139486 5.300035 -3.836020 2.617295e-04  0.680897 -0.8690346
## 204255_s_at  0.6347101 6.210331  3.835486 2.622035e-04  0.680897 -0.8700780
## 202514_at   -1.1144834 3.485566 -3.784630 3.112968e-04  0.680897 -0.9690606
## 215231_at    0.9731268 3.570726  3.749731 3.499420e-04  0.680897 -1.0365695
## 212502_at   -0.3397220 6.407716 -3.689913 4.270452e-04  0.680897 -1.1514712
## 211249_at    0.9216533 4.715712  3.678883 4.429281e-04  0.680897 -1.1725464
## 208715_at    0.4903336 5.714713  3.648011 4.904241e-04  0.680897 -1.2313384
## 207205_at    0.3975092 7.392708  3.626939 5.255840e-04  0.680897 -1.2713038
## 216883_x_at -0.9267381 4.408227 -3.622146 5.339098e-04  0.680897 -1.2803760
## 220681_at    0.7268451 5.003339  3.603446 5.676080e-04  0.680897 -1.3157051
## 208576_s_at  0.8060151 2.635568  3.600735 5.726564e-04  0.680897 -1.3208164
## 211980_at    0.7459364 4.844631  3.589317 5.943977e-04  0.680897 -1.3423261
## 204274_at   -0.5111516 6.205530 -3.588566 5.958546e-04  0.680897 -1.3437393
## 206541_at    0.8570628 4.312389  3.575594 6.215645e-04  0.680897 -1.3681242
## 208398_s_at -0.3168321 7.083500 -3.549363 6.768059e-04  0.680897 -1.4172744
##                probe_id
## 217094_s_at 217094_s_at
## 208666_s_at 208666_s_at
## 219437_s_at 219437_s_at
## 205398_s_at 205398_s_at
## 211971_s_at 211971_s_at
## 212183_at     212183_at
## 204255_s_at 204255_s_at
## 202514_at     202514_at
## 215231_at     215231_at
## 212502_at     212502_at
## 211249_at     211249_at
## 208715_at     208715_at
## 207205_at     207205_at
## 216883_x_at 216883_x_at
## 220681_at     220681_at
## 208576_s_at 208576_s_at
## 211980_at     211980_at
## 204274_at     204274_at
## 206541_at     206541_at
## 208398_s_at 208398_s_at

GSE6613 volcano plot

The volcano plot shows the relationship between fold change and statistical significance for all genes. No genes meet the strict threshold (adjusted p-value < 0.05 and |logFC| > 0.5), indicating that there are no strongly differentially expressed genes between Parkinson’s disease patients and healthy controls in this dataset.

Most genes are clustered around logFC = 0 with low statistical significance, suggesting that the overall transcriptional differences in whole blood are subtle.

# 6. Volcano plot
# strict threshold
res$significant <- res$adj.P.Val < 0.05 & abs(res$logFC) > 0.5

p1 <- ggplot(res, aes(x = logFC, y = -log10(adj.P.Val), color = significant)) +
  geom_point(alpha = 0.7, size = 1.5) +
  theme_minimal() +
  labs(
    title = "GSE6613: PD vs Control",
    x = "logFC",
    y = "-log10(adjusted P value)"
  )

print(p1)

# save volcano plot
ggsave("GSE6613_volcano_plot.png", p1, width = 8, height = 6, dpi = 300)

GSE6613 Heatmap

The heatmap of the top 30 genes (ranked by absolute logFC) shows the expression patterns across all samples.

Although some variation can be observed, the clustering does not clearly separate Parkinson’s disease samples from healthy controls. This indicates that these genes are not sufficient to distinguish between the two groups, further supporting the observation that gene expression differences in this dataset are relatively weak.

# 7. Select top genes by effect size
res2 <- res[order(abs(res$logFC), decreasing = TRUE), ]
top_genes <- rownames(res2)[1:30]

top_table <- res2[1:30, ]
write.csv(top_table, "GSE6613_top30_by_logFC.csv", row.names = FALSE)

# 8. Heatmap
mat <- expr_sub[top_genes, , drop = FALSE]

annotation_col <- data.frame(Group = group)
rownames(annotation_col) <- colnames(mat)

pheatmap(
  mat,
  scale = "row",
  annotation_col = annotation_col,
  show_colnames = FALSE,
  main = "Top 30 Genes (PD vs Control)"
)

We analyzed gene expression differences between Parkinson’s disease patients and healthy controls using the GSE6613 dataset.

After preprocessing and log transformation, differential expression analysis was performed using the limma package. No genes met the strict significance threshold (adjusted p-value < 0.05 and |logFC| > 0.5), indicating weak differential expression signals.

The volcano plot showed that most genes had low fold changes and low statistical significance. The heatmap of the top 30 genes also failed to clearly separate PD and control samples.

Overall, these results suggest that gene expression differences in whole blood between PD patients and healthy individuals are subtle. Additional datasets or tissues may be required to identify more robust biomarkers.

gse62469 <- read_geo_series_matrix("GSE62469_series_matrix.txt.gz")

expr <- gse62469$expr
meta <- gse62469$meta
unique(meta$characteristics_ch1)

## [1] "disease status: healthy" "disease status: PD"

keep <- meta$characteristics_ch1 %in% c(
  "disease status: healthy",
  "disease status: PD"
)

meta_sub <- meta[keep, , drop = FALSE]
expr_sub <- expr[, keep]

group <- ifelse(
  meta_sub$characteristics_ch1 == "disease status: PD",
  "PD",
  "Control"
)

group <- factor(group, levels = c("Control", "PD"))

table(group)

## group
## Control      PD 
##      31      35

expr_sub <- log2(expr_sub + 1)
library(limma)

design <- model.matrix(~ group)
colnames(design) <- c("Intercept", "PD_vs_Control")

fit <- lmFit(expr_sub, design)
fit <- eBayes(fit)

res <- topTable(
  fit,
  coef = "PD_vs_Control",
  number = Inf,
  adjust.method = "BH"
)

res$gene <- rownames(res)

GSE62469 Volcano plot

The volcano plot for GSE62469 shows a limited range of fold changes and low statistical significance across most genes. No genes pass the strict threshold (adjusted p-value < 0.05 and |logFC| > 0.5), indicating weak differential expression signals between PD patients and healthy controls.

library(ggplot2)

res$significant <- res$adj.P.Val < 0.05 & abs(res$logFC) > 0.5

ggplot(res, aes(x = logFC, y = -log10(adj.P.Val), color = significant)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "GSE62469: PD vs Control")

GSE62469 Heatmap

The heatmap of the top 30 genes does not show a clear separation between PD and control samples. The clustering pattern indicates that these genes are not sufficient to distinguish between the two groups.

library(pheatmap)

res2 <- res[order(abs(res$logFC), decreasing = TRUE), ]
top_genes <- rownames(res2)[1:30]

mat <- expr_sub[top_genes, ]

annotation_col <- data.frame(Group = group)
rownames(annotation_col) <- colnames(mat)

pheatmap(
  mat,
  scale = "row",
  annotation_col = annotation_col,
  show_colnames = FALSE,
  main = "Top 30 Genes (PD vs Control)"
)

PCA

Principal Component Analysis (PCA) was performed to explore the overall variance in the dataset. The PCA plot shows that PD and control samples are largely overlapping, with no clear separation between the two groups. This further supports the observation that gene expression differences in this dataset are subtle.

pca <- prcomp(t(expr_sub), scale. = TRUE)

pca_df <- data.frame(
  PC1 = pca$x[,1],
  PC2 = pca$x[,2],
  Group = group
)

ggplot(pca_df, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA: PD vs Control")

In this study, we analyzed gene expression differences between Parkinson’s disease (PD) patients and healthy controls using two independent datasets: GSE6613 and GSE62469.

Differential expression analysis using the limma package revealed that neither dataset showed strong evidence of differentially expressed genes under a strict threshold (adjusted p-value < 0.05 and |logFC| > 0.5). Volcano plots indicated that most genes exhibited small fold changes and low statistical significance.

Heatmap analysis of the top 30 genes showed no clear separation between PD and control samples in either dataset. Similarly, Principal Component Analysis (PCA) demonstrated substantial overlap between the two groups, suggesting that the overall transcriptional profiles are highly similar.

Together, these results indicate that gene expression differences in whole blood between PD patients and healthy individuals are subtle. This suggests that blood-based transcriptomic signals may not be sufficient for robust biomarker discovery, and alternative tissues or additional datasets may be required.

EDA

2026-03-29