#Load require packages
library(ggplot2)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(kableExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Load data set
GSE55763_100 <- read.csv("GSE55763_merged.csv", header = TRUE, stringsAsFactors = FALSE)
#Verify dimensions and first rows
dim(GSE55763_100)
## [1] 2664  103
head(GSE55763_100,3)

Convert variables in the correct type:

GSE55763_100$age <- as.numeric(GSE55763_100$age)
GSE55763_100$gender <- factor(GSE55763_100$gender, levels = c("M", "F"))

Visualizations

Age

ggplot(GSE55763_100, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "violet", color = "black") +
  theme_minimal() +
  labs(title = "Age Distribution", x = "Age", y = "Samples")

Gender

ggplot(GSE55763_100, aes(x = gender, fill = gender)) +
  geom_bar() +
  scale_fill_manual(values = c("steelblue", "pink")) +
  theme_minimal() +
  labs(title = "Gender Distribution", x = "Gender", y = "Samples")

Verify missing values

n_samples <- nrow(GSE55763_100)
na_per_var <- colSums(is.na(GSE55763_100))
na_per_var <- na_per_var[na_per_var > 0]

na_df <- data.frame(
  Variable = names(na_per_var),
  Missing  = as.numeric(na_per_var),
  row.names = NULL   
) %>%
  filter(Missing > 0) %>%   
  arrange(desc(Missing))  
na_df %>%
  kable("html", caption = "Variables with missing values") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Variables with missing values
Variable Missing
cg03429968 59
cg04146137 45
cg02455868 3
cg00120358 3
cg03598412 2
cg00413097 1
cg01794405 1
cg03142837 1
cg02481741 1
cg02636585 1
cg01444413 1
cg03962463 1
# Total number of values in the dataset
total_values <- nrow(GSE55763_100) * ncol(GSE55763_100)

# Total number of missing values
total_missing <- sum(is.na(GSE55763_100))

# Percentage of missing values
missing_pct <- (total_missing / total_values) * 100

# Print summary
cat(sprintf("The dataset contains %d missing values (%.4f%%).\n",
            total_missing, missing_pct))
## The dataset contains 119 missing values (0.0434%).

In this case, missing values are assumed to be Missing Completely at Random (MCAR) and will be imputed using the median of each CpG.

X_cpg <- GSE55763_100[, 4:103]  #select cpg columns
# Impute missing values using the median 
X_cpg_imputed <- as.data.frame(lapply(X_cpg, function(x) {
  x[is.na(x)] <- median(x, na.rm = TRUE)
  return(x)
}))
GSE55763_100_imputed <- cbind(GSE55763_100[, 1:3], X_cpg_imputed) #combine with other variables
sum(is.na(GSE55763_100_imputed)) #verify NAs
## [1] 0
# Save imputed daat set
write.csv(GSE55763_100_imputed, file = "GSE55763_100_imputed.csv", row.names = FALSE)

PCA

X_cpg_scaled <- scale(X_cpg_top30)
pca_res <- PCA(X_cpg_scaled, graph = FALSE)
fviz_pca_ind(pca_res,
             geom.ind = "point",
             pointsize = 1.5,
             alpha.ind = 0.6,  # transparency
             col.ind = GSE55763_100_imputed$gender,
             palette = c("blue", "red"),
             addEllipses = TRUE)

# Show only top 10 contributing CpGs
fviz_pca_var(pca_res, col.var = "contrib", 
             select.var = list(contribution = 10)) +
  theme_minimal()

Top 10 CpGs contributing to PC1 and PC2
CpG Contribution
cg03052760 19.678355
cg02481741 14.334416
cg00414013 10.944840
cg00176657 10.345763
cg03962463 10.024569
cg02890642 10.023807
cg04111436 9.459641
cg02065717 8.926222
cg02636585 8.462062
cg02100365 8.291989

The top 10 CpGs contributing most to PC1 and PC2 capture the main sources of variation in the dataset.