#Load require packages
library(ggplot2)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(kableExtra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Load data set
GSE55763_100 <- read.csv("GSE55763_merged.csv", header = TRUE, stringsAsFactors = FALSE)
#Verify dimensions and first rows
dim(GSE55763_100)
## [1] 2664 103
head(GSE55763_100,3)
Convert variables in the correct type:
GSE55763_100$age <- as.numeric(GSE55763_100$age)
GSE55763_100$gender <- factor(GSE55763_100$gender, levels = c("M", "F"))
Visualizations
Age
ggplot(GSE55763_100, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "violet", color = "black") +
theme_minimal() +
labs(title = "Age Distribution", x = "Age", y = "Samples")
Gender
ggplot(GSE55763_100, aes(x = gender, fill = gender)) +
geom_bar() +
scale_fill_manual(values = c("steelblue", "pink")) +
theme_minimal() +
labs(title = "Gender Distribution", x = "Gender", y = "Samples")
Verify missing values
n_samples <- nrow(GSE55763_100)
na_per_var <- colSums(is.na(GSE55763_100))
na_per_var <- na_per_var[na_per_var > 0]
na_df <- data.frame(
Variable = names(na_per_var),
Missing = as.numeric(na_per_var),
row.names = NULL
) %>%
filter(Missing > 0) %>%
arrange(desc(Missing))
na_df %>%
kable("html", caption = "Variables with missing values") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
Variable | Missing |
---|---|
cg03429968 | 59 |
cg04146137 | 45 |
cg02455868 | 3 |
cg00120358 | 3 |
cg03598412 | 2 |
cg00413097 | 1 |
cg01794405 | 1 |
cg03142837 | 1 |
cg02481741 | 1 |
cg02636585 | 1 |
cg01444413 | 1 |
cg03962463 | 1 |
# Total number of values in the dataset
total_values <- nrow(GSE55763_100) * ncol(GSE55763_100)
# Total number of missing values
total_missing <- sum(is.na(GSE55763_100))
# Percentage of missing values
missing_pct <- (total_missing / total_values) * 100
# Print summary
cat(sprintf("The dataset contains %d missing values (%.4f%%).\n",
total_missing, missing_pct))
## The dataset contains 119 missing values (0.0434%).
In this case, missing values are assumed to be Missing Completely at Random (MCAR) and will be imputed using the median of each CpG.
X_cpg <- GSE55763_100[, 4:103] #select cpg columns
# Impute missing values using the median
X_cpg_imputed <- as.data.frame(lapply(X_cpg, function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}))
GSE55763_100_imputed <- cbind(GSE55763_100[, 1:3], X_cpg_imputed) #combine with other variables
sum(is.na(GSE55763_100_imputed)) #verify NAs
## [1] 0
# Save imputed daat set
write.csv(GSE55763_100_imputed, file = "GSE55763_100_imputed.csv", row.names = FALSE)
PCA
X_cpg_scaled <- scale(X_cpg_top30)
pca_res <- PCA(X_cpg_scaled, graph = FALSE)
fviz_pca_ind(pca_res,
geom.ind = "point",
pointsize = 1.5,
alpha.ind = 0.6, # transparency
col.ind = GSE55763_100_imputed$gender,
palette = c("blue", "red"),
addEllipses = TRUE)
# Show only top 10 contributing CpGs
fviz_pca_var(pca_res, col.var = "contrib",
select.var = list(contribution = 10)) +
theme_minimal()
CpG | Contribution |
---|---|
cg03052760 | 19.678355 |
cg02481741 | 14.334416 |
cg00414013 | 10.944840 |
cg00176657 | 10.345763 |
cg03962463 | 10.024569 |
cg02890642 | 10.023807 |
cg04111436 | 9.459641 |
cg02065717 | 8.926222 |
cg02636585 | 8.462062 |
cg02100365 | 8.291989 |
The top 10 CpGs contributing most to PC1 and PC2 capture the main sources of variation in the dataset.