The following analysis aims to reproduce some of the results presented in the study “Expression and Prognosis of CDC45 in Cervical Cancer Based on the GEO Database” by Liu et al. (2021), which evaluated the expression and prognostic value of the CDC45 gene in cervical cancer. For this purpose, the gene expression dataset available in the Gene Expression Omnibus (GEO) repository, under accession number GSE63514, was used. It is worth noting that GPL570 refers to the Affymetrix Human Genome U133 Plus 2.0 Array: it was the microarray platform used for the generation of gene expression data for the data set used: GSE63514.
In this section the code shared on the following site was used: https://www.ncbi.nlm.nih.gov/geo/geo2r/?acc=GSE63514
# Loading dataset GSE63514
#(GSEMatrix = TRUE, indicates that we are downloading the gene expression matrix)
gset <- getGEO("GSE63514", GSEMatrix =TRUE, getGPL=FALSE)
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
#express: Extracts the gene expression matrix from the gset object, creates a new object called ex
ex <- exprs(gset)
dim(ex)
## [1] 54675 128
#rows 54675 columns:128
# see if we have NAs:
sum(is.na(ex))
## [1] 0
#0 na
#A logarithmic transformation (log2) is performed
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) { ex[which(ex <= 0)] <- NaN
ex <- log2(ex) }
# box-and-whisker plot
dev.new(width=3+ncol(gset)/6, height=5)
par(mar=c(7,4,2,1))
title <- paste ("GSE63514", "/", annotation(gset), sep ="")
boxplot(ex, boxwex=0.7, notch=T, main=title, outline=FALSE, las=2)
dev.off()
## png
## 2
# expression value distribution plot
par(mar=c(4,4,2,1))
title <- paste ("GSE63514", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, main=title, legend=F)
# mean-variance trend
ex <- na.omit(ex) # eliminate rows with NAs
plotSA(lmFit(ex), main="Mean variance trend, GSE63514")
Exploratory data analysis and data visualization: The characteristics of the dataset samples are explored to identify and classify each one as “Normal”, “Cancer” or “Precancer” according to its description.
# Verify first rows:
head(pData(gset)$characteristics_ch1)
## [1] "tissue type: normal cervical epithelium"
## [2] "tissue type: normal cervical epithelium"
## [3] "tissue type: normal cervical epithelium"
## [4] "tissue type: normal cervical epithelium"
## [5] "tissue type: normal cervical epithelium"
## [6] "tissue type: normal cervical epithelium"
# Filter
cancer_samples <- grep("cancer", pData(gset)$characteristics_ch1, ignore.case = TRUE)
cancer_samples
## [1] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [20] 120 121 122 123 124 125 126 127 128
#Displays all unique descriptions that appear in the characteristics_ch1 column of the gset object
unique(pData(gset)$characteristics_ch1)
## [1] "tissue type: normal cervical epithelium"
## [2] "tissue type: cervical intraepithelial neoplasm, low grade lesion"
## [3] "tissue type: cervical intraepithelial neoplasm, moderate grade lesion"
## [4] "tissue type: cervical intraepithelial neoplasm, high grade lesion"
## [5] "tissue type: cervical squamous epithelial cancer"
# Assign condition labels based on different characteristics
condition_labels <- ifelse(grepl("normal", pData(gset)$characteristics_ch1, ignore.case = TRUE), "Normal",
ifelse(grepl("cancer", pData(gset)$characteristics_ch1, ignore.case = TRUE), "Cancer", "Precancer"))
# Assign the above labels to the gset object
pData(gset)$condition <- condition_labels
# Print the results in a table
table(pData(gset)$condition)
##
## Cancer Normal Precancer
## 28 24 76
1- Barplot/ Lollipop chart 2- Boxplot/ Violin plot
# Count the number of samples in each condition
condition_counts <- table(pData(gset)$condition)
# Convert the table to a data frame and rename the columns
condition_counts_df <- as.data.frame(condition_counts)
colnames(condition_counts_df) <- c("Condition", "Frequency")
# Inspect the first few rows to verify column structure
head(condition_counts_df)
## Condition Frequency
## 1 Cancer 28
## 2 Normal 24
## 3 Precancer 76
# Generate a bar plot showing the number of samples per condition
library(viridis) #'viridis' palette for colorblind-friendly
g2<- ggplot(condition_counts_df, aes(x = Condition, y = Frequency, fill = Condition)) +
geom_bar(stat = "identity") +
labs(title = "Sample Distribution by Condition", x = "Condition", y = "Number of Samples") +
scale_fill_viridis(discrete = TRUE) +
theme_minimal()
# Alternative visualization: lollipop chart
ggplot(condition_counts_df, aes(x = Condition, y = Frequency, color = Condition)) +
geom_segment(aes(x = Condition, xend = Condition, y = 0, yend = Frequency), linewidth = 1) +
geom_point(size = 3) +
labs(title = "Sample Distribution by Condition (Lollipop Chart)",
x = "Condition", y = "Sample Count") +
theme_minimal() +
scale_color_manual(values = c("Normal" = "blue", "Precancer" = "orange", "Cancer" = "red"))
# Prepare expression data in long format for plotting
expression_data_long <- data.frame(
Gene = rep(rownames(ex), each = ncol(ex)),
Expression = as.vector(t(ex)),
Condition = rep(pData(gset)$condition, times = nrow(ex))
)
# Boxplot: visualize gene expression distribution across conditions
ggplot(expression_data_long, aes(x = Condition, y = Expression, fill = Condition)) +
geom_boxplot() +
labs(title = "Gene Expression Distribution by Condition (Boxplot)",
x = "Condition", y = "Gene Expression") +
theme_minimal()
# Violin plot: alternative visualization for gene expression distribution
g3<- ggplot(expression_data_long, aes(x = Condition, y = Expression, fill = Condition)) +
geom_violin() +
labs(title = "Gene Expression Distribution by Condition (Violin Plot)",
x = "Condition", y = "Gene Expression") +
theme_minimal() +
scale_fill_brewer(palette = "Set3")
PCA Analysis: Normal vs Cancer Principal components (PCs) are used to identify variable dependencies, explore relationships among individuals, stabilize estimates, assess multivariate normality, and detect outliers. This technique is widely used in Transcriptomics and results very helpful for exploring complex gene expression data and revealing underlying patterns.
# Filter samples for "Cancer" and "Normal"
# Filtrar muestras para "Cancer" y "Normal"
selected_samples <- pData(gset)$condition %in% c("Cancer", "Normal")
# Extraer y transponer la matriz de expresión solo con las muestras seleccionadas
exprs_filtered <- exprs(gset)[, selected_samples]
# Realizar PCA
pca <- prcomp(t(exprs_filtered), scale. = TRUE)
# Crear un dataframe con las puntuaciones de PCA
pca_scores <- data.frame(pca$x)
# Añadir las condiciones (solo Cancer y Normal)
pca_scores$condition <- pData(gset)$condition[selected_samples]
# Calcular la varianza explicada por cada componente (PC1 y PC2)
explained_var <- summary(pca)$importance[2, 1:2] * 100
explained_var
## PC1 PC2
## 31.680 11.837
# PCA Plot
ggplot(pca_scores, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 4, alpha = 0.7) + # Añadir transparencia para mejorar la visibilidad
scale_color_brewer(palette = "Set1") +
labs(
title = "PCA Plot",
x = paste0("PC1 (", round(explained_var[1], 2), "%)"),
y = paste0("PC2 (", round(explained_var[2], 2), "%)")
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5), # Centrar título
legend.title = element_blank() # Eliminar título en la leyenda
) +
guides(color = guide_legend(override.aes = list(size = 4))) # Mejorar visibilidad de la leyenda
Save several charts into one, using the gridExtra library
library(gridExtra)
final_plot <- grid.arrange(g2,g3, ncol = 1)
#Save in png format
ggsave("final_combined_plots.png", final_plot, width = 8, height = 16)