FINAL PROJECT: EVALUATING EXPRESSION PROFILES OF KEY INFLAMMATORY GENES IN THYROID CANCER
SETUP:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(patchwork)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.5.3
Human_thyroid_arrays_public_trans_9_27_17 <- readRDS("R_project_IRF7_IL18_ISG15_04282026.rds")
public_annotation <- readRDS("public_annotation.rds")
Data Cleaning and Joining:
thyroid_df_filtered <- Human_thyroid_arrays_public_trans_9_27_17 %>%
filter(Gene.Symbol %in% c("IRF7", "IL18", "ISG15"))
thyroid_long <- thyroid_df_filtered %>%
pivot_longer(
cols = starts_with("GSM"),
names_to = "Sample_ID",
values_to = "Expression"
)
thyroid_long_clean <- thyroid_long %>%
mutate(Sample_ID = str_extract(Sample_ID, "GSM[0-9]+"))
thyroid_long_unique <- thyroid_long_clean %>%
group_by(Sample_ID, Gene.Symbol) %>%
summarize(Expression = mean(Expression, na.rm = TRUE), .groups = 'drop')
colnames(public_annotation) <- make.names(colnames(public_annotation),
unique = TRUE)
colnames(public_annotation)[12:24] <- c("GSE_Number", "Sample_ID", "Sample_Name",
"Histology", "Sample_type", "Driver_summary",
"Tumor_purity", "X12_Index", "Unknown_Status",
"BRAF", "RAS", "RET_PTC", "Other_Mutation")
public_clean <- public_annotation %>%
filter(if_any(everything(), ~ str_detect(., "GSM")))
thyroid_master <- thyroid_long_unique %>%
left_join(public_clean, by = "Sample_ID")
summary(thyroid_master)
## Sample_ID Gene.Symbol.x Expression Gene.Symbol.y
## Length:648 Length:648 Min. : 3.391 Length:648
## Class :character Class :character 1st Qu.: 6.131 Class :character
## Mode :character Mode :character Median : 7.305 Mode :character
## Mean : 7.299
## 3rd Qu.: 8.422
## Max. :13.253
## PECAM1 PECAM1.1 PECAM1.2 PECAM1.3
## Length:648 Length:648 Length:648 Length:648
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PECAM1.4 X X.1 X.2 X.3
## Length:648 Mode:logical Mode:logical Mode:logical Mode:logical
## Class :character NA's:648 NA's:648 NA's:648 NA's:648
## Mode :character
##
##
##
## X.4 GSE_Number Sample_Name Histology
## Mode:logical Length:648 Length:648 Length:648
## NA's:648 Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Sample_type Driver_summary Tumor_purity X12_Index
## Length:648 Length:648 Length:648 Min. :1
## Class :character Class :character Class :character 1st Qu.:1
## Mode :character Mode :character Mode :character Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
## Unknown_Status BRAF RAS RET_PTC
## Length:648 Length:648 Length:648 Length:648
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Other_Mutation
## Length:648
## Class :character
## Mode :character
##
##
##
INITIAL DATA VISUALIZATION: BOX PLOT FIGURES FOR INITIAL ANALYSIS
library(ggplot2)
plot_data <- thyroid_master %>%
filter(Histology %in% c("Normal", "ATC", "PDTC", "PTC")) %>%
rename(Gene.Symbol = Gene.Symbol.x) %>%
mutate(Histology = factor(Histology, levels = c("Normal", "PTC", "PDTC", "ATC")))
isg15_data <- plot_data %>%
filter(str_detect(Gene.Symbol, "ISG15"))
isg15_plot <- ggplot(isg15_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
labs(title = "ISG15 Expression", x = NULL, y = "Log2 Intensity") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "none")
print(isg15_plot)
irf7_data <- plot_data %>%
filter(Gene.Symbol == "IRF7")
irf7_plot <- ggplot(irf7_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
labs(title = "IRF7 Expression", x = NULL, y = "Log2 Intensity") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "none")
print(irf7_plot)
il18_data <- plot_data %>%
filter(Gene.Symbol == "IL18")
il18_plot <- ggplot(il18_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
theme_minimal() +
scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
labs(title = "IL18 Expression", x = "Histological Classification", y = "Log2 Intensity") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "none")
print(il18_plot)
P VALUE ANALYSIS:
irf7_anova <- oneway.test(Expression ~ Histology,
data = filter(plot_data, Gene.Symbol == "IRF7"),
var.equal = FALSE)
isg15_anova <- oneway.test(Expression ~ Histology,
data = filter(plot_data, str_detect(Gene.Symbol, "ISG15")),
var.equal = FALSE)
il18_anova <- oneway.test(Expression ~ Histology,
data = filter(plot_data, Gene.Symbol == "IL18"),
var.equal = FALSE)
cat(
paste("IRF7 p-value: ", irf7_anova$p.value),
paste("ISG15 p-value:", isg15_anova$p.value),
paste("IL18 p-value: ", il18_anova$p.value),
sep = "\n"
)
## IRF7 p-value: 4.78435014859038e-06
## ISG15 p-value: 9.53536079278678e-13
## IL18 p-value: 3.93989157719563e-08
Data readjustment - driver mutation BRAF V600E only:
braf_only_data <- plot_data %>%
dplyr::filter(Driver_summary == "BRAF_V600E")
braf_plot <- ggplot(braf_only_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1.2) +
facet_wrap(~Gene.Symbol, scales = "free_y") +
theme_minimal() +
# Custom colors: Blue for PTC, Purple for PDTC, Red for ATC
scale_fill_manual(values = c("PTC" = "#3498db", "PDTC" = "#9b59b6", "ATC" = "#e74c3c")) +
labs(
title = "Expression Analysis: BRAF_V600E Mutants Only",
subtitle = "Standardizing by driver mutation to isolate histological impact",
x = "Histology",
y = "Log2 Expression (Normalized)"
) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)
print(braf_plot)
Readjusted Welch’s ANOVA test:
irf7_braf_anova <- oneway.test(Expression ~ Histology,
data = filter(braf_only_data, Gene.Symbol == "IRF7"),
var.equal = FALSE)
isg15_braf_anova <- oneway.test(Expression ~ Histology,
data = filter(braf_only_data, str_detect(Gene.Symbol, "ISG15")),
var.equal = FALSE)
il18_braf_anova <- oneway.test(Expression ~ Histology,
data = filter(braf_only_data, Gene.Symbol == "IL18"),
var.equal = FALSE)
cat(
paste("IRF7_BRAF p-value: ", irf7_braf_anova$p.value),
paste("ISG15_BRAF p-value:", isg15_braf_anova$p.value),
paste("IL18_BRAF p-value: ", il18_braf_anova$p.value),
sep = "\n"
)
## IRF7_BRAF p-value: 0.0266210630837767
## ISG15_BRAF p-value: 0.218002868726139
## IL18_BRAF p-value: 0.108941338525097
Further verification of data based on number of samples missing a driver mutation category:
driver_stats <- plot_data %>%
summarise(
count_missing = sum(Driver_summary == "" | Driver_summary == "ND" | is.na(Driver_summary)),
total_rows = n(),
proportion_missing = count_missing / total_rows,
percentage_missing = proportion_missing * 100
)
print(driver_stats)
## # A tibble: 1 × 4
## count_missing total_rows proportion_missing percentage_missing
## <int> <int> <dbl> <dbl>
## 1 357 648 0.551 55.1
Based on plot data verification above, ATC driver mutation method was discarded.
FINER VISUALIZATION OF DATA: VIOLIN PLOTS FOR MORE APPROPRIATE STATISTICAL ANALYSIS:
isg15_violin_plot <- ggplot(isg15_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
theme_minimal() +
scale_fill_manual(values = c(
"Normal" = "#4DAF4A",
"PTC" = "#377EB8",
"PDTC" = "#984EA3",
"ATC" = "#E41A1C"
)) +
labs(
title = "ISG15 Expression",
x = NULL,
y = "Log2 Intensity"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "none"
)
print(isg15_violin_plot)
library(ggplot2)
irf7_violin_plot <- ggplot(irf7_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
theme_minimal() +
scale_fill_manual(values = c(
"Normal" = "#4DAF4A",
"PTC" = "#377EB8",
"PDTC" = "#984EA3",
"ATC" = "#E41A1C"
)) +
labs(
title = "IRF7 Expression",
x = NULL,
y = "Log2 Intensity"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "none"
)
print(irf7_violin_plot)
library(ggplot2)
il18_violin_plot <- ggplot(il18_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
theme_minimal() +
scale_fill_manual(values = c(
"Normal" = "#4DAF4A",
"PTC" = "#377EB8",
"PDTC" = "#984EA3",
"ATC" = "#E41A1C"
)) +
labs(
title = "IL18 Expression",
x = "Histological Classification",
y = "Log2 Intensity"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "none"
)
print(il18_violin_plot)
library(ggplot2)
braf_violin_plot <- ggplot(braf_only_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_violin(trim = FALSE, alpha = 0.7, color = "black") +
geom_jitter(width = 0.2, alpha = 0.4, size = 1.2) +
facet_wrap(~Gene.Symbol, scales = "free_y") +
theme_minimal() +
scale_fill_manual(values = c(
"PTC" = "#3498db",
"PDTC" = "#9b59b6",
"ATC" = "#e74c3c"
)) +
labs(
title = "Expression Analysis: BRAF_V600E Mutants Only",
subtitle = "Standardizing by driver mutation to isolate histological impact",
x = "Histology",
y = "Log2 Expression (Normalized)"
) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text = element_text(face = "bold")
)
print(braf_violin_plot)
plot_data <- thyroid_master %>%
filter(Histology %in% c("Normal", "PTC", "PDTC", "ATC")) %>%
mutate(Histology = factor(Histology, levels = c("Normal", "PTC", "PDTC", "ATC")))
my_comparisons <- list( c("Normal", "PTC"), c("Normal", "PDTC"), c("Normal", "ATC") )
r_sig_fig <- ggplot(plot_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.2) +
facet_wrap(~ Gene.Symbol.x, scales = "free_y") +
stat_compare_means(comparisons = my_comparisons,
label = "p.signif",
method = "wilcox.test") +
stat_compare_means(method = "anova", label.y.npc = "top", label.x.npc = 0.5) +
scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8",
"PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
theme_minimal() +
labs(
title = "Inflammatory Gene Expression Across Thyroid Cancer Histologies",
subtitle = "Significance markers indicate comparison vs. Normal tissue",
x = "Histological Classification",
y = "Log2 Intensity (Microarray Units)"
) +
theme(
legend.position = "none",
strip.text = element_text(face = "bold", size = 12),
plot.title = element_text(hjust = 0.5, face = "bold")
)
print(r_sig_fig)
Modified version:
plot_data <- thyroid_master %>%
filter(Histology %in% c("Normal", "PTC", "ATC")) %>%
mutate(Histology = factor(Histology, levels = c("Normal", "PTC", "ATC")))
my_comparisons <- list(
c("Normal", "PTC"),
c("Normal", "ATC"),
c("PTC", "ATC")
)
ggplot(plot_data, aes(x = Histology, y = Expression, fill = Histology)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1.2) +
facet_wrap(~ Gene.Symbol.x, scales = "free_y") +
stat_compare_means(
comparisons = my_comparisons,
label = "p.signif",
method = "wilcox.test",
step.increase = 0.12
) +
scale_fill_manual(values = c(
"Normal" = "#4DAF4A",
"PTC" = "#377EB8",
"ATC" = "#E41A1C"
)) +
theme_minimal() +
labs(
title = "Comparison of Inflammatory Gene Expression",
subtitle = "Excluding PDTC; Significance via Wilcoxon Test",
x = "Histological Classification",
y = "Log2 Intensity"
) +
theme(
legend.position = "none",
strip.text = element_text(face = "bold", size = 12),
plot.title = element_text(hjust = 0.5, face = "bold")
)
atc_corr_data <- thyroid_master %>%
filter(Gene.Symbol.x %in% c("IRF7", "IL18", "ISG15")) %>%
filter(Histology == "ATC") %>% # Filter specifically for ATC
select(Sample_ID, Gene.Symbol.x, Expression) %>%
pivot_wider(names_from = Gene.Symbol.x, values_from = Expression)
ggplot(atc_corr_data, aes(x = ISG15, y = IRF7)) +
geom_point(color = "#E41A1C", alpha = 0.6, size = 2.5) + # Using your ATC Red color
geom_smooth(method = "lm", color = "black", linetype = "solid", se = TRUE) +
stat_cor(method = "spearman", label.x.npc = "left", label.y.npc = "top") +
theme_minimal() +
labs(
title = "ATC Histology: ISG15 vs IRF7 Correlation",
subtitle = "Spearman Rank Correlation (Anaplastic Thyroid Carcinoma Only)",
x = "ISG15 (Log2 Intensity)",
y = "IRF7 (Log2 Intensity)"
) +
theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
ggplot(atc_corr_data, aes(x = ISG15, y = IL18)) +
geom_point(color = "#E41A1C", alpha = 0.6, size = 2.5) +
geom_smooth(method = "lm", color = "black", linetype = "solid", se = TRUE) +
stat_cor(method = "spearman") +
theme_minimal() +
labs(
title = "ATC Histology: ISG15 vs IL18 Correlation",
x = "ISG15 (Log2 Intensity)",
y = "IL18 (Log2 Intensity)"
) +
theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
ggplot(atc_corr_data, aes(x = IRF7, y = IL18)) +
geom_point(color = "#E41A1C", alpha = 0.6, size = 2.5) +
geom_smooth(method = "lm", color = "black", linetype = "solid", se = TRUE) +
stat_cor(method = "spearman") +
theme_minimal() +
labs(
title = "ATC Histology: IRF7 vs IL18 Correlation",
x = "IRF7 (Log2 Intensity)",
y = "IL18 (Log2 Intensity)"
) +
theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'