FINAL PROJECT: EVALUATING EXPRESSION PROFILES OF KEY INFLAMMATORY GENES IN THYROID CANCER

SETUP:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(patchwork)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.5.3
Human_thyroid_arrays_public_trans_9_27_17 <- readRDS("R_project_IRF7_IL18_ISG15_04282026.rds")

public_annotation <- readRDS("public_annotation.rds")

Data Cleaning and Joining:

thyroid_df_filtered <- Human_thyroid_arrays_public_trans_9_27_17 %>%
      filter(Gene.Symbol %in% c("IRF7", "IL18", "ISG15"))

thyroid_long <- thyroid_df_filtered %>% 
      pivot_longer(
        cols = starts_with("GSM"),
        names_to = "Sample_ID",
        values_to = "Expression"
      )

thyroid_long_clean <- thyroid_long %>% 
  mutate(Sample_ID = str_extract(Sample_ID, "GSM[0-9]+"))


thyroid_long_unique <- thyroid_long_clean %>%
  group_by(Sample_ID, Gene.Symbol) %>%
  summarize(Expression = mean(Expression, na.rm = TRUE), .groups = 'drop')


colnames(public_annotation) <- make.names(colnames(public_annotation), 
                                          unique = TRUE)

colnames(public_annotation)[12:24] <- c("GSE_Number", "Sample_ID", "Sample_Name", 
                                        "Histology", "Sample_type", "Driver_summary", 
                                        "Tumor_purity", "X12_Index", "Unknown_Status", 
                                        "BRAF", "RAS", "RET_PTC", "Other_Mutation")

public_clean <- public_annotation %>% 
  filter(if_any(everything(), ~ str_detect(., "GSM")))


thyroid_master <- thyroid_long_unique %>% 
  left_join(public_clean, by = "Sample_ID")

summary(thyroid_master)
##   Sample_ID         Gene.Symbol.x        Expression     Gene.Symbol.y     
##  Length:648         Length:648         Min.   : 3.391   Length:648        
##  Class :character   Class :character   1st Qu.: 6.131   Class :character  
##  Mode  :character   Mode  :character   Median : 7.305   Mode  :character  
##                                        Mean   : 7.299                     
##                                        3rd Qu.: 8.422                     
##                                        Max.   :13.253                     
##     PECAM1            PECAM1.1           PECAM1.2           PECAM1.3        
##  Length:648         Length:648         Length:648         Length:648        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    PECAM1.4            X             X.1            X.2            X.3         
##  Length:648         Mode:logical   Mode:logical   Mode:logical   Mode:logical  
##  Class :character   NA's:648       NA's:648       NA's:648       NA's:648      
##  Mode  :character                                                              
##                                                                                
##                                                                                
##                                                                                
##    X.4           GSE_Number        Sample_Name         Histology        
##  Mode:logical   Length:648         Length:648         Length:648        
##  NA's:648       Class :character   Class :character   Class :character  
##                 Mode  :character   Mode  :character   Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##  Sample_type        Driver_summary     Tumor_purity         X12_Index
##  Length:648         Length:648         Length:648         Min.   :1  
##  Class :character   Class :character   Class :character   1st Qu.:1  
##  Mode  :character   Mode  :character   Mode  :character   Median :1  
##                                                           Mean   :1  
##                                                           3rd Qu.:1  
##                                                           Max.   :1  
##  Unknown_Status         BRAF               RAS              RET_PTC         
##  Length:648         Length:648         Length:648         Length:648        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Other_Mutation    
##  Length:648        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

INITIAL DATA VISUALIZATION: BOX PLOT FIGURES FOR INITIAL ANALYSIS

library(ggplot2)

plot_data <- thyroid_master %>%
  filter(Histology %in% c("Normal", "ATC", "PDTC", "PTC")) %>%
  rename(Gene.Symbol = Gene.Symbol.x) %>% 
  mutate(Histology = factor(Histology, levels = c("Normal", "PTC", "PDTC", "ATC")))


isg15_data <- plot_data %>% 
  filter(str_detect(Gene.Symbol, "ISG15"))

isg15_plot <- ggplot(isg15_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
  labs(title = "ISG15 Expression", x = NULL, y = "Log2 Intensity") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "none")

print(isg15_plot)

irf7_data <- plot_data %>% 
  filter(Gene.Symbol == "IRF7")

irf7_plot <- ggplot(irf7_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
  labs(title = "IRF7 Expression", x = NULL, y = "Log2 Intensity") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "none")

print(irf7_plot)

il18_data <- plot_data %>% 
  filter(Gene.Symbol == "IL18")

il18_plot <- ggplot(il18_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
  theme_minimal() +
  scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
  labs(title = "IL18 Expression", x = "Histological Classification", y = "Log2 Intensity") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "none")

print(il18_plot)

P VALUE ANALYSIS:

irf7_anova <- oneway.test(Expression ~ Histology, 
                          data = filter(plot_data, Gene.Symbol == "IRF7"), 
                          var.equal = FALSE)


isg15_anova <- oneway.test(Expression ~ Histology, 
                           data = filter(plot_data, str_detect(Gene.Symbol, "ISG15")), 
                           var.equal = FALSE)


il18_anova <- oneway.test(Expression ~ Histology, 
                          data = filter(plot_data, Gene.Symbol == "IL18"), 
                          var.equal = FALSE)

cat(
  paste("IRF7 p-value: ", irf7_anova$p.value),
  paste("ISG15 p-value:", isg15_anova$p.value),
  paste("IL18 p-value: ", il18_anova$p.value),
  sep = "\n"
)
## IRF7 p-value:  4.78435014859038e-06
## ISG15 p-value: 9.53536079278678e-13
## IL18 p-value:  3.93989157719563e-08

Data readjustment - driver mutation BRAF V600E only:

braf_only_data <- plot_data %>%
  dplyr::filter(Driver_summary == "BRAF_V600E")

braf_plot <- ggplot(braf_only_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.4, size = 1.2) +
  facet_wrap(~Gene.Symbol, scales = "free_y") +
  theme_minimal() +
  # Custom colors: Blue for PTC, Purple for PDTC, Red for ATC
  scale_fill_manual(values = c("PTC" = "#3498db", "PDTC" = "#9b59b6", "ATC" = "#e74c3c")) +
  labs(
    title = "Expression Analysis: BRAF_V600E Mutants Only",
    subtitle = "Standardizing by driver mutation to isolate histological impact",
    x = "Histology",
    y = "Log2 Expression (Normalized)"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )

print(braf_plot)

Readjusted Welch’s ANOVA test:

irf7_braf_anova <- oneway.test(Expression ~ Histology, 
                          data = filter(braf_only_data, Gene.Symbol == "IRF7"), 
                          var.equal = FALSE)


isg15_braf_anova <- oneway.test(Expression ~ Histology, 
                           data = filter(braf_only_data, str_detect(Gene.Symbol, "ISG15")), 
                           var.equal = FALSE)


il18_braf_anova <- oneway.test(Expression ~ Histology, 
                          data = filter(braf_only_data, Gene.Symbol == "IL18"), 
                          var.equal = FALSE)

cat(
  paste("IRF7_BRAF p-value: ", irf7_braf_anova$p.value),
  paste("ISG15_BRAF p-value:", isg15_braf_anova$p.value),
  paste("IL18_BRAF p-value: ", il18_braf_anova$p.value),
  sep = "\n"
)
## IRF7_BRAF p-value:  0.0266210630837767
## ISG15_BRAF p-value: 0.218002868726139
## IL18_BRAF p-value:  0.108941338525097

Further verification of data based on number of samples missing a driver mutation category:

driver_stats <- plot_data %>%
  summarise(
    count_missing = sum(Driver_summary == "" | Driver_summary == "ND" | is.na(Driver_summary)),
    total_rows = n(),
    proportion_missing = count_missing / total_rows,
    percentage_missing = proportion_missing * 100
  )

print(driver_stats)
## # A tibble: 1 × 4
##   count_missing total_rows proportion_missing percentage_missing
##           <int>      <int>              <dbl>              <dbl>
## 1           357        648              0.551               55.1

Based on plot data verification above, ATC driver mutation method was discarded.

FINER VISUALIZATION OF DATA: VIOLIN PLOTS FOR MORE APPROPRIATE STATISTICAL ANALYSIS:

isg15_violin_plot <- ggplot(isg15_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_violin(trim = FALSE, alpha = 0.7, color = "black") + 
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
  theme_minimal() +
  scale_fill_manual(values = c(
    "Normal" = "#4DAF4A", 
    "PTC"    = "#377EB8", 
    "PDTC"   = "#984EA3", 
    "ATC"    = "#E41A1C"
  )) +
  labs(
    title = "ISG15 Expression", 
    x = NULL, 
    y = "Log2 Intensity"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    legend.position = "none"
  )


print(isg15_violin_plot)

library(ggplot2)

irf7_violin_plot <- ggplot(irf7_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_violin(trim = FALSE, alpha = 0.7, color = "black") + 
  
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
  theme_minimal() +
  scale_fill_manual(values = c(
    "Normal" = "#4DAF4A", 
    "PTC"    = "#377EB8", 
    "PDTC"   = "#984EA3", 
    "ATC"    = "#E41A1C"
  )) +
  labs(
    title = "IRF7 Expression", 
    x = NULL, 
    y = "Log2 Intensity"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    legend.position = "none"
  )

print(irf7_violin_plot)

library(ggplot2)

il18_violin_plot <- ggplot(il18_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_violin(trim = FALSE, alpha = 0.7, color = "black") + 
  
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.5) +
  theme_minimal() +
  scale_fill_manual(values = c(
    "Normal" = "#4DAF4A", 
    "PTC"    = "#377EB8", 
    "PDTC"   = "#984EA3", 
    "ATC"    = "#E41A1C"
  )) +
  labs(
    title = "IL18 Expression", 
    x = "Histological Classification", 
    y = "Log2 Intensity"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    legend.position = "none"
  )

print(il18_violin_plot)

library(ggplot2)

braf_violin_plot <- ggplot(braf_only_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_violin(trim = FALSE, alpha = 0.7, color = "black") + 
  geom_jitter(width = 0.2, alpha = 0.4, size = 1.2) +
  facet_wrap(~Gene.Symbol, scales = "free_y") +
  theme_minimal() +
  scale_fill_manual(values = c(
    "PTC"  = "#3498db", 
    "PDTC" = "#9b59b6", 
    "ATC"  = "#e74c3c"
  )) +
  
  labs(
    title = "Expression Analysis: BRAF_V600E Mutants Only",
    subtitle = "Standardizing by driver mutation to isolate histological impact",
    x = "Histology",
    y = "Log2 Expression (Normalized)"
  ) +
  
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    strip.text = element_text(face = "bold") 
  )

print(braf_violin_plot)

plot_data <- thyroid_master %>%
  filter(Histology %in% c("Normal", "PTC", "PDTC", "ATC")) %>%
  mutate(Histology = factor(Histology, levels = c("Normal", "PTC", "PDTC", "ATC")))

my_comparisons <- list( c("Normal", "PTC"), c("Normal", "PDTC"), c("Normal", "ATC") )

r_sig_fig <- ggplot(plot_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.2) +
  facet_wrap(~ Gene.Symbol.x, scales = "free_y") +
  stat_compare_means(comparisons = my_comparisons, 
                     label = "p.signif", 
                     method = "wilcox.test") + 
  stat_compare_means(method = "anova", label.y.npc = "top", label.x.npc = 0.5) +
  scale_fill_manual(values = c("Normal" = "#4DAF4A", "PTC" = "#377EB8", 
                               "PDTC" = "#984EA3", "ATC" = "#E41A1C")) +
  theme_minimal() +
  labs(
    title = "Inflammatory Gene Expression Across Thyroid Cancer Histologies",
    subtitle = "Significance markers indicate comparison vs. Normal tissue",
    x = "Histological Classification",
    y = "Log2 Intensity (Microarray Units)"
  ) +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold", size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

print(r_sig_fig)

Modified version:

plot_data <- thyroid_master %>%
  filter(Histology %in% c("Normal", "PTC", "ATC")) %>%
  mutate(Histology = factor(Histology, levels = c("Normal", "PTC", "ATC")))

my_comparisons <- list( 
  c("Normal", "PTC"), 
  c("Normal", "ATC"), 
  c("PTC", "ATC") 
)

ggplot(plot_data, aes(x = Histology, y = Expression, fill = Histology)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1.2) +
  facet_wrap(~ Gene.Symbol.x, scales = "free_y") +
  
  stat_compare_means(
    comparisons = my_comparisons, 
    label = "p.signif", 
    method = "wilcox.test",
    step.increase = 0.12 
  ) + 
  
  
  scale_fill_manual(values = c(
    "Normal" = "#4DAF4A", 
    "PTC" = "#377EB8", 
    "ATC" = "#E41A1C"
  )) +
  
  theme_minimal() +
  labs(
    title = "Comparison of Inflammatory Gene Expression",
    subtitle = "Excluding PDTC; Significance via Wilcoxon Test",
    x = "Histological Classification",
    y = "Log2 Intensity"
  ) +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold", size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

atc_corr_data <- thyroid_master %>%
  filter(Gene.Symbol.x %in% c("IRF7", "IL18", "ISG15")) %>%
  filter(Histology == "ATC") %>% # Filter specifically for ATC
  select(Sample_ID, Gene.Symbol.x, Expression) %>%
  pivot_wider(names_from = Gene.Symbol.x, values_from = Expression)


ggplot(atc_corr_data, aes(x = ISG15, y = IRF7)) +
  geom_point(color = "#E41A1C", alpha = 0.6, size = 2.5) + # Using your ATC Red color
  geom_smooth(method = "lm", color = "black", linetype = "solid", se = TRUE) +
  stat_cor(method = "spearman", label.x.npc = "left", label.y.npc = "top") + 
  theme_minimal() +
  labs(
    title = "ATC Histology: ISG15 vs IRF7 Correlation",
    subtitle = "Spearman Rank Correlation (Anaplastic Thyroid Carcinoma Only)",
    x = "ISG15 (Log2 Intensity)",
    y = "IRF7 (Log2 Intensity)"
  ) +
  theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(atc_corr_data, aes(x = ISG15, y = IL18)) +
  geom_point(color = "#E41A1C", alpha = 0.6, size = 2.5) +
  geom_smooth(method = "lm", color = "black", linetype = "solid", se = TRUE) +
  stat_cor(method = "spearman") + 
  theme_minimal() +
  labs(
    title = "ATC Histology: ISG15 vs IL18 Correlation",
    x = "ISG15 (Log2 Intensity)",
    y = "IL18 (Log2 Intensity)"
  ) +
  theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(atc_corr_data, aes(x = IRF7, y = IL18)) +
  geom_point(color = "#E41A1C", alpha = 0.6, size = 2.5) +
  geom_smooth(method = "lm", color = "black", linetype = "solid", se = TRUE) +
  stat_cor(method = "spearman") + 
  theme_minimal() +
  labs(
    title = "ATC Histology: IRF7 vs IL18 Correlation",
    x = "IRF7 (Log2 Intensity)",
    y = "IL18 (Log2 Intensity)"
  ) +
  theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'