---
title: "CCDC134 Group Analysis"
author: "Maoyan"
date: "2025-07-02"
output: html_document
---
## Load Libraries and Set Working Directory
``` r
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
setwd("~/Documents/GitHub/ccdc134/RNAseq")
Read and Process Data
Load Group Data
group <- read.csv("./ccdc134-median2group.csv")
row.names(group) <- group$seqn
group <- group[complete.cases(group[, 3:4]), ]
Load Clinics Data
clinics <- read_excel("./103clincs_data.xlsx") %>% as.data.frame()
## New names:
## • `CD34` -> `CD34...21`
## • `CD34` -> `CD34...23`
row.names(clinics) <- clinics$seqn
colnames(clinics)[ncol(clinics)] <- "cd34"
Merge Data
merged_data <- merge(group[, 3:4, drop = FALSE], clinics[, c(5:15, ncol(clinics)), drop = FALSE], by = 0)
row.names(merged_data) <- merged_data$Row.names
merged_data <- merged_data[, -1]
merged_data[merged_data == "NA"] <- NA
merged_data <- merged_data %>%
mutate(across(2:ncol(.), ~ factor(., exclude = NA)))
merged_data[[2]] <- factor(merged_data[[2]], levels = c(1, 0), labels = c("高水平组", "低水平组"))
Contingency Tables
distribution_tables <- lapply(3:ncol(merged_data), function(i) {
table(merged_data[[2]], merged_data[[i]])
})
names(distribution_tables) <- colnames(merged_data)[3:ncol(merged_data)]
distribution_tables
## $sex
##
## 1 2
## 高水平组 47 4
## 低水平组 43 9
##
## $ethnic
##
## 1 2
## 高水平组 34 17
## 低水平组 36 16
##
## $CK19
##
## 0 1
## 高水平组 35 7
## 低水平组 39 7
##
## $lymph
##
## 0 1
## 高水平组 34 17
## 低水平组 34 14
##
## $blood
##
## 0 1
## 高水平组 36 15
## 低水平组 37 13
##
## $necro
##
## 0 1
## 高水平组 10 41
## 低水平组 22 28
##
## $capsule
##
## 0 1
## 高水平组 15 36
## 低水平组 10 42
##
## $TumorCAPSULE
##
## 0 1
## 高水平组 34 17
## 低水平组 36 16
##
## $MVI
##
## 0 1
## 高水平组 27 24
## 低水平组 32 20
##
## $MDmax
##
## 1 2
## 高水平组 19 32
## 低水平组 18 34
##
## $TumorN
##
## 1 2
## 高水平组 43 8
## 低水平组 45 7
##
## $cd34
##
## 0 1
## 高水平组 18 32
## 低水平组 21 30
Chi-Squared Tests and Bar Plots
Display Bar Plots
p_values <- list()
for (i in 3:ncol(merged_data)) {
filtered_data <- merged_data[!is.na(merged_data[[i]]), ]
test_result <- chisq.test(table(filtered_data[[2]], filtered_data[[i]]))
p_values[[colnames(merged_data)[i]]] <- test_result$p.value
plot_data <- as.data.frame(prop.table(table(filtered_data[[2]], filtered_data[[i]]), margin = 2) * 100)
colnames(plot_data) <- c("CCDC134", "GroupOther", "Percentage")
plot <- ggplot(plot_data, aes(x = GroupOther, y = Percentage, fill = CCDC134)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = paste0(round(Percentage, 1), "%")),
position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
labs(title = paste("Barplot for", colnames(merged_data)[i]),
subtitle = paste("p-value:", signif(test_result$p.value, 3)),
x = "Group", y = "Percentage (%)") +
scale_fill_discrete(name = "CCDC134", labels = c("low", "high")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
}












Grouped Bar Plots
for (i in 3:ncol(merged_data)) {
filtered_data <- merged_data[!is.na(merged_data[[i]]), ]
test_result <- chisq.test(table(filtered_data[[2]], filtered_data[[i]]))
p_values[[colnames(merged_data)[i]]] <- test_result$p.value
plot_data <- as.data.frame(prop.table(table(filtered_data[[2]], filtered_data[[i]]), margin = 1) * 100)
colnames(plot_data) <- c("CCDC134", "GroupOther", "Percentage")
ggplot(plot_data, aes(x = CCDC134, y = Percentage, fill = GroupOther)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(round(Percentage, 1), "%")),
position = position_stack(vjust = 0.5), size = 3) +
labs(title = paste("Barplot for", colnames(merged_data)[i]),
subtitle = paste("p-value:", signif(test_result$p.value, 3)),
x = "CCDC134", y = "Percentage (%)") +
scale_fill_discrete(name = "CCDC134", labels = c("Low", "High")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
Group Statistics
group_stats <- merged_data %>%
group_by(merged_data[[2]]) %>%
summarise(
Median = median(merged_data[[1]], na.rm = TRUE),
Mean = mean(merged_data[[1]], na.rm = TRUE)
)
group_stats
## # A tibble: 2 × 3
## `merged_data[[2]]` Median Mean
## <fct> <dbl> <dbl>
## 1 高水平组 4.11 5.08
## 2 低水平组 4.11 5.08
Create New Data Frame
new_data <- merged_data[, -1]