Fill NA use KNN
install.packages("impute")
library(impute)
data <- matrix(rnorm(100), ncol = 10)
data[sample(1:100, 10)] <- NA
imputed_data <- impute.knn(data, k = 5)
print(imputed_data)
IQR normalization
data <- data.frame(value = c(10, 15, 20, 22, 25, 30, 35, 40, 45, 100))
Q1 <- quantile(data$value, 0.25)
Q3 <- quantile(data$value, 0.75)
IQR_value <- Q3 - Q1
data$IQR_normalized <- (data$value - median(data$value)) / IQR_value
print(data)
Read data
full_d_sub <- read.table("Expression.csv",header=T,sep=",")
dim(full_d_sub)
## [1] 242 99
datatable(full_d_sub[,1:10])
Differential expression analysis
library(genefilter)
fc <- colttests(as.matrix(full_d_sub[,-c(1:4)]), as.factor(full_d_sub$Group)) ### Cancer vs Non Cancer or 1 vs 0
# Adjust p.value by BH
fc$FDR <- p.adjust(fc$p.value, method = "BH", n = length(fc$p.value))
colnames(fc)[2] <- c("logFC")
fc$Feature <- rownames(fc)
fc$fc_direct <- ifelse(fc$logFC >0, "up","down")
datatable(fc[1:10,])
# Original genes N
nrow(fc)
## [1] 95
fc_filter <- fc[abs(fc$logFC)>log2(2) & fc$FDR < 0.05,]
datatable(fc_filter[1:10,])
# N after fitler
nrow(fc_filter)
## [1] 25
Volcano plot
# All genes
p <- ggplot(fc, aes(logFC, -log(FDR, 10), fill = fc_direct)) +
geom_point(size = 1, shape=21, color="black") + scale_fill_manual(values = c("#0072B5FF", "#BC3C29FF")) +
xlab(expression("log"[2] * "FC")) +
ylab(expression("-log"[10] * "FDR")) +
geom_label_repel(
data = fc,
mapping = aes(logFC, -log(FDR, 10), label = Feature),
size = 2, max.overlaps = 50, force = 2
) + theme_bw()
print(p)

# Filtered genes
p1 <- ggplot(fc, aes(logFC, -log(FDR, 10), fill = fc_direct)) +
geom_point(size = 1, shape=21, color="black") + scale_fill_manual(values = c("#0072B5FF", "#BC3C29FF")) +
xlab(expression("log"[2] * "FC")) +
ylab(expression("-log"[10] * "FDR")) +
geom_label_repel(
data = fc_filter,
mapping = aes(logFC, -log(FDR, 10), label = Feature),
size = 2, max.overlaps = 50, force = 2
) + theme_bw()
print(p1)
