# Load a dataset from UCI
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
df <- read.csv(url, header = FALSE)
# Identify the most frequent class
tab <- table(df$V1)
print(tab)
##
## 1 2 3
## 59 71 48
# Partition into group1 (largest class) and group2 (others)
main_class <- as.numeric(names(tab)[which.max(tab)])
group1 <- df[df$V1 == main_class, ]
group2 <- df[df$V1 != main_class, ]
set.seed(101)
ci1 <- function(col) {
x <- sample(group1[[col]], 30)
y <- sample(group2[[col]], 30)
t <- t.test(x, y, conf.level = 0.95)
t$conf.int
}
# Example CI for column 3
ci1(3)
## [1] -1.01321417 0.04188083
## attr(,"conf.level")
## [1] 0.95
# Perform 1000 replicates for column 3
cis <- replicate(1000, ci1(3)) # 2 × 1000 matrix
lower <- cis[1, ]
upper <- cis[2, ]
# (a) Average interval length
avg_len <- mean(upper - lower)
# (b) True difference of means
true_diff <- mean(group1[[3]]) - mean(group2[[3]])
# (c) Coverage proportion
coverage <- mean(lower <= true_diff & upper >= true_diff)
avg_len; true_diff; coverage
## [1] 1.100187
## [1] -0.6715296
## [1] 0.985
# Compute for each feature
results <- lapply(2:ncol(df), function(col) {
cis <- replicate(1000, ci1(col))
low <- cis[1, ]; up <- cis[2, ]
tdiff <- mean(group1[[col]]) - mean(group2[[col]])
list(
avg_length = mean(up - low),
coverage = mean(low <= tdiff & up >= tdiff),
true_diff = tdiff
)
})
names(results) <- paste0("V", 2:ncol(df))
results
## $V2
## $V2$avg_length
## [1] 0.5755087
##
## $V2$coverage
## [1] 0.983
##
## $V2$true_diff
## [1] -1.200894
##
##
## $V3
## $V3$avg_length
## [1] 1.08926
##
## $V3$coverage
## [1] 0.984
##
## $V3$true_diff
## [1] -0.6715296
##
##
## $V4
## $V4$avg_length
## [1] 0.2772047
##
## $V4$coverage
## [1] 0.99
##
## $V4$true_diff
## [1] -0.202501
##
##
## $V5
## $V5$avg_length
## [1] 3.40669
##
## $V5$coverage
## [1] 0.985
##
## $V5$true_diff
## [1] 1.236159
##
##
## $V6
## $V6$avg_length
## [1] 14.68549
##
## $V6$coverage
## [1] 0.975
##
## $V6$true_diff
## [1] -8.63762
##
##
## $V7
## $V7$avg_length
## [1] 0.6314293
##
## $V7$coverage
## [1] 0.987
##
## $V7$true_diff
## [1] -0.06028564
##
##
## $V8
## $V8$avg_length
## [1] 0.9929503
##
## $V8$coverage
## [1] 0.981
##
## $V8$true_diff
## [1] 0.08579834
##
##
## $V9
## $V9$avg_length
## [1] 0.1284952
##
## $V9$coverage
## [1] 0.987
##
## $V9$true_diff
## [1] 0.003007766
##
##
## $V10
## $V10$avg_length
## [1] 0.5929412
##
## $V10$coverage
## [1] 0.988
##
## $V10$true_diff
## [1] 0.06551534
##
##
## $V11
## $V11$avg_length
## [1] 1.626316
##
## $V11$coverage
## [1] 0.969
##
## $V11$true_diff
## [1] -3.279642
##
##
## $V12
## $V12$avg_length
## [1] 0.2192998
##
## $V12$coverage
## [1] 0.98
##
## $V12$true_diff
## [1] 0.1644125
##
##
## $V13
## $V13$avg_length
## [1] 0.6914787
##
## $V13$coverage
## [1] 0.977
##
## $V13$true_diff
## [1] 0.2889035
##
##
## $V14
## $V14$avg_length
## [1] 251.726
##
## $V14$coverage
## [1] 0.98
##
## $V14$true_diff
## [1] -378.2687
# Full-data CI for column 3
full_ci <- t.test(group1[[3]], group2[[3]])$conf.int
# Plot sampled CIs vs full-data CI
plot(1:1000, lower, pch = NA,
ylim = range(c(lower, upper, full_ci)),
xlab = "Replication", ylab = "CI bounds")
segments(1:1000, lower, 1:1000, upper, col = "grey")
abline(h = full_ci, col = "red", lwd = 2)
# Load data
metadata <- read.csv("/Users/mdorval/metadata.csv", row.names = 1)
expr_mat <- read.csv("/Users/mdorval/expr_mat.csv", row.names = 1)
# Check dimensions and alignment
dim(expr_mat)
## [1] 57773 82
head(expr_mat[, 1:5])
## SKIN_AGE22_M_GR1_Gt073 SKIN_AGE23_M_GR1_Gt022
## ENSG00000000003 7407 11212
## ENSG00000000005 18 0
## ENSG00000000419 4198 6380
## ENSG00000000457 4895 5024
## ENSG00000000460 4677 4262
## ENSG00000000938 19 0
## SKIN_AGE25_F_GR1_Gt085 SKIN_AGE25_M_GR1_Gt046
## ENSG00000000003 4919 3920
## ENSG00000000005 18 0
## ENSG00000000419 4450 12458
## ENSG00000000457 4817 3210
## ENSG00000000460 4658 3200
## ENSG00000000938 38 10
## SKIN_AGE26_F_GR1_Gt045
## ENSG00000000003 3823
## ENSG00000000005 2
## ENSG00000000419 10722
## ENSG00000000457 3434
## ENSG00000000460 3590
## ENSG00000000938 2
dim(metadata)
## [1] 82 2
head(metadata)
## sample age
## SKIN_AGE22_M_GR1_Gt073 SKIN_AGE22_M_GR1_Gt073 22
## SKIN_AGE23_M_GR1_Gt022 SKIN_AGE23_M_GR1_Gt022 23
## SKIN_AGE25_F_GR1_Gt085 SKIN_AGE25_F_GR1_Gt085 25
## SKIN_AGE25_M_GR1_Gt046 SKIN_AGE25_M_GR1_Gt046 25
## SKIN_AGE26_F_GR1_Gt045 SKIN_AGE26_F_GR1_Gt045 26
## SKIN_AGE27_F_GR1_Gt055 SKIN_AGE27_F_GR1_Gt055 27
# Verify alignment between expression matrix and metadata
stopifnot(all(colnames(expr_mat) == rownames(metadata)))
metadata <- read.csv(“/Users/mdorval/metadata.csv”, row.names = 1) expr_mat <- read.csv(“/Users/mdorval/expr_mat.csv”, row.names = 1)
dim(expr_mat) head(expr_mat[, 1:5]) # Preview first few genes for the first 5 samples
dim(metadata) head(metadata) # Preview metadata
stopifnot(all(colnames(expr_mat) == rownames(metadata)))
---
## Fit Linear Models
``` r
# Fit one linear model per gene to predict age based on expression levels
skipped <- 0
res <- t(apply(expr_mat, 1, function(x) {
if (var(x) == 0) {
skipped <<- skipped + 1
return(c(lower = NA, upper = NA, pvalue = NA))
}
tryCatch({
fit <- lm(metadata$age ~ x)
ci <- confint(fit)[2, ]
p <- summary(fit)$coefficients[2, 4]
c(lower = ci[1], upper = ci[2], pvalue = p)
}, error = function(e) {
c(lower = NA, upper = NA, pvalue = NA)
})
}))
print(paste("Skipped rows due to zero variance:", skipped))
## [1] "Skipped rows due to zero variance: 2147"
# Convert results to a data frame
res_df <- as.data.frame(res, stringsAsFactors = FALSE)
colnames(res_df) <- c("lower", "upper", "pvalue")
res_df$gene <- rownames(expr_mat)
# Filter out rows with NA p-values
res_df <- res_df %>% filter(!is.na(pvalue))
# Sort by p-value
res_df <- res_df %>% arrange(pvalue)
# Best predictor (lowest p-value)
best <- res_df[1, ]
print("Best Predictor:")
## [1] "Best Predictor:"
print(best)
## lower upper pvalue gene
## ENSG00000106069 0.02619654 0.0636573 8.045929e-06 ENSG00000106069
# Worst predictor (highest p-value)
worst <- res_df[nrow(res_df), ]
print("Worst Predictor:")
## [1] "Worst Predictor:"
print(worst)
## lower upper pvalue gene
## ENSG00000226508 -1.901455 1.901515 0.9999749 ENSG00000226508
# Plot the 95% confidence intervals for the slope
plot(res_df$lower, type = "l",
ylim = range(res_df$lower, res_df$upper),
xlab = "Genes (sorted by increasing p-value)",
ylab = "95% CI for slope")
lines(res_df$upper, lty = 2)
abline(h = 0, col = "grey50")
legend("topright",
legend = c("Lower bound", "Upper bound"),
lty = c(1, 2),
bty = "n")