Summary of Key Findings

Question 1: UCI Confidence Interval Analysis

Example feature (column 3):
The average length of the 1 000 95 % confidence intervals was 1.10. The true difference in means between the two groups was –0.67, and 98.5 % of the replicated intervals contained that true difference—very close to the nominal 95 % level.
All features (columns 2–14):
Average CI lengths ranged from 0.13 (feature 9) up to 251.73 (feature 14), reflecting the varying measurement scales across features. Coverage rates (the percent of CIs containing the true difference) ranged from 96.9 % (feature 11) to 99.0 % (feature 4), confirming well‐calibrated intervals throughout.
Full‐data vs. sampled CIs (feature 3):
The 95 % CI computed on the entire dataset for feature 3 is [–0.48, –0.02], which is noticeably narrower than most of the random‐sample CIs—illustrating how larger sample sizes yield more precise estimates.

Question 2: GEO Linear Model Associations

Strongest predictor:
Gene 3275 had the smallest p‐value (8.05 × 10⁻⁶) with a 95 % slope confidence interval of [0.03, 0.06], which excludes zero and indicates a statistically significant association with age.
Weakest predictor:
Gene 28473 had the largest p‐value (0.99997) with a CI of [–1.90, 1.90], spanning zero and supporting no detectable linear effect.
Overall pattern:
When plotting the 95 % CIs for the top 50 genes by p‐value, most intervals cluster tightly around zero—only a handful extend clearly above or below—highlighting a small subset of genes driving the continuous‐variable association.

Question 1

Load UCI Dataset

# Load a dataset from UCI
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
df  <- read.csv(url, header = FALSE)

# Identify the most frequent class
tab <- table(df$V1)
print(tab)

## 
##  1  2  3 
## 59 71 48

# Partition into group1 (largest class) and group2 (others)
main_class <- as.numeric(names(tab)[which.max(tab)])
group1 <- df[df$V1 == main_class, ]
group2 <- df[df$V1 != main_class, ]

Define CI Function

set.seed(101)
ci1 <- function(col) {
  x <- sample(group1[[col]], 30)
  y <- sample(group2[[col]], 30)
  t <- t.test(x, y, conf.level = 0.95)
  t$conf.int
}

# Example CI for column 3
ci1(3)

## [1] -1.01321417  0.04188083
## attr(,"conf.level")
## [1] 0.95

Replicate Confidence Intervals

# Perform 1000 replicates for column 3
cis   <- replicate(1000, ci1(3))   # 2 × 1000 matrix
lower <- cis[1, ]
upper <- cis[2, ]

# (a) Average interval length
avg_len   <- mean(upper - lower)
# (b) True difference of means
true_diff <- mean(group1[[3]]) - mean(group2[[3]])
# (c) Coverage proportion
coverage  <- mean(lower <= true_diff & upper >= true_diff)

avg_len; true_diff; coverage

## [1] 1.100187

## [1] -0.6715296

## [1] 0.985

Results Across All Features

# Compute for each feature
results <- lapply(2:ncol(df), function(col) {
  cis <- replicate(1000, ci1(col))
  low <- cis[1, ]; up <- cis[2, ]
  tdiff <- mean(group1[[col]]) - mean(group2[[col]])
  list(
    avg_length = mean(up - low),
    coverage   = mean(low <= tdiff & up >= tdiff),
    true_diff  = tdiff
  )
})
names(results) <- paste0("V", 2:ncol(df))
results

## $V2
## $V2$avg_length
## [1] 0.5755087
## 
## $V2$coverage
## [1] 0.983
## 
## $V2$true_diff
## [1] -1.200894
## 
## 
## $V3
## $V3$avg_length
## [1] 1.08926
## 
## $V3$coverage
## [1] 0.984
## 
## $V3$true_diff
## [1] -0.6715296
## 
## 
## $V4
## $V4$avg_length
## [1] 0.2772047
## 
## $V4$coverage
## [1] 0.99
## 
## $V4$true_diff
## [1] -0.202501
## 
## 
## $V5
## $V5$avg_length
## [1] 3.40669
## 
## $V5$coverage
## [1] 0.985
## 
## $V5$true_diff
## [1] 1.236159
## 
## 
## $V6
## $V6$avg_length
## [1] 14.68549
## 
## $V6$coverage
## [1] 0.975
## 
## $V6$true_diff
## [1] -8.63762
## 
## 
## $V7
## $V7$avg_length
## [1] 0.6314293
## 
## $V7$coverage
## [1] 0.987
## 
## $V7$true_diff
## [1] -0.06028564
## 
## 
## $V8
## $V8$avg_length
## [1] 0.9929503
## 
## $V8$coverage
## [1] 0.981
## 
## $V8$true_diff
## [1] 0.08579834
## 
## 
## $V9
## $V9$avg_length
## [1] 0.1284952
## 
## $V9$coverage
## [1] 0.987
## 
## $V9$true_diff
## [1] 0.003007766
## 
## 
## $V10
## $V10$avg_length
## [1] 0.5929412
## 
## $V10$coverage
## [1] 0.988
## 
## $V10$true_diff
## [1] 0.06551534
## 
## 
## $V11
## $V11$avg_length
## [1] 1.626316
## 
## $V11$coverage
## [1] 0.969
## 
## $V11$true_diff
## [1] -3.279642
## 
## 
## $V12
## $V12$avg_length
## [1] 0.2192998
## 
## $V12$coverage
## [1] 0.98
## 
## $V12$true_diff
## [1] 0.1644125
## 
## 
## $V13
## $V13$avg_length
## [1] 0.6914787
## 
## $V13$coverage
## [1] 0.977
## 
## $V13$true_diff
## [1] 0.2889035
## 
## 
## $V14
## $V14$avg_length
## [1] 251.726
## 
## $V14$coverage
## [1] 0.98
## 
## $V14$true_diff
## [1] -378.2687

Full-Data CI Comparison

# Full-data CI for column 3
full_ci <- t.test(group1[[3]], group2[[3]])$conf.int

# Plot sampled CIs vs full-data CI
plot(1:1000, lower, pch = NA,
     ylim = range(c(lower, upper, full_ci)),
     xlab = "Replication", ylab = "CI bounds")
segments(1:1000, lower, 1:1000, upper, col = "grey")
abline(h = full_ci, col = "red", lwd = 2)

Data Preparation

# Load data
metadata <- read.csv("/Users/mdorval/metadata.csv", row.names = 1)
expr_mat <- read.csv("/Users/mdorval/expr_mat.csv", row.names = 1)

# Check dimensions and alignment
dim(expr_mat)

## [1] 57773    82

head(expr_mat[, 1:5])

##                 SKIN_AGE22_M_GR1_Gt073 SKIN_AGE23_M_GR1_Gt022
## ENSG00000000003                   7407                  11212
## ENSG00000000005                     18                      0
## ENSG00000000419                   4198                   6380
## ENSG00000000457                   4895                   5024
## ENSG00000000460                   4677                   4262
## ENSG00000000938                     19                      0
##                 SKIN_AGE25_F_GR1_Gt085 SKIN_AGE25_M_GR1_Gt046
## ENSG00000000003                   4919                   3920
## ENSG00000000005                     18                      0
## ENSG00000000419                   4450                  12458
## ENSG00000000457                   4817                   3210
## ENSG00000000460                   4658                   3200
## ENSG00000000938                     38                     10
##                 SKIN_AGE26_F_GR1_Gt045
## ENSG00000000003                   3823
## ENSG00000000005                      2
## ENSG00000000419                  10722
## ENSG00000000457                   3434
## ENSG00000000460                   3590
## ENSG00000000938                      2

dim(metadata)

## [1] 82  2

head(metadata)

##                                        sample age
## SKIN_AGE22_M_GR1_Gt073 SKIN_AGE22_M_GR1_Gt073  22
## SKIN_AGE23_M_GR1_Gt022 SKIN_AGE23_M_GR1_Gt022  23
## SKIN_AGE25_F_GR1_Gt085 SKIN_AGE25_F_GR1_Gt085  25
## SKIN_AGE25_M_GR1_Gt046 SKIN_AGE25_M_GR1_Gt046  25
## SKIN_AGE26_F_GR1_Gt045 SKIN_AGE26_F_GR1_Gt045  26
## SKIN_AGE27_F_GR1_Gt055 SKIN_AGE27_F_GR1_Gt055  27

# Verify alignment between expression matrix and metadata
stopifnot(all(colnames(expr_mat) == rownames(metadata)))

Load required data from the saved CSV files

metadata <- read.csv(“/Users/mdorval/metadata.csv”, row.names = 1) expr_mat <- read.csv(“/Users/mdorval/expr_mat.csv”, row.names = 1)

Check dimensions and alignment

dim(expr_mat) head(expr_mat[, 1:5]) # Preview first few genes for the first 5 samples

dim(metadata) head(metadata) # Preview metadata

Verify alignment between expression matrix and metadata

stopifnot(all(colnames(expr_mat) == rownames(metadata)))


---

## Fit Linear Models


``` r
# Fit one linear model per gene to predict age based on expression levels
skipped <- 0
res <- t(apply(expr_mat, 1, function(x) {
  if (var(x) == 0) {
    skipped <<- skipped + 1
    return(c(lower = NA, upper = NA, pvalue = NA))
  }
  tryCatch({
    fit <- lm(metadata$age ~ x)
    ci  <- confint(fit)[2, ]
    p   <- summary(fit)$coefficients[2, 4]
    c(lower = ci[1], upper = ci[2], pvalue = p)
  }, error = function(e) {
    c(lower = NA, upper = NA, pvalue = NA)
  })
}))
print(paste("Skipped rows due to zero variance:", skipped))

## [1] "Skipped rows due to zero variance: 2147"

# Convert results to a data frame
res_df <- as.data.frame(res, stringsAsFactors = FALSE)
colnames(res_df) <- c("lower", "upper", "pvalue")
res_df$gene <- rownames(expr_mat)

# Filter out rows with NA p-values
res_df <- res_df %>% filter(!is.na(pvalue))

# Sort by p-value
res_df <- res_df %>% arrange(pvalue)

Best and Worst Predictors

# Best predictor (lowest p-value)
best <- res_df[1, ]
print("Best Predictor:")

## [1] "Best Predictor:"

print(best)

##                      lower     upper       pvalue            gene
## ENSG00000106069 0.02619654 0.0636573 8.045929e-06 ENSG00000106069

# Worst predictor (highest p-value)
worst <- res_df[nrow(res_df), ]
print("Worst Predictor:")

## [1] "Worst Predictor:"

print(worst)

##                     lower    upper    pvalue            gene
## ENSG00000226508 -1.901455 1.901515 0.9999749 ENSG00000226508

Plot Confidence Intervals

# Plot the 95% confidence intervals for the slope
plot(res_df$lower, type = "l",
     ylim = range(res_df$lower, res_df$upper),
     xlab = "Genes (sorted by increasing p-value)",
     ylab = "95% CI for slope")
lines(res_df$upper, lty = 2)
abline(h = 0, col = "grey50")
legend("topright",
       legend = c("Lower bound", "Upper bound"),
       lty = c(1, 2),
       bty = "n")

RBIF 111 Homework 4

Megan Dorval

2025-04-29