Question 1

# Setup
set.seed(42)  # ensure reproducibility of random sampling
suppressPackageStartupMessages({
  library(tidyverse)   # tidy data manipulation (dplyr, tibble)
})

# Download Pima Indians Diabetes dataset from GitHub
pima <- read.csv(
  "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
)

# Convert outcome columnmto a factor with labels
pima$Outcome <- factor(pima$Outcome, levels = c(0,1), labels = c("neg","pos"))

# Store names of numeric feature columns (exclude 'Outcome')
feat_cols <- names(pima)[names(pima) != "Outcome"]

# Function to run 1000 random-sample t.tests for one feature
resample_ci_feature <- function(df, feature, n = 30, reps = 1000) {
  # Subset numeric values for each group
  g0 <- df %>% filter(Outcome == "neg") %>% pull(all_of(feature))
  g1 <- df %>% filter(Outcome == "pos") %>% pull(all_of(feature))
  
  # Calculate population-level difference (mean of g1 - mean of g0)
  pop_diff <- mean(g1, na.rm = TRUE) - mean(g0, na.rm = TRUE)
  
  # Compute CI 
  full_ci <- t.test(g1, g0, var.equal = FALSE)$conf.int
  
  # Function to run a single replicate
  one_rep <- function() {
    # Draw random samples of size n from each group with replacement
    x0 <- sample(g0, n, replace = TRUE)
    x1 <- sample(g1, n, replace = TRUE)
    # Run Welch’s t-test for difference in means
    tt <- t.test(x1, x0, var.equal = FALSE)
    # Return lower and upper CI bounds
    c(lwr = tt$conf.int[1], upr = tt$conf.int[2])
  }
  
  # Run 1000 replicates. Collect CIs into a matrix
  ci_mat <- replicate(reps, one_rep())
  
  # Convert matrix to tibble for easier use
  cis <- tibble(
    rep = 1:reps,             # replicate ID
    lwr = ci_mat["lwr",],     # lower bound
    upr = ci_mat["upr",],     # upper bound
    len = upr - lwr,          # CI length
    covers = (lwr <= pop_diff & upr >= pop_diff), # whether CI covers true diff
    feature = feature         # record which feature this is for
  )
  
  # Return both results and summary stats
  list(
    cis = cis,                # all replicate CI results
    pop_diff = pop_diff,      # true population mean difference
    full_ci = full_ci         # full dataset CI
  )
}

# Run analysis for all features 
all_results <- lapply(feat_cols, function(f) resample_ci_feature(pima, f, n = 30, reps = 1000))

# Combine replicate CIs for all features into one tibble
cis_df <- bind_rows(lapply(all_results, `[[`, "cis"))

# Collect population-level diffs and full-dataset CIs into one tibble
pop_df <- tibble(
  feature = feat_cols,
  pop_diff = sapply(all_results, `[[`, "pop_diff"),
  full_lwr = sapply(all_results, function(x) x$full_ci[1]),
  full_upr = sapply(all_results, function(x) x$full_ci[2])
)

Summaries:

# a) Average CI length per feature
avg_len <- cis_df %>% group_by(feature) %>% summarise(avg_length = mean(len))
print(avg_len)
## # A tibble: 8 × 2
##   feature                  avg_length
##   <chr>                         <dbl>
## 1 Age                          11.6  
## 2 BMI                           7.66 
## 3 BloodPressure                20.3  
## 4 DiabetesPedigreeFunction      0.344
## 5 Glucose                      30.1  
## 6 Insulin                     123.   
## 7 Pregnancies                   3.51 
## 8 SkinThickness                16.8
# b) Population-level difference per feature
print(pop_df %>% select(feature, pop_diff))
## # A tibble: 8 × 2
##   feature                  pop_diff
##   <chr>                       <dbl>
## 1 Pregnancies                 1.57 
## 2 Glucose                    31.3  
## 3 BloodPressure               2.64 
## 4 SkinThickness               2.50 
## 5 Insulin                    31.5  
## 6 BMI                         4.84 
## 7 DiabetesPedigreeFunction    0.121
## 8 Age                         5.88
# c) Empirical coverage proportion per feature (fraction of replicates covering true diff)
coverage <- cis_df %>% group_by(feature) %>% summarise(coverage = mean(covers))
print(coverage)
## # A tibble: 8 × 2
##   feature                  coverage
##   <chr>                       <dbl>
## 1 Age                         0.937
## 2 BMI                         0.942
## 3 BloodPressure               0.953
## 4 DiabetesPedigreeFunction    0.943
## 5 Glucose                     0.958
## 6 Insulin                     0.961
## 7 Pregnancies                 0.955
## 8 SkinThickness               0.942

Plots:

# Show 1000 CIs with population diff line
p1 <- cis_df %>%
  left_join(pop_df, by = "feature") %>%
  ggplot(aes(y = rep, x = lwr, xend = upr)) +
  geom_segment(aes(yend = rep), alpha = 0.3) +   # each CI drawn as a horizontal line
  geom_vline(aes(xintercept = pop_diff), linetype = 2) +  # dashed vertical line at true diff
  facet_wrap(~feature, scales = "free_x") +       # one panel per feature, scales independent
  labs(title = "1000 resampled 95% CIs per feature (Pima)",
       x = "Difference in means (pos - neg)", y = "Replication") +
  theme_bw()
print(p1)

# Show overlay average resampled CI and full-dataset CI 
p2 <- cis_df %>%
  group_by(feature) %>%
  summarise(mean_lwr = mean(lwr), mean_upr = mean(upr)) %>% # average CI bounds
  left_join(pop_df, by = "feature") %>%
  ggplot(aes(y = feature)) +
  geom_linerange(aes(xmin = mean_lwr, xmax = mean_upr)) +  # average resampled CI
  geom_linerange(aes(xmin = full_lwr, xmax = full_upr)) +  # CI from full data
  geom_point(aes(x = pop_diff), shape = 4, stroke = 1.1) + # "true" population diff marked with X
  labs(title = "Full vs. average resampled CI per feature",
       x = "Difference in means (pos - neg)", y = NULL) +
  theme_bw()
print(p2)

Question 2

# Setup
suppressPackageStartupMessages({
  library(GEOquery)             # for downloading GEO datasets
  library(tidyverse)            # data wrangling + ggplot
  library(readr)                # for raw count 
})

# Download GEO dataset
gse <- getGEO("GSE152418", GSEMatrix = TRUE)
## Found 1 file(s)
## GSE152418_series_matrix.txt.gz
# If multiple ExpressionSets, just take the first
if (length(gse) > 1) {
  eset <- gse[[1]]
} else {
  eset <- gse[[1]]
}

# Extract phenotype 
pheno <- pData(eset) # sample annotations
pheno <- pheno[!is.na( pheno$`days_post_symptom_onset:ch1`), ] # continuous variable

# Extract Raw Count

sra <- read_csv("SraRunTable.csv")   
## Rows: 68 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (25): Run, Assay Type, BioProject, BioSample, cell_type, Center Name, C...
## dbl   (5): AvgSpotLen, Bases, Bytes, days_post_symptom_onset, version
## dttm  (2): ReleaseDate, create_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Look at the columns and map
colnames(sra)
##  [1] "Run"                     "Assay Type"             
##  [3] "AvgSpotLen"              "Bases"                  
##  [5] "BioProject"              "BioSample"              
##  [7] "Bytes"                   "cell_type"              
##  [9] "Center Name"             "Consent"                
## [11] "DATASTORE filetype"      "DATASTORE provider"     
## [13] "DATASTORE region"        "days_post_symptom_onset"
## [15] "disease_state"           "Experiment"             
## [17] "gender"                  "GEO_Accession (exp)"    
## [19] "geographical_location"   "Instrument"             
## [21] "LibraryLayout"           "LibrarySelection"       
## [23] "LibrarySource"           "Organism"               
## [25] "Platform"                "ReleaseDate"            
## [27] "create_date"             "version"                
## [29] "Sample Name"             "Severity"               
## [31] "source_name"             "SRA Study"
map <- sra %>% select("Sample Name", "GEO_Accession (exp)")

# Keep only the overlap
common <- intersect(colnames(expr), map$`GEO_Accession (exp)`)

# Reorder pheno to match
pheno <- pheno[map$`GEO_Accession (exp)`[match(common, map$`geo_accession`)], ]
## Warning: Unknown or uninitialised column: `geo_accession`.
pheno <- pheno[common, ]



# Align expression to GSMs
expr  <- Biobase::exprs(eset)
expr <- expr[, common]

Define gene model:

Y <- pheno
fit_one <- function(y, x) {
  df <- data.frame(Y = y, expr = x)
  if (length(unique(df$expr)) < 2) {
    return(c(beta = NA, lwr = NA, upr = NA, pval = NA))
  }
  m <- lm(y ~ expr, data = df)
  beta <- coef(m)["expr"]
  ci   <- confint(m, "expr", level = 0.95)
  pval <- summary(m)$coefficients["expr", "Pr(>|t|)"]
  c(beta = beta, lwr = ci[1], upr = ci[2], pval = pval)
}

# Run models for all genes
out <- t(apply(expr, 1, function(x) fit_one(Y, x)))

res <- as.data.frame(out)
res$gene <- rownames(res)
rownames(res) <- NULL

Visualize:

# Top 30 
top_genes <- res %>% slice_head(n = 30)

# Plot 
p_ci <- top_genes %>%
  ggplot(aes(y = gene, x = beta)) +
  geom_point() +
  geom_errorbarh(aes(xmin = lwr, xmax = upr), height = 0) +
  geom_vline(xintercept = 0, linetype = 2) +
  labs(title = "Top genes: slope of age ~ expression (95% CI)",
       x = "Slope (years per 1-unit expression)", y = NULL) +
  theme_bw()
## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.