# Setup
set.seed(42) # ensure reproducibility of random sampling
suppressPackageStartupMessages({
library(tidyverse) # tidy data manipulation (dplyr, tibble)
})
# Download Pima Indians Diabetes dataset from GitHub
pima <- read.csv(
"https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
)
# Convert outcome columnmto a factor with labels
pima$Outcome <- factor(pima$Outcome, levels = c(0,1), labels = c("neg","pos"))
# Store names of numeric feature columns (exclude 'Outcome')
feat_cols <- names(pima)[names(pima) != "Outcome"]
# Function to run 1000 random-sample t.tests for one feature
resample_ci_feature <- function(df, feature, n = 30, reps = 1000) {
# Subset numeric values for each group
g0 <- df %>% filter(Outcome == "neg") %>% pull(all_of(feature))
g1 <- df %>% filter(Outcome == "pos") %>% pull(all_of(feature))
# Calculate population-level difference (mean of g1 - mean of g0)
pop_diff <- mean(g1, na.rm = TRUE) - mean(g0, na.rm = TRUE)
# Compute CI
full_ci <- t.test(g1, g0, var.equal = FALSE)$conf.int
# Function to run a single replicate
one_rep <- function() {
# Draw random samples of size n from each group with replacement
x0 <- sample(g0, n, replace = TRUE)
x1 <- sample(g1, n, replace = TRUE)
# Run Welch’s t-test for difference in means
tt <- t.test(x1, x0, var.equal = FALSE)
# Return lower and upper CI bounds
c(lwr = tt$conf.int[1], upr = tt$conf.int[2])
}
# Run 1000 replicates. Collect CIs into a matrix
ci_mat <- replicate(reps, one_rep())
# Convert matrix to tibble for easier use
cis <- tibble(
rep = 1:reps, # replicate ID
lwr = ci_mat["lwr",], # lower bound
upr = ci_mat["upr",], # upper bound
len = upr - lwr, # CI length
covers = (lwr <= pop_diff & upr >= pop_diff), # whether CI covers true diff
feature = feature # record which feature this is for
)
# Return both results and summary stats
list(
cis = cis, # all replicate CI results
pop_diff = pop_diff, # true population mean difference
full_ci = full_ci # full dataset CI
)
}
# Run analysis for all features
all_results <- lapply(feat_cols, function(f) resample_ci_feature(pima, f, n = 30, reps = 1000))
# Combine replicate CIs for all features into one tibble
cis_df <- bind_rows(lapply(all_results, `[[`, "cis"))
# Collect population-level diffs and full-dataset CIs into one tibble
pop_df <- tibble(
feature = feat_cols,
pop_diff = sapply(all_results, `[[`, "pop_diff"),
full_lwr = sapply(all_results, function(x) x$full_ci[1]),
full_upr = sapply(all_results, function(x) x$full_ci[2])
)
Summaries:
# a) Average CI length per feature
avg_len <- cis_df %>% group_by(feature) %>% summarise(avg_length = mean(len))
print(avg_len)
## # A tibble: 8 × 2
## feature avg_length
## <chr> <dbl>
## 1 Age 11.6
## 2 BMI 7.66
## 3 BloodPressure 20.3
## 4 DiabetesPedigreeFunction 0.344
## 5 Glucose 30.1
## 6 Insulin 123.
## 7 Pregnancies 3.51
## 8 SkinThickness 16.8
# b) Population-level difference per feature
print(pop_df %>% select(feature, pop_diff))
## # A tibble: 8 × 2
## feature pop_diff
## <chr> <dbl>
## 1 Pregnancies 1.57
## 2 Glucose 31.3
## 3 BloodPressure 2.64
## 4 SkinThickness 2.50
## 5 Insulin 31.5
## 6 BMI 4.84
## 7 DiabetesPedigreeFunction 0.121
## 8 Age 5.88
# c) Empirical coverage proportion per feature (fraction of replicates covering true diff)
coverage <- cis_df %>% group_by(feature) %>% summarise(coverage = mean(covers))
print(coverage)
## # A tibble: 8 × 2
## feature coverage
## <chr> <dbl>
## 1 Age 0.937
## 2 BMI 0.942
## 3 BloodPressure 0.953
## 4 DiabetesPedigreeFunction 0.943
## 5 Glucose 0.958
## 6 Insulin 0.961
## 7 Pregnancies 0.955
## 8 SkinThickness 0.942
Plots:
# Show 1000 CIs with population diff line
p1 <- cis_df %>%
left_join(pop_df, by = "feature") %>%
ggplot(aes(y = rep, x = lwr, xend = upr)) +
geom_segment(aes(yend = rep), alpha = 0.3) + # each CI drawn as a horizontal line
geom_vline(aes(xintercept = pop_diff), linetype = 2) + # dashed vertical line at true diff
facet_wrap(~feature, scales = "free_x") + # one panel per feature, scales independent
labs(title = "1000 resampled 95% CIs per feature (Pima)",
x = "Difference in means (pos - neg)", y = "Replication") +
theme_bw()
print(p1)
# Show overlay average resampled CI and full-dataset CI
p2 <- cis_df %>%
group_by(feature) %>%
summarise(mean_lwr = mean(lwr), mean_upr = mean(upr)) %>% # average CI bounds
left_join(pop_df, by = "feature") %>%
ggplot(aes(y = feature)) +
geom_linerange(aes(xmin = mean_lwr, xmax = mean_upr)) + # average resampled CI
geom_linerange(aes(xmin = full_lwr, xmax = full_upr)) + # CI from full data
geom_point(aes(x = pop_diff), shape = 4, stroke = 1.1) + # "true" population diff marked with X
labs(title = "Full vs. average resampled CI per feature",
x = "Difference in means (pos - neg)", y = NULL) +
theme_bw()
print(p2)
# Setup
suppressPackageStartupMessages({
library(GEOquery) # for downloading GEO datasets
library(tidyverse) # data wrangling + ggplot
library(readr) # for raw count
})
# Download GEO dataset
gse <- getGEO("GSE152418", GSEMatrix = TRUE)
## Found 1 file(s)
## GSE152418_series_matrix.txt.gz
# If multiple ExpressionSets, just take the first
if (length(gse) > 1) {
eset <- gse[[1]]
} else {
eset <- gse[[1]]
}
# Extract phenotype
pheno <- pData(eset) # sample annotations
pheno <- pheno[!is.na( pheno$`days_post_symptom_onset:ch1`), ] # continuous variable
# Extract Raw Count
sra <- read_csv("SraRunTable.csv")
## Rows: 68 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (25): Run, Assay Type, BioProject, BioSample, cell_type, Center Name, C...
## dbl (5): AvgSpotLen, Bases, Bytes, days_post_symptom_onset, version
## dttm (2): ReleaseDate, create_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Look at the columns and map
colnames(sra)
## [1] "Run" "Assay Type"
## [3] "AvgSpotLen" "Bases"
## [5] "BioProject" "BioSample"
## [7] "Bytes" "cell_type"
## [9] "Center Name" "Consent"
## [11] "DATASTORE filetype" "DATASTORE provider"
## [13] "DATASTORE region" "days_post_symptom_onset"
## [15] "disease_state" "Experiment"
## [17] "gender" "GEO_Accession (exp)"
## [19] "geographical_location" "Instrument"
## [21] "LibraryLayout" "LibrarySelection"
## [23] "LibrarySource" "Organism"
## [25] "Platform" "ReleaseDate"
## [27] "create_date" "version"
## [29] "Sample Name" "Severity"
## [31] "source_name" "SRA Study"
map <- sra %>% select("Sample Name", "GEO_Accession (exp)")
# Keep only the overlap
common <- intersect(colnames(expr), map$`GEO_Accession (exp)`)
# Reorder pheno to match
pheno <- pheno[map$`GEO_Accession (exp)`[match(common, map$`geo_accession`)], ]
## Warning: Unknown or uninitialised column: `geo_accession`.
pheno <- pheno[common, ]
# Align expression to GSMs
expr <- Biobase::exprs(eset)
expr <- expr[, common]
Define gene model:
Y <- pheno
fit_one <- function(y, x) {
df <- data.frame(Y = y, expr = x)
if (length(unique(df$expr)) < 2) {
return(c(beta = NA, lwr = NA, upr = NA, pval = NA))
}
m <- lm(y ~ expr, data = df)
beta <- coef(m)["expr"]
ci <- confint(m, "expr", level = 0.95)
pval <- summary(m)$coefficients["expr", "Pr(>|t|)"]
c(beta = beta, lwr = ci[1], upr = ci[2], pval = pval)
}
# Run models for all genes
out <- t(apply(expr, 1, function(x) fit_one(Y, x)))
res <- as.data.frame(out)
res$gene <- rownames(res)
rownames(res) <- NULL
Visualize:
# Top 30
top_genes <- res %>% slice_head(n = 30)
# Plot
p_ci <- top_genes %>%
ggplot(aes(y = gene, x = beta)) +
geom_point() +
geom_errorbarh(aes(xmin = lwr, xmax = upr), height = 0) +
geom_vline(xintercept = 0, linetype = 2) +
labs(title = "Top genes: slope of age ~ expression (95% CI)",
x = "Slope (years per 1-unit expression)", y = NULL) +
theme_bw()
## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.