df <- read.csv("synthetic_microdata_01.csv")
library(dplyr)
cs_econ <- df %>%
filter(major %in% c("CS", "Econ"))
cs_econ %>%
group_by(major) %>%
summarise(
sample_size = n(),
mean_gpa = mean(gpa, na.rm = TRUE),
sd_gpa = sd(gpa, na.rm = TRUE)
)
## # A tibble: 2 × 4
## major sample_size mean_gpa sd_gpa
## <chr> <int> <dbl> <dbl>
## 1 CS 166416 3.36 0.262
## 2 Econ 169098 3.42 0.260
cs_stats <- cs_econ %>% filter(major == "CS")
econ_stats <- cs_econ %>% filter(major == "Econ")
mean_cs <- mean(cs_stats$gpa, na.rm = TRUE)
mean_econ <- mean(econ_stats$gpa, na.rm = TRUE)
sd_cs <- sd(cs_stats$gpa, na.rm = TRUE)
sd_econ <- sd(econ_stats$gpa, na.rm = TRUE)
n_cs <- nrow(cs_stats)
n_econ <- nrow(econ_stats)
# Difference in means
delta_hat <- mean_cs - mean_econ
# Welch SE
se <- sqrt((sd_cs^2 / n_cs) + (sd_econ^2 / n_econ))
# 95% CI
lower <- delta_hat - 1.96 * se
upper <- delta_hat + 1.96 * se
part_b <- tibble(
statistic = c("Difference in means (CS - Econ)", "Welch SE", "95% CI lower", "95% CI upper"),
value = c(delta_hat, se, lower, upper)
)
part_b
## # A tibble: 4 × 2
## statistic value
## <chr> <dbl>
## 1 Difference in means (CS - Econ) -0.0642
## 2 Welch SE 0.000900
## 3 95% CI lower -0.0660
## 4 95% CI upper -0.0625
# Define success = internship_offer == "Yes"
cs_econ <- cs_econ %>%
mutate(success = internship_offer == "Yes")
# Proportions
prop_table <- cs_econ %>%
group_by(major) %>%
summarise(
n = n(),
success_rate = mean(success, na.rm = TRUE)
)
prop_table
## # A tibble: 2 × 3
## major n success_rate
## <chr> <int> <dbl>
## 1 CS 166416 0.348
## 2 Econ 169098 0.266
# Extract values
p_cs <- prop_table$success_rate[prop_table$major == "CS"]
p_econ <- prop_table$success_rate[prop_table$major == "Econ"]
# Difference and lift
diff <- p_cs - p_econ
lift <- diff / p_econ
tibble(
statistic = c("p_CS", "p_Econ", "Difference (CS - Econ)", "Lift (CS relative to Econ)"),
value = c(p_cs, p_econ, diff, lift)
)
## # A tibble: 4 × 2
## statistic value
## <chr> <dbl>
## 1 p_CS 0.348
## 2 p_Econ 0.266
## 3 Difference (CS - Econ) 0.0816
## 4 Lift (CS relative to Econ) 0.306
# Sample sizes
n_cs <- sum(cs_econ$major == "CS")
n_econ <- sum(cs_econ$major == "Econ")
# Proportions (already computed but reuse cleanly)
p_cs <- mean(cs_econ$success[cs_econ$major == "CS"])
p_econ <- mean(cs_econ$success[cs_econ$major == "Econ"])
# Difference
diff <- p_cs - p_econ
# --- UNPOOLED SE (for CI)
se_unpooled <- sqrt(
(p_cs * (1 - p_cs) / n_cs) +
(p_econ * (1 - p_econ) / n_econ)
)
# 95% CI
lower <- diff - 1.96 * se_unpooled
upper <- diff + 1.96 * se_unpooled
# --- POOLED SE (for hypothesis test)
p_pool <- (
sum(cs_econ$success[cs_econ$major == "CS"]) +
sum(cs_econ$success[cs_econ$major == "Econ"])
) / (n_cs + n_econ)
se_pooled <- sqrt(
p_pool * (1 - p_pool) * (1/n_cs + 1/n_econ)
)
# z-stat
z <- diff / se_pooled
# Clean output
tibble(
statistic = c(
"Difference (CS - Econ)",
"Unpooled SE",
"95% CI lower",
"95% CI upper",
"Pooled proportion",
"Pooled SE",
"z-stat"
),
value = c(diff, se_unpooled, lower, upper, p_pool, se_pooled, z)
)
## # A tibble: 7 × 2
## statistic value
## <chr> <dbl>
## 1 Difference (CS - Econ) 0.0816
## 2 Unpooled SE 0.00159
## 3 95% CI lower 0.0785
## 4 95% CI upper 0.0847
## 5 Pooled proportion 0.307
## 6 Pooled SE 0.00159
## 7 z-stat 51.2
Looking at strictly CS and Econ majors, CS majors have a slightly lower GPA on average. CS majors average about 0.064 less GPA points than Econ majors across the data set. This difference is statistically significant in the 95% confidence interval [-0.066, -0.0625], as 0 is not included, the difference does appear statistically significant, however, this is not very practically significant because 0.064 is a very small difference in each individual GPA. Looking at the internship rates, a large difference is presented. CS majors have a higher internship-offer rate than Econ majors by about 8.16 percentage points, corresponding to about a 30.6% relative lift. This difference is statistically significant as the 95% confidence interval is [0.0785, 0.0847], and isn’t even close to including 0, so there is definitely a statistically significant difference. This difference is also practically significant, a 30.6% lift / 8.16 percentage points difference is a large noticeable difference in real life, there should be an observably higher rate of CS majors with internships compares to Econ majors. We cannot say anything causal about this relationship.
nyc_rural <- df %>%
filter(city_type %in% c("NYC", "Rural"))
summary_stats <- nyc_rural %>%
group_by(city_type) %>%
summarise(
sample_size = n(),
mean_commute = mean(commute_minutes, na.rm = TRUE),
median_commute = median(commute_minutes, na.rm = TRUE)
)
summary_stats
## # A tibble: 2 × 4
## city_type sample_size mean_commute median_commute
## <chr> <int> <dbl> <dbl>
## 1 NYC 209701 21.3 20.1
## 2 Rural 119799 38.7 32.4
tibble(
statistic = c("Mean (Rural - NYC)", "Median (Rural - NYC)"),
value = c(
summary_stats$mean_commute[summary_stats$city_type == "Rural"] -
summary_stats$mean_commute[summary_stats$city_type == "NYC"],
summary_stats$median_commute[summary_stats$city_type == "Rural"] -
summary_stats$median_commute[summary_stats$city_type == "NYC"]
)
)
## # A tibble: 2 × 2
## statistic value
## <chr> <dbl>
## 1 Mean (Rural - NYC) 17.4
## 2 Median (Rural - NYC) 12.3
set.seed(123)
B <- 1000
boot_mean_diff <- numeric(B)
boot_median_diff <- numeric(B)
nyc <- nyc_rural$commute_minutes[nyc_rural$city_type == "NYC"]
rural <- nyc_rural$commute_minutes[nyc_rural$city_type == "Rural"]
for (b in 1:B) {
nyc_sample <- sample(nyc, length(nyc), replace = TRUE)
rural_sample <- sample(rural, length(rural), replace = TRUE)
boot_mean_diff[b] <- mean(rural_sample) - mean(nyc_sample)
boot_median_diff[b] <- median(rural_sample) - median(nyc_sample)
}
Report Results
boot_se_mean <- sd(boot_mean_diff)
boot_se_median <- sd(boot_median_diff)
tibble(
statistic = c("Bootstrap SE (mean difference)", "Bootstrap SE (median difference)"),
value = c(boot_se_mean, boot_se_median)
)
## # A tibble: 2 × 2
## statistic value
## <chr> <dbl>
## 1 Bootstrap SE (mean difference) 0.0859
## 2 Bootstrap SE (median difference) 0.102
ci_mean <- quantile(boot_mean_diff, c(0.025, 0.975))
ci_median <- quantile(boot_median_diff, c(0.025, 0.975))
tibble(
statistic = c(
"CI lower (mean difference)",
"CI upper (mean difference)",
"CI lower (median difference)",
"CI upper (median difference)"
),
value = c(ci_mean[1], ci_mean[2], ci_median[1], ci_median[2])
)
## # A tibble: 4 × 2
## statistic value
## <chr> <dbl>
## 1 CI lower (mean difference) 17.2
## 2 CI upper (mean difference) 17.6
## 3 CI lower (median difference) 12.2
## 4 CI upper (median difference) 12.6
nyc <- nyc_rural$commute_minutes[nyc_rural$city_type == "NYC"]
rural <- nyc_rural$commute_minutes[nyc_rural$city_type == "Rural"]
mean_nyc <- mean(nyc, na.rm = TRUE)
mean_rural <- mean(rural, na.rm = TRUE)
diff_mean <- mean_rural - mean_nyc
sd_nyc <- sd(nyc, na.rm = TRUE)
sd_rural <- sd(rural, na.rm = TRUE)
n_nyc <- length(nyc)
n_rural <- length(rural)
welch_se <- sqrt((sd_rural^2 / n_rural) + (sd_nyc^2 / n_nyc))
lower_formula <- diff_mean - 1.96 * welch_se
upper_formula <- diff_mean + 1.96 * welch_se
tibble(
statistic = c(
"Bootstrap SE (mean)",
"Formula SE (Welch)",
"Bootstrap CI lower",
"Bootstrap CI upper",
"Formula CI lower",
"Formula CI upper"
),
value = c(
boot_se_mean,
welch_se,
ci_mean[1],
ci_mean[2],
lower_formula,
upper_formula
)
)
## # A tibble: 6 × 2
## statistic value
## <chr> <dbl>
## 1 Bootstrap SE (mean) 0.0859
## 2 Formula SE (Welch) 0.0846
## 3 Bootstrap CI lower 17.2
## 4 Bootstrap CI upper 17.6
## 5 Formula CI lower 17.2
## 6 Formula CI upper 17.5
Comparing the results of the formula-based method to the bootstrapping method we see very similar results. Looking at the standard errors, the bootstrap standard error for the mean difference (0.0859) lies very close to the Welch formula standard error (0.0846). Then when we compare the confidence intervals we also see they are very close to one another, the Welch formula produces a 95% confidence interval of [17.217, 17.548] while the bootstrap method gives a confidence interval of [17.220, 17.558], showing these two methods produce very similar results. The bootstrap and formula-based methods produce almost identical results, indicating that the normal approximation works very well for this problem.
library(ggplot2)
boot_df <- tibble(
boot_mean_diff = boot_mean_diff,
boot_median_diff = boot_median_diff
)
ggplot(boot_df, aes(x = boot_mean_diff)) +
geom_histogram(bins = 40) +
labs(
title = "Bootstrap Distribution of Mean Difference",
x = "Mean Difference (Rural - NYC)",
y = "Frequency"
)
ggplot(boot_df, aes(x = boot_median_diff)) +
geom_histogram(bins = 15) +
labs(
title = "Bootstrap Distribution of Median Difference",
x = "Median Difference (Rural - NYC)",
y = "Frequency"
)
When bootstrapping the distribution of mean and median difference, the
resampling must be done within groups rather than from the pooled data
because we are comparing two distinct groups, Rural and NYC. If we
pooled the observations before resampling, we lose the ability to draw a
mean and median difference from the two separate groups. The bootstrap
sample if we resampled from the pooled data, would simply reflect a
single combined population, rather than looking at both individually,
resulting in the estimated variation not corresponding with the Rural -
NYC difference we are trying to measure. By resampling within each
group, we uphold the original group structure and size, ensuring the
bootstrap distribution correctly approximates sampling variability of
the difference in commute times.
debt <- df$credit_card_debt_usd
debt <- debt[!is.na(debt)]
mean_debt <- mean(debt)
median_debt <- median(debt)
trimmed_mean <- mean(debt, trim = 0.10)
lower_cutoff <- quantile(debt, 0.10)
upper_cutoff <- quantile(debt, 0.90)
winsorized_debt <- pmin(pmax(debt, lower_cutoff), upper_cutoff)
winsorized_mean <- mean(winsorized_debt)
tibble(
statistic = c(
"Mean",
"Median",
"10% trimmed mean",
"10% winsorized mean"
),
value = c(
mean_debt,
median_debt,
trimmed_mean,
winsorized_mean
)
)
## # A tibble: 4 × 2
## statistic value
## <chr> <dbl>
## 1 Mean 2914.
## 2 Median 1833
## 3 10% trimmed mean 2228.
## 4 10% winsorized mean 2463.
q1 <- quantile(debt, 0.25)
q3 <- quantile(debt, 0.75)
iqr <- q3 - q1
lower_fence <- q1 - 1.5 * iqr
upper_fence <- q3 + 1.5 * iqr
flagged <- debt < lower_fence | debt > upper_fence
fraction_flagged <- mean(flagged)
tibble(
statistic = c(
"Q1",
"Q3",
"IQR",
"Lower fence",
"Upper fence",
"Fraction flagged"
),
value = format(c(
q1,
q3,
iqr,
lower_fence,
upper_fence,
fraction_flagged
), scientific = FALSE)
)
## # A tibble: 6 × 2
## statistic value
## <chr> <chr>
## 1 Q1 " 970.00000000"
## 2 Q3 " 3489.00000000"
## 3 IQR " 2519.00000000"
## 4 Lower fence "-2808.50000000"
## 5 Upper fence " 7267.50000000"
## 6 Fraction flagged " 0.07533914"
Approximately 7.534% of observations are flagged as outside the IQR fence, but since the lower fence is negative, these all must be above the upper fence.
options(scipen = 999)
ggplot(data.frame(debt), aes(x = debt)) +
geom_histogram(bins = 40) +
coord_cartesian(xlim = c(0, 10000)) +
labs(
title = "Distribution of Credit Card Debt",
x = "Credit Card Debt (USD)",
y = "Frequency"
)
tibble(
statistic = c("90th percentile", "95th percentile", "99th percentile"),
value = format(quantile(debt, c(0.90, 0.95, 0.99), na.rm = TRUE), scientific = FALSE)
)
## # A tibble: 3 × 2
## statistic value
## <chr> <chr>
## 1 90th percentile " 6257.0"
## 2 95th percentile " 8885.5"
## 3 99th percentile "17356.0"
log_debt <- log(debt)
ggplot(data.frame(log_debt), aes(x = log_debt)) +
geom_histogram(bins = 40) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Distribution of Log Credit Card Debt",
x = "log(Credit Card Debt)",
y = "Frequency"
)
Comparing the raw and log histograms, the immediate difference is that
the log transformation shifts the distribution into being approximately
symmetric and bell-shaped. The original (raw) histogram showed a
distribution that is strongly right-skewed with a long tail of large
values, where this log transformation removes that strong tail. This
shows that the log transformation reduced the impact of those extremely
high (right-skewed) values and compresses the right tail. This makes the
data on the log scale much more closely resemble a normal distribution,
making it more suitable for methods that assume normal
distributions.
tibble(
statistic = c(
"Mean",
"Median",
"10% trimmed mean",
"10% winsorized mean",
"Q1",
"Q3",
"IQR",
"Upper fence",
"Fraction flagged",
"90th percentile",
"95th percentile",
"99th percentile"
),
value = format(c(
mean_debt,
median_debt,
trimmed_mean,
winsorized_mean,
q1,
q3,
iqr,
upper_fence,
fraction_flagged,
quantile(debt, 0.90),
quantile(debt, 0.95),
quantile(debt, 0.99)
), scientific = FALSE)
) %>% print(n = 12)
## # A tibble: 12 × 2
## statistic value
## <chr> <chr>
## 1 Mean " 2914.42724260"
## 2 Median " 1833.00000000"
## 3 10% trimmed mean " 2227.72448257"
## 4 10% winsorized mean " 2463.07938382"
## 5 Q1 " 970.00000000"
## 6 Q3 " 3489.00000000"
## 7 IQR " 2519.00000000"
## 8 Upper fence " 7267.50000000"
## 9 Fraction flagged " 0.07533914"
## 10 90th percentile " 6257.00000000"
## 11 95th percentile " 8885.50000000"
## 12 99th percentile "17356.00000000"
Looking at which summary best represents typical credit card debt, the median (1833) value gives us the best representation of the “typical” credit card debt as the distribution is strongly right-skewed so the mean is inflated by the extremely large values. This heavy right tail also explains why the mean (2914.43) is larger than the median (1833), the small number of very large debt values pulls this mean up drastically. This is consistent with the large gap between the 90th, 95th, and 99th percentiles, showing there is a presence of extremely high observations. A log scale is more informative when the data is highly skewed, as it compresses that long right tail, making the distribution more symmetric and reducing the dominance of the large values. Because this log distribution normalizes the data, we are able to make more meaningful comparisons, compared to the raw distribution, that aren’t poisoned by the rare extreme values that distort the raw distribution’s scale.