Problem Set 3

df <- read.csv("synthetic_microdata_01.csv")

Problem 8

8 (A)

library(dplyr)

cs_econ <- df %>%
  filter(major %in% c("CS", "Econ"))

cs_econ %>%
  group_by(major) %>%
  summarise(
    sample_size = n(),
    mean_gpa = mean(gpa, na.rm = TRUE),
    sd_gpa = sd(gpa, na.rm = TRUE)
  )

## # A tibble: 2 × 4
##   major sample_size mean_gpa sd_gpa
##   <chr>       <int>    <dbl>  <dbl>
## 1 CS         166416     3.36  0.262
## 2 Econ       169098     3.42  0.260

8 (B)

cs_stats <- cs_econ %>% filter(major == "CS")
econ_stats <- cs_econ %>% filter(major == "Econ")

mean_cs <- mean(cs_stats$gpa, na.rm = TRUE)
mean_econ <- mean(econ_stats$gpa, na.rm = TRUE)

sd_cs <- sd(cs_stats$gpa, na.rm = TRUE)
sd_econ <- sd(econ_stats$gpa, na.rm = TRUE)

n_cs <- nrow(cs_stats)
n_econ <- nrow(econ_stats)

# Difference in means
delta_hat <- mean_cs - mean_econ

# Welch SE
se <- sqrt((sd_cs^2 / n_cs) + (sd_econ^2 / n_econ))

# 95% CI
lower <- delta_hat - 1.96 * se
upper <- delta_hat + 1.96 * se

part_b <- tibble(
  statistic = c("Difference in means (CS - Econ)", "Welch SE", "95% CI lower", "95% CI upper"),
  value = c(delta_hat, se, lower, upper)
)

part_b

## # A tibble: 4 × 2
##   statistic                           value
##   <chr>                               <dbl>
## 1 Difference in means (CS - Econ) -0.0642  
## 2 Welch SE                         0.000900
## 3 95% CI lower                    -0.0660  
## 4 95% CI upper                    -0.0625

8 (C)

# Define success = internship_offer == "Yes"

cs_econ <- cs_econ %>%
  mutate(success = internship_offer == "Yes")

# Proportions
prop_table <- cs_econ %>%
  group_by(major) %>%
  summarise(
    n = n(),
    success_rate = mean(success, na.rm = TRUE)
  )

prop_table

## # A tibble: 2 × 3
##   major      n success_rate
##   <chr>  <int>        <dbl>
## 1 CS    166416        0.348
## 2 Econ  169098        0.266

# Extract values
p_cs <- prop_table$success_rate[prop_table$major == "CS"]
p_econ <- prop_table$success_rate[prop_table$major == "Econ"]

# Difference and lift
diff <- p_cs - p_econ
lift <- diff / p_econ

tibble(
  statistic = c("p_CS", "p_Econ", "Difference (CS - Econ)", "Lift (CS relative to Econ)"),
  value = c(p_cs, p_econ, diff, lift)
)

## # A tibble: 4 × 2
##   statistic                   value
##   <chr>                       <dbl>
## 1 p_CS                       0.348 
## 2 p_Econ                     0.266 
## 3 Difference (CS - Econ)     0.0816
## 4 Lift (CS relative to Econ) 0.306

8 (D)

# Sample sizes
n_cs <- sum(cs_econ$major == "CS")
n_econ <- sum(cs_econ$major == "Econ")

# Proportions (already computed but reuse cleanly)
p_cs <- mean(cs_econ$success[cs_econ$major == "CS"])
p_econ <- mean(cs_econ$success[cs_econ$major == "Econ"])

# Difference
diff <- p_cs - p_econ

# --- UNPOOLED SE (for CI)
se_unpooled <- sqrt(
  (p_cs * (1 - p_cs) / n_cs) +
  (p_econ * (1 - p_econ) / n_econ)
)

# 95% CI
lower <- diff - 1.96 * se_unpooled
upper <- diff + 1.96 * se_unpooled

# --- POOLED SE (for hypothesis test)
p_pool <- (
  sum(cs_econ$success[cs_econ$major == "CS"]) +
  sum(cs_econ$success[cs_econ$major == "Econ"])
) / (n_cs + n_econ)

se_pooled <- sqrt(
  p_pool * (1 - p_pool) * (1/n_cs + 1/n_econ)
)

# z-stat
z <- diff / se_pooled

# Clean output
tibble(
  statistic = c(
    "Difference (CS - Econ)",
    "Unpooled SE",
    "95% CI lower",
    "95% CI upper",
    "Pooled proportion",
    "Pooled SE",
    "z-stat"
  ),
  value = c(diff, se_unpooled, lower, upper, p_pool, se_pooled, z)
)

## # A tibble: 7 × 2
##   statistic                 value
##   <chr>                     <dbl>
## 1 Difference (CS - Econ)  0.0816 
## 2 Unpooled SE             0.00159
## 3 95% CI lower            0.0785 
## 4 95% CI upper            0.0847 
## 5 Pooled proportion       0.307  
## 6 Pooled SE               0.00159
## 7 z-stat                 51.2

8 (E) Analysis

Looking at strictly CS and Econ majors, CS majors have a slightly lower GPA on average. CS majors average about 0.064 less GPA points than Econ majors across the data set. This difference is statistically significant in the 95% confidence interval [-0.066, -0.0625], as 0 is not included, the difference does appear statistically significant, however, this is not very practically significant because 0.064 is a very small difference in each individual GPA. Looking at the internship rates, a large difference is presented. CS majors have a higher internship-offer rate than Econ majors by about 8.16 percentage points, corresponding to about a 30.6% relative lift. This difference is statistically significant as the 95% confidence interval is [0.0785, 0.0847], and isn’t even close to including 0, so there is definitely a statistically significant difference. This difference is also practically significant, a 30.6% lift / 8.16 percentage points difference is a large noticeable difference in real life, there should be an observably higher rate of CS majors with internships compares to Econ majors. We cannot say anything causal about this relationship.

Problem 9

9 (A)

nyc_rural <- df %>%
  filter(city_type %in% c("NYC", "Rural"))

summary_stats <- nyc_rural %>%
  group_by(city_type) %>%
  summarise(
    sample_size = n(),
    mean_commute = mean(commute_minutes, na.rm = TRUE),
    median_commute = median(commute_minutes, na.rm = TRUE)
  )

summary_stats

## # A tibble: 2 × 4
##   city_type sample_size mean_commute median_commute
##   <chr>           <int>        <dbl>          <dbl>
## 1 NYC            209701         21.3           20.1
## 2 Rural          119799         38.7           32.4

tibble(
  statistic = c("Mean (Rural - NYC)", "Median (Rural - NYC)"),
  value = c(
    summary_stats$mean_commute[summary_stats$city_type == "Rural"] -
      summary_stats$mean_commute[summary_stats$city_type == "NYC"],
    summary_stats$median_commute[summary_stats$city_type == "Rural"] -
      summary_stats$median_commute[summary_stats$city_type == "NYC"]
  )
)

## # A tibble: 2 × 2
##   statistic            value
##   <chr>                <dbl>
## 1 Mean (Rural - NYC)    17.4
## 2 Median (Rural - NYC)  12.3

9 (B)

set.seed(123)

B <- 1000

boot_mean_diff <- numeric(B)
boot_median_diff <- numeric(B)

nyc <- nyc_rural$commute_minutes[nyc_rural$city_type == "NYC"]
rural <- nyc_rural$commute_minutes[nyc_rural$city_type == "Rural"]

for (b in 1:B) {
  nyc_sample <- sample(nyc, length(nyc), replace = TRUE)
  rural_sample <- sample(rural, length(rural), replace = TRUE)
  
  boot_mean_diff[b] <- mean(rural_sample) - mean(nyc_sample)
  boot_median_diff[b] <- median(rural_sample) - median(nyc_sample)
}

Report Results

boot_se_mean <- sd(boot_mean_diff)
boot_se_median <- sd(boot_median_diff)

tibble(
  statistic = c("Bootstrap SE (mean difference)", "Bootstrap SE (median difference)"),
  value = c(boot_se_mean, boot_se_median)
)

## # A tibble: 2 × 2
##   statistic                         value
##   <chr>                             <dbl>
## 1 Bootstrap SE (mean difference)   0.0859
## 2 Bootstrap SE (median difference) 0.102

9 (C)

ci_mean <- quantile(boot_mean_diff, c(0.025, 0.975))
ci_median <- quantile(boot_median_diff, c(0.025, 0.975))

tibble(
  statistic = c(
    "CI lower (mean difference)",
    "CI upper (mean difference)",
    "CI lower (median difference)",
    "CI upper (median difference)"
  ),
  value = c(ci_mean[1], ci_mean[2], ci_median[1], ci_median[2])
)

## # A tibble: 4 × 2
##   statistic                    value
##   <chr>                        <dbl>
## 1 CI lower (mean difference)    17.2
## 2 CI upper (mean difference)    17.6
## 3 CI lower (median difference)  12.2
## 4 CI upper (median difference)  12.6

9 (D)

nyc <- nyc_rural$commute_minutes[nyc_rural$city_type == "NYC"]
rural <- nyc_rural$commute_minutes[nyc_rural$city_type == "Rural"]

mean_nyc <- mean(nyc, na.rm = TRUE)
mean_rural <- mean(rural, na.rm = TRUE)

diff_mean <- mean_rural - mean_nyc

sd_nyc <- sd(nyc, na.rm = TRUE)
sd_rural <- sd(rural, na.rm = TRUE)

n_nyc <- length(nyc)
n_rural <- length(rural)

welch_se <- sqrt((sd_rural^2 / n_rural) + (sd_nyc^2 / n_nyc))

lower_formula <- diff_mean - 1.96 * welch_se
upper_formula <- diff_mean + 1.96 * welch_se

tibble(
  statistic = c(
    "Bootstrap SE (mean)",
    "Formula SE (Welch)",
    "Bootstrap CI lower",
    "Bootstrap CI upper",
    "Formula CI lower",
    "Formula CI upper"
  ),
  value = c(
    boot_se_mean,
    welch_se,
    ci_mean[1],
    ci_mean[2],
    lower_formula,
    upper_formula
  )
)

## # A tibble: 6 × 2
##   statistic             value
##   <chr>                 <dbl>
## 1 Bootstrap SE (mean)  0.0859
## 2 Formula SE (Welch)   0.0846
## 3 Bootstrap CI lower  17.2   
## 4 Bootstrap CI upper  17.6   
## 5 Formula CI lower    17.2   
## 6 Formula CI upper    17.5

Comparing the results of the formula-based method to the bootstrapping method we see very similar results. Looking at the standard errors, the bootstrap standard error for the mean difference (0.0859) lies very close to the Welch formula standard error (0.0846). Then when we compare the confidence intervals we also see they are very close to one another, the Welch formula produces a 95% confidence interval of [17.217, 17.548] while the bootstrap method gives a confidence interval of [17.220, 17.558], showing these two methods produce very similar results. The bootstrap and formula-based methods produce almost identical results, indicating that the normal approximation works very well for this problem.

9 (E)

library(ggplot2)

boot_df <- tibble(
  boot_mean_diff = boot_mean_diff,
  boot_median_diff = boot_median_diff
)

ggplot(boot_df, aes(x = boot_mean_diff)) +
  geom_histogram(bins = 40) +
  labs(
    title = "Bootstrap Distribution of Mean Difference",
    x = "Mean Difference (Rural - NYC)",
    y = "Frequency"
  )

ggplot(boot_df, aes(x = boot_median_diff)) +
  geom_histogram(bins = 15) +
  labs(
    title = "Bootstrap Distribution of Median Difference",
    x = "Median Difference (Rural - NYC)",
    y = "Frequency"
  )

When bootstrapping the distribution of mean and median difference, the resampling must be done within groups rather than from the pooled data because we are comparing two distinct groups, Rural and NYC. If we pooled the observations before resampling, we lose the ability to draw a mean and median difference from the two separate groups. The bootstrap sample if we resampled from the pooled data, would simply reflect a single combined population, rather than looking at both individually, resulting in the estimated variation not corresponding with the Rural - NYC difference we are trying to measure. By resampling within each group, we uphold the original group structure and size, ensuring the bootstrap distribution correctly approximates sampling variability of the difference in commute times.

Problem 10

10 (A)

debt <- df$credit_card_debt_usd
debt <- debt[!is.na(debt)]

mean_debt <- mean(debt)
median_debt <- median(debt)

trimmed_mean <- mean(debt, trim = 0.10)

lower_cutoff <- quantile(debt, 0.10)
upper_cutoff <- quantile(debt, 0.90)

winsorized_debt <- pmin(pmax(debt, lower_cutoff), upper_cutoff)
winsorized_mean <- mean(winsorized_debt)

tibble(
  statistic = c(
    "Mean",
    "Median",
    "10% trimmed mean",
    "10% winsorized mean"
  ),
  value = c(
    mean_debt,
    median_debt,
    trimmed_mean,
    winsorized_mean
  )
)

## # A tibble: 4 × 2
##   statistic           value
##   <chr>               <dbl>
## 1 Mean                2914.
## 2 Median              1833 
## 3 10% trimmed mean    2228.
## 4 10% winsorized mean 2463.

10 (B)

q1 <- quantile(debt, 0.25)
q3 <- quantile(debt, 0.75)

iqr <- q3 - q1

lower_fence <- q1 - 1.5 * iqr
upper_fence <- q3 + 1.5 * iqr

flagged <- debt < lower_fence | debt > upper_fence
fraction_flagged <- mean(flagged)

tibble(
  statistic = c(
    "Q1",
    "Q3",
    "IQR",
    "Lower fence",
    "Upper fence",
    "Fraction flagged"
  ),
  value = format(c(
    q1,
    q3,
    iqr,
    lower_fence,
    upper_fence,
    fraction_flagged
  ), scientific = FALSE)
)

## # A tibble: 6 × 2
##   statistic        value           
##   <chr>            <chr>           
## 1 Q1               "  970.00000000"
## 2 Q3               " 3489.00000000"
## 3 IQR              " 2519.00000000"
## 4 Lower fence      "-2808.50000000"
## 5 Upper fence      " 7267.50000000"
## 6 Fraction flagged "    0.07533914"

Approximately 7.534% of observations are flagged as outside the IQR fence, but since the lower fence is negative, these all must be above the upper fence.

10 (C)

options(scipen = 999)
ggplot(data.frame(debt), aes(x = debt)) +
  geom_histogram(bins = 40) +
  coord_cartesian(xlim = c(0, 10000)) +
  labs(
    title = "Distribution of Credit Card Debt",
    x = "Credit Card Debt (USD)",
    y = "Frequency"
  )

tibble(
  statistic = c("90th percentile", "95th percentile", "99th percentile"),
  value = format(quantile(debt, c(0.90, 0.95, 0.99), na.rm = TRUE), scientific = FALSE)
)

## # A tibble: 3 × 2
##   statistic       value    
##   <chr>           <chr>    
## 1 90th percentile " 6257.0"
## 2 95th percentile " 8885.5"
## 3 99th percentile "17356.0"

10 (D)

log_debt <- log(debt)

ggplot(data.frame(log_debt), aes(x = log_debt)) +
  geom_histogram(bins = 40) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Distribution of Log Credit Card Debt",
    x = "log(Credit Card Debt)",
    y = "Frequency"
  )

Comparing the raw and log histograms, the immediate difference is that the log transformation shifts the distribution into being approximately symmetric and bell-shaped. The original (raw) histogram showed a distribution that is strongly right-skewed with a long tail of large values, where this log transformation removes that strong tail. This shows that the log transformation reduced the impact of those extremely high (right-skewed) values and compresses the right tail. This makes the data on the log scale much more closely resemble a normal distribution, making it more suitable for methods that assume normal distributions.

10 (E)

tibble(
  statistic = c(
    "Mean",
    "Median",
    "10% trimmed mean",
    "10% winsorized mean",
    "Q1",
    "Q3",
    "IQR",
    "Upper fence",
    "Fraction flagged",
    "90th percentile",
    "95th percentile",
    "99th percentile"
  ),
  value = format(c(
    mean_debt,
    median_debt,
    trimmed_mean,
    winsorized_mean,
    q1,
    q3,
    iqr,
    upper_fence,
    fraction_flagged,
    quantile(debt, 0.90),
    quantile(debt, 0.95),
    quantile(debt, 0.99)
  ), scientific = FALSE)
) %>% print(n = 12)

## # A tibble: 12 × 2
##    statistic           value           
##    <chr>               <chr>           
##  1 Mean                " 2914.42724260"
##  2 Median              " 1833.00000000"
##  3 10% trimmed mean    " 2227.72448257"
##  4 10% winsorized mean " 2463.07938382"
##  5 Q1                  "  970.00000000"
##  6 Q3                  " 3489.00000000"
##  7 IQR                 " 2519.00000000"
##  8 Upper fence         " 7267.50000000"
##  9 Fraction flagged    "    0.07533914"
## 10 90th percentile     " 6257.00000000"
## 11 95th percentile     " 8885.50000000"
## 12 99th percentile     "17356.00000000"

Looking at which summary best represents typical credit card debt, the median (1833) value gives us the best representation of the “typical” credit card debt as the distribution is strongly right-skewed so the mean is inflated by the extremely large values. This heavy right tail also explains why the mean (2914.43) is larger than the median (1833), the small number of very large debt values pulls this mean up drastically. This is consistent with the large gap between the 90th, 95th, and 99th percentiles, showing there is a presence of extremely high observations. A log scale is more informative when the data is highly skewed, as it compresses that long right tail, making the distribution more symmetric and reducing the dominance of the large values. Because this log distribution normalizes the data, we are able to make more meaningful comparisons, compared to the raw distribution, that aren’t poisoned by the rare extreme values that distort the raw distribution’s scale.