Step 2: Add the “Load data” section (the next small piece)

df <- read.csv("synthetic_microdata_01.csv", stringsAsFactors = FALSE)
head(df)
##   person_id    sex age age_group nationality        region usa_state
## 1         1   Male  74       65+         USA North America        NJ
## 2         2 Female  26     25-34         USA North America        TX
## 3         3 Female  18     18-24         USA North America        MA
## 4         4   Male  24     18-24         USA North America        NC
## 5         5 Female  26     25-34         USA North America     Other
## 6         6   Male  35     35-44         USA North America        NY
##          city_type   education student_status undergrad_year    major
## 1 Other large city        Grad           Grad             NA Politics
## 2              NYC        Grad           Grad             NA       CS
## 3            Rural High school    Not student             NA     <NA>
## 4 Other large city   Undergrad      Undergrad              2     Math
## 5 Other large city        Grad           Grad             NA     Econ
## 6            Rural        Grad           Grad             NA       CS
##   lives_on_campus commute_minutes transport_mode height_cm weight_kg  bmi
## 1           False            23.6         Subway     190.6      81.4 22.4
## 2           False            16.3           Walk     154.7      56.7 23.7
## 3           False            30.0            Car     160.8      61.0 23.6
## 4           False            20.3      RideShare     171.2      71.2 24.3
## 5           False            17.4           Walk     161.6      54.3 20.8
## 6           False            51.4            Car     175.7      75.6 24.5
##   exercise_days_per_week sleep_hours screen_time_hours coffee_cups_per_day
## 1                      1         7.9               5.8                   1
## 2                      2         6.7               4.6                   2
## 3                      0         6.2               3.5                   1
## 4                      3         7.0               7.6                   1
## 5                      3         7.8               4.0                   0
## 6                      2         6.9               5.8                   0
##   smoking_status handedness has_job hours_worked_per_week annual_income_usd
## 1     Non-smoker      Right   False                   0.0             10645
## 2     Non-smoker       Left   False                   0.0             15590
## 3     Non-smoker      Right    True                  44.2             46707
## 4     Non-smoker      Right   False                   0.0             10911
## 5     Non-smoker      Right   False                   0.0              9031
## 6     Non-smoker      Right   False                   0.0             16511
##   credit_card_debt_usd  gpa internship_offer clicked_suspicious_link
## 1                  283 3.79               No                      No
## 2                  601 2.96              Yes                      No
## 3                 1404   NA             <NA>                      No
## 4                 2007 3.37               No                      No
## 5                 1364 4.00              Yes                      No
## 6                  902 3.65              Yes                      No
##   relationship_status
## 1              Single
## 2              Single
## 3              Single
## 4              Single
## 5              Single
## 6             Married

###Part (A)

left_count  <- sum(df$handedness == "Left", na.rm = TRUE)
right_count <- sum(df$handedness == "Right", na.rm = TRUE)

left_count
## [1] 120312
right_count
## [1] 1079688

What this does (quick)

df$handedness == "Left" creates a TRUE/FALSE list sum(...) counts TRUEs (Lefts) na.rm = TRUE ignores missing values (safe habit)

###Part (B)

check_independence <- function(sub_df, tol = 0.01) {
  A_hat  <- mean(sub_df$handedness == "Left", na.rm = TRUE)
  B_hat  <- mean(sub_df$sex == "Male", na.rm = TRUE)
  AB_hat <- mean((sub_df$handedness == "Left") &
                 (sub_df$sex == "Male"), na.rm = TRUE)

  diff <- abs(AB_hat - A_hat * B_hat)

  if (diff <= tol) {
    return("independent")
  } else {
    return("not independent")
  }
}
false_alarm_count <- function(n, num_trials = 100, tol = 0.01) {
  wrong <- 0
  
  for (i in 1:num_trials) {
    idx <- sample(1:nrow(df), size = n, replace = FALSE)
    sub_df <- df[idx, ]
    
    decision <- check_independence(sub_df, tol)
    
    if (decision == "not independent") {
      wrong <- wrong + 1
    }
  }
  
  return(wrong)
}

Part C

set.seed(123)

n_values <- c(200, 500, 800, 1000, 1500, 2000)

false_alarms <- sapply(n_values, false_alarm_count)

results <- data.frame(
  n = n_values,
  wrong_conclusions_out_of_100 = false_alarms
)

results
##      n wrong_conclusions_out_of_100
## 1  200                           33
## 2  500                            9
## 3  800                            1
## 4 1000                            1
## 5 1500                            1
## 6 2000                            0

Part D As the subsample n increases, the number of “wrong conclusions” decreases drastically. This is because, when n is small, the variability in sampling is large, so the sample’s Phat(A), Phat(B), and Phat(A and B) estimates can deviate greatly from their true population values, causing the decision rule to incorrectly conclude that handedness and sex are not independent. As the subsample n grows, these samples become more stable and closer to the true probabilities, making the quantity [Phat(A and B) - Phat(A)*Phat(B)] smaller. Therefore, the decision rule is much less likely to produce the wrong conclusion as the samplesize n grows larger, closer to the full dataframe size.