df <- read.csv("synthetic_microdata_01.csv", stringsAsFactors = FALSE)
head(df)
## person_id sex age age_group nationality region usa_state
## 1 1 Male 74 65+ USA North America NJ
## 2 2 Female 26 25-34 USA North America TX
## 3 3 Female 18 18-24 USA North America MA
## 4 4 Male 24 18-24 USA North America NC
## 5 5 Female 26 25-34 USA North America Other
## 6 6 Male 35 35-44 USA North America NY
## city_type education student_status undergrad_year major
## 1 Other large city Grad Grad NA Politics
## 2 NYC Grad Grad NA CS
## 3 Rural High school Not student NA <NA>
## 4 Other large city Undergrad Undergrad 2 Math
## 5 Other large city Grad Grad NA Econ
## 6 Rural Grad Grad NA CS
## lives_on_campus commute_minutes transport_mode height_cm weight_kg bmi
## 1 False 23.6 Subway 190.6 81.4 22.4
## 2 False 16.3 Walk 154.7 56.7 23.7
## 3 False 30.0 Car 160.8 61.0 23.6
## 4 False 20.3 RideShare 171.2 71.2 24.3
## 5 False 17.4 Walk 161.6 54.3 20.8
## 6 False 51.4 Car 175.7 75.6 24.5
## exercise_days_per_week sleep_hours screen_time_hours coffee_cups_per_day
## 1 1 7.9 5.8 1
## 2 2 6.7 4.6 2
## 3 0 6.2 3.5 1
## 4 3 7.0 7.6 1
## 5 3 7.8 4.0 0
## 6 2 6.9 5.8 0
## smoking_status handedness has_job hours_worked_per_week annual_income_usd
## 1 Non-smoker Right False 0.0 10645
## 2 Non-smoker Left False 0.0 15590
## 3 Non-smoker Right True 44.2 46707
## 4 Non-smoker Right False 0.0 10911
## 5 Non-smoker Right False 0.0 9031
## 6 Non-smoker Right False 0.0 16511
## credit_card_debt_usd gpa internship_offer clicked_suspicious_link
## 1 283 3.79 No No
## 2 601 2.96 Yes No
## 3 1404 NA <NA> No
## 4 2007 3.37 No No
## 5 1364 4.00 Yes No
## 6 902 3.65 Yes No
## relationship_status
## 1 Single
## 2 Single
## 3 Single
## 4 Single
## 5 Single
## 6 Married
###Part (A)
left_count <- sum(df$handedness == "Left", na.rm = TRUE)
right_count <- sum(df$handedness == "Right", na.rm = TRUE)
left_count
## [1] 120312
right_count
## [1] 1079688
df$handedness == "Left" creates a TRUE/FALSE list
sum(...) counts TRUEs (Lefts) na.rm = TRUE
ignores missing values (safe habit)
###Part (B)
check_independence <- function(sub_df, tol = 0.01) {
A_hat <- mean(sub_df$handedness == "Left", na.rm = TRUE)
B_hat <- mean(sub_df$sex == "Male", na.rm = TRUE)
AB_hat <- mean((sub_df$handedness == "Left") &
(sub_df$sex == "Male"), na.rm = TRUE)
diff <- abs(AB_hat - A_hat * B_hat)
if (diff <= tol) {
return("independent")
} else {
return("not independent")
}
}
false_alarm_count <- function(n, num_trials = 100, tol = 0.01) {
wrong <- 0
for (i in 1:num_trials) {
idx <- sample(1:nrow(df), size = n, replace = FALSE)
sub_df <- df[idx, ]
decision <- check_independence(sub_df, tol)
if (decision == "not independent") {
wrong <- wrong + 1
}
}
return(wrong)
}
Part C
set.seed(123)
n_values <- c(200, 500, 800, 1000, 1500, 2000)
false_alarms <- sapply(n_values, false_alarm_count)
results <- data.frame(
n = n_values,
wrong_conclusions_out_of_100 = false_alarms
)
results
## n wrong_conclusions_out_of_100
## 1 200 33
## 2 500 9
## 3 800 1
## 4 1000 1
## 5 1500 1
## 6 2000 0
Part D As the subsample n increases, the number of “wrong conclusions” decreases drastically. This is because, when n is small, the variability in sampling is large, so the sample’s Phat(A), Phat(B), and Phat(A and B) estimates can deviate greatly from their true population values, causing the decision rule to incorrectly conclude that handedness and sex are not independent. As the subsample n grows, these samples become more stable and closer to the true probabilities, making the quantity [Phat(A and B) - Phat(A)*Phat(B)] smaller. Therefore, the decision rule is much less likely to produce the wrong conclusion as the samplesize n grows larger, closer to the full dataframe size.