# Create a large simulated dataset
set.seed(123)
df <- data.frame(
id = 1:500,
age = round(rnorm(500, mean = 35, sd = 10)),
income = round(rnorm(500, mean = 50000, sd = 15000)),
height = round(rnorm(500, mean = 170, sd = 10), 1),
weight = round(rnorm(500, mean = 70, sd = 15), 1),
exam_score = round(rnorm(500, mean = 75, sd = 12)),
study_hours = round(rnorm(500, mean = 20, sd = 8)),
gender = sample(c("Male", "Female"), 500, replace = TRUE),
region = sample(c("North", "South", "East", "West"), 500, replace = TRUE),
education = factor(
sample(c("High School", "Bachelor", "Master", "PhD"), 500, replace = TRUE),
levels = c("High School", "Bachelor", "Master", "PhD"),
ordered = TRUE
),
purchased = sample(c("Yes", "No"), 500, replace = TRUE, prob = c(0.4, 0.6)),
blood_type = sample(c("A", "B", "AB", "O"), 500, replace = TRUE),
satisfaction = factor(
sample(c("Low", "Medium", "High"), 500, replace = TRUE),
levels = c("Low", "Medium", "High"),
ordered = TRUE
)
)
# Add age groups and experience (used in later examples)
df <- df %>%
mutate(
age_group = case_when(
age < 30 ~ "Young",
age < 50 ~ "Middle",
TRUE ~ "Older"
),
experience = pmax(0, age - 22)
)
head(df)