Data 604 Week 5 Assignment

Instructions

1. Select a dataset from an open data portal like https://data.gov, and download the dataset in CSV format. Try to use a dataset no larger than 1,000 records. If larger, filter to use only the first 1,000 records.

2. Create a “profile” for the real dataset using descriptive statistics, and charts to analyze trends and behavior. Save the file in Excel format.

3. Use Excel, Python or R to create a test dataset based on the fields of the real dataset selected. Replicate attributes of the fields and define a range of values that resembles the real dataset.

4. Download the fake dataset in CSV format. The program will allow you to create a dummy file up to 1,000 records.

5. Replicate the profile on the fake dataset, and compare if it resembles the real dataset.

6. Analyze how you can improve the accuracy of a fake dataset to simulate real values in the scenario selected. Try to apply the recommended measures or techniques in a new version of your fake dataset and compare again with the real dataset.

Deliverable: zip file with real & fake datasets, and document with analysis.

Select a dataset

dir.create("output", showWarnings = FALSE)
dir.create("output/charts", showWarnings = FALSE)

I selected an insurance dataset (https://www.kaggle.com/datasets/yasserh/insurance-claim-dataset/data.). It contains 8 variables: - age: age of the insured person - sex: gender - bmi: body mass index - children: number of children covered - smoker: smoking status - region: residential region - charges: insurance charges (response variable) - insuranceclaim: claim outcome

This dataset was chosen because it combines categorical, numeric, and binary variables, which makes it a good test case for profiling and simulation.

insurance_data <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-622/refs/heads/main/Insurance%202.csv")
head(insurance_data)

##   age sex    bmi children smoker region   charges insuranceclaim
## 1  19   0 27.900        0      1      3 16884.924              1
## 2  18   1 33.770        1      0      2  1725.552              1
## 3  28   1 33.000        3      0      2  4449.462              0
## 4  33   1 22.705        0      0      1 21984.471              0
## 5  32   1 28.880        0      0      1  3866.855              1
## 6  31   0 25.740        0      0      2  3756.622              0

Real dataset profile

real <- insurance_data |>
janitor::clean_names() |>
mutate(
sex = factor(sex, levels = sort(unique(sex)), labels = paste0("sex_", sort(unique(sex)))),
smoker = factor(smoker, levels = sort(unique(smoker)), labels = c("no","yes")[seq_along(sort(unique(smoker)))]),
region = factor(region, levels = sort(unique(region)), labels = paste0("region_", sort(unique(region)))),
insuranceclaim = factor(insuranceclaim, levels = sort(unique(insuranceclaim)), labels = c("no","yes")[seq_along(sort(unique(insuranceclaim)))])
 )

real <- real |> slice_head(n = 1000)

profile_numeric <- real |>
select(where(is.numeric)) |>
pivot_longer(everything(), names_to="variable", values_to="value") |>
group_by(variable) |>
summarise(
n=sum(!is.na(value)), mean=mean(value,na.rm=TRUE), sd=sd(value,na.rm=TRUE),
min=min(value,na.rm=TRUE), q25=quantile(value,.25,na.rm=TRUE),
median=median(value,na.rm=TRUE), q75=quantile(value,.75,na.rm=TRUE),
max=max(value,na.rm=TRUE), skew=moments::skewness(value,na.rm=TRUE),
.groups="drop"
)

profile_categorical <- real |>
summarise(across(where(is.factor), \(x) tibble(n_levels=n_distinct(x), missing=sum(is.na(x))))) |>
pivot_longer(everything(), names_to="variable", values_to="stats") |>
unnest_wider(stats)

writexl::write_xlsx(
list(
skim = skimr::skim(real) |> as_tibble(),
numeric_summary = profile_numeric,
categorical_overview = profile_categorical
),
"output/real_profile.xlsx"
)

ggsave("output/charts/hist_age.png", ggplot(real, aes(age))+geom_histogram(bins=25)+ggtitle("Age"), width=6,height=4,dpi=120)
ggsave("output/charts/hist_bmi.png", ggplot(real, aes(bmi))+geom_histogram(bins=25)+ggtitle("BMI"), width=6,height=4,dpi=120)
ggsave("output/charts/hist_charges.png", ggplot(real, aes(charges))+geom_histogram(bins=30)+ggtitle("Charges"), width=6,height=4,dpi=120)
ggsave("output/charts/hist_log_charges.png", ggplot(real, aes(log1p(charges)))+geom_histogram(bins=30)+ggtitle("log(1+Charges)"), width=6,height=4,dpi=120)
ggsave("output/charts/box_charges_by_smoker.png", ggplot(real, aes(smoker, charges))+geom_boxplot()+ggtitle("Charges by Smoker"), width=6,height=4,dpi=120)
ggsave("output/charts/bar_region.png",
real |> count(region) |> ggplot(aes(reorder(region,n), n))+geom_col()+coord_flip()+xlab("region")+ggtitle("Region counts"),
width=6,height=4,dpi=120)

getwd()

## [1] "/Users/ulianaplotnikova/Desktop/Data 604/Week 5 Assignment "

The dataset was profiled to understand its structure, distributions, and variability.

For numeric variables (age, bmi, children, charges), I calculated mean, standard deviation, quartiles, min, and max.
For categorical variables (sex, smoker, region, insuranceclaim), I summarized the number of unique categories and their frequencies.

I also created charts:

Histograms for age, BMI, and charges (both raw and log-transformed).
Boxplots of charges by smoker status.
Bar plots for region counts.

These profiles give insight into skewness (e.g., charges are right-skewed) and imbalances (smoking has fewer “yes” than “no”).

Create a test dataset

set.seed(42)
n <- nrow(real)

sample_factor_like <- function(x, n){
probs <- prop.table(table(x, useNA="ifany"))
out <- sample(names(probs), n, TRUE, as.numeric(probs))
factor(out, levels=levels(x))
}

# numeric generators tuned to this file
gen_children <- function(x,n){ # count-like
probs <- prop.table(table(x))
sample(as.integer(names(probs)), n, TRUE, as.numeric(probs))
}
gen_age <- function(x,n){
r <- range(x, na.rm=TRUE)
out <- round(rnorm(n, mean(x,na.rm=TRUE), sd(x,na.rm=TRUE)))
pmin(pmax(out, r[1]), r[2])
}
gen_bmi <- function(x,n){
r <- range(x, na.rm=TRUE)
out <- rnorm(n, mean(x,na.rm=TRUE), sd(x,na.rm=TRUE))
pmin(pmax(out, r[1]), r[2])
}
gen_charges_lognormal <- function(x,n){
val <- x[x>0 & !is.na(x)]
mu <- mean(log(val)); sigma <- sd(log(val))
out <- rlnorm(n, mu, sigma)
pmin(pmax(out, min(val)), max(val))
}

fake <- tibble(
age = gen_age(real$age, n),
sex = sample_factor_like(real$sex, n),
bmi = gen_bmi(real$bmi, n),
children = gen_children(real$children, n),
smoker = sample_factor_like(real$smoker, n),
region = sample_factor_like(real$region, n),
insuranceclaim = sample_factor_like(real$insuranceclaim, n)
)
fake$charges <- gen_charges_lognormal(real$charges, n)

readr::write_csv(real, "output/real_insurance.csv")
readr::write_csv(fake, "output/fake_insurance.csv")

In this step, I generated a synthetic dataset that replicates the structure of the real dataset:

Categorical variables (sex, smoker, region, insuranceclaim) were generated by sampling from the observed distributions.
Numeric variables were generated with different strategies:
age and bmi ~ normal distribution using mean and sd from real data
children ~ sampling integer counts
charges ~ log-normal distribution to better capture skewness

This ensures the fake dataset looks statistically similar to the real dataset, while not containing any actual individuals.

Compare real vs fake

compare_num <- function(df1, df2, cols){
bind_rows(
df1 |> select(all_of(cols)) |> pivot_longer(everything(), names_to="var", values_to="val") |> mutate(src="real"),
df2 |> select(all_of(cols)) |> pivot_longer(everything(), names_to="var", values_to="val") |> mutate(src="fake")
) |>
group_by(src, var) |>
summarise(mean=mean(val,na.rm=TRUE), sd=sd(val,na.rm=TRUE),
median=median(val,na.rm=TRUE), min=min(val,na.rm=TRUE), max=max(val,na.rm=TRUE),
.groups="drop") |>
pivot_wider(names_from=src, values_from=c(mean,sd,median,min,max)) |>
mutate(
mean_abs_pct_diff = abs(mean_real-mean_fake)/ifelse(mean_real==0,1,abs(mean_real)),
sd_abs_pct_diff = abs(sd_real-sd_fake)/ifelse(sd_real==0,1,abs(sd_real))
)
}

num_cols <- c("age","bmi","children","charges")
cmp_num <- compare_num(real, fake, num_cols)

overlap_top5 <- function(a,b){
ra <- names(sort(table(a), decreasing=TRUE))[1:min(5, n_distinct(a, na.rm=TRUE))]
rb <- names(sort(table(b), decreasing=TRUE))[1:min(5, n_distinct(b, na.rm=TRUE))]
length(intersect(ra,rb))/max(length(unique(na.omit(a))),1)
}

cat_cols <- c("sex","smoker","region","insuranceclaim")
cmp_cat <- tibble(variable = cat_cols,
top5_overlap_ratio = map_dbl(cat_cols, ~overlap_top5(real[[.x]], fake[[.x]])))

writexl::write_xlsx(
list(numeric_comparison = cmp_num, categorical_overlap = cmp_cat),
"output/comparison_real_vs_fake.xlsx"
)

To evaluate the similarity:

For numeric variables, I compared mean, standard deviation, min, max, and medians.
For categorical variables, I calculated the overlap ratio between the top 5 categories in real vs. fake.

The results show that the fake dataset broadly matches the real one, though charges required special handling due to heavy skew.

Improvement

improve_by_smoker <- function(real_df, fake_df){
out <- fake_df
rng <- range(real_df$charges, na.rm=TRUE)
for(gr in levels(real_df$smoker)){
r_idx <- which(real_df$smoker==gr & !is.na(real_df$charges) & real_df$charges>0)
f_idx <- which(out$smoker==gr)
if(length(r_idx)>20 && length(f_idx)>0){
val <- real_df$charges[r_idx]
mu <- mean(log(val)); sigma <- sd(log(val))
out$charges[f_idx] <- rlnorm(length(f_idx), mu, sigma)
}
}
out$charges <- pmin(pmax(out$charges, rng[1]), rng[2])
out
}
fake_improved <- improve_by_smoker(real, fake)
readr::write_csv(fake_improved, "output/fake_insurance_improved.csv")

cmp_num_improved <- compare_num(real, fake_improved, num_cols)
writexl::write_xlsx(
list(
numeric_comparison_baseline = cmp_num,
numeric_comparison_improved = cmp_num_improved,
categorical_overlap = cmp_cat
),
"output/comparison_real_vs_fake_improved.xlsx"
)

I improved the synthetic dataset by modeling charges conditional on smoking status.

Since smokers tend to have significantly higher charges, this conditional modeling reduces error in the fake dataset.

The improved synthetic dataset was closer to the real one in terms of numeric summaries for charges.