Create simulation dataset for ICAS project
1. Numerical
1.1 Normal distribution (BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc,
LDLp, LargeLDLp, smallLDLp)
BMI <- abs(round(rnorm(711, 23.6, 2.87), 2))
SBP <- abs(round(rnorm(711, 135.57, 18.02), 2))
DBP <- abs(round(rnorm(711, 80.06, 10.79), 2))
eGFR <- abs(round(rnorm(711, 74.34, 10.16), 2))
Tchol <- abs(round(rnorm(711, 210.69, 32.57), 2))
HDLc <- abs(round(rnorm(711, 59.38, 17.05), 2))
LDLc <- abs(round(rnorm(711, 126.61, 30.8), 2))
LDLp <- abs(round(rnorm(711, 1299.71, 392.13), 2))
LargeLDLp <- abs(round(rnorm(711, 628.4, 298.54), 2))
smallLDLp <- abs(round(rnorm(711, 549.31, 419.21), 2))
1.2 Skewness (age, HbA1C, Glucose, CRP, TG, IDLp)
Age <- round(runif(711, 40, 79), 2)
hist(Age)

HbA1c <- round(runif(711, 3.9, 10.1), 2)
hist(HbA1c)

Glucose <- round(runif(711, 61, 263), 2)
CRP <- round(runif(711, 0, 65.3), 2)
###
# Set median and quartiles
med <- 104
q1 <- 76
q3 <- 150
# Compute parameters for lognormal distribution
mu <- log(med)
sigma <- (log(q3) - log(q1)) / (2 * qnorm(0.75))
# Generate right-skewed distribution
set.seed(2019) # Set random seed for reproducibility
TG <- rlnorm(711, meanlog = mu, sdlog = sigma)
# Check summary statistics
summary(TG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.68 71.15 97.07 112.51 139.73 538.55
# In this example, we first set the desired median and quartiles using the variables
# med, q1, and q3. To compute the parameters for a lognormal distribution
# that has this median and quartiles, we use the formulas
# mu = log(med) and sigma = (log(q3) - log(q1)) / (2 * qnorm(0.75)).
# The qnorm(0.75) function computes the value of
# the normal distribution's inverse cumulative distribution function at the 75th percentile,
# which is used to scale the value of sigma. Finally, we generate a sample of 711 observations
# from the lognormal distribution using the rlnorm() function,
# with the meanlog parameter set to mu and the sdlog parameter set to sigma.
# We then check the summary statistics of the generated distribution using the summary() function.
# The resulting distribution will be right-skewed.
IDLp <- runif(711, 0, 654)
hist(IDLp)

2. Categorical vaiables (Smsts, Drinksts, Hypertension, DM,
Dyslip)
2.1 For smoking status
# Set number of participants
n <- 711
# Set percentages for each group
current_pct <- 0.295
ex_pct <- 0.509
never_pct <- 0.1965
# Calculate counts for each group
current_count <- round(current_pct * n)
ex_count <- round(ex_pct * n)
never_count <- n - current_count - ex_count
# Create vector of smoking status labels
smsts <- rep(c("current", "ex", "never"), c(current_count, ex_count, never_count))
# Check frequency table
table(smsts)
## smsts
## current ex never
## 210 362 139
2.2 For Drinksts
groups <- c("current", "ex", "never")
drinksts <- sample(x = groups, size = 711, replace = T, prob = c(79.18/100, 5.2/100, 15.62/100))
table(drinksts)
## drinksts
## current ex never
## 567 33 111
2.3 Hypertension
hypgroup <- c("Yes", "No")
Hyper <- sample(x = hypgroup, size = 711, replace = T, prob = c(52.6, 47.4))
table(Hyper)
## Hyper
## No Yes
## 347 364
2.4 Diabetes mellitus
dmgroup <- c("Yes", "No")
DM <- sample(x = dmgroup, size = 711, replace = T, prob = c(21.8, 78.2))
table(DM)
## DM
## No Yes
## 557 154
2.5 Dislip
dysgroup <- c("Yes", "No")
Dyslip <- sample(x = dmgroup, size = 711, replace = T, prob = c(55.8, 44.2))
table(Dyslip)
## Dyslip
## No Yes
## 332 379
2.6 ICAS
ICASgroup <- c("Yes", "No")
ICAS <- sample(x = dmgroup, size = 711, replace = T, prob = c(28.8, 71.2))
table(ICAS)
## ICAS
## No Yes
## 505 206
3. create data set
data <- data.frame(BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc, LDLp, LargeLDLp, smallLDLp,
Age, HbA1c, Glucose, CRP, TG, IDLp, smsts, drinksts, Hyper, DM, Dyslip)