Create simulation dataset for ICAS project

1. Numerical

1.1 Normal distribution (BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc, LDLp, LargeLDLp, smallLDLp)

BMI <- abs(round(rnorm(711, 23.6, 2.87), 2))
SBP <- abs(round(rnorm(711, 135.57, 18.02), 2))
DBP <- abs(round(rnorm(711, 80.06, 10.79), 2))
eGFR <- abs(round(rnorm(711, 74.34, 10.16), 2))
Tchol <- abs(round(rnorm(711, 210.69, 32.57), 2))
HDLc <- abs(round(rnorm(711, 59.38, 17.05), 2))
LDLc <- abs(round(rnorm(711, 126.61, 30.8), 2))
LDLp <- abs(round(rnorm(711, 1299.71, 392.13), 2))
LargeLDLp <- abs(round(rnorm(711, 628.4, 298.54), 2))
smallLDLp <- abs(round(rnorm(711, 549.31, 419.21), 2))

1.2 Skewness (age, HbA1C, Glucose, CRP, TG, IDLp)

Age <- round(runif(711, 40, 79), 2)
hist(Age)

HbA1c <- round(runif(711, 3.9, 10.1), 2)  
hist(HbA1c)

Glucose <- round(runif(711, 61, 263), 2)
CRP <- round(runif(711, 0, 65.3), 2)

###
# Set median and quartiles
med <- 104
q1 <- 76
q3 <- 150

# Compute parameters for lognormal distribution
mu <- log(med)
sigma <- (log(q3) - log(q1)) / (2 * qnorm(0.75))

# Generate right-skewed distribution
set.seed(2019)  # Set random seed for reproducibility
TG <- rlnorm(711, meanlog = mu, sdlog = sigma)

# Check summary statistics
summary(TG)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.68   71.15   97.07  112.51  139.73  538.55
# In this example, we first set the desired median and quartiles using the variables 
# med, q1, and q3. To compute the parameters for a lognormal distribution 
# that has this median and quartiles, we use the formulas 
# mu = log(med) and sigma = (log(q3) - log(q1)) / (2 * qnorm(0.75)). 
# The qnorm(0.75) function computes the value of 
# the normal distribution's inverse cumulative distribution function at the 75th percentile, 
# which is used to scale the value of sigma. Finally, we generate a sample of 711 observations 
# from the lognormal distribution using the rlnorm() function, 
# with the meanlog parameter set to mu and the sdlog parameter set to sigma. 
# We then check the summary statistics of the generated distribution using the summary() function. 
# The resulting distribution will be right-skewed.

IDLp <- runif(711, 0, 654)
hist(IDLp)

2. Categorical vaiables (Smsts, Drinksts, Hypertension, DM, Dyslip)

2.1 For smoking status

# Set number of participants
n <- 711

# Set percentages for each group
current_pct <- 0.295
ex_pct <- 0.509
never_pct <- 0.1965

# Calculate counts for each group
current_count <- round(current_pct * n)
ex_count <- round(ex_pct * n)
never_count <- n - current_count - ex_count

# Create vector of smoking status labels
smsts <- rep(c("current", "ex", "never"), c(current_count, ex_count, never_count))

# Check frequency table
table(smsts)  
## smsts
## current      ex   never 
##     210     362     139

2.2 For Drinksts

groups <- c("current", "ex", "never")
drinksts <- sample(x = groups, size = 711, replace = T, prob = c(79.18/100, 5.2/100, 15.62/100))
table(drinksts) 
## drinksts
## current      ex   never 
##     567      33     111

2.3 Hypertension

hypgroup <- c("Yes", "No")
Hyper <- sample(x = hypgroup, size = 711, replace = T, prob = c(52.6, 47.4))
table(Hyper)
## Hyper
##  No Yes 
## 347 364

2.4 Diabetes mellitus

dmgroup <- c("Yes", "No")
DM <- sample(x = dmgroup, size = 711, replace = T, prob = c(21.8, 78.2))
table(DM)
## DM
##  No Yes 
## 557 154

2.5 Dislip

dysgroup <- c("Yes", "No")
Dyslip <- sample(x = dmgroup, size = 711, replace = T, prob = c(55.8, 44.2))
table(Dyslip)
## Dyslip
##  No Yes 
## 332 379

2.6 ICAS

ICASgroup <- c("Yes", "No")
ICAS <- sample(x = dmgroup, size = 711, replace = T, prob = c(28.8, 71.2))
table(ICAS)
## ICAS
##  No Yes 
## 505 206

3. create data set

data <- data.frame(BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc, LDLp, LargeLDLp, smallLDLp, 
                   Age, HbA1c, Glucose, CRP, TG, IDLp, smsts, drinksts, Hyper, DM, Dyslip)