Create simulation dataset for ICAS project

1. Numerical

1.1 Normal distribution (BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc, LDLp, LargeLDLp, smallLDLp)

BMI <- abs(round(rnorm(711, 23.6, 2.87), 2))
SBP <- abs(round(rnorm(711, 135.57, 18.02), 2))
DBP <- abs(round(rnorm(711, 80.06, 10.79), 2))
eGFR <- abs(round(rnorm(711, 74.34, 10.16), 2))
Tchol <- abs(round(rnorm(711, 210.69, 32.57), 2))
HDLc <- abs(round(rnorm(711, 59.38, 17.05), 2))
LDLc <- abs(round(rnorm(711, 126.61, 30.8), 2))
LDLp <- abs(round(rnorm(711, 1299.71, 392.13), 2))
LargeLDLp <- abs(round(rnorm(711, 628.4, 298.54), 2))
smallLDLp <- abs(round(rnorm(711, 549.31, 419.21), 2))

1.2 Skewness (age, HbA1C, Glucose, CRP, TG, IDLp)

Age <- round(runif(711, 40, 79), 2)
hist(Age)

HbA1c <- round(runif(711, 3.9, 10.1), 2)  
hist(HbA1c)

Glucose <- round(runif(711, 61, 263), 2)
CRP <- round(runif(711, 0, 65.3), 2)

###
# Set median and quartiles
med <- 104
q1 <- 76
q3 <- 150

# Compute parameters for lognormal distribution
mu <- log(med)
sigma <- (log(q3) - log(q1)) / (2 * qnorm(0.75))

# Generate right-skewed distribution
set.seed(2019)  # Set random seed for reproducibility
TG <- rlnorm(711, meanlog = mu, sdlog = sigma)

# Check summary statistics
summary(TG)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.68   71.15   97.07  112.51  139.73  538.55

# In this example, we first set the desired median and quartiles using the variables 
# med, q1, and q3. To compute the parameters for a lognormal distribution 
# that has this median and quartiles, we use the formulas 
# mu = log(med) and sigma = (log(q3) - log(q1)) / (2 * qnorm(0.75)). 
# The qnorm(0.75) function computes the value of 
# the normal distribution's inverse cumulative distribution function at the 75th percentile, 
# which is used to scale the value of sigma. Finally, we generate a sample of 711 observations 
# from the lognormal distribution using the rlnorm() function, 
# with the meanlog parameter set to mu and the sdlog parameter set to sigma. 
# We then check the summary statistics of the generated distribution using the summary() function. 
# The resulting distribution will be right-skewed.

IDLp <- runif(711, 0, 654)
hist(IDLp)

2. Categorical vaiables (Smsts, Drinksts, Hypertension, DM, Dyslip)

2.1 For smoking status

# Set number of participants
n <- 711

# Set percentages for each group
current_pct <- 0.295
ex_pct <- 0.509
never_pct <- 0.1965

# Calculate counts for each group
current_count <- round(current_pct * n)
ex_count <- round(ex_pct * n)
never_count <- n - current_count - ex_count

# Create vector of smoking status labels
smsts <- rep(c("current", "ex", "never"), c(current_count, ex_count, never_count))

# Check frequency table
table(smsts)

## smsts
## current      ex   never 
##     210     362     139

2.2 For Drinksts

groups <- c("current", "ex", "never")
drinksts <- sample(x = groups, size = 711, replace = T, prob = c(79.18/100, 5.2/100, 15.62/100))
table(drinksts)

## drinksts
## current      ex   never 
##     567      33     111

2.3 Hypertension

hypgroup <- c("Yes", "No")
Hyper <- sample(x = hypgroup, size = 711, replace = T, prob = c(52.6, 47.4))
table(Hyper)

## Hyper
##  No Yes 
## 347 364

2.4 Diabetes mellitus

dmgroup <- c("Yes", "No")
DM <- sample(x = dmgroup, size = 711, replace = T, prob = c(21.8, 78.2))
table(DM)

## DM
##  No Yes 
## 557 154

2.5 Dislip

dysgroup <- c("Yes", "No")
Dyslip <- sample(x = dmgroup, size = 711, replace = T, prob = c(55.8, 44.2))
table(Dyslip)

## Dyslip
##  No Yes 
## 332 379

2.6 ICAS

ICASgroup <- c("Yes", "No")
ICAS <- sample(x = dmgroup, size = 711, replace = T, prob = c(28.8, 71.2))
table(ICAS)

## ICAS
##  No Yes 
## 505 206

3. create data set

data <- data.frame(BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc, LDLp, LargeLDLp, smallLDLp, 
                   Age, HbA1c, Glucose, CRP, TG, IDLp, smsts, drinksts, Hyper, DM, Dyslip)

Simulation data set

Thien

2023-02-18

Create simulation dataset for ICAS project

1. Numerical

1.1 Normal distribution (BMI, SBP, DBP, eGFR, Tchol, HDLc, LDLc, LDLp, LargeLDLp, smallLDLp)

1.2 Skewness (age, HbA1C, Glucose, CRP, TG, IDLp)

2. Categorical vaiables (Smsts, Drinksts, Hypertension, DM, Dyslip)

2.1 For smoking status

2.2 For Drinksts

2.3 Hypertension

2.4 Diabetes mellitus

2.5 Dislip

2.6 ICAS

3. create data set