Learning from CGPT

#Creating a dataset

set.seed(123)

n <- 200

data <- data.frame(
  id = 1:n,
  age = round(rnorm(n, 55, 10)),
  gender = factor(sample(c("Male", "Female"), n, replace = TRUE)),
  bmi = round(rnorm(n, 27, 4), 1),
  treatment = factor(sample(c("Drug", "Placebo"), n, replace = TRUE)),
  diabetes = factor(sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.3, 0.7))),
  smoker = factor(sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.25, 0.75)))
)

# Baseline SBP
data$sbp_baseline <- round(rnorm(n, 150, 15))

# Treatment effect (Drug reduces BP more)
data$sbp_3m <- data$sbp_baseline -
  ifelse(data$treatment == "Drug",
         rnorm(n, 15, 5),
         rnorm(n, 5, 5))

# Survival data
data$time_to_event <- round(rexp(n, 0.05), 1)
data$event <- rbinom(n, 1, 0.3)

head(data)

##   id age gender  bmi treatment diabetes smoker sbp_baseline   sbp_3m
## 1  1  49   Male 24.1      Drug      Yes     No          155 144.9790
## 2  2  53   Male 24.0   Placebo      Yes     No          140 138.7876
## 3  3  71   Male 23.2   Placebo       No     No          163 153.7424
## 4  4  56   Male 22.8      Drug      Yes     No          167 152.6609
## 5  5  56   Male 25.3   Placebo       No     No          154 145.8488
## 6  6  72   Male 28.3      Drug      Yes    Yes          152 131.7971
##   time_to_event event
## 1          29.9     0
## 2           2.3     0
## 3          14.1     0
## 4          45.1     0
## 5          10.5     0
## 6           4.3     0

str(data)

## 'data.frame':    200 obs. of  11 variables:
##  $ id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age          : num  49 53 71 56 56 72 60 42 48 51 ...
##  $ gender       : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 1 2 1 2 ...
##  $ bmi          : num  24.1 24 23.2 22.8 25.3 28.3 18.9 27.8 31.9 35.2 ...
##  $ treatment    : Factor w/ 2 levels "Drug","Placebo": 1 2 2 1 2 1 1 2 2 1 ...
##  $ diabetes     : Factor w/ 2 levels "No","Yes": 2 2 1 2 1 2 1 1 2 1 ...
##  $ smoker       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 2 1 2 ...
##  $ sbp_baseline : num  155 140 163 167 154 152 149 182 154 148 ...
##  $ sbp_3m       : num  145 139 154 153 146 ...
##  $ time_to_event: num  29.9 2.3 14.1 45.1 10.5 4.3 20.5 8.2 5.4 13.1 ...
##  $ event        : int  0 0 0 0 0 0 0 0 0 0 ...

summary(data)

##        id              age           gender         bmi          treatment  
##  Min.   :  1.00   Min.   :32.00   Female:107   Min.   :16.40   Drug   :101  
##  1st Qu.: 50.75   1st Qu.:49.00   Male  : 93   1st Qu.:24.70   Placebo: 99  
##  Median :100.50   Median :54.00                Median :27.45                
##  Mean   :100.50   Mean   :54.91                Mean   :27.14                
##  3rd Qu.:150.25   3rd Qu.:61.00                3rd Qu.:29.90                
##  Max.   :200.00   Max.   :87.00                Max.   :37.30                
##  diabetes  smoker     sbp_baseline       sbp_3m       time_to_event  
##  No :146   No :149   Min.   :112.0   Min.   : 83.01   Min.   : 0.00  
##  Yes: 54   Yes: 51   1st Qu.:140.0   1st Qu.:129.44   1st Qu.: 6.05  
##                      Median :150.5   Median :141.09   Median :12.80  
##                      Mean   :150.6   Mean   :140.46   Mean   :18.39  
##                      3rd Qu.:160.0   3rd Qu.:151.94   3rd Qu.:27.50  
##                      Max.   :190.0   Max.   :178.45   Max.   :85.20  
##      event     
##  Min.   :0.00  
##  1st Qu.:0.00  
##  Median :0.00  
##  Mean   :0.29  
##  3rd Qu.:1.00  
##  Max.   :1.00

#Event is an integer. Convert to factor

data$event <- factor(data$event, levels = c(0,1))

#Descriptive statistics #Get mean and SD of age in two different ways

mean(data$age)

## [1] 54.915

sd(data$age)

## [1] 9.443712

summary(data$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.00   49.00   54.00   54.91   61.00   87.00

#Categorical data #Information about gender, treatment and event

table(data$gender)

## 
## Female   Male 
##    107     93

#or for proportion:
prop.table(table(data$gender))

## 
## Female   Male 
##  0.535  0.465

#For proportion of participants who got treatment
prop.table(table(data$treatment))

## 
##    Drug Placebo 
##   0.505   0.495

#For proportion of participants who had an event
prop.table(table(data$event))

## 
##    0    1 
## 0.71 0.29

#To calculate BP reduction

data$bp_reduction <- data$sbp_baseline - data$sbp_3m
summary(data$bp_reduction)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -5.823   5.337  10.429  10.099  15.276  28.987

#Now to compare with treatment group

aggregate(bp_reduction ~ treatment, data, mean)

##   treatment bp_reduction
## 1      Drug    15.244082
## 2   Placebo     4.850657

#Now to find if this difference is significant

t.test(bp_reduction ~ treatment, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  bp_reduction by treatment
## t = 15.485, df = 196.07, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Drug and group Placebo is not equal to 0
## 95 percent confidence interval:
##   9.069703 11.717146
## sample estimates:
##    mean in group Drug mean in group Placebo 
##             15.244082              4.850657

#Is diabetes associated with cardiovascular events?

table(data$diabetes, data$event)

##      
##         0   1
##   No  103  43
##   Yes  39  15

chisq.test(table(data$diabetes, data$event))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$diabetes, data$event)
## X-squared = 0.003154, df = 1, p-value = 0.9552

#What predicts BP reduction?

model1 <- lm(bp_reduction ~ age + gender + treatment + bmi, data = data)
summary(model1)

## 
## Call:
## lm(formula = bp_reduction ~ age + gender + treatment + bmi, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9061  -3.4451   0.1655   2.9688  13.2712 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       14.48804    3.27940   4.418 1.65e-05 ***
## age                0.04618    0.03598   1.284    0.201    
## genderMale        -0.40752    0.67574  -0.603    0.547    
## treatmentPlacebo -10.42895    0.67764 -15.390  < 2e-16 ***
## bmi               -0.05797    0.08406  -0.690    0.491    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.744 on 195 degrees of freedom
## Multiple R-squared:  0.5546, Adjusted R-squared:  0.5455 
## F-statistic: 60.71 on 4 and 195 DF,  p-value: < 2.2e-16

#What predicts cardiovascular event?

model2 <- glm(event ~ age + diabetes + smoker + treatment, data = data, family = binomial)
summary(model2)

## 
## Call:
## glm(formula = event ~ age + diabetes + smoker + treatment, family = binomial, 
##     data = data)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)      -0.512135   0.944657  -0.542    0.588
## age              -0.003171   0.016663  -0.190    0.849
## diabetesYes      -0.083863   0.355609  -0.236    0.814
## smokerYes        -0.231735   0.368716  -0.628    0.530
## treatmentPlacebo -0.270373   0.313689  -0.862    0.389
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 240.86  on 199  degrees of freedom
## Residual deviance: 239.63  on 195  degrees of freedom
## AIC: 249.63
## 
## Number of Fisher Scoring iterations: 4

#Convert coefficients to odds ratio

exp(coef(model2))

##      (Intercept)              age      diabetesYes        smokerYes 
##        0.5992148        0.9968343        0.9195571        0.7931565 
## treatmentPlacebo 
##        0.7630949

#not sure how to interpret this.

#Now to make a Kaplam-Meier curve

library(survival)
fit <- survfit(Surv(time_to_event, event) ~ treatment, data = data)
plot(fit, col = c("red", "blue"))

# Codes for Log-rank test and Cox regression did not work.

Learning from CGPT_1

2026-04-03