#Creating a dataset
set.seed(123)
n <- 200
data <- data.frame(
id = 1:n,
age = round(rnorm(n, 55, 10)),
gender = factor(sample(c("Male", "Female"), n, replace = TRUE)),
bmi = round(rnorm(n, 27, 4), 1),
treatment = factor(sample(c("Drug", "Placebo"), n, replace = TRUE)),
diabetes = factor(sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.3, 0.7))),
smoker = factor(sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.25, 0.75)))
)
# Baseline SBP
data$sbp_baseline <- round(rnorm(n, 150, 15))
# Treatment effect (Drug reduces BP more)
data$sbp_3m <- data$sbp_baseline -
ifelse(data$treatment == "Drug",
rnorm(n, 15, 5),
rnorm(n, 5, 5))
# Survival data
data$time_to_event <- round(rexp(n, 0.05), 1)
data$event <- rbinom(n, 1, 0.3)
head(data)
## id age gender bmi treatment diabetes smoker sbp_baseline sbp_3m
## 1 1 49 Male 24.1 Drug Yes No 155 144.9790
## 2 2 53 Male 24.0 Placebo Yes No 140 138.7876
## 3 3 71 Male 23.2 Placebo No No 163 153.7424
## 4 4 56 Male 22.8 Drug Yes No 167 152.6609
## 5 5 56 Male 25.3 Placebo No No 154 145.8488
## 6 6 72 Male 28.3 Drug Yes Yes 152 131.7971
## time_to_event event
## 1 29.9 0
## 2 2.3 0
## 3 14.1 0
## 4 45.1 0
## 5 10.5 0
## 6 4.3 0
str(data)
## 'data.frame': 200 obs. of 11 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num 49 53 71 56 56 72 60 42 48 51 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 1 2 1 2 ...
## $ bmi : num 24.1 24 23.2 22.8 25.3 28.3 18.9 27.8 31.9 35.2 ...
## $ treatment : Factor w/ 2 levels "Drug","Placebo": 1 2 2 1 2 1 1 2 2 1 ...
## $ diabetes : Factor w/ 2 levels "No","Yes": 2 2 1 2 1 2 1 1 2 1 ...
## $ smoker : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 2 1 2 ...
## $ sbp_baseline : num 155 140 163 167 154 152 149 182 154 148 ...
## $ sbp_3m : num 145 139 154 153 146 ...
## $ time_to_event: num 29.9 2.3 14.1 45.1 10.5 4.3 20.5 8.2 5.4 13.1 ...
## $ event : int 0 0 0 0 0 0 0 0 0 0 ...
summary(data)
## id age gender bmi treatment
## Min. : 1.00 Min. :32.00 Female:107 Min. :16.40 Drug :101
## 1st Qu.: 50.75 1st Qu.:49.00 Male : 93 1st Qu.:24.70 Placebo: 99
## Median :100.50 Median :54.00 Median :27.45
## Mean :100.50 Mean :54.91 Mean :27.14
## 3rd Qu.:150.25 3rd Qu.:61.00 3rd Qu.:29.90
## Max. :200.00 Max. :87.00 Max. :37.30
## diabetes smoker sbp_baseline sbp_3m time_to_event
## No :146 No :149 Min. :112.0 Min. : 83.01 Min. : 0.00
## Yes: 54 Yes: 51 1st Qu.:140.0 1st Qu.:129.44 1st Qu.: 6.05
## Median :150.5 Median :141.09 Median :12.80
## Mean :150.6 Mean :140.46 Mean :18.39
## 3rd Qu.:160.0 3rd Qu.:151.94 3rd Qu.:27.50
## Max. :190.0 Max. :178.45 Max. :85.20
## event
## Min. :0.00
## 1st Qu.:0.00
## Median :0.00
## Mean :0.29
## 3rd Qu.:1.00
## Max. :1.00
#Event is an integer. Convert to factor
data$event <- factor(data$event, levels = c(0,1))
#Descriptive statistics #Get mean and SD of age in two different ways
mean(data$age)
## [1] 54.915
sd(data$age)
## [1] 9.443712
summary(data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.00 49.00 54.00 54.91 61.00 87.00
#Categorical data #Information about gender, treatment and event
table(data$gender)
##
## Female Male
## 107 93
#or for proportion:
prop.table(table(data$gender))
##
## Female Male
## 0.535 0.465
#For proportion of participants who got treatment
prop.table(table(data$treatment))
##
## Drug Placebo
## 0.505 0.495
#For proportion of participants who had an event
prop.table(table(data$event))
##
## 0 1
## 0.71 0.29
#To calculate BP reduction
data$bp_reduction <- data$sbp_baseline - data$sbp_3m
summary(data$bp_reduction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.823 5.337 10.429 10.099 15.276 28.987
#Now to compare with treatment group
aggregate(bp_reduction ~ treatment, data, mean)
## treatment bp_reduction
## 1 Drug 15.244082
## 2 Placebo 4.850657
#Now to find if this difference is significant
t.test(bp_reduction ~ treatment, data = data)
##
## Welch Two Sample t-test
##
## data: bp_reduction by treatment
## t = 15.485, df = 196.07, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Drug and group Placebo is not equal to 0
## 95 percent confidence interval:
## 9.069703 11.717146
## sample estimates:
## mean in group Drug mean in group Placebo
## 15.244082 4.850657
#Is diabetes associated with cardiovascular events?
table(data$diabetes, data$event)
##
## 0 1
## No 103 43
## Yes 39 15
chisq.test(table(data$diabetes, data$event))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$diabetes, data$event)
## X-squared = 0.003154, df = 1, p-value = 0.9552
#What predicts BP reduction?
model1 <- lm(bp_reduction ~ age + gender + treatment + bmi, data = data)
summary(model1)
##
## Call:
## lm(formula = bp_reduction ~ age + gender + treatment + bmi, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9061 -3.4451 0.1655 2.9688 13.2712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.48804 3.27940 4.418 1.65e-05 ***
## age 0.04618 0.03598 1.284 0.201
## genderMale -0.40752 0.67574 -0.603 0.547
## treatmentPlacebo -10.42895 0.67764 -15.390 < 2e-16 ***
## bmi -0.05797 0.08406 -0.690 0.491
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.744 on 195 degrees of freedom
## Multiple R-squared: 0.5546, Adjusted R-squared: 0.5455
## F-statistic: 60.71 on 4 and 195 DF, p-value: < 2.2e-16
#What predicts cardiovascular event?
model2 <- glm(event ~ age + diabetes + smoker + treatment, data = data, family = binomial)
summary(model2)
##
## Call:
## glm(formula = event ~ age + diabetes + smoker + treatment, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.512135 0.944657 -0.542 0.588
## age -0.003171 0.016663 -0.190 0.849
## diabetesYes -0.083863 0.355609 -0.236 0.814
## smokerYes -0.231735 0.368716 -0.628 0.530
## treatmentPlacebo -0.270373 0.313689 -0.862 0.389
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 240.86 on 199 degrees of freedom
## Residual deviance: 239.63 on 195 degrees of freedom
## AIC: 249.63
##
## Number of Fisher Scoring iterations: 4
#Convert coefficients to odds ratio
exp(coef(model2))
## (Intercept) age diabetesYes smokerYes
## 0.5992148 0.9968343 0.9195571 0.7931565
## treatmentPlacebo
## 0.7630949
#not sure how to interpret this.
#Now to make a Kaplam-Meier curve
library(survival)
fit <- survfit(Surv(time_to_event, event) ~ treatment, data = data)
plot(fit, col = c("red", "blue"))
# Codes for Log-rank test and Cox regression did not work.