##Checking working directory
getwd()
## [1] "/Users/aruvipoomali/rsconnect/Kaggle_brainstim_dataset"
##Loading the CSV file
data <- read.csv("Allergic_Rhinitis.csv")
##Three questions: ##Does tDCS improve symptoms vs sham? ##Do biomarkers change with treatment? ##Can you predict responders? ##Load libraries ##install janitor
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, isoyear, mday, minute, month, quarter, second, wday,
## week, yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##Clean names
df <- fread("Allergic_Rhinitis.csv") |>
clean_names()
##Create change variables
df <- df |>
mutate(
il6_change = post_il6 - pre_il6,
tnf_change = post_tnf_alpha - pre_tnf_alpha,
ige_change = post_ig_e - pre_ig_e,
symptom_change = post_t_dcs_symptom_score - pre_symptom_score
)
##Check randomization balance ##We do this by making a table 1 and comparing both arms. ##Here mean age on both arms and percentage of men in both arms is taken.
df |>
group_by(group) |>
summarise(
age_mean = mean(age),
male_pct = mean(gender =="M")
)
## # A tibble: 2 × 3
## group age_mean male_pct
## <chr> <dbl> <dbl>
## 1 Sham 39.3 0.536
## 2 tDCS 39.4 0.548
##gender == “M” → returns TRUE/FALSE ##TRUE = 1, FALSE = 0 ##Mean of that = proportion of males. So this gives percentage of males (as a proportion)
##Primary outcome ##Now response is given as a binary outcome. 0 and 1. ##So the way to compare two arms is logistic regression model. #We are modelling response as a function of group, gender, and age.
model <- glm(response ~ group + age + gender,
data = df,
family = binomial)
summary(model)
##
## Call:
## glm(formula = response ~ group + age + gender, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.188974 0.380886 -3.122 0.0018 **
## grouptDCS 1.306645 0.217430 6.009 1.86e-09 ***
## age -0.012908 0.008507 -1.517 0.1292
## genderM -0.001272 0.208678 -0.006 0.9951
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 592.95 on 499 degrees of freedom
## Residual deviance: 551.48 on 496 degrees of freedom
## AIC: 559.48
##
## Number of Fisher Scoring iterations: 4
##Interpretation of results: ##Estimate = 1.3066 ##p < 0.000000001 ## Convert to odds ratio:
exp(1.306645)
## [1] 3.69376
##Interpretation of results: ##tDCS group had 3.7 times higher odds of response ##P values for age and gender are not significant. ##Odds ratio for age can also be calculated if you are interested. ##Here age and gender do not confound the effect of treatment. ##Odds ratio of intercept will give odds of response for reference group. ##can calculate odds ratio confidence intervals.
exp(cbind(OR = coef(model), confint(model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.3045335 0.1424772 0.6357274
## grouptDCS 3.6937589 2.4295365 5.7069036
## age 0.9871746 0.9707821 1.0037534
## genderM 0.9987285 0.6636545 1.5055426
##The above command converts the model into a table of odds ratios with confidence intervals.
##Now to test improvement in symptoms. While pre and post scores are available, you’d think paired t test is better. But what we are comparing is change in scores between each arm. The scores have to be continuous and normal. T-test works if the assumptions are true. ##Another test to do is ANCOVA. ANCOVA is better cos it adjusts for baseline differences. But it doesn’t analyse for change in score. It analyses post scores adjusted for various covariates.
# Fit ANCOVA model
model_ancova <- lm(post_t_dcs_symptom_score ~ group + pre_symptom_score + age + gender, data = df)
# View results
summary(model_ancova)
##
## Call:
## lm(formula = post_t_dcs_symptom_score ~ group + pre_symptom_score +
## age + gender, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8520 -1.0977 0.0125 1.2373 3.3600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.836882 0.432823 15.796 <2e-16 ***
## grouptDCS -3.304576 0.136363 -24.234 <2e-16 ***
## pre_symptom_score 0.039361 0.048185 0.817 0.414
## age -0.001152 0.005607 -0.205 0.837
## genderM -0.041903 0.136934 -0.306 0.760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.524 on 495 degrees of freedom
## Multiple R-squared: 0.5437, Adjusted R-squared: 0.54
## F-statistic: 147.4 on 4 and 495 DF, p-value: < 2.2e-16
##Checking key ANCOVA assumption
model_check <- lm(post_t_dcs_symptom_score ~ group * pre_symptom_score, data = df)
summary(model_check)
##
## Call:
## lm(formula = post_t_dcs_symptom_score ~ group * pre_symptom_score,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.892 -1.053 -0.044 1.185 3.414
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.078975 0.512104 13.823 < 2e-16 ***
## grouptDCS -3.874588 0.692552 -5.595 3.66e-08 ***
## pre_symptom_score -0.004372 0.071002 -0.062 0.951
## grouptDCS:pre_symptom_score 0.080758 0.096302 0.839 0.402
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.522 on 496 degrees of freedom
## Multiple R-squared: 0.5442, Adjusted R-squared: 0.5414
## F-statistic: 197.4 on 3 and 496 DF, p-value: < 2.2e-16
##Group:prescore slope should not be significantly different.
##Now to visualize:
ggplot(df, aes(x = group, y = symptom_change)) +
geom_boxplot() +
labs(title = "Symptom Improvement by Group")
#Comparing biomarkers
t.test(il6_change ~ group, data = df)
##
## Welch Two Sample t-test
##
## data: il6_change by group
## t = 10.551, df = 496.83, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Sham and group tDCS is not equal to 0
## 95 percent confidence interval:
## 0.9752236 1.4215590
## sample estimates:
## mean in group Sham mean in group tDCS
## 0.1172989 -1.0810924
t.test(tnf_change ~ group, data = df)
##
## Welch Two Sample t-test
##
## data: tnf_change by group
## t = 9.8951, df = 497.88, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Sham and group tDCS is not equal to 0
## 95 percent confidence interval:
## 0.902679 1.349953
## sample estimates:
## mean in group Sham mean in group tDCS
## 0.1608728 -0.9654431
t.test(ige_change ~ group, data = df)
##
## Welch Two Sample t-test
##
## data: ige_change by group
## t = 3.6641, df = 497.54, p-value = 0.0002749
## alternative hypothesis: true difference in means between group Sham and group tDCS is not equal to 0
## 95 percent confidence interval:
## 9.015895 29.863991
## sample estimates:
## mean in group Sham mean in group tDCS
## 2.191804 -17.248139
#Visualising biomarker change
ggplot(df, aes(x = group, y = il6_change)) +
geom_boxplot()
##Finding correlation between biomarker change and symptom
cor(df$il6_change, df$symptom_change)
## [1] 0.3022947
cor(df$tnf_change, df$symptom_change)
## [1] 0.2958352
cor(df$ige_change, df$symptom_change)
## [1] 0.1453829
##GPT also gave code for creating a model that would predict responders. Not using it for now.