##Checking working directory

getwd()
## [1] "/Users/aruvipoomali/rsconnect/Kaggle_brainstim_dataset"

##Loading the CSV file

data <- read.csv("Allergic_Rhinitis.csv")

##Three questions: ##Does tDCS improve symptoms vs sham? ##Do biomarkers change with treatment? ##Can you predict responders? ##Load libraries ##install janitor

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, isoyear, mday, minute, month, quarter, second, wday,
##     week, yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

##Clean names

df <- fread("Allergic_Rhinitis.csv") |> 
  clean_names()

##Create change variables

df <- df |> 
  mutate(
    il6_change = post_il6 - pre_il6,
    tnf_change = post_tnf_alpha - pre_tnf_alpha,
    ige_change = post_ig_e - pre_ig_e,
    symptom_change = post_t_dcs_symptom_score - pre_symptom_score
  )

##Check randomization balance ##We do this by making a table 1 and comparing both arms. ##Here mean age on both arms and percentage of men in both arms is taken.

df |> 
  group_by(group) |> 
  summarise(
    age_mean = mean(age),
    male_pct = mean(gender =="M")
  )
## # A tibble: 2 × 3
##   group age_mean male_pct
##   <chr>    <dbl>    <dbl>
## 1 Sham      39.3    0.536
## 2 tDCS      39.4    0.548

##gender == “M” → returns TRUE/FALSE ##TRUE = 1, FALSE = 0 ##Mean of that = proportion of males. So this gives percentage of males (as a proportion)

##Primary outcome ##Now response is given as a binary outcome. 0 and 1. ##So the way to compare two arms is logistic regression model. #We are modelling response as a function of group, gender, and age.

model <- glm(response ~ group + age + gender,
             data = df,
             family = binomial)

summary(model)
## 
## Call:
## glm(formula = response ~ group + age + gender, family = binomial, 
##     data = df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.188974   0.380886  -3.122   0.0018 ** 
## grouptDCS    1.306645   0.217430   6.009 1.86e-09 ***
## age         -0.012908   0.008507  -1.517   0.1292    
## genderM     -0.001272   0.208678  -0.006   0.9951    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 592.95  on 499  degrees of freedom
## Residual deviance: 551.48  on 496  degrees of freedom
## AIC: 559.48
## 
## Number of Fisher Scoring iterations: 4

##Interpretation of results: ##Estimate = 1.3066 ##p < 0.000000001 ## Convert to odds ratio:

exp(1.306645)
## [1] 3.69376

##Interpretation of results: ##tDCS group had 3.7 times higher odds of response ##P values for age and gender are not significant. ##Odds ratio for age can also be calculated if you are interested. ##Here age and gender do not confound the effect of treatment. ##Odds ratio of intercept will give odds of response for reference group. ##can calculate odds ratio confidence intervals.

exp(cbind(OR = coef(model), confint(model)))
## Waiting for profiling to be done...
##                    OR     2.5 %    97.5 %
## (Intercept) 0.3045335 0.1424772 0.6357274
## grouptDCS   3.6937589 2.4295365 5.7069036
## age         0.9871746 0.9707821 1.0037534
## genderM     0.9987285 0.6636545 1.5055426

##The above command converts the model into a table of odds ratios with confidence intervals.

##Now to test improvement in symptoms. While pre and post scores are available, you’d think paired t test is better. But what we are comparing is change in scores between each arm. The scores have to be continuous and normal. T-test works if the assumptions are true. ##Another test to do is ANCOVA. ANCOVA is better cos it adjusts for baseline differences. But it doesn’t analyse for change in score. It analyses post scores adjusted for various covariates.

# Fit ANCOVA model
model_ancova <- lm(post_t_dcs_symptom_score ~ group + pre_symptom_score + age + gender, data = df)

# View results
summary(model_ancova)
## 
## Call:
## lm(formula = post_t_dcs_symptom_score ~ group + pre_symptom_score + 
##     age + gender, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8520 -1.0977  0.0125  1.2373  3.3600 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.836882   0.432823  15.796   <2e-16 ***
## grouptDCS         -3.304576   0.136363 -24.234   <2e-16 ***
## pre_symptom_score  0.039361   0.048185   0.817    0.414    
## age               -0.001152   0.005607  -0.205    0.837    
## genderM           -0.041903   0.136934  -0.306    0.760    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.524 on 495 degrees of freedom
## Multiple R-squared:  0.5437, Adjusted R-squared:   0.54 
## F-statistic: 147.4 on 4 and 495 DF,  p-value: < 2.2e-16

##Checking key ANCOVA assumption

model_check <- lm(post_t_dcs_symptom_score ~ group * pre_symptom_score, data = df)
summary(model_check)
## 
## Call:
## lm(formula = post_t_dcs_symptom_score ~ group * pre_symptom_score, 
##     data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.892 -1.053 -0.044  1.185  3.414 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  7.078975   0.512104  13.823  < 2e-16 ***
## grouptDCS                   -3.874588   0.692552  -5.595 3.66e-08 ***
## pre_symptom_score           -0.004372   0.071002  -0.062    0.951    
## grouptDCS:pre_symptom_score  0.080758   0.096302   0.839    0.402    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.522 on 496 degrees of freedom
## Multiple R-squared:  0.5442, Adjusted R-squared:  0.5414 
## F-statistic: 197.4 on 3 and 496 DF,  p-value: < 2.2e-16

##Group:prescore slope should not be significantly different.

##Now to visualize:

ggplot(df, aes(x = group, y = symptom_change)) +
  geom_boxplot() +
  labs(title = "Symptom Improvement by Group")

#Comparing biomarkers

t.test(il6_change ~ group, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  il6_change by group
## t = 10.551, df = 496.83, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Sham and group tDCS is not equal to 0
## 95 percent confidence interval:
##  0.9752236 1.4215590
## sample estimates:
## mean in group Sham mean in group tDCS 
##          0.1172989         -1.0810924
t.test(tnf_change ~ group, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  tnf_change by group
## t = 9.8951, df = 497.88, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Sham and group tDCS is not equal to 0
## 95 percent confidence interval:
##  0.902679 1.349953
## sample estimates:
## mean in group Sham mean in group tDCS 
##          0.1608728         -0.9654431
t.test(ige_change ~ group, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  ige_change by group
## t = 3.6641, df = 497.54, p-value = 0.0002749
## alternative hypothesis: true difference in means between group Sham and group tDCS is not equal to 0
## 95 percent confidence interval:
##   9.015895 29.863991
## sample estimates:
## mean in group Sham mean in group tDCS 
##           2.191804         -17.248139

#Visualising biomarker change

ggplot(df, aes(x = group, y = il6_change)) +
  geom_boxplot()

##Finding correlation between biomarker change and symptom

cor(df$il6_change, df$symptom_change)
## [1] 0.3022947
cor(df$tnf_change, df$symptom_change)
## [1] 0.2958352
cor(df$ige_change, df$symptom_change)
## [1] 0.1453829

##GPT also gave code for creating a model that would predict responders. Not using it for now.