Call in the SAH segmentation datasset.

library(readxl)
SAH_imaging_database <- read_excel("~/Documents/Job/Research /Bulters research/SAH imaging /Data/SAH imaging database.xlsx", 
    sheet = "Complete dataset")
Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900Warning: NA inserted for an unsupported date prior to 1900
View(SAH_imaging_database)

data_imaging <- SAH_imaging_database
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) : 
  invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) : 
  attempt to use zero-length variable name

Cohort characteristic calcuations.

Firstly, calculate the median and IQR from symptom onset to scan.

na.omit(data_imaging$Time_until_scan_min)
 [1]  157  128  207  210  189 1165  113  163  769  340  147   13 1042  124  126  222  308   92   96  266 1200  117  148  347  934   96  652  513  144
[30]   90  637  208  276  294  533  129  155  998  330  337  149  125  244 1014  103  341  871  541  549  127  118   78  779  293  224  800  337  123
[59] 1256  225  482 1256  491  923  349  288  232 1351 1187  393  136  119 1320  261  385  728  113  175  371  117  119  671
attr(,"na.action")
 [1] 39 42 52 55 59 60 62 68 70 71 74 75 76 77 80
attr(,"class")
[1] "omit"
summary(data_imaging$Time_until_scan_min)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   13.0   130.8   271.0   411.9   547.0  1351.0      15 
271/60 #Mediam
[1] 4.516667
130/60 #1st quartile 
[1] 2.166667
547/60 #3rd quartile
[1] 9.116667

Baseline characteristics table Unknown represents any missing data - 2 patients in the cohort did not have blood volumes collected.

library(gtsummary)
theme_gtsummary_compact()
Setting theme `Compact`
data_imaging %>% select("AGE", "SEX", "HTN", "ANEULOC", "SURGPROC", "Time_until_scan_min", "FISHGRDN", "SEBES", "Total blood", "SSV blood", "Non-SSV blood", "Total manual volume", "SSV Volume", "Ventricular volume", "Below SSV Volume", "WFNSGRDN") %>% tbl_summary()
Characteristic N = 971
AGE 55 (50, 62)
SEX
Β Β Β Β Female 75 (77%)
Β Β Β Β Male 22 (23%)
HTN
Β Β Β Β N 70 (72%)
Β Β Β Β Y 27 (28%)
ANEULOC
Β Β Β Β Anterior Circulation 80 (84%)
Β Β Β Β Posterior Circulation 15 (16%)
Β Β Β Β Unknown 2
SURGPROC
Β Β Β Β Clipping 22 (23%)
Β Β Β Β Coiling 73 (75%)
Β Β Β Β Not Done 2 (2.1%)
Time_until_scan_min 271 (131, 547)
Β Β Β Β Unknown 15
FISHGRDN
Β Β Β Β 3 37 (38%)
Β Β Β Β 4 60 (62%)
SEBES
Β Β Β Β 0 63 (65%)
Β Β Β Β 1 15 (15%)
Β Β Β Β 2 9 (9.3%)
Β Β Β Β 3 7 (7.2%)
Β Β Β Β 4 3 (3.1%)
Total blood 20 (9, 35)
SSV blood 0.00 (0.00, 1.50)
Β Β Β Β Unknown 2
Non-SSV blood 19 (9, 33)
Β Β Β Β Unknown 2
Total manual volume 158 (90, 283)
SSV Volume 37 (20, 75)
Ventricular volume 39 (23, 62)
Below SSV Volume 110 (59, 223)
WFNSGRDN
Β Β Β Β 1 44 (45%)
Β Β Β Β 2 16 (16%)
Β Β Β Β 3 5 (5.2%)
Β Β Β Β 4 27 (28%)
Β Β Β Β 5 5 (5.2%)
1 Median (IQR); n (%)

Blood distributions to check for the normality and see if transformations are required.

#Blood variables: Total Blood, SSV Blood and Non-SSV Blood

Total_blood <- SAH_imaging_database$`Total blood`
SSV_Blood <- SAH_imaging_database$`SSV blood`
Non_SSV_Blood <- SAH_imaging_database$`Non-SSV blood`

hist(Total_blood)

hist(SSV_Blood)

hist(Non_SSV_Blood)


library(moments)
d_total_blood <- density(Total_blood)# returns the density data
skewness(Total_blood, na.rm = TRUE) #Skewness
[1] 0.7766232
plot(d_total_blood, main = "Density plot Total Blood Volume (mls)", sub = "Skewness = 0.777") # plots the results


d_SSV_blood <- density(SSV_Blood, na.rm = TRUE)# returns the density data
skewness(SSV_Blood, na.rm = TRUE) #Skewness
[1] 3.526801
plot(d_SSV_blood, main = "Density plot SSV Blood Volume (mls)", sub = "Skewness = 3.53") # plots the results


d_Non_SSV_blood <- density(Non_SSV_Blood, na.rm = TRUE)# returns the density data
skewness(Non_SSV_Blood, na.rm = TRUE) #Skewness
[1] 0.7286446
plot(d_Non_SSV_blood, main = "Density plot Non-SSV Blood Volume (mls)", sub = "Skewness = 0.729") # plots the results

NA
NA

CSF distributions to check for skewness


#CSF variables: SEBES, Ventricular volume, SSV volume, Below SSV Volume

SEBES <- SAH_imaging_database$SEBES
Ventricle_CSF <- SAH_imaging_database$`Ventricular volume`
SSV_CSF <- SAH_imaging_database$`SSV Volume`
Below_SSV_CSF <- SAH_imaging_database$`Below SSV Volume`
Total_CSF <- SAH_imaging_database$`Total manual volume`

hist(SEBES)

hist(Ventricle_CSF)

hist(SSV_CSF)

hist(Below_SSV_CSF)

hist(Total_CSF)


library(moments)
d_SEBES <- density(SEBES)# returns the density data
skewness(SEBES, na.rm = TRUE) #Skewness
[1] 1.542407
plot(d_SEBES, main = "Density plot SEBES", sub = "Skewness = 1.542") # plots the results


d_Ventricle_CSF <- density(Ventricle_CSF, na.rm = TRUE)# returns the density data
skewness(Ventricle_CSF, na.rm = TRUE) #Skewness
[1] 1.590469
plot(d_Ventricle_CSF, main = "Density plot Ventricle CSF Volume (mls)", sub = "Skewness = 1.59") # plots the results


d_SSV_CSF <- density(SSV_CSF, na.rm = TRUE)# returns the density data
skewness(SSV_CSF, na.rm = TRUE) #Skewness
[1] 1.431467
plot(d_SSV_CSF, main = "Density plot SSV CSF Volume (mls)", sub = "Skewness = 1.43") # plots the results


d_Below_SSV_CSF <- density(Below_SSV_CSF, na.rm = TRUE)# returns the density data
skewness(Below_SSV_CSF, na.rm = TRUE) #Skewness
[1] 0.8290886
plot(d_Below_SSV_CSF, main = "Density plot Below SSV CSF Volume (mls)", sub = "Skewness = 0.829") # plots the results


d_Total_CSF <- density(Total_CSF, na.rm = TRUE)# returns the density data
skewness(Total_CSF, na.rm = TRUE) #Skewness
[1] 0.899855
plot(d_Total_CSF, main = "Density plot Total CSF Volume (mls)", sub = "Skewness = 0.9") # plots the results

Decision made not to transform the variables because they are generally normally distributed, with the exception of SSV blood which is not a clinically important variable anyway.

The next step is to identify through correlation matrices, how the imaging variables are related..


#For visualisation ensure that the variables have the correct names 

SAH_imaging_database$Age <- SAH_imaging_database$AGE
SAH_imaging_database$WFNS <- SAH_imaging_database$WFNSGRDN
SAH_imaging_database$Total_CSF <- SAH_imaging_database$`Total manual volume`
SAH_imaging_database$Ventricular_CSF <- SAH_imaging_database$`Ventricular volume`
SAH_imaging_database$SSV_CSF <- SAH_imaging_database$`SSV Volume`

data_imaging_correlation <- SAH_imaging_database[, c("Age", "WFNS",
                                        "Total blood", "SSV_CSF", "Ventricular_CSF",
                                        "Total_CSF")]

data_imaging_correlation <- na.omit(data_imaging_correlation)

remove.packages('Hmisc')
Removing package from β€˜/Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library’
(as β€˜lib’ is unspecified)
install.packages("Hmisc", repos="http://cran.rstudio.com/", dependencies=TRUE)
trying URL 'http://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.3/Hmisc_5.1-0.tgz'
Content type 'application/x-gzip' length 3515239 bytes (3.4 MB)
==================================================
downloaded 3.4 MB

The downloaded binary packages are in
    /var/folders/5q/zxkgc5yn685ffrcr6m2_f_4h0000gn/T//RtmpVnEbxd/downloaded_packages
install.packages("checkmate")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.3/checkmate_2.2.0.tgz'
Content type 'application/x-gzip' length 772034 bytes (753 KB)
==================================================
downloaded 753 KB

The downloaded binary packages are in
    /var/folders/5q/zxkgc5yn685ffrcr6m2_f_4h0000gn/T//RtmpVnEbxd/downloaded_packages
library(Hmisc)
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: β€˜Hmisc’

The following objects are masked from β€˜package:base’:

    format.pval, units
res2 <- rcorr(as.matrix(data_imaging_correlation), type = "spearman")
res2
                 Age  WFNS Total blood SSV_CSF Ventricular_CSF Total_CSF
Age             1.00  0.06        0.01    0.24            0.39      0.34
WFNS            0.06  1.00        0.46   -0.37           -0.03     -0.14
Total blood     0.01  0.46        1.00   -0.26           -0.28     -0.19
SSV_CSF         0.24 -0.37       -0.26    1.00            0.45      0.78
Ventricular_CSF 0.39 -0.03       -0.28    0.45            1.00      0.78
Total_CSF       0.34 -0.14       -0.19    0.78            0.78      1.00

n= 97 


P
                Age    WFNS   Total blood SSV_CSF Ventricular_CSF Total_CSF
Age                    0.5781 0.9249      0.0196  0.0000          0.0007   
WFNS            0.5781        0.0000      0.0002  0.7396          0.1653   
Total blood     0.9249 0.0000             0.0108  0.0051          0.0589   
SSV_CSF         0.0196 0.0002 0.0108              0.0000          0.0000   
Ventricular_CSF 0.0000 0.7396 0.0051      0.0000                  0.0000   
Total_CSF       0.0007 0.1653 0.0589      0.0000  0.0000                   
res2$r
                        Age        WFNS  Total blood    SSV_CSF Ventricular_CSF  Total_CSF
Age             1.000000000  0.05716434  0.009694334  0.2367181      0.38567391  0.3391306
WFNS            0.057164345  1.00000000  0.460972143 -0.3728866     -0.03418735 -0.1420047
Total blood     0.009694334  0.46097214  1.000000000 -0.2577188     -0.28217705 -0.1925100
SSV_CSF         0.236718130 -0.37288664 -0.257718809  1.0000000      0.44937408  0.7821244
Ventricular_CSF 0.385673908 -0.03418735 -0.282177046  0.4493741      1.00000000  0.7780086
Total_CSF       0.339130575 -0.14200467 -0.192509994  0.7821244      0.77800863  1.0000000
res2$P
                         Age         WFNS  Total blood      SSV_CSF Ventricular_CSF    Total_CSF
Age                       NA 5.781015e-01 9.249165e-01 1.957150e-02    9.569289e-05 0.0006787165
WFNS            5.781015e-01           NA 2.018435e-06 1.688910e-04    7.395581e-01 0.1652907303
Total blood     9.249165e-01 2.018435e-06           NA 1.081759e-02    5.106494e-03 0.0588736570
SSV_CSF         1.957150e-02 1.688910e-04 1.081759e-02           NA    3.889443e-06 0.0000000000
Ventricular_CSF 9.569289e-05 7.395581e-01 5.106494e-03 3.889443e-06              NA 0.0000000000
Total_CSF       6.787165e-04 1.652907e-01 5.887366e-02 0.000000e+00    0.000000e+00           NA
flattenCorrMatrix <- function(cormat, pmat) { #format correlation matrix 
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
  )
}

correlation_matrix <- flattenCorrMatrix(res2$r, res2$P)
write.csv(correlation_matrix, "Correlation_matrix.csv", row.names = F) #export table 


library(corrplot) #visualise the correlation matrix
corrplot 0.92 loaded
res <- cor(data_imaging_correlation)
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)



corrplot(res2$r, type="upper", order="hclust", #insignificant correlations removed
         p.mat = res2$P, sig.level = 0.01, insig = "blank")
Error in data.frame(..., check.names = FALSE) : 
  arguments imply differing number of rows: 21, 15

Univariate analysis of variables

summary(Univariate)

Call:
glm(formula = data_imaging$SAHOT180_dich ~ data_imaging$Total_CSF, 
    family = binomial, data = data_imaging)

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)
(Intercept)            -0.381248   0.379002  -1.006    0.314
data_imaging$Total_CSF -0.001276   0.001626  -0.785    0.433

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 122.07  on 93  degrees of freedom
AIC: 126.07

Number of Fisher Scoring iterations: 4

Dominance plots

To prevent overfitting of further predictive models, a dominance analysis was done by assessing each variable’s R2 when predicting mRS at day 28 and day 180. The mRS at day 28 and day 180 were used because mRS is used most widely in clinical practice to assess functional outcome. Day 28 was selected to reflect short term and day 180 long-term outcome. In addition, to demonstrate the difference between a general scale of functional outcome for stroke and a patient reported SAH specific scale of symptoms we undertook a dominance analysis of SAHOT on day 180.

Multivariate analysis

In order to investigate the additional benefit of blood and CSF volume measurements in models over conventional clinical features alone multivariate models were built using the previously reported SAHIT core variables to predict mRS at day 28 and 180 and repeated adding total blood volume, SSV CSF, ventricular CSF.

Core model= Age, Hypertension and Admission WFNS Neuroimaging model= Core model + Total Blood volume, SSV CSF and Ventricular CSF

For unknown reasons the below code did not work and therefore, I created the AUC graphs by adapting my previous analysed code.

legend("bottomright", title = "modified Rankin Scale", 
       legend = c("Neuroimaging Day 28", "Neuroimaging Day 180", "Clinical Day 28", "Clinical Day 180"), col = c("red", "black", "orange", "blue"), 
       lwd = 2, "Legend", cex=0.8)
Error in (function (s, units = "user", cex = NULL, font = NULL, vfont = NULL,  : 
  plot.new has not been called yet

SSV dichotomisation from the literature

Choi et al.: SSV not normally distributed and subjects separated into equal quartiles for measuring odds ratios in examining associations with GCE as well as clinical outcomes. FINDINGS: compared to patients in the highest quartile for SSV, the OR for poor outcome at discharge increased in a dose dependent manner in the lower quartiles after adjusting for age, sex, smoking hypertension and HH score

Yuan et al.: SSV taken as continuous variable and in sensitivity analysis dichotomisation at <5.2ml was done. FINDINGS early SSV defined as lowest SSV volume in first 72hrs but not admission SSV was predictive of outcome

Create two new variables, SSV_quantiles and SSV_dich

levels(SSV_dich)
NULL

Currently have several CSF volumes including - total CSF, SSV CSF, ventricular CSF and below SSV CSF. However, one volume that is missing, is a non-SSV, non-ventricular CSF.

Create non_SSV_non_vent_CSF variable

summary(Univariate)

Call:
glm(formula = SAHOT180_dich ~ non_SSV_non_vent_CSF, family = binomial, 
    data = data_imaging)

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)  
(Intercept)          -0.658236   0.338537  -1.944   0.0519 .
non_SSV_non_vent_CSF  0.000288   0.002716   0.106   0.9155  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 122.69  on 93  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 126.69

Number of Fisher Scoring iterations: 4

Before multivariate analysis, let’s see if the new SSV cut-offs make it a significant predictor of day 30 and 180 outcome in a univariate analysis

summary(Univariate)

Call:
glm(formula = SAHOT180_dich ~ SSV_dich, family = binomial, data = data_imaging)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  -0.7357     0.2536  -2.901  0.00372 **
SSV_dichlow   0.3992     0.4855   0.822  0.41093   
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 122.03  on 93  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 126.03

Number of Fisher Scoring iterations: 4

Repeat regressions:

Before we make that major restructure we wanted to see the outcomes of the regressions for

  1. WFNS, TBV and SSV to predict outcome (MRS and SAHOT at day 30 and 180)
  2. WFNS, TBV and ventricular volume to predict outcome

If the latter does not give a clear result then we wanted to try it again but:

  1. add presence of an EVD as a covariate and an interaction between ventricular volume and an EVD
  2. add an interaction between TBV and ventricular volume

# Regression 1: WFNS, TBV and SSV regression

Multivariate <- glm(as.factor(mRS28) ~ WFNS+Total_blood_volume+SSV_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS28) ~ WFNS + Total_blood_volume + 
    SSV_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.010329   0.784551  -3.837 0.000125 ***
WFNS                0.856655   0.209192   4.095 4.22e-05 ***
Total_blood_volume  0.014721   0.015629   0.942 0.346263    
SSV_CSF            -0.004514   0.006311  -0.715 0.474383    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 124.387  on 96  degrees of freedom
Residual deviance:  89.964  on 93  degrees of freedom
AIC: 97.964

Number of Fisher Scoring iterations: 4
Multivariate <- glm(as.factor(mRS180) ~ WFNS+Total_blood_volume+SSV_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS180) ~ WFNS + Total_blood_volume + 
    SSV_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.415511   0.984158  -3.470  0.00052 ***
WFNS                0.525937   0.238247   2.208  0.02728 *  
Total_blood_volume  0.018970   0.016313   1.163  0.24489    
SSV_CSF            -0.004060   0.007936  -0.512  0.60894    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 74.307  on 93  degrees of freedom
AIC: 82.307

Number of Fisher Scoring iterations: 5
Multivariate <- glm(as.factor(SAHOT28_dich) ~ WFNS+Total_blood_volume+SSV_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT28_dich) ~ WFNS + Total_blood_volume + 
    SSV_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)  
(Intercept)        -0.469845   0.589470  -0.797   0.4254  
WFNS                0.213558   0.180917   1.180   0.2378  
Total_blood_volume  0.024086   0.014551   1.655   0.0979 .
SSV_CSF            -0.008032   0.005332  -1.506   0.1320  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 131.18  on 94  degrees of freedom
Residual deviance: 117.88  on 91  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 125.88

Number of Fisher Scoring iterations: 4
Multivariate <- glm(as.factor(SAHOT180_dich) ~ WFNS+Total_blood_volume+SSV_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT180_dich) ~ WFNS + Total_blood_volume + 
    SSV_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)  
(Intercept)        -0.936754   0.645685  -1.451   0.1468  
WFNS               -0.014991   0.187351  -0.080   0.9362  
Total_blood_volume  0.030589   0.014152   2.161   0.0307 *
SSV_CSF            -0.009022   0.006394  -1.411   0.1582  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 112.25  on 91  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 120.25

Number of Fisher Scoring iterations: 4
# For mRS, WFNS is a significant predictor, for SAHOT, TBV is significant.Substitute in new SSV variables to see if this makes a difference. 

#SSV_quantile first

Multivariate <- glm(as.factor(mRS28) ~ WFNS+Total_blood_volume+SSV_quantiles, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS28) ~ WFNS + Total_blood_volume + 
    SSV_quantiles, family = binomial, data = data_imaging)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.44386    0.95249  -3.616 0.000300 ***
WFNS                0.91927    0.23636   3.889 0.000101 ***
Total_blood_volume  0.01880    0.01655   1.136 0.255768    
SSV_quantiles2     -0.36984    0.78129  -0.473 0.635952    
SSV_quantiles3      0.62358    0.78633   0.793 0.427768    
SSV_quantiles4     -0.72274    0.81935  -0.882 0.377732    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 124.387  on 96  degrees of freedom
Residual deviance:  87.279  on 91  degrees of freedom
AIC: 99.279

Number of Fisher Scoring iterations: 5
Multivariate <- glm(as.factor(mRS180) ~ WFNS+Total_blood_volume+SSV_quantiles, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS180) ~ WFNS + Total_blood_volume + 
    SSV_quantiles, family = binomial, data = data_imaging)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.62012    1.16642  -3.961 7.47e-05 ***
WFNS                0.70936    0.26454   2.681  0.00733 ** 
Total_blood_volume  0.01309    0.01801   0.727  0.46708    
SSV_quantiles2      1.10257    0.85530   1.289  0.19736    
SSV_quantiles3      1.23702    0.91902   1.346  0.17829    
SSV_quantiles4      0.23181    1.00954   0.230  0.81838    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 71.709  on 91  degrees of freedom
AIC: 83.709

Number of Fisher Scoring iterations: 5
Multivariate <- glm(as.factor(SAHOT28_dich) ~ WFNS+Total_blood_volume+SSV_quantiles, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT28_dich) ~ WFNS + Total_blood_volume + 
    SSV_quantiles, family = binomial, data = data_imaging)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)
(Intercept)        -0.56706    0.74665  -0.759    0.448
WFNS                0.22010    0.20052   1.098    0.272
Total_blood_volume  0.02143    0.01498   1.431    0.152
SSV_quantiles2      0.35846    0.70441   0.509    0.611
SSV_quantiles3     -0.59282    0.66883  -0.886    0.375
SSV_quantiles4     -0.84778    0.68353  -1.240    0.215

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 131.18  on 94  degrees of freedom
Residual deviance: 116.03  on 89  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 128.03

Number of Fisher Scoring iterations: 4
Multivariate <- glm(as.factor(SAHOT180_dich) ~ WFNS+Total_blood_volume+SSV_quantiles, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT180_dich) ~ WFNS + Total_blood_volume + 
    SSV_quantiles, family = binomial, data = data_imaging)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)  
(Intercept)        -1.1602876  0.7493937  -1.548   0.1215  
WFNS               -0.0006619  0.2018584  -0.003   0.9974  
Total_blood_volume  0.0305582  0.0147357   2.074   0.0381 *
SSV_quantiles2      0.0063719  0.6535923   0.010   0.9922  
SSV_quantiles3     -0.1591029  0.6770875  -0.235   0.8142  
SSV_quantiles4     -1.0107826  0.7590549  -1.332   0.1830  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 111.88  on 89  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 123.88

Number of Fisher Scoring iterations: 4
#Makes no difference. How about SSV_dich

Multivariate <- glm(as.factor(mRS28) ~ WFNS+Total_blood_volume+SSV_dich, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS28) ~ WFNS + Total_blood_volume + 
    SSV_dich, family = binomial, data = data_imaging)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.35505    0.64610  -5.193 2.07e-07 ***
WFNS                0.86463    0.22327   3.873 0.000108 ***
Total_blood_volume  0.01661    0.01556   1.067 0.285778    
SSV_dichlow         0.16214    0.61914   0.262 0.793417    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 124.387  on 96  degrees of freedom
Residual deviance:  90.424  on 93  degrees of freedom
AIC: 98.424

Number of Fisher Scoring iterations: 4
Multivariate <- glm(as.factor(mRS180) ~ WFNS+Total_blood_volume+SSV_dich, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS180) ~ WFNS + Total_blood_volume + 
    SSV_dich, family = binomial, data = data_imaging)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.74949    0.81003  -4.629 3.68e-06 ***
WFNS                0.69899    0.26084   2.680  0.00737 ** 
Total_blood_volume  0.01566    0.01677   0.934  0.35020    
SSV_dichlow        -0.90681    0.74110  -1.224  0.22111    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 72.985  on 93  degrees of freedom
AIC: 80.985

Number of Fisher Scoring iterations: 5
Multivariate <- glm(as.factor(SAHOT28_dich) ~ WFNS+Total_blood_volume+SSV_dich, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT28_dich) ~ WFNS + Total_blood_volume + 
    SSV_dich, family = binomial, data = data_imaging)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)  
(Intercept)        -1.06198    0.44665  -2.378   0.0174 *
WFNS                0.20878    0.19555   1.068   0.2857  
Total_blood_volume  0.02790    0.01457   1.915   0.0555 .
SSV_dichlow         0.37694    0.57273   0.658   0.5104  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 131.18  on 94  degrees of freedom
Residual deviance: 119.86  on 91  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 127.86

Number of Fisher Scoring iterations: 4
Multivariate <- glm(as.factor(SAHOT180_dich) ~ WFNS+Total_blood_volume+SSV_dich, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT180_dich) ~ WFNS + Total_blood_volume + 
    SSV_dich, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)   
(Intercept)        -1.583066   0.483300  -3.276  0.00105 **
WFNS                0.008426   0.200301   0.042  0.96645   
Total_blood_volume  0.033597   0.014236   2.360  0.01827 * 
SSV_dichlow         0.308950   0.565281   0.547  0.58469   
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 114.23  on 91  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 122.23

Number of Fisher Scoring iterations: 4
#Again no difference made. 

# Regression 2: WFNS, TBV and ventricular volume to predict outcome

Multivariate <- glm(as.factor(mRS28) ~ WFNS+Total_blood_volume+Ventricular_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS28) ~ WFNS + Total_blood_volume + 
    Ventricular_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.396700   0.877685  -5.009 5.46e-07 ***
WFNS                0.882108   0.213980   4.122 3.75e-05 ***
Total_blood_volume  0.026610   0.016492   1.614   0.1066    
Ventricular_CSF     0.014472   0.006771   2.137   0.0326 *  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 124.39  on 96  degrees of freedom
Residual deviance:  85.55  on 93  degrees of freedom
AIC: 93.55

Number of Fisher Scoring iterations: 5
Multivariate <- glm(as.factor(mRS180) ~ WFNS+Total_blood_volume+Ventricular_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(mRS180) ~ WFNS + Total_blood_volume + 
    Ventricular_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.688721   1.031167  -4.547 5.44e-06 ***
WFNS                0.535981   0.241497   2.219   0.0265 *  
Total_blood_volume  0.030422   0.017492   1.739   0.0820 .  
Ventricular_CSF     0.012876   0.006733   1.912   0.0558 .  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 70.991  on 93  degrees of freedom
AIC: 78.991

Number of Fisher Scoring iterations: 5
Multivariate <- glm(as.factor(SAHOT28_dich) ~ WFNS+Total_blood_volume+Ventricular_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT28_dich) ~ WFNS + Total_blood_volume + 
    Ventricular_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)  
(Intercept)        -1.309795   0.552306  -2.372   0.0177 *
WFNS                0.242631   0.177250   1.369   0.1710  
Total_blood_volume  0.029947   0.015179   1.973   0.0485 *
Ventricular_CSF     0.004223   0.005557   0.760   0.4473  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 131.18  on 94  degrees of freedom
Residual deviance: 119.71  on 91  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 127.71

Number of Fisher Scoring iterations: 4
Multivariate <- glm(as.factor(SAHOT180_dich) ~ WFNS+Total_blood_volume+Ventricular_CSF, family = binomial, data = data_imaging)
summary(Multivariate)

Call:
glm(formula = as.factor(SAHOT180_dich) ~ WFNS + Total_blood_volume + 
    Ventricular_CSF, family = binomial, data = data_imaging)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)   
(Intercept)        -1.586e+00  5.821e-01  -2.725  0.00644 **
WFNS                5.660e-02  1.822e-01   0.311  0.75604   
Total_blood_volume  3.239e-02  1.463e-02   2.213  0.02688 * 
Ventricular_CSF     5.209e-05  5.813e-03   0.009  0.99285   
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 122.70  on 94  degrees of freedom
Residual deviance: 114.53  on 91  degrees of freedom
  (2 observations deleted due to missingness)
AIC: 122.53

Number of Fisher Scoring iterations: 4
# Ventricular CSF significant at 30 day mRS and almost significant for mRS 180. However, not significant for SAHOT. 

The above regressions demonstrate several observations with the dataset:

  1. WFNS is a highly significant predictor of functional outcome at 30 days and 180 days
  2. When using a more specific outcome tool for SAH (SAHOT), TBV is shown to be a highly significant predictor
  3. Univariate analysis of SSV using cut offs, which isolate cases with β€˜low’ SSV, makes it a significant variable at early timepoints
  4. Multivariate analyse shows that SSV is not an independent predictor of outcome when TBV and WFNS are included. However, Ventricular volume is an independent predictor of outcome using mRS

I think the above gives a clear result. Firstly, it shows that we have been able to replicate previous findings in the literature, that at early timepoints SSV predicts outcome. However, we have shown that this affect is not maintained at later timepoints or when combined with other significant predictors of outcome. Instead, imaging predictors of TBV, and to a lesser extent ventricular CSF are more important.

The narrative of the paper is clearer to me now and I think should be presented something like this:

  1. Basic demographics
  2. Regression 1 and Regression 2 from above
  3. Interaction plots
  4. Inclusion of β€˜other’ imaging variables to show they are less relevant

For completeness, below I will include:

  1. add presence of an EVD as a covariate and an interaction between ventricular volume and an EVD
  2. add an interaction between TBV and ventricular volume
summary(Multivariate)

Call:
glm(formula = as.factor(mRS180) ~ WFNS + Total_blood_volume + 
    Ventricular_CSF * EVD, family = binomial, data = data_imaging)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -5.17723    1.23504  -4.192 2.77e-05 ***
WFNS                  0.35852    0.29083   1.233   0.2177    
Total_blood_volume    0.02756    0.01767   1.560   0.1188    
Ventricular_CSF       0.02031    0.01024   1.983   0.0474 *  
EVDY                  2.12353    1.11352   1.907   0.0565 .  
Ventricular_CSF:EVDY -0.01595    0.01299  -1.227   0.2197    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 66.859  on 91  degrees of freedom
AIC: 78.859

Number of Fisher Scoring iterations: 6

Another thing we should check is whether the model fit is better if we use just: -WFNS and blood vol -WFNS and CSF vol -Blood and CSF vol

summary(Multivariate)

Call:
glm(formula = as.factor(mRS180) ~ WFNS + Total_blood_volume + 
    Total_CSF, family = binomial, data = data_imaging)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.399267   1.094137  -4.021  5.8e-05 ***
WFNS                0.609560   0.242768   2.511    0.012 *  
Total_blood_volume  0.022004   0.016296   1.350    0.177    
Total_CSF           0.002316   0.002224   1.041    0.298    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 73.526  on 93  degrees of freedom
AIC: 81.526

Number of Fisher Scoring iterations: 5

SAHOT as linear to see if this changes significance of variables. SAHOT is a 9 part scale 1 (Best outcome), 9 (death).

Repeat regressions Univariate SSV analyses:

As SAHOT as a linear 9 point scale,

Repeat regressions SAHOT at D28 and D180: 1) WFNS, TBV and SSV to predict outcome (MRS and SAHOT at day 30 and 180) 2) WFNS, TBV and ventricular volume to predict outcome

summary(Multivariate)

Call:
lm(formula = sahot180 ~ WFNS + Total_blood_volume + Ventricular_CSF, 
    data = data_imaging)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.4973 -1.7656 -0.4125  1.5223  5.1628 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        2.2918313  0.5470922   4.189 6.47e-05 ***
WFNS               0.2250422  0.1820884   1.236  0.21968    
Total_blood_volume 0.0416388  0.0143432   2.903  0.00464 ** 
Ventricular_CSF    0.0008862  0.0056270   0.157  0.87520    
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

Residual standard error: 2.149 on 91 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.1658,    Adjusted R-squared:  0.1383 
F-statistic: 6.028 on 3 and 91 DF,  p-value: 0.0008604

Reasoning why a variable in a univariate analysis is not significant but becomes significant in a multivariate analysis:

The case of two predictors that are truly orthogonal: there is absolutely no collinearity among them. A remarkable change in significance can still happen.

Designate the predictor variables 𝑋1 and 𝑋2 and let π‘Œ name the predictor. The regression of π‘Œ against 𝑋1 will fail to be significant when the variation in π‘Œ around its mean is not appreciably reduced when 𝑋1 is used as the independent variable. When that variation is strongly associated with a second variable 𝑋2, however, the situation changes. Recall that multiple regression of π‘Œ against 𝑋1 and 𝑋2 is equivalent to separately regress π‘Œ and 𝑋1 against 𝑋2.

Regress the π‘Œ residuals against the 𝑋1 residuals.

The residuals from the first step have removed the effect of 𝑋2. When 𝑋2 is closely correlated with π‘Œ, this can expose a relatively small amount of variation that had previously been masked. If this variation is associated with 𝑋1, we obtain a significant result.

Further info from: https://stats.stackexchange.com/questions/28474/how-can-adding-a-2nd-iv-make-the-1st-iv-significant

In the analyses done WFNS and ventricular volume are not colinear, but blood volume and ventricular volume are colinear in a negative direction. WFNS and ventricular volume, therefore represent the above finding:

#Ventricular volume only

Univariate <- glm(as.factor(mRS180) ~ Ventricular_CSF, family = binomial, data = data_imaging) summary(Univariate)

#Ventricular volume is not significant on its own

#WFNS and ventricular volume

Multivariate <- glm(as.factor(mRS180) ~ WFNS+Ventricular_CSF, family = binomial, data = data_imaging) summary(Multivariate)

##AIC: 94.269 #WFNS significant, ventricular volume is almost significant

#Blood and ventricular volume

Multivariate <- glm(as.factor(mRS180) ~ Total_blood_volume+Ventricular_CSF, family = binomial, data = data_imaging) summary(Multivariate)

##AIC: 112.16 #Total blood very significant and ventricular volume significant - probably due to collinarity

Shapley plots

Complex predictive models are not easy to interpret. By complex I mean: random forest, xgboost, deep learning, etc.

Shapley values calculate the importance of a feature by comparing what a model predicts with and without the feature. However, since the order in which a model sees features can affect its predictions, this is done in every possible order, so that the features are fairly compared.

How to interpret the shap summary plot?

The y-axis indicates the variable name, in order of importance from top to bottom. The value next to them is the mean SHAP value. On the x-axis is the SHAP value. Indicates how much is the change in log-odds. From this number we can extract the probability of success. Gradient color indicates the original value for that variable. In booleans, it will take two colors, but in number it can contain the whole spectrum. Each point represents a row from the original dataset.

fit_xgb <- xgb.train(
  params,
  data = dtrain,
  watchlist = list(valid = dvalid),
  early_stopping_rounds = 20,
  print_every_n = 100,
  nrounds = 10000 # early stopping
)
Error in xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) : 
  [15:53:06] src/objective/regression_obj.cu:43: Check failed: info.labels.Shape(0) == info.num_row_ (0 vs. 97) : Invalid shape of labels.
Stack trace:
  [bt] (0) 1   xgboost.so                          0x000000013580bd3c dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   xgboost.so                          0x0000000135836b10 xgboost::obj::(anonymous namespace)::CheckInitInputs(xgboost::MetaInfo const&) + 208
  [bt] (2) 3   xgboost.so                          0x00000001358368dc xgboost::obj::(anonymous namespace)::CheckRegInputs(xgboost::MetaInfo const&, xgboost::HostDeviceVector<float> const&) + 28
  [bt] (3) 4   xgboost.so                          0x0000000135835c3c xgboost::obj::RegLossObj<xgboost::obj::LinearSquareLoss>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*) + 60
  [bt] (4) 5   xgboost.so                          0x00000001359610fc xgboost::Lea

Create a comparative AUC graph for the β€˜final’ model that includes WFNS, total blood volume and ventricular CSF volume in the prediction of mRS 180

legend("bottomright", title = "modified Rankin Scale", 
       legend = c("WFNS+Blood", "WFNS+VentricularCSF", "Final model", "WFNS+SSV_CSF", "WFNS+TotalCSF"), col = c("red", "black", "orange", "blue", "forestgreen"), 
       lwd = 2, "Legend", cex=0.8)
Error in (function (s, units = "user", cex = NULL, font = NULL, vfont = NULL,  : 
  plot.new has not been called yet

Stepwise regresssional analysis for the development of the multivariate model.

Variables: age, WFNS, TBV, SSV, Ventricular CSF, total CSF

summary(step.model)

Call:
glm(formula = mRS180 ~ WFNS + Total_blood + Ventricular_CSF, 
    family = binomial, data = data_imaging)

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.688721   1.031167  -4.547 5.44e-06 ***
WFNS             0.535981   0.241497   2.219   0.0265 *  
Total_blood      0.030422   0.017492   1.739   0.0820 .  
Ventricular_CSF  0.012876   0.006733   1.912   0.0558 .  
---
Signif. codes:  0 β€˜***’ 0.001 β€˜**’ 0.01 β€˜*’ 0.05 β€˜.’ 0.1 β€˜ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 86.870  on 96  degrees of freedom
Residual deviance: 70.991  on 93  degrees of freedom
AIC: 78.991

Number of Fisher Scoring iterations: 5
