0. Installing the libraries

library(dplyr)
library(invgamma)
library(truncnorm)
library(matlib)
library(MASS)
library(wordspace)

1. Getting the data

We obtain the data and divide it into test and training set. The training set is further divided for 5-fold cross validation. The data looks as follows: the first column is the unique identifier for each observation (patient), the second column is the binary response (1 stands for dead and 0 stands for not dead) followed by four feature variables suitably scaled.

dataset = read.delim("traumadata.txt")

dataset <- cbind(dataset[,1:2], scale(dataset[,-(1:2)], center = TRUE, scale = TRUE))
#dataset <- cbind(dataset[,1:2], normalize.cols(scale(df[,-(1:2)], center = TRUE, scale = FALSE), method = "euclidean", p = 2))
head(dataset)
pos <- dataset[dataset$death==1,]
neg <- dataset[dataset$death==0,]

set.seed(123)
testpos <- sample(pos$ID, 2)

pos %>% filter(!(ID %in% testpos)) -> train_pos
pos %>% filter((ID %in% testpos)) -> test_pos

testneg <- sample(neg$ID, 48)

neg %>% filter(!(ID %in% testneg)) -> train_neg
neg %>% filter(ID %in% testneg) -> test_neg

test_set <- rbind(test_pos, test_neg)

set.seed(123)
temp <- sample(train_pos$ID)

foldpos1 <- temp[1:4]
foldpos2 <- temp[5:8]
foldpos3 <- temp[9:12]
foldpos4 <- temp[13:16]
foldpos5 <- temp[17:20]

fold_pos_1 <- train_pos %>% filter(ID %in% foldpos1)
fold_pos_2 <- train_pos %>% filter(ID %in% foldpos2)
fold_pos_3 <- train_pos %>% filter(ID %in% foldpos3)
fold_pos_4 <- train_pos %>% filter(ID %in% foldpos4)
fold_pos_5 <- train_pos %>% filter(ID %in% foldpos5)

set.seed(123)
temp <- sample(train_neg$ID)

fold_neg_1 <- train_neg %>% filter(ID %in% temp[1:46])
fold_neg_2 <- train_neg %>% filter(ID %in% temp[(1+46):(46*2)])
fold_neg_3 <- train_neg %>% filter(ID %in% temp[(1+46*2):(46*3)])
fold_neg_4 <- train_neg %>% filter(ID %in% temp[(1+46*3):(46*4)])
fold_neg_5 <- train_neg %>% filter(ID %in% temp[(1+46*4):(46*5)])

fold_1 <- rbind(fold_pos_1, fold_neg_1)
fold_2 <- rbind(fold_pos_2, fold_neg_2)
fold_3 <- rbind(fold_pos_3, fold_neg_3)
fold_4 <- rbind(fold_pos_4, fold_neg_4)
fold_5 <- rbind(fold_pos_5, fold_neg_5)

2. Model selection

For model selection we are selecting from the four variables and the intercept i.e a total of five variables. The model selection Gibbs Sampler selects the intercept and just one of the features when applied on the five folds i.e is suggests us to do a Simple Linear Regression. The best test set precision is obtained in the second fold which suggests us to select the intercept and the third variable i.e. “RTS”.

dataset <- rbind(fold_1, fold_2, fold_3, fold_5)

b <- rep(0, 5)
B=diag(1000,5,5)
X=as.matrix(dataset[,-c(1,2)])
X=cbind(1,X)
#head(X)
#dim(X)
#length(y)


p=rep(0.5, 5)
b_0=1
b_1=1
b_2=1
b_3=1
b_4=1

gamma=c(1,0,1,0,1)
sigmasq=1
b_new=data.frame(b_0,b_1,b_2,b_3,b_4)
#b_new
gamma_new=t(data.frame(gamma))
#gamma_new


set.seed(123)

z=data.frame()

for(i in 1:10000){
  X_star = matrix(NA,nrow=200,ncol=5)
  for(k in 1:5){
    X_star[,k]=gamma_new[i,k]*X[,k]
  }
  for (j in 1:200){
    if (dataset[j,2]==1){
      z[i,j]= rtruncnorm(n=1,a=0,b=Inf,mean=sum(X_star[j,]*b_new[i,]),sd=sigmasq)
    }
    else{
      z[i,j]=rtruncnorm(n=1,a=-Inf,b=0,mean=sum(X_star[j,]*b_new[i,]),sd=sigmasq)
    }
    
  }
  intermediate_value<-inv(inv(B)+(t(X_star)%*%X_star))
  
  temp=mvrnorm(n=1,mu=intermediate_value%*%(inv(B)%*%as.matrix(b,byrow=TRUE)+(t(X_star)%*%t(as.matrix(z[i,])))),Sigma=intermediate_value)
  b_new <- rbind(b_new, temp)
  
  
  u=c(0,0,0,0,0)
  for (q in 1:5){
    
    v_q_star<-gamma_new[i,]*b_new[i,]
    v_q_star[q]<-b_new[i,q]
    v_q_star<-t(v_q_star)
    v_q_ss<-gamma_new[i,]*b_new[i,]
    v_q_ss[q]<-0
    v_q_ss<-t(v_q_ss)
    log=-0.5*((t(t(as.matrix(z[i,]))-X%*%v_q_ss)%*%(t(as.matrix(z[i,]))-X%*%v_q_ss)-(t(t(as.matrix(z[i,]))-X%*%v_q_star)%*%(t(as.matrix(z[i,]))-X%*%v_q_star))))
    ratio<- exp(log)+1
    u[q]<-rbinom(1,1,1/ratio)
    
  }
  gamma_new<-rbind(gamma_new,u)
}
th <- gamma_new

myfun <- function(a) { paste(a, collapse="")}

ou <- apply(th[3002:10001,], 1, FUN = myfun)

table(ou)
ou
10000 10001 10010 10011 10110 10111 11000 11001 11010 11011 11100 11101 11110 11111 
  143    62  2091   698   177    84  1967   945    37    31    99   621    42     3 

Above we tabulate the frequencies of the various models that the Gibbs Sampler selects and see that the Simple Linear Regression model containing intercept and RTS is most frequent. Below we obtain estimates for the coefficients using Gibbs Sampler and check its precision on the test set.

3. Estimation and Prediction

Here is the design matrix X for the estimation part.

dataset <- rbind(fold_1, fold_2, fold_3, fold_4, fold_5)


b=c(0,0)
B=diag(1000,2,2)
X=as.matrix(dataset[,5])
X=cbind(1,X)
head(X)
     [,1]       [,2]
[1,]    1  0.4415385
[2,]    1 -1.0519002
[3,]    1 -2.2622105
[4,]    1 -0.9581618
[5,]    1  0.4415385
[6,]    1  0.4415385
intermediate_value=solve(solve(B)+t(X)%*%X)
burn=2500

b_0=1
b_3=5

b_new=data.frame(b_0, b_3)
#b_new
# In the next step, we generate z_i
z=data.frame()
set.seed(123)
for(i in 1:10000){
  for (j in 1:250){
    if (dataset[j,2]==1){
      z[i,j]= rtruncnorm(n=1,a=0,b=Inf,mean=sum(X[j,]*b_new[i,]),sd=1)
    }
    else{
      z[i,j]=rtruncnorm(n=1,a=-Inf,b=0,mean=sum(X[j,]*b_new[i,]),sd=1)
    }
    
  }
  temp=mvrnorm(n=1,mu=intermediate_value%*%(solve(B)%*%as.matrix(b)+t(X)%*%t(as.matrix(z[i,]))),Sigma=intermediate_value)
  b_new<- rbind(b_new, temp)
}
th1 <- b_new

cf <- apply(th1[3002:10001,] , 2, mean)

z123 <- cbind(1, as.matrix(test_set[,5]))%*%(as.matrix(cf))

y123pred <- ifelse(z123 >= 0, 1, 0)

y123true <- test_set[,2]

The estimated coefficients for the intercept and the third feature i.e. RTS are:

cf
       b_0        b_3 
-1.5836733 -0.4891487 

The predicted values are:

y123pred %>% as.vector %>% cat
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

The true values are:

y123true %>% cat
1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

We can see that our model predicts one out of the two deaths correctly and does not incorrectly predict any death.

The End

---
title: "Project on trauma data"
output: html_notebook
---

### 0. Installing the libraries

```{r  message=FALSE, warning=FALSE}
library(dplyr)
library(invgamma)
library(truncnorm)
library(matlib)
library(MASS)
library(wordspace)
```

### 1. Getting the data

We obtain the data and divide it into test and training set. The training set is further divided for 5-fold cross validation. The data looks as follows: the first column is the unique identifier for each observation (patient), the second column is the binary response (1 stands for dead and 0 stands for not dead) followed by four feature variables suitably scaled.


```{r}
dataset = read.delim("traumadata.txt")

dataset <- cbind(dataset[,1:2], scale(dataset[,-(1:2)], center = TRUE, scale = TRUE))
#dataset <- cbind(dataset[,1:2], normalize.cols(scale(df[,-(1:2)], center = TRUE, scale = FALSE), method = "euclidean", p = 2))
head(dataset)
pos <- dataset[dataset$death==1,]
neg <- dataset[dataset$death==0,]

set.seed(123)
testpos <- sample(pos$ID, 2)

pos %>% filter(!(ID %in% testpos)) -> train_pos
pos %>% filter((ID %in% testpos)) -> test_pos

testneg <- sample(neg$ID, 48)

neg %>% filter(!(ID %in% testneg)) -> train_neg
neg %>% filter(ID %in% testneg) -> test_neg

test_set <- rbind(test_pos, test_neg)

set.seed(123)
temp <- sample(train_pos$ID)

foldpos1 <- temp[1:4]
foldpos2 <- temp[5:8]
foldpos3 <- temp[9:12]
foldpos4 <- temp[13:16]
foldpos5 <- temp[17:20]

fold_pos_1 <- train_pos %>% filter(ID %in% foldpos1)
fold_pos_2 <- train_pos %>% filter(ID %in% foldpos2)
fold_pos_3 <- train_pos %>% filter(ID %in% foldpos3)
fold_pos_4 <- train_pos %>% filter(ID %in% foldpos4)
fold_pos_5 <- train_pos %>% filter(ID %in% foldpos5)

set.seed(123)
temp <- sample(train_neg$ID)

fold_neg_1 <- train_neg %>% filter(ID %in% temp[1:46])
fold_neg_2 <- train_neg %>% filter(ID %in% temp[(1+46):(46*2)])
fold_neg_3 <- train_neg %>% filter(ID %in% temp[(1+46*2):(46*3)])
fold_neg_4 <- train_neg %>% filter(ID %in% temp[(1+46*3):(46*4)])
fold_neg_5 <- train_neg %>% filter(ID %in% temp[(1+46*4):(46*5)])

fold_1 <- rbind(fold_pos_1, fold_neg_1)
fold_2 <- rbind(fold_pos_2, fold_neg_2)
fold_3 <- rbind(fold_pos_3, fold_neg_3)
fold_4 <- rbind(fold_pos_4, fold_neg_4)
fold_5 <- rbind(fold_pos_5, fold_neg_5)

```

### 2. Model selection

For model selection we are selecting from the four variables and the intercept i.e a total of five variables. The model selection Gibbs Sampler selects the intercept and just one of the features when applied on the five folds i.e is suggests us to do a Simple Linear Regression. The best test set precision is obtained in the second fold which suggests us to select the intercept and the third variable i.e. "RTS".


```{r}
dataset <- rbind(fold_1, fold_2, fold_3, fold_5)

b <- rep(0, 5)
B=diag(1000,5,5)
X=as.matrix(dataset[,-c(1,2)])
X=cbind(1,X)
#head(X)
#dim(X)
#length(y)


p=rep(0.5, 5)
b_0=1
b_1=1
b_2=1
b_3=1
b_4=1

gamma=c(1,0,1,0,1)
sigmasq=1
b_new=data.frame(b_0,b_1,b_2,b_3,b_4)
#b_new
gamma_new=t(data.frame(gamma))
#gamma_new


set.seed(123)

z=data.frame()

for(i in 1:10000){
  X_star = matrix(NA,nrow=200,ncol=5)
  for(k in 1:5){
    X_star[,k]=gamma_new[i,k]*X[,k]
  }
  for (j in 1:200){
    if (dataset[j,2]==1){
      z[i,j]= rtruncnorm(n=1,a=0,b=Inf,mean=sum(X_star[j,]*b_new[i,]),sd=sigmasq)
    }
    else{
      z[i,j]=rtruncnorm(n=1,a=-Inf,b=0,mean=sum(X_star[j,]*b_new[i,]),sd=sigmasq)
    }
    
  }
  intermediate_value<-inv(inv(B)+(t(X_star)%*%X_star))
  
  temp=mvrnorm(n=1,mu=intermediate_value%*%(inv(B)%*%as.matrix(b,byrow=TRUE)+(t(X_star)%*%t(as.matrix(z[i,])))),Sigma=intermediate_value)
  b_new <- rbind(b_new, temp)
  
  
  u=c(0,0,0,0,0)
  for (q in 1:5){
    
    v_q_star<-gamma_new[i,]*b_new[i,]
    v_q_star[q]<-b_new[i,q]
    v_q_star<-t(v_q_star)
    v_q_ss<-gamma_new[i,]*b_new[i,]
    v_q_ss[q]<-0
    v_q_ss<-t(v_q_ss)
    log=-0.5*((t(t(as.matrix(z[i,]))-X%*%v_q_ss)%*%(t(as.matrix(z[i,]))-X%*%v_q_ss)-(t(t(as.matrix(z[i,]))-X%*%v_q_star)%*%(t(as.matrix(z[i,]))-X%*%v_q_star))))
    ratio<- exp(log)+1
    u[q]<-rbinom(1,1,1/ratio)
    
  }
  gamma_new<-rbind(gamma_new,u)
}

```

```{r}
th <- gamma_new

myfun <- function(a) { paste(a, collapse="")}

ou <- apply(th[3002:10001,], 1, FUN = myfun)

table(ou)
```

Above we tabulate the frequencies of the various models that the Gibbs Sampler selects and see that the Simple Linear Regression model containing intercept and RTS is most frequent. Below we obtain estimates for  the coefficients using Gibbs Sampler and check its precision on the test set.


### 3. Estimation and Prediction

Here is the design matrix X for the estimation part.

```{r}
dataset <- rbind(fold_1, fold_2, fold_3, fold_4, fold_5)


b=c(0,0)
B=diag(1000,2,2)
X=as.matrix(dataset[,5])
X=cbind(1,X)
#head(X)
intermediate_value=solve(solve(B)+t(X)%*%X)
burn=2500

b_0=1
b_3=5

b_new=data.frame(b_0, b_3)
#b_new
# In the next step, we generate z_i
z=data.frame()
set.seed(123)
for(i in 1:10000){
  for (j in 1:250){
    if (dataset[j,2]==1){
      z[i,j]= rtruncnorm(n=1,a=0,b=Inf,mean=sum(X[j,]*b_new[i,]),sd=1)
    }
    else{
      z[i,j]=rtruncnorm(n=1,a=-Inf,b=0,mean=sum(X[j,]*b_new[i,]),sd=1)
    }
    
  }
  temp=mvrnorm(n=1,mu=intermediate_value%*%(solve(B)%*%as.matrix(b)+t(X)%*%t(as.matrix(z[i,]))),Sigma=intermediate_value)
  b_new<- rbind(b_new, temp)
}

```

```{r}
th1 <- b_new

cf <- apply(th1[3002:10001,] , 2, mean)

z123 <- cbind(1, as.matrix(test_set[,5]))%*%(as.matrix(cf))

y123pred <- ifelse(z123 >= 0, 1, 0)

y123true <- test_set[,2]
```

The estimated coefficients for the intercept and the third feature i.e. RTS are:

```{r}
cf
```


The predicted values are:

```{r}
y123pred %>% as.vector %>% cat
```

The true values are:

```{r}
y123true %>% cat
```

We can see that our model predicts one out of the two deaths correctly and does not incorrectly predict any death.


### The End
