Load the sepsis dataset
## Shock Malnutrition Alcoholism Age Infarction Death
## 1 0 0 0 56 0 0
## 2 0 0 0 80 0 0
## 3 0 0 0 61 0 0
## 4 0 0 0 26 0 0
## 5 0 0 0 53 0 0
## 6 0 1 0 87 0 1
We build a 2-way contingency table
sepdeath <- table(sepsis$Death, sepsis$Shock)
rownames(sepdeath) <- c("Alive", "Death")
colnames(sepdeath) <- c("No Shock", "Shock Present")
sepdeath
##
## No Shock Shock Present
## Alive 82 3
## Death 14 7
So,
Pr(Death/Shock Present) = 7/10 = 70%
Pr(Death/No Shock) = 14/96 = 14.6%
Is the difference significant?
We apply the two sample proportion test.
prop.test(sepdeath, correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: sepdeath
## X-squared = 17.507, df = 1, p-value = 2.862e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.09263929 0.50343914
## sample estimates:
## prop 1 prop 2
## 0.9647059 0.6666667
Marginal probabilites are to be calculated. These are probablities that are there on the sides of a 2 x 2 contingency table.
If the conditional probablities are same then there is no association between the variables in a contingency table.
Odds ratio is important as it tells us the degree of association.
The odds ratio is 82 x 7 / 14 x 3 (cross product of the table values)
(82*7)/(14*3)
## [1] 13.66667
Calculating odds ratio using vcd package
library(vcd)
oddsratio(sepdeath,log = FALSE)
## odds ratios for and
##
## [1] 13.66667
Building a 95% confidence intervals for the odds ratio.
confint(oddsratio(sepdeath, log=FALSE), level = 0.99)
## 0.5 % 99.5 %
## Alive:Death/No Shock:Shock Present 1.989146 93.89847
As the value 1 of the odds ratio is outside the null hypothesis we can reject the null hypothesis. In this case the level chosen is 99%
chisq.test(sepdeath)
## Warning in chisq.test(sepdeath): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: sepdeath
## X-squared = 14.193, df = 1, p-value = 0.000165
Why is odds ratio a preferred statistic? As we get a number. If we have a 95% confidence interval, then we can also test for the significance of the model as if the lower bound of the 95% confidence interval excludes 1 (an odds ratio of 1 is the null hypothesis or H0) then we can reject the H0. Hence odds ratio is a better statistic to report when we are looking for association between binary variables.
Calculate odds ratio for malnutrition
or2 <- oddsratio(table(sepsis$Death, sepsis$Malnutrition), log = FALSE)
confint(or2)
## 2.5 % 97.5 %
## 0:1/0:1 1.24805 9.004814
Calculate odds ratio for alcoholism
or3 <- oddsratio(table(sepsis$Death, sepsis$Alcoholism), log = FALSE)
confint(or3)
## 2.5 % 97.5 %
## 0:1/0:1 1.931937 15.83087
Thus both alcoholism and malnutrtion are significantly associated with death when we test using the Odds ratio.
Calculating the association of age with death
sepsis_death <- subset(sepsis, sepsis$Death==1) # Subsetting a data to extract only the data of the patients who died.
sepsis_alive <- subset(sepsis, sepsis$Death!=1) # Now we get the subset of patients who are alive.
summary(sepsis_death$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.00 60.00 68.00 66.48 70.00 87.00
summary(sepsis_alive$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 28.00 46.00 47.53 60.00 94.00
The average age of those who die and those who survive are same.
H0 = (Mean age of surviving = Mean age of dead)
t.test(sepsis$Age ~ sepsis$Death)
##
## Welch Two Sample t-test
##
## data: sepsis$Age by sepsis$Death
## t = -5.8687, df = 64.455, p-value = 1.665e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25.39551 -12.49805
## sample estimates:
## mean in group 0 mean in group 1
## 47.52941 66.47619
t.test(sepsis_alive$Age, sepsis_death$Age) # Alternate formulation of the t test. Comparing two variables in two datasets we dont need to put tilde.
##
## Welch Two Sample t-test
##
## data: sepsis_alive$Age and sepsis_death$Age
## t = -5.8687, df = 64.455, p-value = 1.665e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25.39551 -12.49805
## sample estimates:
## mean of x mean of y
## 47.52941 66.47619
A variable with only two possible values. Building a model to predict this as an independant variable requires us to use logistic regression.
head(sepsis)
## Shock Malnutrition Alcoholism Age Infarction Death
## 1 0 0 0 56 0 0
## 2 0 0 0 80 0 0
## 3 0 0 0 61 0 0
## 4 0 0 0 26 0 0
## 5 0 0 0 53 0 0
## 6 0 1 0 87 0 1
Pr(Y = 1) = Pr (Y = Death) = eB0+B1xShock+B2xMalnutrition+..+B5xInfarction/(1+numerator)
Why?
Because the the independant n can have only a 0 and 1 state and we are intersted in the probablity of the variable being 0 and 1. The formula generates the probablity estimates ranging between 0 - 1 from the equation.
summary(sepsis)
## Shock Malnutrition Alcoholism Age
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :17.00
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:33.00
## Median :0.00000 Median :0.0000 Median :0.0000 Median :52.50
## Mean :0.09434 Mean :0.3019 Mean :0.2075 Mean :51.28
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:68.75
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :94.00
## Infarction Death
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.1321 Mean :0.1981
## 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
sepsis1 <- glm(Death~ ., data = sepsis, family = binomial) #. notation inputs all other factors in the glm model except for death.
summary(sepsis1)
##
## Call:
## glm(formula = Death ~ ., family = binomial, data = sepsis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3277 -0.4204 -0.0781 -0.0274 3.2946
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.75391 2.54170 -3.838 0.000124 ***
## Shock 3.67387 1.16481 3.154 0.001610 **
## Malnutrition 1.21658 0.72822 1.671 0.094798 .
## Alcoholism 3.35488 0.98210 3.416 0.000635 ***
## Age 0.09215 0.03032 3.039 0.002374 **
## Infarction 2.79759 1.16397 2.403 0.016240 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 105.528 on 105 degrees of freedom
## Residual deviance: 53.122 on 100 degrees of freedom
## AIC: 65.122
##
## Number of Fisher Scoring iterations: 7
The model is thus:
P(Death) = e-9.75+3.67(Shock)+1.21(Malnutrition)+3.35(Alcoholism)+0.09(Age)+2.79(Infarction)/(1+numerator)
The most important is Alcoholism as it has the lowest P value. A factor with a low p value has a lower importance in the model.
In any model the H0 is that the beta coefficient for each dependant variable is considered to be 0. The H1 is that B1 is not equal to 0. The p value is the probablity of getting a B1 value of 3.67 for shock in this model if H0 is true. This probablity is 0.0016 (0.16%). Hence the null hypothesis is rejected.
The intercept detects the burden of the average. It is the probablity of death in this model which would occur if all the other factors were 0 in this model. It tells that the overall effect is significant but this has no interpretation.
The z value is the ratio of the “B estimate” divided by the standard error (SE).
The sign of the coefficient inidicates the risk for the response indicator. Thus in this data paitents with shock is more likely to die. If the B value was negative then it would have been interpreted as less likely to die.
The AIC value is the Akaike Information Criteria. It is used for model selection. A smaller value indicates a better model. However it is used for model selection and does not inform us about the quality of the model.
simdata <- data.frame(Shock=c(1,1), Malnutrition=c(0,0), Alcoholism=c(0,0), Age=c(90,36), Infarction=c(1,1)) #Simulatetd data is generated.
predict(sepsis1, newdata=simdata, type="response") #Using the model to predict the model.
## 1 2
## 0.9933818 0.5087599
sepsis_pred <- predict(sepsis1, newdata=sepsis, type="response")
dim(sepsis_pred)
## NULL
length(sepsis_pred)
## [1] 106
head(sepsis_pred)
## 1 2 3 4 5
## 0.0100174835 0.0845795486 0.0157879878 0.0006371069 0.0076163433
## 6
## 0.3728426160
sepsis_pred <- ifelse(sepsis_pred<0.5,"Alive", "Death")
head(sepsis_pred)
## 1 2 3 4 5 6
## "Alive" "Alive" "Alive" "Alive" "Alive" "Alive"
confusionmatrix <- table(sepsis$Death, sepsis_pred) # Confusion matrix cross tabulation.
confusionmatrix
## sepsis_pred
## Alive Death
## 0 84 1
## 1 8 13
misclassify <- 9/106 # This is the misclassification test.
misclassify
## [1] 0.08490566
sepsis3 <- glm(Death ~ Shock+Alcoholism+Age+Infarction, data=sepsis, family = binomial)
summary(sepsis3)
##
## Call:
## glm(formula = Death ~ Shock + Alcoholism + Age + Infarction,
## family = binomial, data = sepsis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.26192 -0.50391 -0.10690 -0.04112 3.06000
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.89459 2.31689 -3.839 0.000124 ***
## Shock 3.70119 1.10353 3.354 0.000797 ***
## Alcoholism 3.18590 0.91725 3.473 0.000514 ***
## Age 0.08983 0.02922 3.075 0.002106 **
## Infarction 2.38647 1.07227 2.226 0.026039 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 105.528 on 105 degrees of freedom
## Residual deviance: 56.073 on 101 degrees of freedom
## AIC: 66.073
##
## Number of Fisher Scoring iterations: 7
sepsis_pred2 <- predict(sepsis3, newdata = sepsis, type="response")
head(sepsis_pred2)
## 1 2 3 4 5 6
## 0.020552383 0.153416834 0.031834480 0.001415347 0.015773774 0.253652221
sepsis_pred2 <- ifelse(sepsis_pred2<0.5,"Alive", "Death")
confusionmatrix <- table(sepsis$Death, sepsis_pred2)
confusionmatrix
## sepsis_pred2
## Alive Death
## 0 83 2
## 1 7 14
misclassify2 <- 9/106
misclassify
## [1] 0.08490566
In this model the misclassification rate is not changed but the AIC value has changed. The coefficients have changed.
This is also known as model based odds ratio.
The odds ratio that was calculated earlier was an example an unadjusted odds ratio. We can get the adjusted odds ratio from the logistic regression model.
The idea is what is the probablity of death if the patient is in shock. This simplifiies to eB1 for shock where B1 is the B value for Shock in the regression model.
This is one of the best things about a logistic regression model. That we can easily get the odds ratio.
sepsis1
##
## Call: glm(formula = Death ~ ., family = binomial, data = sepsis)
##
## Coefficients:
## (Intercept) Shock Malnutrition Alcoholism Age
## -9.75391 3.67387 1.21658 3.35488 0.09215
## Infarction
## 2.79759
##
## Degrees of Freedom: 105 Total (i.e. Null); 100 Residual
## Null Deviance: 105.5
## Residual Deviance: 53.12 AIC: 65.12
or <- exp(sepsis1$coefficients)
round(or,2)
## (Intercept) Shock Malnutrition Alcoholism Age
## 0.00 39.40 3.38 28.64 1.10
## Infarction
## 16.41
round(cbind(or, exp(confint(sepsis1))),2) # This will give you the adjusted odds ratio as well as the 95% confidence intervals of the odds ratio. cbind command allows you to bind two outputs together.
## Waiting for profiling to be done...
## or 2.5 % 97.5 %
## (Intercept) 0.00 0.00 0.00
## Shock 39.40 5.12 551.51
## Malnutrition 3.38 0.84 15.50
## Alcoholism 28.64 5.21 267.55
## Age 1.10 1.04 1.17
## Infarction 16.41 2.03 218.48
Use the curve command to plot the logistic regression equation. This probablity curve represents the probablity of death if the patient has shock, h/o infarction and is alcoholic.
sepsis3$coefficients
## (Intercept) Shock Alcoholism Age Infarction
## -8.89458992 3.70119321 3.18590397 0.08983175 2.38646847
# Plot 1 with all factors
curve(exp(-8.89+3.7+3.18+0.09*x+2.39)/(1+exp(-8.89+3.7+3.18+0.09*x+2.39)),from = 15, to = 95, xlab = "Age in years", ylab="Probablity of Death", ylim =c(0,1), col= 1, lwd=3, main="Probablity of Mortality")
# Plot 2 for shock and alcoholism and add existing graph
curve(exp(-8.89+3.7+3.18+0.09*x)/(1+exp(-8.89+3.7+3.18+0.09*x)), col= 2, lwd=3, add=TRUE)
# Plot 3 for shock and infarction and add existing graph
curve(exp(-8.89+3.7+0.09*x+2.39)/(1+exp(-8.89+3.7+0.09*x+2.39)), col= 3, lwd=3, add=TRUE)
# Plot 4 for alcoholism and infarction and add existing graph
curve(exp(-8.89+3.18+0.09*x+2.39)/(1+exp(-8.89+3.18+0.09*x+2.39)), col= 4, lwd=3, add=TRUE)
# Plot 5 for alcoholism and add existing graph
curve(exp(-8.89+3.18+0.09*x)/(1+exp(-8.89+3.18+0.09*x)), col=5, lwd=3, add=TRUE)
# Plot 6 for shock and add existing graph
curve(exp(-8.89+3.7+0.09*x)/(1+exp(-8.89+3.7+0.09*x)), col= 6, lwd=3, add=TRUE)
# Plot 7 for infarction and add existing graph
curve(exp(-8.89+0.09*x+2.39)/(1+exp(-8.89+0.09*x+2.39)), col= 7, lwd=3, add=TRUE)
# Plot 8 for none and add existing graph
curve(exp(-8.89+0.09*x)/(1+exp(-8.89+0.09*x)), col= 8, lwd=3, add=TRUE)
# Add a legend
legend("right", pch = rep(16,8), col= c(1:10), legend = c("S+A+I","S+A","S+I","A+I","A","S","I","None"))