#problem a #These were the categorical variables: gender, chestpain, angina, vessels, and electrocardio, These categorical variables have no mathematical meaning and can only take on limited and fixed values. The continuous variables were the following: age, blood pressure, serum cholestero, heart rate, blood sugar, ST depression, thal. Since these variables can take on any value between a min and max value of numbers While SLslope seemed unclear.
#There were 160 cases of no heart disease (absent) and 137 cases of heart disease present in this data set. the code use was table(hd). I switched the Y to hd.
data<- read.table ("Data-HW3-CHeartDisease.dat")
head(data)
## V1
## 1 63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
## 2 67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
## 3 67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
## 4 37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
## 5 41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
## 6 56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
data<-read.table(file = "Data-HW3-CHeartDisease.dat", header = FALSE, quote = "", sep = ",")
n.all<-nrow(data)
id.ms<-sort(c(seq(1, n.all)[data[, 12]=='?'], seq(1, n.all)[data[, 13]=='?']))
data2<-data[-id.ms,]
data2[, 12]<-as.numeric(data2[, 12]) - 2
data2[, 13]<-as.numeric(data2[, 13]) - 1
X<-data.matrix(data2[, 1:13])
hd<-data2[, 14]; hd[hd > 0]<-1
colnames(X)<-c("age", "gender", "chestpain", "bldpressure", "chol", "bldsugar", "electrocardio", "heartrate", "angina", "STdepression", "STslope", "vessel", "thal")
model = glm(hd ~ X, family = "binomial", data = data2)
summary(model)
##
## Call:
## glm(formula = hd ~ X, family = "binomial", data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8042 -0.5263 -0.1860 0.4161 2.3676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.993701 2.893938 -2.417 0.01566 *
## Xage -0.014057 0.024036 -0.585 0.55866
## Xgender 1.319688 0.486718 2.711 0.00670 **
## Xchestpain 0.578582 0.191335 3.024 0.00250 **
## Xbldpressure 0.024182 0.010727 2.254 0.02418 *
## Xchol 0.004816 0.003775 1.276 0.20202
## Xbldsugar -0.991868 0.554947 -1.787 0.07389 .
## Xelectrocardio 0.246117 0.185238 1.329 0.18396
## Xheartrate -0.021183 0.010275 -2.062 0.03923 *
## Xangina 0.915651 0.414003 2.212 0.02699 *
## XSTdepression 0.249909 0.212418 1.176 0.23940
## XSTslope 0.582699 0.362317 1.608 0.10778
## Xvessel 1.267008 0.265723 4.768 1.86e-06 ***
## Xthal 0.714003 0.202068 3.533 0.00041 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 203.86 on 283 degrees of freedom
## AIC: 231.86
##
## Number of Fisher Scoring iterations: 6
#The model suggests that their are 7 statistically significant predictors: gender, chestpain, blood pressure, standing heart rate, angina, vessel, and thal.
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
#problem c
chestpaindummy <- dummy("V3", data2)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
thaldummy <- dummy("V13", data2)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
model2 = glm(hd ~ V1+V2+chestpaindummy[ ,2:4]+V4+V5+V6+V7+V8+V9+V10+V11+V12+thaldummy[ , 2:3], family = "binomial", data = data2)
summary(model2)
##
## Call:
## glm(formula = hd ~ V1 + V2 + chestpaindummy[, 2:4] + V4 + V5 +
## V6 + V7 + V8 + V9 + V10 + V11 + V12 + thaldummy[, 2:3], family = "binomial",
## data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7145 -0.5436 -0.1444 0.3264 2.7316
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.986093 2.938058 -2.037 0.041607 *
## V1 -0.012296 0.024664 -0.499 0.618120
## V2 1.431422 0.513185 2.789 0.005282 **
## chestpaindummy[, 2:4]V32 1.071153 0.753902 1.421 0.155371
## chestpaindummy[, 2:4]V33 0.202175 0.648718 0.312 0.755304
## chestpaindummy[, 2:4]V34 2.006802 0.652608 3.075 0.002105 **
## V4 0.023981 0.011110 2.159 0.030889 *
## V5 0.004930 0.003944 1.250 0.211306
## V6 -0.610758 0.599184 -1.019 0.308052
## V7 0.255433 0.189565 1.347 0.177829
## V8 -0.021281 0.010821 -1.967 0.049224 *
## V9 0.739431 0.434687 1.701 0.088931 .
## V10 0.353095 0.230102 1.535 0.124903
## V11 0.670508 0.371616 1.804 0.071184 .
## V12 1.269290 0.271304 4.678 2.89e-06 ***
## thaldummy[, 2:3]V132 0.011430 0.795090 0.014 0.988530
## thaldummy[, 2:3]V133 1.441377 0.418558 3.444 0.000574 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 194.83 on 280 degrees of freedom
## AIC: 228.83
##
## Number of Fisher Scoring iterations: 6
#problem d ##for every one unit change in serum cholestrol #the log odds of heart disease (versus not having heart disease) change by 0.004930 #holding all other predictors fixed #In order to test the null hypothesis that the serum choloestrol coefficient equeals zero, one needs to check the p-value for this coefficient, which is 0.211306. Given the siginificance level is set at 0.05, we can not reject the null hypothesis and conclude that this coefficient is equal to zero, which means that the serum cholestrol is not a significant predictor. #In comparison to the first model (problem b), model2, has only 6 statistically significant predictors (as opposed to 7). Given that, I created two dummy variables, one for chestpain and the other for thal, the only statistically significant level of pain was for level 4 and for thal, only level 3 was statistically significant.
#problem e For those who have chestpain type 4, the log odds of having heart disease (versus not having it) is greater than 2.006 than the log odds of having heart disease for those having chestpain of type 1. In order to test the null hypothesis that the chestpain level 4 coefficient equals zero, one needs to check the p-value for this coefficient, which is 0.002105. Given the significance level is set at 0.05, we can reject the null hypothesis since the p-value is well below this alpha. #problem f
n = nrow(data2)
yb.hat = rep(0,n)
yb.hat[fitted(model2) > 0.5] = 1
sum(hd != yb.hat) / length(hd)
## [1] 0.1380471
##The miscalculation rate for this logistic regression model is about 14% or 0.0138047.