# Logistic Regression example
# URL for data http://www.umass.edu/statdata/statdata/stat-rmult.html
# Description data
# ID Identification Code
# AGE Age
# CHD Coronary Heart Disease (0=absent, 1=present)
rm(list=ls())
fdata<-read.csv("chdage.csv")
str(fdata)
## 'data.frame': 100 obs. of 3 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ AGE: int 20 23 24 25 25 26 26 28 28 29 ...
## $ CHD: int 0 0 0 0 1 0 0 0 0 0 ...
# Plot for binary results demo
with(fdata, plot(AGE,CHD, pch =19))

# Ralation of age and CHD frequency
summary(fdata)
## ID AGE CHD
## Min. : 1.0 Min. :20.0 Min. :0.00
## 1st Qu.: 25.8 1st Qu.:34.8 1st Qu.:0.00
## Median : 50.5 Median :44.0 Median :0.00
## Mean : 50.5 Mean :44.4 Mean :0.43
## 3rd Qu.: 75.2 3rd Qu.:55.0 3rd Qu.:1.00
## Max. :100.0 Max. :69.0 Max. :1.00
fdata<-transform(fdata, agegr = cut(AGE, breaks=c(19,25,30,35,40,45,50,55,60,65,70)))
table(fdata$agegr)
##
## (19,25] (25,30] (30,35] (35,40] (40,45] (45,50] (50,55] (55,60] (60,65]
## 5 11 11 12 15 13 9 16 7
## (65,70]
## 1
mytable<- table(fdata$agegr, fdata$CHD)
mytable
##
## 0 1
## (19,25] 4 1
## (25,30] 10 1
## (30,35] 10 1
## (35,40] 8 4
## (40,45] 10 5
## (45,50] 7 6
## (50,55] 3 6
## (55,60] 4 12
## (60,65] 1 6
## (65,70] 0 1
df0<-as.data.frame(mytable)
df0
## Var1 Var2 Freq
## 1 (19,25] 0 4
## 2 (25,30] 0 10
## 3 (30,35] 0 10
## 4 (35,40] 0 8
## 5 (40,45] 0 10
## 6 (45,50] 0 7
## 7 (50,55] 0 3
## 8 (55,60] 0 4
## 9 (60,65] 0 1
## 10 (65,70] 0 0
## 11 (19,25] 1 1
## 12 (25,30] 1 1
## 13 (30,35] 1 1
## 14 (35,40] 1 4
## 15 (40,45] 1 5
## 16 (45,50] 1 6
## 17 (50,55] 1 6
## 18 (55,60] 1 12
## 19 (60,65] 1 6
## 20 (65,70] 1 1
df <- data.frame(agegr=df0[1:10,1], chdno=df0[,3][df0[,2] ==0],
chdyes=df0[,3][df0[,2] ==1],
chdOR = round(df0[,3][df0[,2] ==1]/df0[,3][df0[,2] ==0],2))
df
## agegr chdno chdyes chdOR
## 1 (19,25] 4 1 0.25
## 2 (25,30] 10 1 0.10
## 3 (30,35] 10 1 0.10
## 4 (35,40] 8 4 0.50
## 5 (40,45] 10 5 0.50
## 6 (45,50] 7 6 0.86
## 7 (50,55] 3 6 2.00
## 8 (55,60] 4 12 3.00
## 9 (60,65] 1 6 6.00
## 10 (65,70] 0 1 Inf
plot(df[1:9,1],df[1:9,4], main = "Odds Ratio")

# Logistic regression model
fit <- glm(CHD~AGE, data=fdata, family=binomial(link="logit"))
summary(fit)
##
## Call:
## glm(formula = CHD ~ AGE, family = binomial(link = "logit"), data = fdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.972 -0.846 -0.458 0.825 2.286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.3095 1.1337 -4.68 2.8e-06 ***
## AGE 0.1109 0.0241 4.61 4.0e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136.66 on 99 degrees of freedom
## Residual deviance: 107.35 on 98 degrees of freedom
## AIC: 111.4
##
## Number of Fisher Scoring iterations: 4
coef(fit)
## (Intercept) AGE
## -5.3095 0.1109
confint(fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -7.72587 -3.246
## AGE 0.06693 0.162
exp(coef(fit))
## (Intercept) AGE
## 0.004945 1.117307
exp(confint(fit))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.0004413 0.03892
## AGE 1.0692223 1.17587
# Frequency of coronary heart didease increases for 11,7% (CI 0.95, 6,9%, 17,6%)