# Logistic Regression example
# URL for data http://www.umass.edu/statdata/statdata/stat-rmult.html
# Description data
# ID   Identification Code  
# AGE    Age
# CHD  Coronary Heart Disease  (0=absent, 1=present)



rm(list=ls())

fdata<-read.csv("chdage.csv")
str(fdata)
## 'data.frame':    100 obs. of  3 variables:
##  $ ID : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ AGE: int  20 23 24 25 25 26 26 28 28 29 ...
##  $ CHD: int  0 0 0 0 1 0 0 0 0 0 ...
# Plot for binary results demo

with(fdata, plot(AGE,CHD, pch =19))

plot of chunk unnamed-chunk-1

# Ralation of age and CHD frequency

summary(fdata)
##        ID             AGE            CHD      
##  Min.   :  1.0   Min.   :20.0   Min.   :0.00  
##  1st Qu.: 25.8   1st Qu.:34.8   1st Qu.:0.00  
##  Median : 50.5   Median :44.0   Median :0.00  
##  Mean   : 50.5   Mean   :44.4   Mean   :0.43  
##  3rd Qu.: 75.2   3rd Qu.:55.0   3rd Qu.:1.00  
##  Max.   :100.0   Max.   :69.0   Max.   :1.00
fdata<-transform(fdata, agegr = cut(AGE, breaks=c(19,25,30,35,40,45,50,55,60,65,70)))

table(fdata$agegr)
## 
## (19,25] (25,30] (30,35] (35,40] (40,45] (45,50] (50,55] (55,60] (60,65] 
##       5      11      11      12      15      13       9      16       7 
## (65,70] 
##       1
mytable<- table(fdata$agegr, fdata$CHD)
mytable
##          
##            0  1
##   (19,25]  4  1
##   (25,30] 10  1
##   (30,35] 10  1
##   (35,40]  8  4
##   (40,45] 10  5
##   (45,50]  7  6
##   (50,55]  3  6
##   (55,60]  4 12
##   (60,65]  1  6
##   (65,70]  0  1
df0<-as.data.frame(mytable)
df0
##       Var1 Var2 Freq
## 1  (19,25]    0    4
## 2  (25,30]    0   10
## 3  (30,35]    0   10
## 4  (35,40]    0    8
## 5  (40,45]    0   10
## 6  (45,50]    0    7
## 7  (50,55]    0    3
## 8  (55,60]    0    4
## 9  (60,65]    0    1
## 10 (65,70]    0    0
## 11 (19,25]    1    1
## 12 (25,30]    1    1
## 13 (30,35]    1    1
## 14 (35,40]    1    4
## 15 (40,45]    1    5
## 16 (45,50]    1    6
## 17 (50,55]    1    6
## 18 (55,60]    1   12
## 19 (60,65]    1    6
## 20 (65,70]    1    1
df <- data.frame(agegr=df0[1:10,1], chdno=df0[,3][df0[,2] ==0], 
                 chdyes=df0[,3][df0[,2] ==1], 
                 chdOR = round(df0[,3][df0[,2] ==1]/df0[,3][df0[,2] ==0],2))
df
##      agegr chdno chdyes chdOR
## 1  (19,25]     4      1  0.25
## 2  (25,30]    10      1  0.10
## 3  (30,35]    10      1  0.10
## 4  (35,40]     8      4  0.50
## 5  (40,45]    10      5  0.50
## 6  (45,50]     7      6  0.86
## 7  (50,55]     3      6  2.00
## 8  (55,60]     4     12  3.00
## 9  (60,65]     1      6  6.00
## 10 (65,70]     0      1   Inf
plot(df[1:9,1],df[1:9,4], main = "Odds Ratio")

plot of chunk unnamed-chunk-1

# Logistic regression model

fit <- glm(CHD~AGE, data=fdata, family=binomial(link="logit"))
summary(fit)
## 
## Call:
## glm(formula = CHD ~ AGE, family = binomial(link = "logit"), data = fdata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.972  -0.846  -0.458   0.825   2.286  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.3095     1.1337   -4.68  2.8e-06 ***
## AGE           0.1109     0.0241    4.61  4.0e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136.66  on 99  degrees of freedom
## Residual deviance: 107.35  on 98  degrees of freedom
## AIC: 111.4
## 
## Number of Fisher Scoring iterations: 4
coef(fit)
## (Intercept)         AGE 
##     -5.3095      0.1109
confint(fit)
## Waiting for profiling to be done...
##                2.5 % 97.5 %
## (Intercept) -7.72587 -3.246
## AGE          0.06693  0.162
exp(coef(fit))
## (Intercept)         AGE 
##    0.004945    1.117307
exp(confint(fit))
## Waiting for profiling to be done...
##                 2.5 %  97.5 %
## (Intercept) 0.0004413 0.03892
## AGE         1.0692223 1.17587
# Frequency of coronary heart didease increases for 11,7% (CI 0.95, 6,9%, 17,6%)