Chapter 6 Class Example (Week 12)

#Multiple logistic regression # Predictors / Response # # Pregnancies - Number of times pregnant. # Glucose - Plasma glucose concentration after a 2-hour oral glucose tolerance test. # BloodPressure – Diastolic blood pressure (mm Hg). # SkinThickness – Triceps skinfold thickness (mm). # Insulin – 2-Hour serum insulin (mu U/ml). # BMI – Body Mass Index (weight in kg / (height in m)^2). # DiabetesPedigreeFunction - A function that scores likelihood of diabetes based on family history. # Age - Age of the patient (years). # Outcome (Response Variable) - Class label indicating diabetes status.

#importing data
diabetes <- read.csv("~/chapter 6 stats 244/diabetes.csv")
d  = diabetes
# fit logistic regression model

##glm - generalised linear model vs lm- linear model

####g(E[Y|X])=g(p(x))=log[p(x)/1-P(x)]
model_fit = glm(Outcome~., data=d, family = 'binomial')

summary(model_fit)
## 
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = d)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.4046964  0.7166359 -11.728  < 2e-16 ***
## Pregnancies               0.1231823  0.0320776   3.840 0.000123 ***
## Glucose                   0.0351637  0.0037087   9.481  < 2e-16 ***
## BloodPressure            -0.0132955  0.0052336  -2.540 0.011072 *  
## SkinThickness             0.0006190  0.0068994   0.090 0.928515    
## Insulin                  -0.0011917  0.0009012  -1.322 0.186065    
## BMI                       0.0897010  0.0150876   5.945 2.76e-09 ***
## DiabetesPedigreeFunction  0.9451797  0.2991475   3.160 0.001580 ** 
## Age                       0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
model_fit = glm(Outcome~., data=d, family = binomial())

summary(model_fit)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial(), data = d)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.4046964  0.7166359 -11.728  < 2e-16 ***
## Pregnancies               0.1231823  0.0320776   3.840 0.000123 ***
## Glucose                   0.0351637  0.0037087   9.481  < 2e-16 ***
## BloodPressure            -0.0132955  0.0052336  -2.540 0.011072 *  
## SkinThickness             0.0006190  0.0068994   0.090 0.928515    
## Insulin                  -0.0011917  0.0009012  -1.322 0.186065    
## BMI                       0.0897010  0.0150876   5.945 2.76e-09 ***
## DiabetesPedigreeFunction  0.9451797  0.2991475   3.160 0.001580 ** 
## Age                       0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
# Coefficients and Interpretations

#log[p(x)/1-p(x)] =n(x) =B0+B1X1+...+BpXp - (-infinity,+infini)
coef(model_fit)       #log-odds
##              (Intercept)              Pregnancies                  Glucose 
##            -8.4046963669             0.1231822984             0.0351637146 
##            BloodPressure            SkinThickness                  Insulin 
##            -0.0132955469             0.0006189644            -0.0011916990 
##                      BMI DiabetesPedigreeFunction                      Age 
##             0.0897009700             0.9451797406             0.0148690047
#p(x)/1-p(x) - (0,inifity)
exp(coef(model_fit))  #odds
##              (Intercept)              Pregnancies                  Glucose 
##             0.0002238137             1.1310905981             1.0357892688 
##            BloodPressure            SkinThickness                  Insulin 
##             0.9867924485             1.0006191560             0.9988090108 
##                      BMI DiabetesPedigreeFunction                      Age 
##             1.0938471417             2.5732758592             1.0149800983
#  Pregnancies -
#  For every additional pregnancy, the odds of having diabetes increase by 13.1%.
#
#  DiabetesPedigreeFunction - 
#  For each unit increase in the pedigree score, the odds of diabetes are 157% higher (about 2.6 times greater).
#
#  Blood pressure - 
#  Each mm Hg increase in diastolic blood pressure is associated with a 1.3% decrease in the odds of diabetes


# increase with k=5 units for X1

5*coef(model_fit)[2]       #log-odds
## Pregnancies 
##   0.6159115
exp(5*coef(model_fit)[2])  #odds
## Pregnancies 
##    1.851343
##################################
#predictions

df_pred = data.frame("Pregnancies"=3, "Glucose"=147, "BloodPressure"=74,
                     "SkinThickness"=32, "Insulin"=0, "BMI"=29.6,
                     "DiabetesPedigreeFunction"=0.631, "Age"=49)
#probability for this prediction
##Gives Probability of the person having diabetes 
prob = predict(model_fit, newdata = df_pred, type = 'response')
prob
##         1 
## 0.5374277
# Get the prediction (use prob to classify to 1 or 0)
##Gives Prediction between 1 or 0
pred = as.factor(ifelse(prob>=0.5, "1", "0"))
c('Prediction' = as.character(pred))
## Prediction 
##        "1"
#########################################
# Variable selection (model calibration)
##Which varibale best predicts the model
full_model = glm(Outcome~., data = d, family='binomial')
##model without variables yet-base model
null_model = glm(Outcome~1, data = d, family='binomial') #also called the base model
##step()-stepwise selection
###output is process of finding best model
model_selection_results = step(null_model,
                               scope=list(lower=formula(null_model),
                                          upper=formula(full_model)),
                               direction='forward',
                               k = 2) # AIC Make k = log(n) for BIC(more harsh)
## Start:  AIC=995.48
## Outcome ~ 1
## 
##                            Df Deviance    AIC
## + Glucose                   1   808.72 812.72
## + BMI                       1   920.71 924.71
## + Age                       1   950.72 954.72
## + Pregnancies               1   956.21 960.21
## + DiabetesPedigreeFunction  1   970.86 974.86
## + Insulin                   1   980.81 984.81
## + SkinThickness             1   989.19 993.19
## + BloodPressure             1   990.13 994.13
## <none>                          993.48 995.48
## 
## Step:  AIC=812.72
## Outcome ~ Glucose
## 
##                            Df Deviance    AIC
## + BMI                       1   771.40 777.40
## + Pregnancies               1   784.95 790.95
## + DiabetesPedigreeFunction  1   796.99 802.99
## + Age                       1   797.36 803.36
## <none>                          808.72 812.72
## + SkinThickness             1   807.07 813.07
## + Insulin                   1   807.77 813.77
## + BloodPressure             1   808.59 814.59
## 
## Step:  AIC=777.4
## Outcome ~ Glucose + BMI
## 
##                            Df Deviance    AIC
## + Pregnancies               1   744.12 752.12
## + Age                       1   755.68 763.68
## + DiabetesPedigreeFunction  1   762.87 770.87
## + Insulin                   1   767.79 775.79
## + BloodPressure             1   769.07 777.07
## <none>                          771.40 777.40
## + SkinThickness             1   770.20 778.20
## 
## Step:  AIC=752.12
## Outcome ~ Glucose + BMI + Pregnancies
## 
##                            Df Deviance    AIC
## + DiabetesPedigreeFunction  1   734.31 744.31
## + BloodPressure             1   738.43 748.43
## + Age                       1   742.10 752.10
## <none>                          744.12 752.12
## + Insulin                   1   742.43 752.43
## + SkinThickness             1   743.60 753.60
## 
## Step:  AIC=744.31
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction
## 
##                 Df Deviance    AIC
## + BloodPressure  1   728.56 740.56
## + Insulin        1   731.51 743.51
## <none>               734.31 744.31
## + Age            1   732.51 744.51
## + SkinThickness  1   733.06 745.06
## 
## Step:  AIC=740.56
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction + 
##     BloodPressure
## 
##                 Df Deviance    AIC
## + Age            1   725.46 739.46
## + Insulin        1   725.97 739.97
## <none>               728.56 740.56
## + SkinThickness  1   728.00 742.00
## 
## Step:  AIC=739.46
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction + 
##     BloodPressure + Age
## 
##                 Df Deviance    AIC
## + Insulin        1   723.45 739.45
## <none>               725.46 739.46
## + SkinThickness  1   725.19 741.19
## 
## Step:  AIC=739.45
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction + 
##     BloodPressure + Age + Insulin
## 
##                 Df Deviance    AIC
## <none>               723.45 739.45
## + SkinThickness  1   723.45 741.45
###output will be the best model
formula(model_selection_results)
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction + 
##     BloodPressure + Age + Insulin
##setting it as glm
model_final = glm(formula(model_selection_results), d, family='binomial')
#########################################
# Model assessment


probs = predict(model_final, type = 'response')
preds = ifelse(probs>=0.5, "1", "0")
##Accuracy
c('Accuracy' = mean(d$Outcome == preds)) # % of correct prediction
##  Accuracy 
## 0.7838542
c('Err' = mean(d$Outcome != preds)) # % of incorrect predictions (!-not)
##       Err 
## 0.2161458
#Confusion matrix
##Output gives you tp,tn, fp,fn
table('Label' = d$Outcome, 'Prediction' = preds)
##      Prediction
## Label   0   1
##     0 445  55
##     1 111 157
# categorise according to type
##manually calc tp,tn,fp,fn
tp = sum(preds[d$Outcome == "1"] == "1") # Preds == Y and Outcome == Y
tn = sum(preds[d$Outcome == "0"] == "0") # Preds == N and Outcome == N
fp = sum(preds[d$Outcome == "0"] == "1") # Preds == Y and Outcome == N
fn = sum(preds[d$Outcome == "1"] == "0") # Preds == N and Outcome == Y
accuracy = (tp+tn)/(tp+tn+fp+fn)
accuracy
## [1] 0.7838542
#precision
##precision of you calculating 1 for 1
precision = tp/(tp+fp)
precision
## [1] 0.740566
#recall (or sensitivity, true positive rate = 1-FNR = power)
##how well you found 1
recall = tp/(tp+fn)
recall
## [1] 0.5858209
#specificity (or true negative rate = 1-FPR)
##how well you find 0
specificity = tn/(tn+fp)
specificity
## [1] 0.89