#Multiple logistic regression # Predictors / Response # # Pregnancies - Number of times pregnant. # Glucose - Plasma glucose concentration after a 2-hour oral glucose tolerance test. # BloodPressure – Diastolic blood pressure (mm Hg). # SkinThickness – Triceps skinfold thickness (mm). # Insulin – 2-Hour serum insulin (mu U/ml). # BMI – Body Mass Index (weight in kg / (height in m)^2). # DiabetesPedigreeFunction - A function that scores likelihood of diabetes based on family history. # Age - Age of the patient (years). # Outcome (Response Variable) - Class label indicating diabetes status.
#importing data
diabetes <- read.csv("~/chapter 6 stats 244/diabetes.csv")
d = diabetes
# fit logistic regression model
##glm - generalised linear model vs lm- linear model
####g(E[Y|X])=g(p(x))=log[p(x)/1-P(x)]
model_fit = glm(Outcome~., data=d, family = 'binomial')
summary(model_fit)
##
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## Pregnancies 0.1231823 0.0320776 3.840 0.000123 ***
## Glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## BloodPressure -0.0132955 0.0052336 -2.540 0.011072 *
## SkinThickness 0.0006190 0.0068994 0.090 0.928515
## Insulin -0.0011917 0.0009012 -1.322 0.186065
## BMI 0.0897010 0.0150876 5.945 2.76e-09 ***
## DiabetesPedigreeFunction 0.9451797 0.2991475 3.160 0.001580 **
## Age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
model_fit = glm(Outcome~., data=d, family = binomial())
summary(model_fit)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial(), data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## Pregnancies 0.1231823 0.0320776 3.840 0.000123 ***
## Glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## BloodPressure -0.0132955 0.0052336 -2.540 0.011072 *
## SkinThickness 0.0006190 0.0068994 0.090 0.928515
## Insulin -0.0011917 0.0009012 -1.322 0.186065
## BMI 0.0897010 0.0150876 5.945 2.76e-09 ***
## DiabetesPedigreeFunction 0.9451797 0.2991475 3.160 0.001580 **
## Age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
# Coefficients and Interpretations
#log[p(x)/1-p(x)] =n(x) =B0+B1X1+...+BpXp - (-infinity,+infini)
coef(model_fit) #log-odds
## (Intercept) Pregnancies Glucose
## -8.4046963669 0.1231822984 0.0351637146
## BloodPressure SkinThickness Insulin
## -0.0132955469 0.0006189644 -0.0011916990
## BMI DiabetesPedigreeFunction Age
## 0.0897009700 0.9451797406 0.0148690047
#p(x)/1-p(x) - (0,inifity)
exp(coef(model_fit)) #odds
## (Intercept) Pregnancies Glucose
## 0.0002238137 1.1310905981 1.0357892688
## BloodPressure SkinThickness Insulin
## 0.9867924485 1.0006191560 0.9988090108
## BMI DiabetesPedigreeFunction Age
## 1.0938471417 2.5732758592 1.0149800983
# Pregnancies -
# For every additional pregnancy, the odds of having diabetes increase by 13.1%.
#
# DiabetesPedigreeFunction -
# For each unit increase in the pedigree score, the odds of diabetes are 157% higher (about 2.6 times greater).
#
# Blood pressure -
# Each mm Hg increase in diastolic blood pressure is associated with a 1.3% decrease in the odds of diabetes
# increase with k=5 units for X1
5*coef(model_fit)[2] #log-odds
## Pregnancies
## 0.6159115
exp(5*coef(model_fit)[2]) #odds
## Pregnancies
## 1.851343
##################################
#predictions
df_pred = data.frame("Pregnancies"=3, "Glucose"=147, "BloodPressure"=74,
"SkinThickness"=32, "Insulin"=0, "BMI"=29.6,
"DiabetesPedigreeFunction"=0.631, "Age"=49)
#probability for this prediction
##Gives Probability of the person having diabetes
prob = predict(model_fit, newdata = df_pred, type = 'response')
prob
## 1
## 0.5374277
# Get the prediction (use prob to classify to 1 or 0)
##Gives Prediction between 1 or 0
pred = as.factor(ifelse(prob>=0.5, "1", "0"))
c('Prediction' = as.character(pred))
## Prediction
## "1"
#########################################
# Variable selection (model calibration)
##Which varibale best predicts the model
full_model = glm(Outcome~., data = d, family='binomial')
##model without variables yet-base model
null_model = glm(Outcome~1, data = d, family='binomial') #also called the base model
##step()-stepwise selection
###output is process of finding best model
model_selection_results = step(null_model,
scope=list(lower=formula(null_model),
upper=formula(full_model)),
direction='forward',
k = 2) # AIC Make k = log(n) for BIC(more harsh)
## Start: AIC=995.48
## Outcome ~ 1
##
## Df Deviance AIC
## + Glucose 1 808.72 812.72
## + BMI 1 920.71 924.71
## + Age 1 950.72 954.72
## + Pregnancies 1 956.21 960.21
## + DiabetesPedigreeFunction 1 970.86 974.86
## + Insulin 1 980.81 984.81
## + SkinThickness 1 989.19 993.19
## + BloodPressure 1 990.13 994.13
## <none> 993.48 995.48
##
## Step: AIC=812.72
## Outcome ~ Glucose
##
## Df Deviance AIC
## + BMI 1 771.40 777.40
## + Pregnancies 1 784.95 790.95
## + DiabetesPedigreeFunction 1 796.99 802.99
## + Age 1 797.36 803.36
## <none> 808.72 812.72
## + SkinThickness 1 807.07 813.07
## + Insulin 1 807.77 813.77
## + BloodPressure 1 808.59 814.59
##
## Step: AIC=777.4
## Outcome ~ Glucose + BMI
##
## Df Deviance AIC
## + Pregnancies 1 744.12 752.12
## + Age 1 755.68 763.68
## + DiabetesPedigreeFunction 1 762.87 770.87
## + Insulin 1 767.79 775.79
## + BloodPressure 1 769.07 777.07
## <none> 771.40 777.40
## + SkinThickness 1 770.20 778.20
##
## Step: AIC=752.12
## Outcome ~ Glucose + BMI + Pregnancies
##
## Df Deviance AIC
## + DiabetesPedigreeFunction 1 734.31 744.31
## + BloodPressure 1 738.43 748.43
## + Age 1 742.10 752.10
## <none> 744.12 752.12
## + Insulin 1 742.43 752.43
## + SkinThickness 1 743.60 753.60
##
## Step: AIC=744.31
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction
##
## Df Deviance AIC
## + BloodPressure 1 728.56 740.56
## + Insulin 1 731.51 743.51
## <none> 734.31 744.31
## + Age 1 732.51 744.51
## + SkinThickness 1 733.06 745.06
##
## Step: AIC=740.56
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction +
## BloodPressure
##
## Df Deviance AIC
## + Age 1 725.46 739.46
## + Insulin 1 725.97 739.97
## <none> 728.56 740.56
## + SkinThickness 1 728.00 742.00
##
## Step: AIC=739.46
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction +
## BloodPressure + Age
##
## Df Deviance AIC
## + Insulin 1 723.45 739.45
## <none> 725.46 739.46
## + SkinThickness 1 725.19 741.19
##
## Step: AIC=739.45
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction +
## BloodPressure + Age + Insulin
##
## Df Deviance AIC
## <none> 723.45 739.45
## + SkinThickness 1 723.45 741.45
###output will be the best model
formula(model_selection_results)
## Outcome ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction +
## BloodPressure + Age + Insulin
##setting it as glm
model_final = glm(formula(model_selection_results), d, family='binomial')
#########################################
# Model assessment
probs = predict(model_final, type = 'response')
preds = ifelse(probs>=0.5, "1", "0")
##Accuracy
c('Accuracy' = mean(d$Outcome == preds)) # % of correct prediction
## Accuracy
## 0.7838542
c('Err' = mean(d$Outcome != preds)) # % of incorrect predictions (!-not)
## Err
## 0.2161458
#Confusion matrix
##Output gives you tp,tn, fp,fn
table('Label' = d$Outcome, 'Prediction' = preds)
## Prediction
## Label 0 1
## 0 445 55
## 1 111 157
# categorise according to type
##manually calc tp,tn,fp,fn
tp = sum(preds[d$Outcome == "1"] == "1") # Preds == Y and Outcome == Y
tn = sum(preds[d$Outcome == "0"] == "0") # Preds == N and Outcome == N
fp = sum(preds[d$Outcome == "0"] == "1") # Preds == Y and Outcome == N
fn = sum(preds[d$Outcome == "1"] == "0") # Preds == N and Outcome == Y
accuracy = (tp+tn)/(tp+tn+fp+fn)
accuracy
## [1] 0.7838542
#precision
##precision of you calculating 1 for 1
precision = tp/(tp+fp)
precision
## [1] 0.740566
#recall (or sensitivity, true positive rate = 1-FNR = power)
##how well you found 1
recall = tp/(tp+fn)
recall
## [1] 0.5858209
#specificity (or true negative rate = 1-FPR)
##how well you find 0
specificity = tn/(tn+fp)
specificity
## [1] 0.89