setwd('/home/daria/Courses/R/Edx/Logistic Regression/')
# Unit 3, Modeling the Expert
# Video 4
# Read in dataset
quality = read.csv("quality.csv")
# Look at structure
str(quality)
## 'data.frame': 131 obs. of 14 variables:
## $ MemberID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ InpatientDays : int 0 1 0 0 8 2 16 2 2 4 ...
## $ ERVisits : int 0 1 0 1 2 0 1 0 1 2 ...
## $ OfficeVisits : int 18 6 5 19 19 9 8 8 4 0 ...
## $ Narcotics : int 1 1 3 0 3 2 1 0 3 2 ...
## $ DaysSinceLastERVisit: num 731 411 731 158 449 ...
## $ Pain : int 10 0 10 34 10 6 4 5 5 2 ...
## $ TotalVisits : int 18 8 5 20 29 11 25 10 7 6 ...
## $ ProviderCount : int 21 27 16 14 24 40 19 11 28 21 ...
## $ MedicalClaims : int 93 19 27 59 51 53 40 28 20 17 ...
## $ ClaimLines : int 222 115 148 242 204 156 261 87 98 66 ...
## $ StartedOnCombination: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ AcuteDrugGapSmall : int 0 1 5 0 0 4 0 0 0 0 ...
## $ PoorCare : int 0 0 0 0 0 1 0 0 1 0 ...
# Table outcome
table(quality$PoorCare)
##
## 0 1
## 98 33
# Baseline accuracy
98/131
## [1] 0.7480916
# Install and load caTools package
#install.packages("caTools")
library(caTools)
# Randomly split data
set.seed(88)
split = sample.split(quality$PoorCare, SplitRatio = 0.75)
split
## [1] TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE
## [12] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [34] TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [45] FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [67] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE
## [111] FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [122] TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
# Create training and testing sets
qualityTrain = subset(quality, split == TRUE)
qualityTest = subset(quality, split == FALSE)
# Logistic Regression Model
QualityLog = glm(PoorCare ~ OfficeVisits + Narcotics, data=qualityTrain, family=binomial)
summary(QualityLog)
##
## Call:
## glm(formula = PoorCare ~ OfficeVisits + Narcotics, family = binomial,
## data = qualityTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.06303 -0.63155 -0.50503 -0.09689 2.16686
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.64613 0.52357 -5.054 4.33e-07 ***
## OfficeVisits 0.08212 0.03055 2.688 0.00718 **
## Narcotics 0.07630 0.03205 2.381 0.01728 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 111.888 on 98 degrees of freedom
## Residual deviance: 89.127 on 96 degrees of freedom
## AIC: 95.127
##
## Number of Fisher Scoring iterations: 4
# Make predictions on training set
predictTrain = predict(QualityLog, type="response")
# Analyze predictions
summary(predictTrain)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06623 0.11910 0.15970 0.25250 0.26760 0.98460
tapply(predictTrain, qualityTrain$PoorCare, mean)
## 0 1
## 0.1894512 0.4392246
# Logistic Regression Model 2
QualityLog2 = glm(PoorCare ~ StartedOnCombination + ProviderCount, data=qualityTrain, family=binomial)
summary(QualityLog2)
##
## Call:
## glm(formula = PoorCare ~ StartedOnCombination + ProviderCount,
## family = binomial, data = qualityTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.61826 -0.72782 -0.64555 -0.08407 1.94662
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.00097 0.55097 -3.632 0.000282 ***
## StartedOnCombinationTRUE 1.95230 1.22342 1.596 0.110541
## ProviderCount 0.03366 0.01983 1.697 0.089706 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 111.89 on 98 degrees of freedom
## Residual deviance: 104.37 on 96 degrees of freedom
## AIC: 110.37
##
## Number of Fisher Scoring iterations: 4
# Make predictions on training set
predictTrain = predict(QualityLog2, type="response")
# Analyze predictions
summary(predictTrain)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1379 0.1830 0.2151 0.2525 0.2842 0.9315
# Video 5
# Confusion matrix for threshold of 0.5
table(qualityTrain$PoorCare, predictTrain > 0.5)
##
## FALSE TRUE
## 0 73 1
## 1 22 3
# Sensitivity and specificity
10/25
## [1] 0.4
70/74
## [1] 0.9459459
# Confusion matrix for threshold of 0.7
table(qualityTrain$PoorCare, predictTrain > 0.7)
##
## FALSE TRUE
## 0 73 1
## 1 23 2
# Sensitivity and specificity
8/25
## [1] 0.32
73/74
## [1] 0.9864865
# Confusion matrix for threshold of 0.2
table(qualityTrain$PoorCare, predictTrain > 0.2)
##
## FALSE TRUE
## 0 31 43
## 1 9 16
# Sensitivity and specificity
16/25
## [1] 0.64
54/74
## [1] 0.7297297
# Video 6
# Install and load ROCR package
#install.packages("ROCR")
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
# Prediction function
ROCRpred = prediction(predictTrain, qualityTrain$PoorCare)
# Performance function
ROCRperf = performance(ROCRpred, "tpr", "fpr")
# Plot ROC curve
plot(ROCRperf)

# Add colors
plot(ROCRperf, colorize=TRUE)
# Add threshold labels
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
