Can you build a supervised learning model to accurately predict whether or not the patients in the dataset have diabetes?
chooseCRANmirror(graphics=FALSE, ind=1)
knitr::opts_chunk$set(echo = TRUE)
df = read.csv("/Users/zhengdongnanzi/Desktop/Columbia Spring 2018/GR 5058/HW4/diabetes.csv")
library(leaps)
library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# (1)
# The variable "Outcome" in the dataset
# signifies whether or not an individual had diabetes.
head(df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
# (2)
# Except the SkinThickness variable,
# all the other variables I think may be explanatory variables for diabetes.
df1<- df[,-4]
df2<-df1[,-8]
head(df2)
## Pregnancies Glucose BloodPressure Insulin BMI DiabetesPedigreeFunction
## 1 6 148 72 0 33.6 0.627
## 2 1 85 66 0 26.6 0.351
## 3 8 183 64 0 23.3 0.672
## 4 1 89 66 94 28.1 0.167
## 5 0 137 40 168 43.1 2.288
## 6 5 116 74 0 25.6 0.201
## Age
## 1 50
## 2 31
## 3 32
## 4 21
## 5 33
## 6 30
# Descriptive statistics
summary(df2)
## Pregnancies Glucose BloodPressure Insulin
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.0
## Median : 3.000 Median :117.0 Median : 72.00 Median : 30.5
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean : 79.8
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:127.2
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :846.0
## BMI DiabetesPedigreeFunction Age
## Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median :32.00 Median :0.3725 Median :29.00
## Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :67.10 Max. :2.4200 Max. :81.00
# (1)
df1 = as.data.frame(df1)
df1$Outcome<- factor(df1$Outcome)
num.vars <- sapply(df1, is.numeric) #identify numeric variables
df1[num.vars] <- lapply(df1[num.vars], scale) #scale numeric variables
set.seed(400)
# put around 20% of data into test data
test.index <- 1:150
train <- df1[-test.index, ]
test <- df1[test.index, ]
ctrl <- trainControl(method="cv",number=5)# k value selected automatically by model
knnFit <- train(
Outcome ~Pregnancies+Glucose+BloodPressure+Insulin+BMI+DiabetesPedigreeFunction+Age,
data = train, method = "knn", trControl = ctrl, tuneLength = 20)
#Output of kNN fit
knnFit
## k-Nearest Neighbors
##
## 618 samples
## 7 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 494, 495, 494, 494, 495
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7410176 0.4036342
## 7 0.7571466 0.4303450
## 9 0.7506688 0.4209520
## 11 0.7571728 0.4367641
## 13 0.7604380 0.4342451
## 15 0.7523735 0.4124368
## 17 0.7571597 0.4226755
## 19 0.7653029 0.4433934
## 21 0.7621164 0.4310535
## 23 0.7621033 0.4306570
## 25 0.7604511 0.4274582
## 27 0.7475085 0.3908128
## 29 0.7507212 0.4005238
## 31 0.7523472 0.4006592
## 33 0.7555862 0.4066476
## 35 0.7507606 0.3957661
## 37 0.7555993 0.4081150
## 39 0.7555730 0.4068308
## 41 0.7523472 0.3975716
## 43 0.7539864 0.4013251
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.
#KnnFit model selects k = 19 value with highest accuracy
plot(knnFit)
# (2) by K fold cross validation, accuracy is around 0.7653
#Predict test data using training data
newdata = test[,-8] #test data object without target variable
knnPredict <- predict(knnFit,newdata)
#Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(knnPredict, test$Outcome )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 86 27
## 1 11 26
##
## Accuracy : 0.7467
## 95% CI : (0.6693, 0.8141)
## No Information Rate : 0.6467
## P-Value [Acc > NIR] : 0.005693
##
## Kappa : 0.4049
## Mcnemar's Test P-Value : 0.014961
##
## Sensitivity : 0.8866
## Specificity : 0.4906
## Pos Pred Value : 0.7611
## Neg Pred Value : 0.7027
## Prevalence : 0.6467
## Detection Rate : 0.5733
## Detection Prevalence : 0.7533
## Balanced Accuracy : 0.6886
##
## 'Positive' Class : 0
##
# (2) by confusion matrix, accuracy is around 0.747
# (1) For logistic regression
# we can use AIC to do best subeset selection to reduce the unimportant variables
# Best model will minimize AIC
library(bestglm)
bs <- bestglm(Xy = df,
family = binomial,
IC = "AIC")
## Morgan-Tatar search since family is non-gaussian.
summary(bs$BestModel)
##
## Call:
## glm(formula = y ~ ., family = family, data = Xi, weights = weights)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5617 -0.7286 -0.4156 0.7271 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4051362 0.7167033 -11.727 < 2e-16 ***
## Pregnancies 0.1231724 0.0320688 3.841 0.000123 ***
## Glucose 0.0351123 0.0036625 9.587 < 2e-16 ***
## BloodPressure -0.0132136 0.0051537 -2.564 0.010350 *
## Insulin -0.0011570 0.0008142 -1.421 0.155275
## BMI 0.0900886 0.0144619 6.229 4.68e-10 ***
## DiabetesPedigreeFunction 0.9475954 0.2980063 3.180 0.001474 **
## Age 0.0147888 0.0092897 1.592 0.111393
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 760 degrees of freedom
## AIC: 739.45
##
## Number of Fisher Scoring iterations: 5
# here we chose
# Pregnancies, Glucose, BloodPressure, Insulin, BMI, DiabetesPedigreeFunction and Age
# these 7 variables for my final model
# (2)
# 1) By K fold cross validation
set.seed(400)
df$Outcome<- factor(df$Outcome)
test.index <- 1:150
train <- df[-test.index, ]
test <- df[test.index, ]
data_ctrl <- trainControl(method = "cv", number = 5)
model_caret <- train(Outcome ~Pregnancies+Glucose
+BMI+DiabetesPedigreeFunction
+BloodPressure+Age+Insulin,
data = train,
method="glm",
family="binomial",
trControl = data_ctrl,
na.action = na.pass)
# Examine average prediction error of
# each fold's model built from training data applied to test data
model_caret
## Generalized Linear Model
##
## 618 samples
## 7 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 494, 495, 494, 494, 495
## Resampling results:
##
## Accuracy Kappa
## 0.771807 0.4718348
summary(model_caret$resample$Accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7561 0.7642 0.7742 0.7718 0.7823 0.7823
# By K fold cross validation, the accuracy is 0.7718
# 2) by a confusion matrix, the accuracy is 0.7733
#Predict test data using training data
newdata = test[,-9]
logitPredict <- predict(model_caret,newdata)
#Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(logitPredict, test$Outcome )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 86 23
## 1 11 30
##
## Accuracy : 0.7733
## 95% CI : (0.6979, 0.8376)
## No Information Rate : 0.6467
## P-Value [Acc > NIR] : 0.0005563
##
## Kappa : 0.4771
## Mcnemar's Test P-Value : 0.0592297
##
## Sensitivity : 0.8866
## Specificity : 0.5660
## Pos Pred Value : 0.7890
## Neg Pred Value : 0.7317
## Prevalence : 0.6467
## Detection Rate : 0.5733
## Detection Prevalence : 0.7267
## Balanced Accuracy : 0.7263
##
## 'Positive' Class : 0
##
# by a confusion matrix, the accuracy is 0.7733
Can you build a supervised learning model to accurately predict voter turnout at the state level in the 2008 election (variable name: “vep08_turnout”)?
library(haven)
States <- read_sav("~/Desktop/Columbia Spring 2018/GR 5058/3.22Lec8/States.sav")
# (1)
head(sapply(States, class))
## abort_rate abortlaw abortlaw3 attend_pct battle04 blkleg
## "numeric" "numeric" "labelled" "numeric" "labelled" "numeric"
# clean the data: only select variables which are
# numeric, not ID, independent, somehow related to the turnout
States1 <- States[, names(States) %in% c("vep04_turnout","urban",
"union07","unemploy",
"prcapinc","blkpct08",
"density",
"hispanic08","earmarks_pcap","college")]
States1 <- cbind(States1, vep08_turnout = States$vep08_turnout)
# For linear regression model,
# Since there are many variables in the dataset
# We can use forward stepwise selection to select important variables
regfit.fwd=regsubsets(vep08_turnout~.,data=States1,nvmax=8, method="forward")
regsummary<-summary(regfit.fwd)
# Model with maximum value of adj Rsq
# Optimal numbers of variables
which.max(regsummary$adjr2)
## [1] 3
plot(regsummary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
plot(regsummary$rss,xlab="Number of Variables",ylab="RSS",type="l")
regsummary
## Subset selection object
## Call: regsubsets.formula(vep08_turnout ~ ., data = States1, nvmax = 8,
## method = "forward")
## 10 Variables (and intercept)
## Forced in Forced out
## blkpct08 FALSE FALSE
## college FALSE FALSE
## density FALSE FALSE
## earmarks_pcap FALSE FALSE
## hispanic08 FALSE FALSE
## prcapinc FALSE FALSE
## unemploy FALSE FALSE
## union07 FALSE FALSE
## urban FALSE FALSE
## vep04_turnout FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## blkpct08 college density earmarks_pcap hispanic08 prcapinc
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " "
## 3 ( 1 ) "*" "*" " " " " " " " "
## 4 ( 1 ) "*" "*" " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " "
## 6 ( 1 ) "*" "*" " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " "*" "*"
## 8 ( 1 ) "*" "*" " " " " "*" "*"
## unemploy union07 urban vep04_turnout
## 1 ( 1 ) " " " " " " "*"
## 2 ( 1 ) " " " " " " "*"
## 3 ( 1 ) " " " " " " "*"
## 4 ( 1 ) "*" " " " " "*"
## 5 ( 1 ) "*" " " " " "*"
## 6 ( 1 ) "*" " " "*" "*"
## 7 ( 1 ) "*" " " "*" "*"
## 8 ( 1 ) "*" "*" "*" "*"
# We know that the optimal subest variables are 3. They are:
# 1: vep04_turnout: percent turnout of voting eligible population 2004
# 2: college: percent pop with college education or higher
# 3: blkpct08: black pop percent in 2008
States1 <- as.data.frame(States1)
# Cross validation
data_ctrl <- trainControl(method = "cv", number = 3)
model_caret <- train(vep08_turnout~ vep04_turnout + college + blkpct08,
data = States1,
trControl = data_ctrl,
method = "lm",
na.action = na.pass)
model_caret
## Linear Regression
##
## 50 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 34, 33, 33
## Resampling results:
##
## RMSE Rsquared MAE
## 2.032164 0.8732699 1.568387
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#How much variation was there across all predictions?
sd(model_caret$resample$Rsquared)
## [1] 0.05031458
summary(model_caret$resample$Rsquared)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8233 0.8480 0.8727 0.8733 0.8983 0.9239
# The accurcey is 0.87.