diabetes<-read.csv("diabetes.csv")
head(diabetes, 10)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## 7 3 78 50 32 88 31.0
## 8 10 115 0 0 0 35.3
## 9 2 197 70 45 543 30.5
## 10 8 125 96 0 0 0.0
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
## 7 0.248 26 1
## 8 0.134 29 0
## 9 0.158 53 1
## 10 0.232 54 1
summary(diabetes$Pregnancies)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.845 6.000 17.000
summary(diabetes$Glucose)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 99.0 117.0 120.9 140.2 199.0
summary(diabetes$BloodPressure)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 62.00 72.00 69.11 80.00 122.00
summary(diabetes$SkinThickness)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 23.00 20.54 32.00 99.00
summary(diabetes$Insulin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 30.5 79.8 127.2 846.0
summary(diabetes$BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.30 32.00 31.99 36.60 67.10
summary(diabetes$DiabetesPedigreeFunction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0780 0.2437 0.3725 0.4719 0.6262 2.4200
summary(diabetes$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 24.00 29.00 33.24 41.00 81.00
The variable that signifies whether or not an individual had diabetes is the categorical variable Outcome. All remaining 8 variables could be explanatory variables for diabetes.
# K nearest neighbors model
diabetes$Outcome<-factor(diabetes$Outcome)
num.vars<-sapply(diabetes, is.numeric)
diabetes[num.vars] <- lapply(diabetes[num.vars], scale)
set.seed(400)
test.index<-1:30
train<-diabetes[-test.index, ]
test<-diabetes[test.index, ]
ctrl<-trainControl(method="repeatedcv",number=5, repeats=10)
knnFit<-train(Outcome~Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+
BMI+DiabetesPedigreeFunction+Age,
data=train, method="knn",
trControl=ctrl, tuneLength=20)
knnFit
## k-Nearest Neighbors
##
## 738 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 590, 591, 590, 590, 591, 590, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7370013 0.3915874
## 7 0.7391625 0.3912318
## 9 0.7338647 0.3764734
## 11 0.7387479 0.3850828
## 13 0.7429518 0.3907383
## 15 0.7521658 0.4083931
## 17 0.7579932 0.4187784
## 19 0.7570518 0.4126364
## 21 0.7529822 0.3998359
## 23 0.7502776 0.3913184
## 25 0.7510820 0.3909229
## 27 0.7529812 0.3927745
## 29 0.7528507 0.3903700
## 31 0.7565049 0.3983867
## 33 0.7547435 0.3918182
## 35 0.7555580 0.3919990
## 37 0.7552969 0.3883912
## 39 0.7562484 0.3898901
## 41 0.7574646 0.3924810
## 43 0.7594907 0.3968341
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 43.
plot(knnFit)
The final model has k=43 because it gives the highest accuracy of 0.7594907.
# 1) Confusion matrix
newdata<-test[,-9]
knnPredict<-predict(knnFit,newdata)
confusionMatrix(knnPredict, test$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11 8
## 1 1 10
##
## Accuracy : 0.7
## 95% CI : (0.506, 0.8527)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.1763
##
## Kappa : 0.4304
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.9167
## Specificity : 0.5556
## Pos Pred Value : 0.5789
## Neg Pred Value : 0.9091
## Prevalence : 0.4000
## Detection Rate : 0.3667
## Detection Prevalence : 0.6333
## Balanced Accuracy : 0.7361
##
## 'Positive' Class : 0
##
The model has an accuracy of 0.7; and a balanced accuracy of 0.7361.
# 2) K fold cross validation
diabetes_ctrl<-trainControl(method = "cv", number = 5)
kfoldmodel<-train(Outcome~Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+
BMI+DiabetesPedigreeFunction+Age,
data = diabetes,
trControl = diabetes_ctrl,
method = "knn",
na.action = na.pass)
kfoldmodel
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 614, 615, 614, 614, 615
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7369748 0.3956709
## 7 0.7382990 0.3977379
## 9 0.7382310 0.3939435
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
summary(kfoldmodel$resample$Accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7078 0.7320 0.7338 0.7383 0.7582 0.7597
The model has an average accuracy of 0.7448.
# Logistic regression model
diabetes3<-read.csv("diabetes.csv")
logit1 = glm(Outcome~Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+
BMI+DiabetesPedigreeFunction+Age,
data = diabetes3,
family = binomial)
summary(logit1)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## SkinThickness + Insulin + BMI + DiabetesPedigreeFunction +
## Age, family = binomial, data = diabetes3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## Pregnancies 0.1231823 0.0320776 3.840 0.000123 ***
## Glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## BloodPressure -0.0132955 0.0052336 -2.540 0.011072 *
## SkinThickness 0.0006190 0.0068994 0.090 0.928515
## Insulin -0.0011917 0.0009012 -1.322 0.186065
## BMI 0.0897010 0.0150876 5.945 2.76e-09 ***
## DiabetesPedigreeFunction 0.9451797 0.2991475 3.160 0.001580 **
## Age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
logit2 = glm(Outcome~Pregnancies+Glucose+BloodPressure+BMI+DiabetesPedigreeFunction+Age,
data = diabetes3,
family = binomial)
summary(logit2)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction + Age, family = binomial,
## data = diabetes3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7380 -0.7313 -0.4123 0.7276 2.8984
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.239812 0.701970 -11.738 < 2e-16 ***
## Pregnancies 0.124919 0.031972 3.907 9.34e-05 ***
## Glucose 0.033492 0.003440 9.736 < 2e-16 ***
## BloodPressure -0.013487 0.005114 -2.637 0.00836 **
## BMI 0.087676 0.014268 6.145 7.99e-10 ***
## DiabetesPedigreeFunction 0.896150 0.294862 3.039 0.00237 **
## Age 0.016325 0.009237 1.767 0.07719 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 725.46 on 761 degrees of freedom
## AIC: 739.46
##
## Number of Fisher Scoring iterations: 5
Model logit1 has an AIC of 741.45, while logit2 has an AIC of 739.46. Model logit2 has the lowest AIC and is therefore the best model to use, with explanatory variables Pregnancies, Glucose, BloodPressure, BMI, DiabetesPedigreeFunction, and Age.
set.seed(400)
diabetes3$Outcome<-factor(diabetes3$Outcome)
test.index3<-1:30
train3<-diabetes3[-test.index3, ]
test3<-diabetes3[test.index3, ]
ctrl3<-trainControl(method="repeatedcv",number=5, repeats=10)
logitFit<-train(Outcome~Pregnancies+Glucose+BloodPressure+BMI+DiabetesPedigreeFunction+Age,
data=train3, method="glm", family="binomial",
trControl=ctrl3, tuneLength=20)
# 1) Confusion matrix
newdata3<-test3[,-9]
knnPredict3<-predict(logitFit,newdata3)
confusionMatrix(knnPredict3, test3$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9 8
## 1 3 10
##
## Accuracy : 0.6333
## 95% CI : (0.4386, 0.8007)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.4311
##
## Kappa : 0.2857
## Mcnemar's Test P-Value : 0.2278
##
## Sensitivity : 0.7500
## Specificity : 0.5556
## Pos Pred Value : 0.5294
## Neg Pred Value : 0.7692
## Prevalence : 0.4000
## Detection Rate : 0.3000
## Detection Prevalence : 0.5667
## Balanced Accuracy : 0.6528
##
## 'Positive' Class : 0
##
The model has an accuracy of 0.6333, and a balanced accuracy of 0.6528.
# 2) K fold cross validation
diabetes3_ctrl<-trainControl(method="repeatedcv", number=5, repeats=10)
kfoldmodel3<-train(Outcome~Pregnancies+Glucose+BloodPressure+BMI+DiabetesPedigreeFunction+Age,
data = diabetes3,
method="glm",
family="binomial",
trControl = diabetes3_ctrl,
na.action = na.pass)
kfoldmodel3
## Generalized Linear Model
##
## 768 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 614, 615, 615, 614, 614, 615, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7673092 0.4632061
summary(kfoldmodel3$resample$Accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6863 0.7484 0.7720 0.7673 0.7908 0.8235
The model has an average accuracy of 0.7673.
From the results of the k fold cross validation, the logistic regression model seems to have a higher accuracy (0.7673) than the k nearest neighbors model (0.7448), therefore it is able to predict diabetes better with new data.
Due to the large number of variables, we used the Forward Stepwise Selection method to find the model with the highest Adjusted R-squared.
States<-read_sav("States.sav")
# Forward Stepwise Selection method
regfit.fwd<-regsubsets(vep08_turnout~.,data=States,method="forward")
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 80 linear dependencies found
reg.summary<-summary(regfit.fwd)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
which.max(reg.summary$adjr2)
## [1] 4
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
reg.summary$which[2,]
## (Intercept) abort_rate abortlaw
## TRUE FALSE FALSE
## abortlaw3 attend_pct battle04
## FALSE FALSE FALSE
## blkleg blkpct04 blkpct08
## FALSE FALSE FALSE
## bush00 bush04 carfatal
## FALSE FALSE FALSE
## cig_tax cig_tax_3 cigarettes
## FALSE FALSE FALSE
## college conpct_m cons_hr06
## FALSE FALSE FALSE
## cons_hr09 cook_index cook_index3
## FALSE FALSE FALSE
## defexpen dem_hr09 demnat06
## FALSE FALSE FALSE
## dempct_m demstate06 demstate09
## FALSE FALSE FALSE
## density division earmarks_pcap
## FALSE FALSE FALSE
## evm evo gay_policy
## FALSE FALSE FALSE
## gay_policy2 gay_policy_con gay_support
## FALSE FALSE FALSE
## gay_support3 gb_win00 gb_win04
## FALSE FALSE FALSE
## gore00 gunlaw_rank gun_rank_rev
## FALSE FALSE FALSE
## gunlaw_rank3_rev gunlaw_scale hispanic04
## FALSE FALSE FALSE
## hispanic08 hs_or_more indpct_m
## FALSE FALSE FALSE
## kerry04 libpct_m mccain08
## FALSE FALSE FALSE
## modpct_m nader00 obama08
## FALSE FALSE FALSE
## obama_win08 over64 permit
## FALSE FALSE FALSE
## pop_18_24 pot_policy prcapinc
## FALSE FALSE FALSE
## region relig_import religiosity
## FALSE FALSE FALSE
## reppct_m secularism secularism3
## FALSE FALSE FALSE
## seniority_sen2 south stateArizona
## FALSE FALSE FALSE
## stateArkansas stateCalifornia stateColorado
## FALSE FALSE FALSE
## stateConnecticut stateFlorida stateGeorgia
## FALSE FALSE FALSE
## stateIllinois stateIndiana stateIowa
## FALSE FALSE FALSE
## stateKansas stateKentucky stateLouisiana
## FALSE FALSE FALSE
## stateMaryland stateMassachusetts stateMichigan
## FALSE FALSE FALSE
## stateMinnesota stateMissouri stateNew Hampshire
## FALSE FALSE FALSE
## stateNew Jersey stateNew Mexico stateNew York
## FALSE FALSE FALSE
## stateOhio stateOregon statePennsylvania
## FALSE FALSE FALSE
## stateSouth Carolina stateTennessee stateTexas
## FALSE FALSE FALSE
## stateUtah stateVirginia stateWashington
## FALSE FALSE FALSE
## stateWest Virginia stateWisconsin stateWyoming
## FALSE FALSE FALSE
## stateid to_0004 to_0408
## FALSE FALSE TRUE
## trnout00 trnout04 unemploy
## FALSE FALSE FALSE
## union04 union07 urban
## FALSE FALSE FALSE
## vep00_turnout vep04_turnout womleg_2007
## FALSE TRUE FALSE
## womleg_2010
## FALSE
It was found that the 4-variable model had the highest Adjusted R-squared; however the plot of Adjusted R-squared against number of variables showed that the Adjusted R-squared already peaked at 2 variables. Thus, we chose the model with 2 explanatory variables to avoid overspecifying the model. The chosen variables were to_0408 and vep04_turnout.
model5<-lm(vep08_turnout~to_0408+vep04_turnout,
data=States)
summary(model5)
##
## Call:
## lm(formula = vep08_turnout ~ to_0408 + vep04_turnout, data = States)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.223e-07 -1.082e-08 5.408e-09 3.296e-08 1.601e-07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.075e-08 1.149e-07 4.420e-01 0.661
## to_0408 1.000e+00 4.037e-09 2.477e+08 <2e-16 ***
## vep04_turnout 1.000e+00 1.806e-09 5.536e+08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.325e-08 on 47 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.538e+17 on 2 and 47 DF, p-value: < 2.2e-16
# K fold cross validation
States_ctrl<-trainControl(method = "cv", number = 5)
model_caret<-train(vep08_turnout~to_0408+vep04_turnout,
data = States,
trControl = States_ctrl,
method = "lm",
na.action = na.pass)
summary(model_caret$resample$Rsquared)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
Using K fold cross validation, we see that the prediction errors for all folds were close to zero. There was almost no variation across all predictions. We can evaluate the model to be accurate.