This analysis is presenting 3 prediction models for a dataset with multiple covariates and identify the most important covariates. The dataset is the wage and other data for a group of 3000 workers in the Mid-Atlantic region (http://www.inside-r.org/packages/cran/ISLR/docs/Wage).
library(ISLR); library(ggplot2); library(caret);
## Warning: package 'ggplot2' was built under R version 3.1.3
## Loading required package: lattice
data(Wage); Wage <- subset(Wage,select=-c(logwage))
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083
## 2. Information:1456 2. >=Very Good:2142 2. No : 917
##
##
##
##
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
# Plot age versus wage, color by jobclass
qplot(age,wage,colour=jobclass, data=training)
# Plot age versus wage, color by jobclass
qplot(age,wage,colour=jobclass,data=training)
# Plot age versus wage, color by education
qplot(age,wage,colour=education,data=training)
10 predictors can be used in the dataset, although only 3 are used in this model.
set.seed(934)
modFit<- train(wage ~ age + jobclass + education, method = "lm",data=training)
finMod <- modFit$finalModel
print(modFit)
## Linear Regression
##
## 2102 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
##
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 35.5938 0.25293 1.206433 0.02253018
##
##
## Diagnostics
# Plot prediction versus residuals. You would like to see a line at 0 ideally, as this is a difference between fitted values and the real values.
plot(finMod,1,pch=19,cex=0.5,col="#00000010")
# Ideally a line at 0 is desired, as this is a difference between fitted values and the real values.
qplot(finMod$fitted,finMod$residuals,colour=race,data=training)
# Plot by index
# i.e. which row of the data set you are looking at versus the residual
plot(finMod$residuals,pch=19)
# Whenever you see a trend based on the row number, this means that a variable is missing, i.e. there is a relationship with time in the data. Again You would like to see a line at 0.
# Predicted versus actual test set
pred <- predict(modFit, testing)
qplot(wage,pred,colour=year,data=testing)
# Here you expect to see a perfect 45 deg line, indicating a perfect relationship between training and test prediction.
#Importance of each variable in the prediction model
varImp(modFit)
## lm variable importance
##
## Overall
## `education5. Advanced Degree` 100.000
## `education4. College Grad` 62.356
## age 34.337
## `education3. Some College` 28.348
## `education2. HS Grad` 1.495
## `jobclass2. Information` 0.000
# If you want to use all covariates
# modFitAll<- train(wage ~ .,data=training,method="lm")
# pred <- predict(modFitAll, testing)
# qplot(wage,pred,data=testing)
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
modFit2 <- randomForest(jobclass ~. , data=training, method="class")
varImp(modFit2)
## Overall
## year 100.13721
## age 190.38386
## sex 0.00000
## maritl 47.32408
## race 42.64432
## education 111.61723
## region 0.00000
## health 24.47244
## health_ins 25.51185
## wage 222.97321
# Predicting using all covariates
prediction2 <- predict(modFit2, testing, type = "class")
qplot(age, wage,prediction2,colour=jobclass,data=testing)
confusionMatrix(prediction2, testing$jobclass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1. Industrial 2. Information
## 1. Industrial 320 163
## 2. Information 153 262
##
## Accuracy : 0.6481
## 95% CI : (0.6159, 0.6794)
## No Information Rate : 0.5267
## P-Value [Acc > NIR] : 1.247e-13
##
## Kappa : 0.2934
## Mcnemar's Test P-Value : 0.6127
##
## Sensitivity : 0.6765
## Specificity : 0.6165
## Pos Pred Value : 0.6625
## Neg Pred Value : 0.6313
## Prevalence : 0.5267
## Detection Rate : 0.3563
## Detection Prevalence : 0.5379
## Balanced Accuracy : 0.6465
##
## 'Positive' Class : 1. Industrial
##
# Predicting using some covariates (wage + age + education)
modFit3 <- randomForest(jobclass ~ wage + age + education , data=training, method="class")
prediction3 <- predict(modFit3, testing, type = "class")
qplot(age, wage,prediction3,colour=jobclass,data=testing)
confusionMatrix(prediction3, testing$jobclass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1. Industrial 2. Information
## 1. Industrial 304 164
## 2. Information 169 261
##
## Accuracy : 0.6292
## 95% CI : (0.5966, 0.6609)
## No Information Rate : 0.5267
## P-Value [Acc > NIR] : 3.663e-10
##
## Kappa : 0.2567
## Mcnemar's Test P-Value : 0.8265
##
## Sensitivity : 0.6427
## Specificity : 0.6141
## Pos Pred Value : 0.6496
## Neg Pred Value : 0.6070
## Prevalence : 0.5267
## Detection Rate : 0.3385
## Detection Prevalence : 0.5212
## Balanced Accuracy : 0.6284
##
## 'Positive' Class : 1. Industrial
##
training$jobclass <- factor(training$jobclass)
fit <- glm(jobclass ~ health + maritl + race + age + education + health_ins + wage,data=training,family=binomial())
#summary(fit)
testing$jobclass <- as.factor(testing$jobclass)
testing$jobclass_num <- as.numeric(testing$jobclass) - 1
testing$jobclass_num <- as.factor(testing$jobclass_num)
testing$predictionLogit <- round(predict.glm(fit, testing, type = "response"))
summary(testing$predictionLogit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4499 1.0000 1.0000
testing$predictionLogit <- as.factor(testing$predictionLogit)
#levels(testing$predictionLogit); levels(testing$jobclass_num)
confusionMatrix(testing$predictionLogit, testing$jobclass_num)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 329 165
## 1 144 260
##
## Accuracy : 0.6559
## 95% CI : (0.6238, 0.687)
## No Information Rate : 0.5267
## P-Value [Acc > NIR] : 3.115e-15
##
## Kappa : 0.3081
## Mcnemar's Test P-Value : 0.2552
##
## Sensitivity : 0.6956
## Specificity : 0.6118
## Pos Pred Value : 0.6660
## Neg Pred Value : 0.6436
## Prevalence : 0.5267
## Detection Rate : 0.3664
## Detection Prevalence : 0.5501
## Balanced Accuracy : 0.6537
##
## 'Positive' Class : 0
##
varImp <- varImp(fit)
varImp
## Overall
## health2. >=Very Good 0.9947044
## maritl2. Married 0.5878644
## maritl3. Widowed 1.5972388
## maritl4. Divorced 0.8338205
## maritl5. Separated 1.3092777
## race2. Black 5.1407401
## race3. Asian 0.6141653
## race4. Other 0.2681622
## age 2.9895228
## education2. HS Grad 1.9844540
## education3. Some College 4.0688007
## education4. College Grad 6.5759884
## education5. Advanced Degree 7.9344778
## health_ins2. No 2.8785913
## wage 2.6220545