Predicting With Regression Models

Prepared by: Bernard Kiyanda

This analysis is presenting 3 prediction models for a dataset with multiple covariates and identify the most important covariates. The dataset is the wage and other data for a group of 3000 workers in the Mid-Atlantic region (http://www.inside-r.org/packages/cran/ISLR/docs/Wage).

Exploring the Dataset

library(ISLR); library(ggplot2); library(caret);

## Warning: package 'ggplot2' was built under R version 3.1.3

## Loading required package: lattice

data(Wage); Wage <- subset(Wage,select=-c(logwage))
summary(Wage)

##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins  
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917  
##                                                           
##                                                           
##                                                           
##                                                           
##                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
##

inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]

# Plot age versus wage, color by jobclass
qplot(age,wage,colour=jobclass, data=training)

# Plot age versus wage, color by jobclass
qplot(age,wage,colour=jobclass,data=training)

# Plot age versus wage, color by education
qplot(age,wage,colour=education,data=training)

Model 1 - Prediction With a Linear Regression Model

10 predictors can be used in the dataset, although only 3 are used in this model.

set.seed(934)
modFit<- train(wage ~ age + jobclass + education, method = "lm",data=training)
finMod <- modFit$finalModel
print(modFit)

## Linear Regression 
## 
## 2102 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
## 
## Resampling results
## 
##   RMSE     Rsquared  RMSE SD   Rsquared SD
##   35.5938  0.25293   1.206433  0.02253018 
## 
##

## Diagnostics

# Plot prediction versus residuals. You would like to see a line at 0 ideally, as this is a difference between fitted values and the real values.
plot(finMod,1,pch=19,cex=0.5,col="#00000010")

# Ideally a line at 0 is desired, as this is a difference between fitted values and the real values.
qplot(finMod$fitted,finMod$residuals,colour=race,data=training)

# Plot by index
# i.e. which row of the data set you are looking at versus the residual
plot(finMod$residuals,pch=19)

# Whenever you see a trend based on the row number, this means that a variable is missing, i.e. there is a relationship with time in the data. Again You would like to see a line at 0.

# Predicted versus actual test set
pred <- predict(modFit, testing)
qplot(wage,pred,colour=year,data=testing)

# Here you expect to see a perfect 45 deg line, indicating a perfect relationship between training and test prediction.

#Importance of each variable in the prediction model
varImp(modFit)

## lm variable importance
## 
##                               Overall
## `education5. Advanced Degree` 100.000
## `education4. College Grad`     62.356
## age                            34.337
## `education3. Some College`     28.348
## `education2. HS Grad`           1.495
## `jobclass2. Information`        0.000

# If you want to use all covariates
# modFitAll<- train(wage ~ .,data=training,method="lm")
# pred <- predict(modFitAll, testing)
# qplot(wage,pred,data=testing)

Model 2 - Prediction With a Random Forest Model

library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

modFit2 <- randomForest(jobclass ~. , data=training, method="class")
varImp(modFit2)

##              Overall
## year       100.13721
## age        190.38386
## sex          0.00000
## maritl      47.32408
## race        42.64432
## education  111.61723
## region       0.00000
## health      24.47244
## health_ins  25.51185
## wage       222.97321

# Predicting using all covariates
prediction2 <- predict(modFit2, testing, type = "class")
qplot(age, wage,prediction2,colour=jobclass,data=testing)

confusionMatrix(prediction2, testing$jobclass)

## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       1. Industrial 2. Information
##   1. Industrial            320            163
##   2. Information           153            262
##                                           
##                Accuracy : 0.6481          
##                  95% CI : (0.6159, 0.6794)
##     No Information Rate : 0.5267          
##     P-Value [Acc > NIR] : 1.247e-13       
##                                           
##                   Kappa : 0.2934          
##  Mcnemar's Test P-Value : 0.6127          
##                                           
##             Sensitivity : 0.6765          
##             Specificity : 0.6165          
##          Pos Pred Value : 0.6625          
##          Neg Pred Value : 0.6313          
##              Prevalence : 0.5267          
##          Detection Rate : 0.3563          
##    Detection Prevalence : 0.5379          
##       Balanced Accuracy : 0.6465          
##                                           
##        'Positive' Class : 1. Industrial   
##

# Predicting using some covariates (wage + age + education)
modFit3 <- randomForest(jobclass ~ wage + age + education , data=training, method="class")
prediction3 <- predict(modFit3, testing, type = "class")
qplot(age, wage,prediction3,colour=jobclass,data=testing)

confusionMatrix(prediction3, testing$jobclass)

## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       1. Industrial 2. Information
##   1. Industrial            304            164
##   2. Information           169            261
##                                           
##                Accuracy : 0.6292          
##                  95% CI : (0.5966, 0.6609)
##     No Information Rate : 0.5267          
##     P-Value [Acc > NIR] : 3.663e-10       
##                                           
##                   Kappa : 0.2567          
##  Mcnemar's Test P-Value : 0.8265          
##                                           
##             Sensitivity : 0.6427          
##             Specificity : 0.6141          
##          Pos Pred Value : 0.6496          
##          Neg Pred Value : 0.6070          
##              Prevalence : 0.5267          
##          Detection Rate : 0.3385          
##    Detection Prevalence : 0.5212          
##       Balanced Accuracy : 0.6284          
##                                           
##        'Positive' Class : 1. Industrial   
##

Model 3 - Prediction With a Logistic Regression Model

training$jobclass <- factor(training$jobclass)
fit <- glm(jobclass ~ health + maritl + race + age + education + health_ins + wage,data=training,family=binomial())
#summary(fit)

testing$jobclass <- as.factor(testing$jobclass)
testing$jobclass_num <- as.numeric(testing$jobclass) - 1
testing$jobclass_num <- as.factor(testing$jobclass_num)

testing$predictionLogit <- round(predict.glm(fit, testing, type = "response"))
summary(testing$predictionLogit)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4499  1.0000  1.0000

testing$predictionLogit <- as.factor(testing$predictionLogit)
#levels(testing$predictionLogit); levels(testing$jobclass_num)

confusionMatrix(testing$predictionLogit, testing$jobclass_num)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 329 165
##          1 144 260
##                                          
##                Accuracy : 0.6559         
##                  95% CI : (0.6238, 0.687)
##     No Information Rate : 0.5267         
##     P-Value [Acc > NIR] : 3.115e-15      
##                                          
##                   Kappa : 0.3081         
##  Mcnemar's Test P-Value : 0.2552         
##                                          
##             Sensitivity : 0.6956         
##             Specificity : 0.6118         
##          Pos Pred Value : 0.6660         
##          Neg Pred Value : 0.6436         
##              Prevalence : 0.5267         
##          Detection Rate : 0.3664         
##    Detection Prevalence : 0.5501         
##       Balanced Accuracy : 0.6537         
##                                          
##        'Positive' Class : 0              
##

varImp <- varImp(fit)
varImp

##                               Overall
## health2. >=Very Good        0.9947044
## maritl2. Married            0.5878644
## maritl3. Widowed            1.5972388
## maritl4. Divorced           0.8338205
## maritl5. Separated          1.3092777
## race2. Black                5.1407401
## race3. Asian                0.6141653
## race4. Other                0.2681622
## age                         2.9895228
## education2. HS Grad         1.9844540
## education3. Some College    4.0688007
## education4. College Grad    6.5759884
## education5. Advanced Degree 7.9344778
## health_ins2. No             2.8785913
## wage                        2.6220545