Example: Wage data

library(ISLR)
library(ggplot2)
library(caret)

## Loading required package: lattice

data(Wage)
Wage<- subset(Wage,select=-c(logwage))
##Becasue logwage is the variable we wanna predict, we delete it
summary(Wage)

##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins  
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917  
##                                                           
##                                                           
##                                                           
##                                                           
##                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
##

Split the data

inTrain<- createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<- Wage[inTrain,]
testing<- Wage[-inTrain,]
dim(training)

## [1] 2102   11

dim(testing)

## [1] 898  11

Feature Plot

featurePlot(x=training[,c("age","education","jobclass")],y=training$wage,plot="pairs")

#Plot age versus wage colour by jobclass

qplot(age,wage,colour=jobclass,data=training)

qplot(age,wage,colour=education,data=training)

#Fit a linear model

modFit<- train(wage~age+jobclass+education,method="lm",data=training)
finMod<- modFit$finalModel
print(modFit)

## Linear Regression 
## 
## 2102 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
## 
## Resampling results
## 
##   RMSE      Rsquared   RMSE SD   Rsquared SD
##   35.83034  0.2598788  1.611693  0.02021685 
## 
##

Diagnostics

plot(finMod,1,pch=19,cex=0.5,col="#00000010")

#The line is mostly close to zero, so the residual is small. But the outfilier on the top, they might be expalin by some other predictors

Color by variables not used in the model

qplot(finMod$fitted,finMod$residuals,colour=race,data=training)

#Plot by index

plot(finMod$residuals,pch=19)

#Predicted versus truth in test set

pred<- predict(modFit,testing)
qplot(wage,pred,colour=year,data=testing) #Idealy, it should be a 45 degree straight line

#Also, another evaluation menthod, RMSD, the result is not good
sqrt(sum((predict(modFit,testing)-testing$logwage)^2))

## [1] 0

If you want to use all covariates

modFitAll<- train(wage~., data=training,method="lm")

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

pred<- predict(modFitAll, testing)

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

qplot(wage,pred,data=testing)