Machine Learning - Continous Data example. Uses different machine learning algorithms to predict wage using multiple features.

Load Required Packages

library(caret)
## Warning: package 'caret' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
library(ISLR)
library(ggplot2)
# libraries for partition trees
library(rpart)
library(rpart.plot)
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.3
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

split data

data(Wage)
dim(Wage)
## [1] 3000   12
colnames(Wage)
##  [1] "year"       "age"        "sex"        "maritl"     "race"      
##  [6] "education"  "region"     "jobclass"   "health"     "health_ins"
## [11] "logwage"    "wage"
set.seed(1)
training_indices <- createDataPartition(y=Wage$wage,p=0.8,list=F)
# remove log-wage column as feature 
training_set <- Wage[training_indices,-c(11)]
test_set <- Wage[-training_indices,-c(11)]
# dim(training_set); dim(test_set)

exploratory data analysis

summary(Wage)
##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins      logwage     
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083   Min.   :3.000  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917   1st Qu.:4.447  
##                                                            Median :4.653  
##                                                            Mean   :4.654  
##                                                            3rd Qu.:4.857  
##                                                            Max.   :5.763  
##                                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 
# remove near zero covariates (features with no variability)
nearZeroVar(training_set,saveMetrics = T)
##            freqRatio percentUnique zeroVar   nzv
## year        1.071979    0.29142381   FALSE FALSE
## age         1.000000    2.53955037   FALSE FALSE
## sex         0.000000    0.04163197    TRUE  TRUE
## maritl      3.165399    0.20815987   FALSE FALSE
## race        8.186722    0.16652789   FALSE FALSE
## education   1.451673    0.20815987   FALSE FALSE
## region      0.000000    0.04163197    TRUE  TRUE
## jobclass    1.061803    0.08326395   FALSE FALSE
## health      2.511696    0.08326395   FALSE FALSE
## health_ins  2.276944    0.08326395   FALSE FALSE
## wage        1.074468   18.40133222   FALSE FALSE
near_zero_covariates <- nearZeroVar(training_set)
head(near_zero_covariates)
## [1] 3 7
if(length(near_zero_covariates)>0)
{
  training_set2 <- training_set[,-near_zero_covariates]
  test_set2 <- test_set[,-near_zero_covariates]
} else {
  training_set2 <- training_set
  test_set2 <- test_set
}
dim(training_set2); dim(test_set2)
## [1] 2402    9
## [1] 598   9
# exploratory plots
colnames(training_set2)
## [1] "year"       "age"        "maritl"     "race"       "education" 
## [6] "jobclass"   "health"     "health_ins" "wage"
featurePlot(x=training_set2[1:8], y=training_set2$wage, plot="pairs")

qplot(x=training_set2$year, y=training_set2$wage ,ylab="wage")

qplot(x=training_set2$age, y=training_set2$wage ,ylab="wage")

qplot(x=training_set2$age, y=training_set2$wage, color=training_set2$education,pch=training_set2$jobclass ,ylab="wage")

# add regression smoother
qq <- qplot(x=training_set2$age, y=training_set2$wage, color=training_set2$education,pch=training_set2$jobclass ,ylab="wage")
qq + geom_smooth(method="lm",formula=y~x)

qplot(x=training_set2$maritl, y=training_set2$wage ,ylab="wage", color=training_set2$age)

qplot(x=training_set2$age, y=training_set2$wage, color=training_set2$maritl ,ylab="wage")

qplot(x=training_set2$race,y=training_set2$wage,color=training_set2$education ,pch=training_set2$jobclass,cex=1,ylab="wage")

qplot(x=training_set2$health,y=training_set2$wage,color=training_set2$health_ins ,pch=training_set2$jobclass,cex=1,ylab="wage")

# correlated features
# all features should be numeric to calculate correlation
sapply(training_set2[1,],class)
##       year        age     maritl       race  education   jobclass 
##  "integer"  "integer"   "factor"   "factor"   "factor"   "factor" 
##     health health_ins       wage 
##   "factor"   "factor"  "numeric"
numeric_set <- which(sapply(training_set2[1,],class)=="integer")

feature_correlation <- cor(training_set2[,numeric_set])
# search through a correlation matrix and returns a vector of integers corresponding to columns to remove to reduce pair-wise correlations.
findCorrelation(feature_correlation,0.75,verbose=T,names=T)
## All correlations <= 0.75
## character(0)
high_correlation <- findCorrelation(feature_correlation,0.75,verbose=T,names=T)
## All correlations <= 0.75
high_correlation
## character(0)
# grep(c(high_correlation[1]),colnames(training_set2))
# grep(c(high_correlation[2]),colnames(training_set2))
if(length(high_correlation)>0)
{
  training_set3 <- training_set2[,-c(11,15)]
  test_set3 <- test_set2[,-c(11,15)]
}else{
  training_set3 <- training_set2
  test_set3 <- test_set2
}
dim(training_set3); dim(test_set3)
## [1] 2402    9
## [1] 598   9
# PCA
pc <- prcomp(training_set3[,numeric_set],center=T,scale=T)
plot(pc,type="l")

pc$rotation[order(-abs(pc$rotation[,"PC1"])),]
##             PC1        PC2
## year -0.7071068  0.7071068
## age  -0.7071068 -0.7071068

model building

# since correlation and PCA shows all features are important, all of them are used in the model
# k-fold cross validation
train_control <- trainControl(method="cv", number=10, savePredictions = T)
# fix the parameters of the algorithm
grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))

# linear regression
set.seed(1)
lm_model <- train(wage~., data=training_set3, trControl=train_control, method="lm" ,preProcess=c("scale","center","pca")) 

#lm_model
summary(lm_model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -99.154 -20.465  -3.968  14.195 215.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 111.8594     0.7274 153.789  < 2e-16 ***
## PC1          15.8721     0.5386  29.469  < 2e-16 ***
## PC2           0.5658     0.5952   0.951   0.3419    
## PC3          -1.3486     0.6333  -2.130   0.0333 *  
## PC4          -3.3924     0.6366  -5.329 1.08e-07 ***
## PC5          -1.0708     0.6649  -1.610   0.1074    
## PC6           0.1313     0.7027   0.187   0.8518    
## PC7           2.8522     0.7066   4.036 5.60e-05 ***
## PC8           0.5709     0.7132   0.800   0.4235    
## PC9          -0.1484     0.7304  -0.203   0.8390    
## PC10         -1.0694     0.7374  -1.450   0.1471    
## PC11          5.8313     0.7583   7.690 2.13e-14 ***
## PC12         -1.4946     0.7931  -1.885   0.0596 .  
## PC13          0.3721     0.8260   0.451   0.6524    
## PC14         -4.5201     0.8498  -5.319 1.14e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.65 on 2387 degrees of freedom
## Multiple R-squared:  0.2984, Adjusted R-squared:  0.2943 
## F-statistic: 72.51 on 14 and 2387 DF,  p-value: < 2.2e-16
# lm_model$finalModel
varImp(lm_model)
## lm variable importance
## 
##        Overall
## PC1  100.00000
## PC11  25.62496
## PC4   17.56131
## PC14  17.52644
## PC7   13.14698
## PC3    6.63467
## PC12   5.79812
## PC5    4.86199
## PC10   4.31496
## PC2    2.60858
## PC8    2.09580
## PC13   0.90073
## PC9    0.05592
## PC6    0.00000
plot(varImp(lm_model))

# re-train model using features that look important
varImp(train(wage~., data=training_set3, trControl=train_control, method="lm" ,preProcess=c("scale","center"))) # not including pca in preProcess as its difficult to interpret their importance in model
## lm variable importance
## 
##                               Overall
## `education5. Advanced Degree` 100.000
## `health_ins2. No`              64.662
## `education4. College Grad`     64.144
## `maritl2. Married`             52.227
## `education3. Some College`     36.691
## `health2. >=Very Good`         27.513
## year                           20.537
## age                            18.173
## `education2. HS Grad`          16.232
## `jobclass2. Information`        9.767
## `maritl5. Separated`            8.207
## `race2. Black`                  7.951
## `race4. Other`                  5.461
## `race3. Asian`                  5.131
## `maritl4. Divorced`             1.245
## `maritl3. Widowed`              0.000
set.seed(1)
lm_model2 <- train(wage ~ education + health_ins + jobclass + maritl, data=training_set3, trControl=train_control, method="lm",preProcess=c("scale","center","pca"))

# tree / recursive partioning
set.seed(1)
rpart_model <- train(wage~., data=training_set3, trControl=train_control, method="rpart" ,preProcess=c("scale","center","pca"))
# plot classification trees
fancyRpartPlot(rpart_model$finalModel)

# boosting with tres
set.seed(1)
gbm_model <- train(wage~., data=training_set3, trControl=train_control, method = "gbm", verbose=F ,preProcess=c("scale","center","pca"))
## Loading required package: gbm
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
summary(train(wage~., data=training_set3, trControl=train_control, method = "gbm", verbose=F ,preProcess=c("scale","center")))

##                                                     var     rel.inf
## education5. Advanced Degree education5. Advanced Degree 31.26189167
## health_ins2. No                         health_ins2. No 17.03002997
## age                                                 age 14.28736109
## education4. College Grad       education4. College Grad 12.06047761
## maritl2. Married                       maritl2. Married  9.57699526
## year                                               year  4.10832632
## health2. >=Very Good               health2. >=Very Good  3.76809844
## education2. HS Grad                 education2. HS Grad  3.13680596
## education3. Some College       education3. Some College  2.00228073
## jobclass2. Information           jobclass2. Information  1.43221319
## race3. Asian                               race3. Asian  0.65102836
## race2. Black                               race2. Black  0.35992125
## race4. Other                               race4. Other  0.23479440
## maritl4. Divorced                     maritl4. Divorced  0.08977574
## maritl3. Widowed                       maritl3. Widowed  0.00000000
## maritl5. Separated                   maritl5. Separated  0.00000000
# random forest. this will take time for high "k" fold cv
set.seed(1)
rf_model <- train(wage~., data=training_set3, trControl=train_control, method="rf" ,preProcess=c("scale","center","pca"))
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
# lasso - regularized regression
set.seed(1)
lasso_model <- train(wage~., data=training_set3, trControl=train_control, method="lasso", metric="RMSE" ,preProcess=c("scale","center","pca"))
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2
# collect resamples
train_results <- resamples(list(LM=lm_model,LM2=lm_model2,RPART=rpart_model,GBM=gbm_model,RF=rf_model,LS=lasso_model))
# summarize the distributions
summary(train_results)
## 
## Call:
## summary.resamples(object = train_results)
## 
## Models: LM, LM2, RPART, GBM, RF, LS 
## Number of resamples: 10 
## 
## RMSE 
##        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
## LM    32.58   34.09  35.27 35.67   37.35 39.51    0
## LM2   33.07   34.34  35.50 35.93   37.61 39.92    0
## RPART 34.52   35.64  36.07 37.14   38.65 42.68    0
## GBM   32.35   33.55  34.65 35.45   37.20 40.35    0
## RF    33.66   35.00  35.64 36.46   38.14 40.17    0
## LS    32.73   34.08  35.25 35.67   37.33 39.58    0
## 
## Rsquared 
##          Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LM    0.18620  0.2642 0.3002 0.2949  0.3274 0.3747    0
## LM2   0.17000  0.2638 0.2937 0.2848  0.3080 0.3675    0
## RPART 0.08589  0.2189 0.2365 0.2383  0.2909 0.3217    0
## GBM   0.17550  0.2646 0.3162 0.3059  0.3490 0.4123    0
## RF    0.19990  0.2517 0.2667 0.2747  0.3088 0.3275    0
## LS    0.18190  0.2655 0.3064 0.2951  0.3262 0.3775    0
# boxplots of results
bwplot(train_results)

# the above results suggest that GBM model performs best on the training data.

EVALUATE MODEL ACCURACY ON TEST SET

#Ideally, you select model that performs best on training data and evaluate on test set. I am doing for all models just for illustration 
par(mfrow=c(2,2))
test_pred_lm <- predict(lm_model, newdata=test_set3)
R2(test_set3$wage,test_pred_lm)
## [1] 0.3581127
RMSE(test_set3$wage,test_pred_lm)
## [1] 31.05722
plot(test_pred_lm,test_set3$wage)

test_pred_lm2 <- predict(lm_model2, newdata=test_set3)
R2(test_set3$wage,test_pred_lm2)
## [1] 0.3594469
RMSE(test_set3$wage,test_pred_lm2)
## [1] 31.03226
plot(test_pred_lm2,test_set3$wage)

test_pred_rpart <- predict(rpart_model, newdata=test_set3)
R2(test_set3$wage,test_pred_rpart)
## [1] 0.2684005
RMSE(test_set3$wage,test_pred_rpart)
## [1] 33.16159
plot(test_pred_rpart,test_set3$wage)

test_pred_gbm <- predict(gbm_model, newdata=test_set3)
R2(test_set3$wage,test_pred_gbm)
## [1] 0.354009
RMSE(test_set3$wage,test_pred_gbm)
## [1] 31.18928
plot(test_pred_gbm,test_set3$wage)

test_pred_rf <- predict(rf_model, newdata=test_set3)
R2(test_set3$wage,test_pred_rf)
## [1] 0.3035356
RMSE(test_set3$wage,test_pred_rf)
## [1] 32.65215
plot(test_pred_rf,test_set3$wage)

test_pred_lasso <- predict(lasso_model, newdata=test_set3)
R2(test_set3$wage,test_pred_lasso)
## [1] 0.3562957
RMSE(test_set3$wage,test_pred_lasso)
## [1] 31.11336
plot(test_pred_lasso,test_set3$wage)