Machine Learning - Continous Data example. Uses different machine learning algorithms to predict wage using multiple features.
Load Required Packages
library(caret)
## Warning: package 'caret' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
library(ISLR)
library(ggplot2)
# libraries for partition trees
library(rpart)
library(rpart.plot)
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.3
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
split data
data(Wage)
dim(Wage)
## [1] 3000 12
colnames(Wage)
## [1] "year" "age" "sex" "maritl" "race"
## [6] "education" "region" "jobclass" "health" "health_ins"
## [11] "logwage" "wage"
set.seed(1)
training_indices <- createDataPartition(y=Wage$wage,p=0.8,list=F)
# remove log-wage column as feature
training_set <- Wage[training_indices,-c(11)]
test_set <- Wage[-training_indices,-c(11)]
# dim(training_set); dim(test_set)
exploratory data analysis
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
# remove near zero covariates (features with no variability)
nearZeroVar(training_set,saveMetrics = T)
## freqRatio percentUnique zeroVar nzv
## year 1.071979 0.29142381 FALSE FALSE
## age 1.000000 2.53955037 FALSE FALSE
## sex 0.000000 0.04163197 TRUE TRUE
## maritl 3.165399 0.20815987 FALSE FALSE
## race 8.186722 0.16652789 FALSE FALSE
## education 1.451673 0.20815987 FALSE FALSE
## region 0.000000 0.04163197 TRUE TRUE
## jobclass 1.061803 0.08326395 FALSE FALSE
## health 2.511696 0.08326395 FALSE FALSE
## health_ins 2.276944 0.08326395 FALSE FALSE
## wage 1.074468 18.40133222 FALSE FALSE
near_zero_covariates <- nearZeroVar(training_set)
head(near_zero_covariates)
## [1] 3 7
if(length(near_zero_covariates)>0)
{
training_set2 <- training_set[,-near_zero_covariates]
test_set2 <- test_set[,-near_zero_covariates]
} else {
training_set2 <- training_set
test_set2 <- test_set
}
dim(training_set2); dim(test_set2)
## [1] 2402 9
## [1] 598 9
# exploratory plots
colnames(training_set2)
## [1] "year" "age" "maritl" "race" "education"
## [6] "jobclass" "health" "health_ins" "wage"
featurePlot(x=training_set2[1:8], y=training_set2$wage, plot="pairs")
qplot(x=training_set2$year, y=training_set2$wage ,ylab="wage")
qplot(x=training_set2$age, y=training_set2$wage ,ylab="wage")
qplot(x=training_set2$age, y=training_set2$wage, color=training_set2$education,pch=training_set2$jobclass ,ylab="wage")
# add regression smoother
qq <- qplot(x=training_set2$age, y=training_set2$wage, color=training_set2$education,pch=training_set2$jobclass ,ylab="wage")
qq + geom_smooth(method="lm",formula=y~x)
qplot(x=training_set2$maritl, y=training_set2$wage ,ylab="wage", color=training_set2$age)
qplot(x=training_set2$age, y=training_set2$wage, color=training_set2$maritl ,ylab="wage")
qplot(x=training_set2$race,y=training_set2$wage,color=training_set2$education ,pch=training_set2$jobclass,cex=1,ylab="wage")
qplot(x=training_set2$health,y=training_set2$wage,color=training_set2$health_ins ,pch=training_set2$jobclass,cex=1,ylab="wage")
# correlated features
# all features should be numeric to calculate correlation
sapply(training_set2[1,],class)
## year age maritl race education jobclass
## "integer" "integer" "factor" "factor" "factor" "factor"
## health health_ins wage
## "factor" "factor" "numeric"
numeric_set <- which(sapply(training_set2[1,],class)=="integer")
feature_correlation <- cor(training_set2[,numeric_set])
# search through a correlation matrix and returns a vector of integers corresponding to columns to remove to reduce pair-wise correlations.
findCorrelation(feature_correlation,0.75,verbose=T,names=T)
## All correlations <= 0.75
## character(0)
high_correlation <- findCorrelation(feature_correlation,0.75,verbose=T,names=T)
## All correlations <= 0.75
high_correlation
## character(0)
# grep(c(high_correlation[1]),colnames(training_set2))
# grep(c(high_correlation[2]),colnames(training_set2))
if(length(high_correlation)>0)
{
training_set3 <- training_set2[,-c(11,15)]
test_set3 <- test_set2[,-c(11,15)]
}else{
training_set3 <- training_set2
test_set3 <- test_set2
}
dim(training_set3); dim(test_set3)
## [1] 2402 9
## [1] 598 9
# PCA
pc <- prcomp(training_set3[,numeric_set],center=T,scale=T)
plot(pc,type="l")
pc$rotation[order(-abs(pc$rotation[,"PC1"])),]
## PC1 PC2
## year -0.7071068 0.7071068
## age -0.7071068 -0.7071068
model building
# since correlation and PCA shows all features are important, all of them are used in the model
# k-fold cross validation
train_control <- trainControl(method="cv", number=10, savePredictions = T)
# fix the parameters of the algorithm
grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))
# linear regression
set.seed(1)
lm_model <- train(wage~., data=training_set3, trControl=train_control, method="lm" ,preProcess=c("scale","center","pca"))
#lm_model
summary(lm_model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.154 -20.465 -3.968 14.195 215.755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.8594 0.7274 153.789 < 2e-16 ***
## PC1 15.8721 0.5386 29.469 < 2e-16 ***
## PC2 0.5658 0.5952 0.951 0.3419
## PC3 -1.3486 0.6333 -2.130 0.0333 *
## PC4 -3.3924 0.6366 -5.329 1.08e-07 ***
## PC5 -1.0708 0.6649 -1.610 0.1074
## PC6 0.1313 0.7027 0.187 0.8518
## PC7 2.8522 0.7066 4.036 5.60e-05 ***
## PC8 0.5709 0.7132 0.800 0.4235
## PC9 -0.1484 0.7304 -0.203 0.8390
## PC10 -1.0694 0.7374 -1.450 0.1471
## PC11 5.8313 0.7583 7.690 2.13e-14 ***
## PC12 -1.4946 0.7931 -1.885 0.0596 .
## PC13 0.3721 0.8260 0.451 0.6524
## PC14 -4.5201 0.8498 -5.319 1.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.65 on 2387 degrees of freedom
## Multiple R-squared: 0.2984, Adjusted R-squared: 0.2943
## F-statistic: 72.51 on 14 and 2387 DF, p-value: < 2.2e-16
# lm_model$finalModel
varImp(lm_model)
## lm variable importance
##
## Overall
## PC1 100.00000
## PC11 25.62496
## PC4 17.56131
## PC14 17.52644
## PC7 13.14698
## PC3 6.63467
## PC12 5.79812
## PC5 4.86199
## PC10 4.31496
## PC2 2.60858
## PC8 2.09580
## PC13 0.90073
## PC9 0.05592
## PC6 0.00000
plot(varImp(lm_model))
# re-train model using features that look important
varImp(train(wage~., data=training_set3, trControl=train_control, method="lm" ,preProcess=c("scale","center"))) # not including pca in preProcess as its difficult to interpret their importance in model
## lm variable importance
##
## Overall
## `education5. Advanced Degree` 100.000
## `health_ins2. No` 64.662
## `education4. College Grad` 64.144
## `maritl2. Married` 52.227
## `education3. Some College` 36.691
## `health2. >=Very Good` 27.513
## year 20.537
## age 18.173
## `education2. HS Grad` 16.232
## `jobclass2. Information` 9.767
## `maritl5. Separated` 8.207
## `race2. Black` 7.951
## `race4. Other` 5.461
## `race3. Asian` 5.131
## `maritl4. Divorced` 1.245
## `maritl3. Widowed` 0.000
set.seed(1)
lm_model2 <- train(wage ~ education + health_ins + jobclass + maritl, data=training_set3, trControl=train_control, method="lm",preProcess=c("scale","center","pca"))
# tree / recursive partioning
set.seed(1)
rpart_model <- train(wage~., data=training_set3, trControl=train_control, method="rpart" ,preProcess=c("scale","center","pca"))
# plot classification trees
fancyRpartPlot(rpart_model$finalModel)
# boosting with tres
set.seed(1)
gbm_model <- train(wage~., data=training_set3, trControl=train_control, method = "gbm", verbose=F ,preProcess=c("scale","center","pca"))
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
summary(train(wage~., data=training_set3, trControl=train_control, method = "gbm", verbose=F ,preProcess=c("scale","center")))
## var rel.inf
## education5. Advanced Degree education5. Advanced Degree 31.26189167
## health_ins2. No health_ins2. No 17.03002997
## age age 14.28736109
## education4. College Grad education4. College Grad 12.06047761
## maritl2. Married maritl2. Married 9.57699526
## year year 4.10832632
## health2. >=Very Good health2. >=Very Good 3.76809844
## education2. HS Grad education2. HS Grad 3.13680596
## education3. Some College education3. Some College 2.00228073
## jobclass2. Information jobclass2. Information 1.43221319
## race3. Asian race3. Asian 0.65102836
## race2. Black race2. Black 0.35992125
## race4. Other race4. Other 0.23479440
## maritl4. Divorced maritl4. Divorced 0.08977574
## maritl3. Widowed maritl3. Widowed 0.00000000
## maritl5. Separated maritl5. Separated 0.00000000
# random forest. this will take time for high "k" fold cv
set.seed(1)
rf_model <- train(wage~., data=training_set3, trControl=train_control, method="rf" ,preProcess=c("scale","center","pca"))
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid
## mtry: reset to within valid range
# lasso - regularized regression
set.seed(1)
lasso_model <- train(wage~., data=training_set3, trControl=train_control, method="lasso", metric="RMSE" ,preProcess=c("scale","center","pca"))
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2
# collect resamples
train_results <- resamples(list(LM=lm_model,LM2=lm_model2,RPART=rpart_model,GBM=gbm_model,RF=rf_model,LS=lasso_model))
# summarize the distributions
summary(train_results)
##
## Call:
## summary.resamples(object = train_results)
##
## Models: LM, LM2, RPART, GBM, RF, LS
## Number of resamples: 10
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 32.58 34.09 35.27 35.67 37.35 39.51 0
## LM2 33.07 34.34 35.50 35.93 37.61 39.92 0
## RPART 34.52 35.64 36.07 37.14 38.65 42.68 0
## GBM 32.35 33.55 34.65 35.45 37.20 40.35 0
## RF 33.66 35.00 35.64 36.46 38.14 40.17 0
## LS 32.73 34.08 35.25 35.67 37.33 39.58 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LM 0.18620 0.2642 0.3002 0.2949 0.3274 0.3747 0
## LM2 0.17000 0.2638 0.2937 0.2848 0.3080 0.3675 0
## RPART 0.08589 0.2189 0.2365 0.2383 0.2909 0.3217 0
## GBM 0.17550 0.2646 0.3162 0.3059 0.3490 0.4123 0
## RF 0.19990 0.2517 0.2667 0.2747 0.3088 0.3275 0
## LS 0.18190 0.2655 0.3064 0.2951 0.3262 0.3775 0
# boxplots of results
bwplot(train_results)
# the above results suggest that GBM model performs best on the training data.
EVALUATE MODEL ACCURACY ON TEST SET
#Ideally, you select model that performs best on training data and evaluate on test set. I am doing for all models just for illustration
par(mfrow=c(2,2))
test_pred_lm <- predict(lm_model, newdata=test_set3)
R2(test_set3$wage,test_pred_lm)
## [1] 0.3581127
RMSE(test_set3$wage,test_pred_lm)
## [1] 31.05722
plot(test_pred_lm,test_set3$wage)
test_pred_lm2 <- predict(lm_model2, newdata=test_set3)
R2(test_set3$wage,test_pred_lm2)
## [1] 0.3594469
RMSE(test_set3$wage,test_pred_lm2)
## [1] 31.03226
plot(test_pred_lm2,test_set3$wage)
test_pred_rpart <- predict(rpart_model, newdata=test_set3)
R2(test_set3$wage,test_pred_rpart)
## [1] 0.2684005
RMSE(test_set3$wage,test_pred_rpart)
## [1] 33.16159
plot(test_pred_rpart,test_set3$wage)
test_pred_gbm <- predict(gbm_model, newdata=test_set3)
R2(test_set3$wage,test_pred_gbm)
## [1] 0.354009
RMSE(test_set3$wage,test_pred_gbm)
## [1] 31.18928
plot(test_pred_gbm,test_set3$wage)
test_pred_rf <- predict(rf_model, newdata=test_set3)
R2(test_set3$wage,test_pred_rf)
## [1] 0.3035356
RMSE(test_set3$wage,test_pred_rf)
## [1] 32.65215
plot(test_pred_rf,test_set3$wage)
test_pred_lasso <- predict(lasso_model, newdata=test_set3)
R2(test_set3$wage,test_pred_lasso)
## [1] 0.3562957
RMSE(test_set3$wage,test_pred_lasso)
## [1] 31.11336
plot(test_pred_lasso,test_set3$wage)