Download data, load packages, load data into memory, add column names

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# Download adult income data
url.train <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
url.names <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names"
download.file(url.train, destfile = "HousingValues.csv")
download.file(url.names, destfile = "HousingFeatureNames.txt")

# Read the training and test data into memory
train <- read.table("HousingValues.csv")
# Feature names can be found by referring to the downloaded names .txt file above
names(train) <- c("CRIM",      #per capita crime rate by town
                  "ZN",        #proportion of residential land zoned for lots over 25,000 sq.ft.
                  "INDUS",     #proportion of non-retail business acres per town
                  "CHAS",      #Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
                  "NOX",       #nitric oxides concentration (parts per 10 million)
                  "RM",        #average number of rooms per dwelling
                  "AGE",       #proportion of owner-occupied units built prior to 1940
                  "DIS",       #weighted distances to five Boston employment centres
                  "RAD",       #index of accessibility to radial highways
                  "TAX",       #full-value property-tax rate per $10,000
                  "PTRATIO",   #pupil-teacher ratio by town
                  "AA",        #1000(AA - 0.63)^2 where AA is the proportion of African Americans by town
                  "LSTAT",     #% lower status of the population
                  "MEDV")      #Median value of owner-occupied homes in $1000's

How to select features

Let’s start nice and easy with a no-frills OLS linear model:

model.lm <- train(MEDV ~ .,
                  data = train,
                  method = "lm")
print(model.lm$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
## (Intercept)         CRIM           ZN        INDUS         CHAS  
##   3.646e+01   -1.080e-01    4.642e-02    2.056e-02    2.687e+00  
##         NOX           RM          AGE          DIS          RAD  
##  -1.777e+01    3.810e+00    6.922e-04   -1.476e+00    3.060e-01  
##         TAX      PTRATIO           AA        LSTAT  
##  -1.233e-02   -9.527e-01    9.312e-03   -5.248e-01

Compare results to what you get from using lm():

model <- lm(MEDV ~ .,
            data = train)
print(model)
## 
## Call:
## lm(formula = MEDV ~ ., data = train)
## 
## Coefficients:
## (Intercept)         CRIM           ZN        INDUS         CHAS  
##   3.646e+01   -1.080e-01    4.642e-02    2.056e-02    2.687e+00  
##         NOX           RM          AGE          DIS          RAD  
##  -1.777e+01    3.810e+00    6.922e-04   -1.476e+00    3.060e-01  
##         TAX      PTRATIO           AA        LSTAT  
##  -1.233e-02   -9.527e-01    9.312e-03   -5.248e-01

One thing that is worth consideration is correlated predictors. Including multiple highly correlated features does not offer very much explanatory power beyond the power of including just one, but it negatively impacts variance of estimates so let’s find highly correlated predictors as our first pass:

correlations <- cor(train[,1:13])
print(correlations)
##                CRIM          ZN       INDUS         CHAS         NOX
## CRIM     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## ZN      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## CHAS    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## NOX      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## RM      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## AGE      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## DIS     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## TAX      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## PTRATIO  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## AA      -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## LSTAT    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
##                  RM         AGE         DIS          RAD         TAX
## CRIM    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431
## ZN       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332
## INDUS   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018
## CHAS     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652
## NOX     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320
## RM       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783
## AGE     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559
## DIS      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158
## RAD     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819
## TAX     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000
## PTRATIO -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304
## AA       0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801
## LSTAT   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341
##            PTRATIO          AA      LSTAT
## CRIM     0.2899456 -0.38506394  0.4556215
## ZN      -0.3916785  0.17552032 -0.4129946
## INDUS    0.3832476 -0.35697654  0.6037997
## CHAS    -0.1215152  0.04878848 -0.0539293
## NOX      0.1889327 -0.38005064  0.5908789
## RM      -0.3555015  0.12806864 -0.6138083
## AGE      0.2615150 -0.27353398  0.6023385
## DIS     -0.2324705  0.29151167 -0.4969958
## RAD      0.4647412 -0.44441282  0.4886763
## TAX      0.4608530 -0.44180801  0.5439934
## PTRATIO  1.0000000 -0.17738330  0.3740443
## AA      -0.1773833  1.00000000 -0.3660869
## LSTAT    0.3740443 -0.36608690  1.0000000
highCorrelations <- findCorrelation(correlations, cutoff = .75, verbose = TRUE)
## Compare row 3  and column  5 with corr  0.764 
##   Means:  0.514 vs 0.38 so flagging column 3 
## Compare row 5  and column  8 with corr  0.769 
##   Means:  0.479 vs 0.359 so flagging column 5 
## Compare row 10  and column  9 with corr  0.91 
##   Means:  0.462 vs 0.334 so flagging column 10 
## All correlations <= 0.75
print(highCorrelations)
## [1]  3  5 10

Carefully read the output from the findCorrelation() function call; it wants to eliminate columns 3, 5, and 10. But it wants to eliminate 3 because it’s correlated with 5 and 5 because it’s correlated with 8. But it we remove column 5, 3 won’t be strongly correlated with anything anymore! It is important to watch what your code is doing, in this case it doesn’t look like 3 is necessary to remove.

highCorrelations <- highCorrelations[-1]
print(highCorrelations)
## [1]  5 10

Another feature selection method is available in the caret package which determines the importance of different features using an ROC curve. This function is called using varImp() on a caret model:

plot(varImp(model.lm))

What happens when we remove the highly correlated variables (5 and 10)?

train <- train[,-highCorrelations]
model.lm <- train(MEDV ~ .,
                  data = train,
                  method = "lm")
plot(varImp(model.lm))

The caret package has another nice prepackaged methodology for feature selection: rfeControl() which uses a method called Recursive Feature Elimination. This code works as follows:

set.seed(134)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(train[,1:11], train[,12], sizes=c(1:11), rfeControl=control)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
print(results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared RMSESD RsquaredSD Selected
##          1 6.793   0.4874 0.9143    0.11753         
##          2 4.565   0.7523 0.7908    0.08345         
##          3 4.005   0.8137 0.4626    0.06017         
##          4 3.857   0.8280 0.5449    0.06210         
##          5 3.707   0.8451 0.5510    0.05308         
##          6 3.388   0.8682 0.4232    0.03973         
##          7 3.399   0.8689 0.4274    0.04281         
##          8 3.418   0.8695 0.4624    0.04352         
##          9 3.293   0.8772 0.4246    0.03896        *
##         10 3.340   0.8746 0.4419    0.04130         
##         11 3.423   0.8682 0.5038    0.04653         
## 
## The top 5 variables (out of 9):
##    RM, LSTAT, DIS, CRIM, PTRATIO
predictors(results)
## [1] "RM"      "LSTAT"   "DIS"     "CRIM"    "PTRATIO" "AGE"     "INDUS"  
## [8] "AA"      "RAD"
plot(results, type=c("g", "o"))

Lowest RMSE at 9 predictors but this is only slightly better than what was attained with 6, so might still be better to only include 6 for final model. The 9 that are suggested are listed with predictors(results).

Note that the top 5 features from the RFE methodology isn’t quite the same as from using varImp()! The most appropriate selection of features will ultimately be a design decision you have to make based on which seems to be the best set for prediction. These methodologies are simply ways for you to make that decision in an informed way but at the end of the day it’s up to YOU to make sure that your selection is the best it can be.

Lasso for feature selection

The regularized regression technique called the “lasso” essentially is an OLS regression with a constraint applied to the betas, namely that the sum of the absolute values of all of the betas are less than some parameter. This will be discussed in greater detail in another example.

train <- read.table("HousingValues.csv")

names(train) <- c("CRIM",      #per capita crime rate by town
                  "ZN",        #proportion of residential land zoned for lots over 25,000 sq.ft.
                  "INDUS",     #proportion of non-retail business acres per town
                  "CHAS",      #Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
                  "NOX",       #nitric oxides concentration (parts per 10 million)
                  "RM",        #average number of rooms per dwelling
                  "AGE",       #proportion of owner-occupied units built prior to 1940
                  "DIS",       #weighted distances to five Boston employment centres
                  "RAD",       #index of accessibility to radial highways
                  "TAX",       #full-value property-tax rate per $10,000
                  "PTRATIO",   #pupil-teacher ratio by town
                  "AA",        #1000(AA - 0.63)^2 where AA is the proportion of African Americans by town
                  "LSTAT",     #% lower status of the population
                  "MEDV")      #Median value of owner-occupied homes in $1000's
train <- train[,-9]
model.lasso <- train(MEDV ~ .,
                     data = train,
                     method = "lasso")
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2
print(model.lasso$finalModel)
## 
## Call:
## enet(x = as.matrix(x), y = y, lambda = 0)
## Cp statistics of the Lasso fit 
## Cp: 1318.049 1047.460  410.051  140.108  118.319   88.952   81.683   69.501   41.919   23.733   10.226   11.712   13.000 
## DF:  1  2  3  4  5  6  7  8  9 10 11 12 13 
## Sequence of  moves:
##      LSTAT RM PTRATIO AA CHAS CRIM DIS NOX ZN INDUS AGE TAX   
## Var     12  6      10 11    4    1   8   5  2     3   7   9 13
## Step     1  2       3  4    5    6   7   8  9    10  11  12 13
plot(model.lasso$finalModel, xvar = "penalty")

This plot shows which variables get set to zero in the lasso as lambda (the penalty parameter) is increased (forgive the missing labels! But the important ones are still visible). You can also see the order of importance of variables by looking at “Step” in the final model printout above the plot.

Which variables get set to zero last using the lasso approach (i.e. are the most important)? How do these compare to what we found using the first two methods?