library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
url.train <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
url.names <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names"
download.file(url.train, destfile = "HousingValues.csv")
download.file(url.names, destfile = "HousingFeatureNames.txt")

train <- read.table("HousingValues.csv")
names(train) <- c("CRIM", 
                  "ZN", 
                  "INDUS", 
                  "CHAS", 
                  "NOX", 
                  "RM", 
                  "AGE", 
                  "DIS", 
                  "RAD", 
                  "TAX", 
                  "PTRATIO", 
                  "AA", 
                  "LSTAT", 
                  "MEDV")

#method 1:Linear correlation
model.lm <- train(MEDV ~ ., 
                  data = train,
                  method = "lm")
print(model.lm$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
## (Intercept)         CRIM           ZN        INDUS         CHAS  
##   3.646e+01   -1.080e-01    4.642e-02    2.056e-02    2.687e+00  
##         NOX           RM          AGE          DIS          RAD  
##  -1.777e+01    3.810e+00    6.922e-04   -1.476e+00    3.060e-01  
##         TAX      PTRATIO           AA        LSTAT  
##  -1.233e-02   -9.527e-01    9.312e-03   -5.248e-01
model <- lm(MEDV ~ .,
            data = train)
print(model)
## 
## Call:
## lm(formula = MEDV ~ ., data = train)
## 
## Coefficients:
## (Intercept)         CRIM           ZN        INDUS         CHAS  
##   3.646e+01   -1.080e-01    4.642e-02    2.056e-02    2.687e+00  
##         NOX           RM          AGE          DIS          RAD  
##  -1.777e+01    3.810e+00    6.922e-04   -1.476e+00    3.060e-01  
##         TAX      PTRATIO           AA        LSTAT  
##  -1.233e-02   -9.527e-01    9.312e-03   -5.248e-01
correlations <- cor(train[,1:13])
print(correlations)
##                CRIM          ZN       INDUS         CHAS         NOX
## CRIM     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## ZN      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## CHAS    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## NOX      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## RM      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## AGE      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## DIS     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## TAX      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## PTRATIO  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## AA      -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## LSTAT    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
##                  RM         AGE         DIS          RAD         TAX
## CRIM    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431
## ZN       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332
## INDUS   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018
## CHAS     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652
## NOX     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320
## RM       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783
## AGE     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559
## DIS      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158
## RAD     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819
## TAX     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000
## PTRATIO -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304
## AA       0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801
## LSTAT   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341
##            PTRATIO          AA      LSTAT
## CRIM     0.2899456 -0.38506394  0.4556215
## ZN      -0.3916785  0.17552032 -0.4129946
## INDUS    0.3832476 -0.35697654  0.6037997
## CHAS    -0.1215152  0.04878848 -0.0539293
## NOX      0.1889327 -0.38005064  0.5908789
## RM      -0.3555015  0.12806864 -0.6138083
## AGE      0.2615150 -0.27353398  0.6023385
## DIS     -0.2324705  0.29151167 -0.4969958
## RAD      0.4647412 -0.44441282  0.4886763
## TAX      0.4608530 -0.44180801  0.5439934
## PTRATIO  1.0000000 -0.17738330  0.3740443
## AA      -0.1773833  1.00000000 -0.3660869
## LSTAT    0.3740443 -0.36608690  1.0000000
highCorrelations <- findCorrelation(correlations, cutoff = .75, verbose = TRUE)
## Compare row 3  and column  5 with corr  0.764 
##   Means:  0.514 vs 0.38 so flagging column 3 
## Compare row 5  and column  8 with corr  0.769 
##   Means:  0.479 vs 0.359 so flagging column 5 
## Compare row 10  and column  9 with corr  0.91 
##   Means:  0.462 vs 0.334 so flagging column 10 
## All correlations <= 0.75
print(highCorrelations)
## [1]  3  5 10
highCorrelations <- highCorrelations[-1]
print(highCorrelations)
## [1]  5 10
#method 2:importance of different features using an ROC curve
library(caret)
plot(varImp(model.lm))

#removing highly correlated variables
train <- train[,-highCorrelations]
model.lm <- train(MEDV ~ .,
                  data = train,
                  method = "lm")
plot(varImp(model.lm))

#method 3:recursive feature elimination
set.seed(123)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(train[,1:11], train[,12], sizes=c(1:11), rfeControl=control)
print(results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD  MAESD Selected
##          1 6.784   0.4736 4.870 1.4602    0.17933 0.9163         
##          2 4.562   0.7502 3.136 0.9881    0.10494 0.3747         
##          3 4.170   0.7885 2.822 0.9418    0.10313 0.3850         
##          4 3.767   0.8307 2.571 0.9983    0.09109 0.4135         
##          5 3.596   0.8530 2.474 0.7956    0.06318 0.3241         
##          6 3.320   0.8726 2.315 0.6614    0.04701 0.2773         
##          7 3.357   0.8710 2.309 0.7188    0.05233 0.3031         
##          8 3.366   0.8721 2.334 0.8076    0.05942 0.3558         
##          9 3.248   0.8798 2.259 0.6785    0.04741 0.2903        *
##         10 3.275   0.8784 2.273 0.7365    0.05176 0.3251         
##         11 3.336   0.8745 2.299 0.7964    0.05724 0.3482         
## 
## The top 5 variables (out of 9):
##    RM, LSTAT, DIS, CRIM, PTRATIO
predictors(results)
## [1] "RM"      "LSTAT"   "DIS"     "CRIM"    "PTRATIO" "AGE"     "INDUS"  
## [8] "AA"      "RAD"
plot(results, type=c("g", "o"))

#method 4:feature selection with lasso regression
train <- read.table("HousingValues.csv")
names(train) <- c("CRIM", 
                  "ZN", 
                  "INDUS", 
                  "CHAS", 
                  "NOX", 
                  "RM", 
                  "AGE", 
                  "DIS", 
                  "RAD", 
                  "TAX", 
                  "PTRATIO", 
                  "AA", 
                  "LSTAT", 
                  "MEDV")
train <- train[,-9]
model.lasso <- train(MEDV ~ .,
                     data = train,
                     method = "lasso")
## Loading required package: lars
## Loaded lars 1.2
print(model.lasso$finalModel)
## 
## Call:
## elasticnet::enet(x = as.matrix(x), y = y, lambda = 0)
## Cp statistics of the Lasso fit 
## Cp: 1318.049 1047.460  410.051  140.108  118.319   88.952   81.683   69.501   41.919   23.733   10.226   11.712   13.000 
## DF:  1  2  3  4  5  6  7  8  9 10 11 12 13 
## Sequence of  moves:
##      LSTAT RM PTRATIO AA CHAS CRIM DIS NOX ZN INDUS AGE TAX   
## Var     12  6      10 11    4    1   8   5  2     3   7   9 13
## Step     1  2       3  4    5    6   7   8  9    10  11  12 13
plot(model.lasso$finalModel, xvar = "penalty")