library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
url.train <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
url.names <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names"
download.file(url.train, destfile = "HousingValues.csv")
download.file(url.names, destfile = "HousingFeatureNames.txt")
train <- read.table("HousingValues.csv")
names(train) <- c("CRIM",
"ZN",
"INDUS",
"CHAS",
"NOX",
"RM",
"AGE",
"DIS",
"RAD",
"TAX",
"PTRATIO",
"AA",
"LSTAT",
"MEDV")
#method 1:Linear correlation
model.lm <- train(MEDV ~ .,
data = train,
method = "lm")
print(model.lm$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Coefficients:
## (Intercept) CRIM ZN INDUS CHAS
## 3.646e+01 -1.080e-01 4.642e-02 2.056e-02 2.687e+00
## NOX RM AGE DIS RAD
## -1.777e+01 3.810e+00 6.922e-04 -1.476e+00 3.060e-01
## TAX PTRATIO AA LSTAT
## -1.233e-02 -9.527e-01 9.312e-03 -5.248e-01
model <- lm(MEDV ~ .,
data = train)
print(model)
##
## Call:
## lm(formula = MEDV ~ ., data = train)
##
## Coefficients:
## (Intercept) CRIM ZN INDUS CHAS
## 3.646e+01 -1.080e-01 4.642e-02 2.056e-02 2.687e+00
## NOX RM AGE DIS RAD
## -1.777e+01 3.810e+00 6.922e-04 -1.476e+00 3.060e-01
## TAX PTRATIO AA LSTAT
## -1.233e-02 -9.527e-01 9.312e-03 -5.248e-01
correlations <- cor(train[,1:13])
print(correlations)
## CRIM ZN INDUS CHAS NOX
## CRIM 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## ZN -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## CHAS -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## NOX 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## RM -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## AGE 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## DIS -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## TAX 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## PTRATIO 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## AA -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## LSTAT 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## RM AGE DIS RAD TAX
## CRIM -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## ZN 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## INDUS -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## CHAS 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## NOX -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## RM 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## AGE -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## DIS 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## RAD -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## TAX -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## PTRATIO -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## AA 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## LSTAT -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## PTRATIO AA LSTAT
## CRIM 0.2899456 -0.38506394 0.4556215
## ZN -0.3916785 0.17552032 -0.4129946
## INDUS 0.3832476 -0.35697654 0.6037997
## CHAS -0.1215152 0.04878848 -0.0539293
## NOX 0.1889327 -0.38005064 0.5908789
## RM -0.3555015 0.12806864 -0.6138083
## AGE 0.2615150 -0.27353398 0.6023385
## DIS -0.2324705 0.29151167 -0.4969958
## RAD 0.4647412 -0.44441282 0.4886763
## TAX 0.4608530 -0.44180801 0.5439934
## PTRATIO 1.0000000 -0.17738330 0.3740443
## AA -0.1773833 1.00000000 -0.3660869
## LSTAT 0.3740443 -0.36608690 1.0000000
highCorrelations <- findCorrelation(correlations, cutoff = .75, verbose = TRUE)
## Compare row 3 and column 5 with corr 0.764
## Means: 0.514 vs 0.38 so flagging column 3
## Compare row 5 and column 8 with corr 0.769
## Means: 0.479 vs 0.359 so flagging column 5
## Compare row 10 and column 9 with corr 0.91
## Means: 0.462 vs 0.334 so flagging column 10
## All correlations <= 0.75
print(highCorrelations)
## [1] 3 5 10
highCorrelations <- highCorrelations[-1]
print(highCorrelations)
## [1] 5 10
#method 2:importance of different features using an ROC curve
library(caret)
plot(varImp(model.lm))

#removing highly correlated variables
train <- train[,-highCorrelations]
model.lm <- train(MEDV ~ .,
data = train,
method = "lm")
plot(varImp(model.lm))

#method 3:recursive feature elimination
set.seed(123)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(train[,1:11], train[,12], sizes=c(1:11), rfeControl=control)
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 6.784 0.4736 4.870 1.4602 0.17933 0.9163
## 2 4.562 0.7502 3.136 0.9881 0.10494 0.3747
## 3 4.170 0.7885 2.822 0.9418 0.10313 0.3850
## 4 3.767 0.8307 2.571 0.9983 0.09109 0.4135
## 5 3.596 0.8530 2.474 0.7956 0.06318 0.3241
## 6 3.320 0.8726 2.315 0.6614 0.04701 0.2773
## 7 3.357 0.8710 2.309 0.7188 0.05233 0.3031
## 8 3.366 0.8721 2.334 0.8076 0.05942 0.3558
## 9 3.248 0.8798 2.259 0.6785 0.04741 0.2903 *
## 10 3.275 0.8784 2.273 0.7365 0.05176 0.3251
## 11 3.336 0.8745 2.299 0.7964 0.05724 0.3482
##
## The top 5 variables (out of 9):
## RM, LSTAT, DIS, CRIM, PTRATIO
predictors(results)
## [1] "RM" "LSTAT" "DIS" "CRIM" "PTRATIO" "AGE" "INDUS"
## [8] "AA" "RAD"
plot(results, type=c("g", "o"))

#method 4:feature selection with lasso regression
train <- read.table("HousingValues.csv")
names(train) <- c("CRIM",
"ZN",
"INDUS",
"CHAS",
"NOX",
"RM",
"AGE",
"DIS",
"RAD",
"TAX",
"PTRATIO",
"AA",
"LSTAT",
"MEDV")
train <- train[,-9]
model.lasso <- train(MEDV ~ .,
data = train,
method = "lasso")
## Loading required package: lars
## Loaded lars 1.2
print(model.lasso$finalModel)
##
## Call:
## elasticnet::enet(x = as.matrix(x), y = y, lambda = 0)
## Cp statistics of the Lasso fit
## Cp: 1318.049 1047.460 410.051 140.108 118.319 88.952 81.683 69.501 41.919 23.733 10.226 11.712 13.000
## DF: 1 2 3 4 5 6 7 8 9 10 11 12 13
## Sequence of moves:
## LSTAT RM PTRATIO AA CHAS CRIM DIS NOX ZN INDUS AGE TAX
## Var 12 6 10 11 4 1 8 5 2 3 7 9 13
## Step 1 2 3 4 5 6 7 8 9 10 11 12 13
plot(model.lasso$finalModel, xvar = "penalty")
