# Demo Data: Boston Housing Price
library(caret)
## Warning: package 'caret' was built under R version 3.6.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.1
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.6.1
data("BostonHousing")
str(BostonHousing)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Train Linear Model
model_lm <- train(medv ~ ., data = BostonHousing, method = "lm")
print(model_lm)
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.912646 0.7157955 3.456434
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Model Specification
model_lm$finalModel
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Coefficients:
## (Intercept) crim zn indus chas1
## 3.646e+01 -1.080e-01 4.642e-02 2.056e-02 2.687e+00
## nox rm age dis rad
## -1.777e+01 3.810e+00 6.922e-04 -1.476e+00 3.060e-01
## tax ptratio b lstat
## -1.233e-02 -9.527e-01 9.312e-03 -5.248e-01
# Model Details
summary(model_lm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas1 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## b 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
# Model Residuals' Plot
plot(model_lm$finalModel)




# Run LM Model with Standardized Data
model_lm2 <- train(medv ~ ., data = BostonHousing, method = "lm", preProcess = c("center", "scale"))
print(model_lm2)
## Linear Regression
##
## 506 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.054569 0.7166256 3.538826
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Calculate R Square and Compare with Output(1)
predict_lm <- predict(model_lm, newdata = BostonHousing, type = "raw")
real_predict <- data.frame(cbind(real = BostonHousing$medv, predict = predict_lm,
square_diff = (BostonHousing$medv - predict_lm)^2))
str(real_predict)
## 'data.frame': 506 obs. of 3 variables:
## $ real : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ predict : num 30 25 30.6 28.6 27.9 ...
## $ square_diff: num 36 11.7 17.1 23 68.2 ...
# Logistic Regression
library(arules)
## Warning: package 'arules' was built under R version 3.6.1
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
BostonHousing2 <- BostonHousing
BostonHousing2$medv <- discretize(BostonHousing2$medv,
method = "frequency",
breaks = 2,
labels = c("low", "high"))
table(BostonHousing2$medv)
##
## low high
## 251 255
model_glm <- train(medv ~ ., data = BostonHousing2,
method = "glm", family = "binomial")
print(model_glm)
## Generalized Linear Model
##
## 506 samples
## 13 predictor
## 2 classes: 'low', 'high'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results:
##
## Accuracy Kappa
## 0.854822 0.709358
# Logistic Regression Model Output
summary(model_glm)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0015 -0.3574 0.0085 0.2981 3.3286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.065836 4.007916 3.011 0.00261 **
## crim -0.082159 0.074950 -1.096 0.27300
## zn 0.012275 0.013525 0.908 0.36411
## indus 0.029454 0.043259 0.681 0.49594
## chas1 1.659713 0.664634 2.497 0.01252 *
## nox -7.283784 2.733645 -2.664 0.00771 **
## rm 1.617226 0.440282 3.673 0.00024 ***
## age -0.028603 0.010451 -2.737 0.00620 **
## dis -0.711035 0.168061 -4.231 2.33e-05 ***
## rad 0.237470 0.060920 3.898 9.70e-05 ***
## tax -0.008461 0.002919 -2.899 0.00374 **
## ptratio -0.552611 0.107529 -5.139 2.76e-07 ***
## b 0.004615 0.002891 1.596 0.11041
## lstat -0.318739 0.054587 -5.839 5.25e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.43 on 505 degrees of freedom
## Residual deviance: 273.85 on 492 degrees of freedom
## AIC: 301.85
##
## Number of Fisher Scoring iterations: 7
# Relative Importance of Variables by Logistic Regression
varImp(model_glm)
## glm variable importance
##
## Overall
## lstat 100.000
## ptratio 86.431
## dis 68.821
## rad 62.370
## rm 58.010
## tax 43.000
## age 39.859
## nox 38.455
## chas1 35.212
## b 17.748
## crim 8.051
## zn 4.394
## indus 0.000