Week-10-R-Notes.utf8.md

# Demo Data: Boston Housing Price
library(caret)

## Warning: package 'caret' was built under R version 3.6.1

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.6.1

library(mlbench)

## Warning: package 'mlbench' was built under R version 3.6.1

data("BostonHousing")
str(BostonHousing)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

# Train Linear Model
model_lm <- train(medv ~ ., data = BostonHousing, method = "lm")
print(model_lm)

## Linear Regression 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.912646  0.7157955  3.456434
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

# Model Specification
model_lm$finalModel

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
## (Intercept)         crim           zn        indus        chas1  
##   3.646e+01   -1.080e-01    4.642e-02    2.056e-02    2.687e+00  
##         nox           rm          age          dis          rad  
##  -1.777e+01    3.810e+00    6.922e-04   -1.476e+00    3.060e-01  
##         tax      ptratio            b        lstat  
##  -1.233e-02   -9.527e-01    9.312e-03   -5.248e-01

# Model Details
summary(model_lm)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas1        2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## b            9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

# Model Residuals' Plot
plot(model_lm$finalModel)

# Run LM Model with Standardized Data
model_lm2 <- train(medv ~ ., data = BostonHousing, method = "lm", preProcess = c("center", "scale"))
print(model_lm2)

## Linear Regression 
## 
## 506 samples
##  13 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   5.054569  0.7166256  3.538826
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

# Calculate R Square and Compare with Output(1)
predict_lm <- predict(model_lm, newdata = BostonHousing, type = "raw")
real_predict <- data.frame(cbind(real = BostonHousing$medv, predict = predict_lm, 
                                 square_diff = (BostonHousing$medv - predict_lm)^2))
str(real_predict)

## 'data.frame':    506 obs. of  3 variables:
##  $ real       : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##  $ predict    : num  30 25 30.6 28.6 27.9 ...
##  $ square_diff: num  36 11.7 17.1 23 68.2 ...

# Logistic Regression
library(arules)

## Warning: package 'arules' was built under R version 3.6.1

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

BostonHousing2 <- BostonHousing
BostonHousing2$medv <- discretize(BostonHousing2$medv,
                                  method = "frequency",
                                  breaks = 2,
                                  labels = c("low", "high"))
table(BostonHousing2$medv)

## 
##  low high 
##  251  255

model_glm <- train(medv ~ ., data = BostonHousing2, 
                   method = "glm", family = "binomial")

print(model_glm)

## Generalized Linear Model 
## 
## 506 samples
##  13 predictor
##   2 classes: 'low', 'high' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results:
## 
##   Accuracy  Kappa   
##   0.854822  0.709358

# Logistic Regression Model Output
summary(model_glm)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0015  -0.3574   0.0085   0.2981   3.3286  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 12.065836   4.007916   3.011  0.00261 ** 
## crim        -0.082159   0.074950  -1.096  0.27300    
## zn           0.012275   0.013525   0.908  0.36411    
## indus        0.029454   0.043259   0.681  0.49594    
## chas1        1.659713   0.664634   2.497  0.01252 *  
## nox         -7.283784   2.733645  -2.664  0.00771 ** 
## rm           1.617226   0.440282   3.673  0.00024 ***
## age         -0.028603   0.010451  -2.737  0.00620 ** 
## dis         -0.711035   0.168061  -4.231 2.33e-05 ***
## rad          0.237470   0.060920   3.898 9.70e-05 ***
## tax         -0.008461   0.002919  -2.899  0.00374 ** 
## ptratio     -0.552611   0.107529  -5.139 2.76e-07 ***
## b            0.004615   0.002891   1.596  0.11041    
## lstat       -0.318739   0.054587  -5.839 5.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 701.43  on 505  degrees of freedom
## Residual deviance: 273.85  on 492  degrees of freedom
## AIC: 301.85
## 
## Number of Fisher Scoring iterations: 7

# Relative Importance of Variables by Logistic Regression
varImp(model_glm)

## glm variable importance
## 
##         Overall
## lstat   100.000
## ptratio  86.431
## dis      68.821
## rad      62.370
## rm       58.010
## tax      43.000
## age      39.859
## nox      38.455
## chas1    35.212
## b        17.748
## crim      8.051
## zn        4.394
## indus     0.000

Week-10-R-Notes.R

Admin

2019-11-05