Linear Regression in ML using Boston Housing Data

Intro To Linear Regression In R

Setup

Setup1

Setup2

Load Libs

library(MASS)
library(plyr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
#install.packages("caret")
library(caret)

## Loading required package: lattice

#install.packages("randomForest")
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

#install.packages("gbm")
library(gbm)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.3

#install.packages("corrgram")
library(corrgram)

## 
## Attaching package: 'corrgram'

## The following object is masked from 'package:plyr':
## 
##     baseball

#install.packages("corrplot")
library(corrplot)

Load IRIS
Boston is ready made; just use it

# If .txt tab file, use this
#my_housing_data <- read.delim(file.choose())
# Or, if .csv file, use this
#my_housing_data <- read.csv(file.choose())

# Store the data in the variable my_housing_data
data("Boston")
my_housing_data <- Boston
View(my_housing_data)

# Print the first 6 rows
head(my_housing_data, 6)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7

# Print the last 6 rows
tail(my_housing_data, 6)

##        crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 501 0.22438  0  9.69    0 0.585 6.027 79.7 2.4982   6 391    19.2 396.90
## 502 0.06263  0 11.93    0 0.573 6.593 69.1 2.4786   1 273    21.0 391.99
## 503 0.04527  0 11.93    0 0.573 6.120 76.7 2.2875   1 273    21.0 396.90
## 504 0.06076  0 11.93    0 0.573 6.976 91.0 2.1675   1 273    21.0 396.90
## 505 0.10959  0 11.93    0 0.573 6.794 89.3 2.3889   1 273    21.0 393.45
## 506 0.04741  0 11.93    0 0.573 6.030 80.8 2.5050   1 273    21.0 396.90
##     lstat medv
## 501 14.33 16.8
## 502  9.67 22.4
## 503  9.08 20.6
## 504  5.64 23.9
## 505  6.48 22.0
## 506  7.88 11.9

Summary Iris

summary(my_housing_data, digits = 1)

##       crim             zn          indus           chas           nox     
##  Min.   :6e-03   Min.   :  0   Min.   : 0.5   Min.   :0.00   Min.   :0.4  
##  1st Qu.:8e-02   1st Qu.:  0   1st Qu.: 5.2   1st Qu.:0.00   1st Qu.:0.4  
##  Median :3e-01   Median :  0   Median : 9.7   Median :0.00   Median :0.5  
##  Mean   :4e+00   Mean   : 11   Mean   :11.1   Mean   :0.07   Mean   :0.6  
##  3rd Qu.:4e+00   3rd Qu.: 12   3rd Qu.:18.1   3rd Qu.:0.00   3rd Qu.:0.6  
##  Max.   :9e+01   Max.   :100   Max.   :27.7   Max.   :1.00   Max.   :0.9  
##        rm         age           dis          rad          tax     
##  Min.   :4   Min.   :  3   Min.   : 1   Min.   : 1   Min.   :187  
##  1st Qu.:6   1st Qu.: 45   1st Qu.: 2   1st Qu.: 4   1st Qu.:279  
##  Median :6   Median : 78   Median : 3   Median : 5   Median :330  
##  Mean   :6   Mean   : 69   Mean   : 4   Mean   :10   Mean   :408  
##  3rd Qu.:7   3rd Qu.: 94   3rd Qu.: 5   3rd Qu.:24   3rd Qu.:666  
##  Max.   :9   Max.   :100   Max.   :12   Max.   :24   Max.   :711  
##     ptratio       black           lstat         medv   
##  Min.   :13   Min.   :  0.3   Min.   : 2   Min.   : 5  
##  1st Qu.:17   1st Qu.:375.4   1st Qu.: 7   1st Qu.:17  
##  Median :19   Median :391.4   Median :11   Median :21  
##  Mean   :18   Mean   :356.7   Mean   :13   Mean   :23  
##  3rd Qu.:20   3rd Qu.:396.2   3rd Qu.:17   3rd Qu.:25  
##  Max.   :22   Max.   :396.9   Max.   :38   Max.   :50

Stat.desc() function on Iris

#install.packages("pastecs")

# Compute descriptive statistics
library(pastecs)

## Loading required package: boot

## 
## Attaching package: 'boot'

## The following object is masked from 'package:survival':
## 
##     aml

## The following object is masked from 'package:lattice':
## 
##     melanoma

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## The following object is masked from 'package:tidyr':
## 
##     extract

res <- stat.desc(my_housing_data[, -5])
round(res, 2)

##                 crim      zn   indus   chas      rm      age     dis
## nbr.val       506.00  506.00  506.00 506.00  506.00   506.00  506.00
## nbr.null        0.00  372.00    0.00 471.00    0.00     0.00    0.00
## nbr.na          0.00    0.00    0.00   0.00    0.00     0.00    0.00
## min             0.01    0.00    0.46   0.00    3.56     2.90    1.13
## max            88.98  100.00   27.74   1.00    8.78   100.00   12.13
## range          88.97  100.00   27.28   1.00    5.22    97.10   11.00
## sum          1828.44 5750.00 5635.21  35.00 3180.03 34698.90 1920.29
## median          0.26    0.00    9.69   0.00    6.21    77.50    3.21
## mean            3.61   11.36   11.14   0.07    6.28    68.57    3.80
## SE.mean         0.38    1.04    0.30   0.01    0.03     1.25    0.09
## CI.mean.0.95    0.75    2.04    0.60   0.02    0.06     2.46    0.18
## var            73.99  543.94   47.06   0.06    0.49   792.36    4.43
## std.dev         8.60   23.32    6.86   0.25    0.70    28.15    2.11
## coef.var        2.38    2.05    0.62   3.67    0.11     0.41    0.55
##                  rad       tax ptratio     black   lstat     medv
## nbr.val       506.00    506.00  506.00    506.00  506.00   506.00
## nbr.null        0.00      0.00    0.00      0.00    0.00     0.00
## nbr.na          0.00      0.00    0.00      0.00    0.00     0.00
## min             1.00    187.00   12.60      0.32    1.73     5.00
## max            24.00    711.00   22.00    396.90   37.97    50.00
## range          23.00    524.00    9.40    396.58   36.24    45.00
## sum          4832.00 206568.00 9338.50 180477.06 6402.45 11401.60
## median          5.00    330.00   19.05    391.44   11.36    21.20
## mean            9.55    408.24   18.46    356.67   12.65    22.53
## SE.mean         0.39      7.49    0.10      4.06    0.32     0.41
## CI.mean.0.95    0.76     14.72    0.19      7.97    0.62     0.80
## var            75.82  28404.76    4.69   8334.75   50.99    84.59
## std.dev         8.71    168.54    2.16     91.29    7.14     9.20
## coef.var        0.91      0.41    0.12      0.26    0.56     0.41

Dataset Split - method 1

#vctTrnRecs <- createDataPartition(y=my_housing_data$medv, p=0.7, list=FALSE)
#my_housing_data_TrnData <- my_housing_data[vctTrnRecs,]
#my_housing_data_TstData <- my_housing_data[-vctTrnRecs,]

Find Corelations - method 1

#functions
#detectNA <- function(inp) {
 # sum(is.na(inp))
#}
#detectCor <- function(x) {
 # cor(as.numeric(my_housing_data[, x]), 
  #  as.numeric(my_housing_data$medv), 
   # method="spearman")
#}
## find correlations
#vcnCorsData <- abs(sapply(colnames(my_housing_data), detectCor)) #absolute value
#summary(vcnCorsData)
#Show Corelations   
#vcnCorsData
#Plot Corelations  
#corrgram(my_housing_data)

Dataset Split - method 2

set.seed(707)
#install.packages("caTools")
library(caTools)
split <- sample.split(my_housing_data$medv, SplitRatio = 0.7)
#split

training_data <- subset(my_housing_data, split=="TRUE")
testing_data <- subset(my_housing_data, split=="FALSE")

Find Corelations - method 2

plot(my_housing_data$crim, my_housing_data$medv, cex=0.5, xlab="Crime Rate", ylab = "medv")

cr <- cor(my_housing_data)

Plot scatter plot for Corelations - method 2

attach(my_housing_data)
library(lattice)
splom(~my_housing_data[c(1:6, 14)], groups=NULL, data=my_housing_data, axis.line.tck = 0, axis.text.alpha = 0)

splom(~my_housing_data[c(7:14)], groups=NULL, data=my_housing_data, axis.line.tck = 0, axis.text.alpha = 0)

Find Corelations - method 2

plot(rm, medv)
#regression fit line
abline(lm(medv~rm), col="red")

Visualization - method 2

library(corrplot)
corrplot(cr, type = "lower")

corrplot(cr, method = "number")

finding multicollinearity - method 2

library(caret)
#to exclude medv(output)
housing_a <- subset(my_housing_data, select = -c(medv))
numericData <- housing_a[sapply(housing_a, is.numeric)]
descrCor <- cor(numericData)

vif - method 2

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:boot':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

model <- lm(medv~., data = training_data)
vif(model)

##     crim       zn    indus     chas      nox       rm      age      dis 
## 2.186063 2.426594 4.044576 1.098107 4.466141 1.999619 3.161921 4.088476 
##      rad      tax  ptratio    black    lstat 
## 7.111726 8.137088 1.931770 1.404318 3.138782

Create model using all variables - method 2

model <- lm(medv~., data = training_data)
#or
model <- lm(medv~ crim + zn + indus + chas + nox +  rm+ age + dis + rad + tax+ ptratio + black + lstat, data = training_data)

description of model - method 2

summary(model)

## 
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age + 
##     dis + rad + tax + ptratio + black + lstat, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1846  -2.9393  -0.6079   2.1119  24.8407 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31.893461   6.085615   5.241 2.76e-07 ***
## crim         -0.092407   0.054133  -1.707 0.088694 .  
## zn            0.047614   0.016067   2.963 0.003249 ** 
## indus         0.030606   0.071300   0.429 0.667998    
## chas          2.832769   1.003739   2.822 0.005039 ** 
## nox         -19.220333   4.537990  -4.235 2.91e-05 ***
## rm            4.387744   0.490338   8.948  < 2e-16 ***
## age          -0.001238   0.016121  -0.077 0.938835    
## dis          -1.573771   0.241048  -6.529 2.31e-10 ***
## rad           0.284439   0.076137   3.736 0.000218 ***
## tax          -0.011483   0.004189  -2.741 0.006429 ** 
## ptratio      -0.873512   0.157825  -5.535 6.09e-08 ***
## black         0.010619   0.003261   3.256 0.001238 ** 
## lstat        -0.528799   0.063089  -8.382 1.26e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.791 on 353 degrees of freedom
## Multiple R-squared:  0.7537, Adjusted R-squared:  0.7446 
## F-statistic: 83.08 on 13 and 353 DF,  p-value: < 2.2e-16

Re-Create model using lesser variables - method 2

model <- lm(medv~., data = training_data)
#or
model <- lm(medv~ crim + zn + indus + chas + nox +  rm+ age + dis + rad + ptratio + black + lstat, data = training_data)
summary(model)

## 
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age + 
##     dis + rad + ptratio + black + lstat, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7097  -3.1897  -0.5404   1.9789  24.7599 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.32910    6.11430   4.960 1.09e-06 ***
## crim         -0.08939    0.05462  -1.637  0.10258    
## zn            0.03932    0.01592   2.469  0.01402 *  
## indus        -0.05105    0.06537  -0.781  0.43535    
## chas          3.13277    1.00690   3.111  0.00201 ** 
## nox         -20.16860    4.56624  -4.417 1.33e-05 ***
## rm            4.48134    0.49363   9.078  < 2e-16 ***
## age          -0.00211    0.01627  -0.130  0.89686    
## dis          -1.59456    0.24314  -6.558 1.93e-10 ***
## rad           0.12666    0.05030   2.518  0.01224 *  
## ptratio      -0.90836    0.15875  -5.722 2.25e-08 ***
## black         0.01084    0.00329   3.296  0.00108 ** 
## lstat        -0.53049    0.06366  -8.333 1.76e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.834 on 354 degrees of freedom
## Multiple R-squared:  0.7484, Adjusted R-squared:  0.7399 
## F-statistic: 87.77 on 12 and 354 DF,  p-value: < 2.2e-16

Re-Create model using lesser variables - method 2

model <- lm(medv~., data = training_data)
#or
model <- lm(medv~ crim + zn + indus + chas + nox +  rm + dis + rad + ptratio + black + lstat, data = training_data)
summary(model)

## 
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + dis + 
##     rad + ptratio + black + lstat, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.6773  -3.1893  -0.5365   1.9819  24.7189 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.396435   6.083787   4.996 9.19e-07 ***
## crim         -0.089652   0.054505  -1.645  0.10089    
## zn            0.039592   0.015758   2.513  0.01243 *  
## indus        -0.051073   0.065282  -0.782  0.43453    
## chas          3.124282   1.003377   3.114  0.00200 ** 
## nox         -20.327722   4.392309  -4.628 5.18e-06 ***
## rm            4.467649   0.481540   9.278  < 2e-16 ***
## dis          -1.585673   0.232963  -6.807 4.27e-11 ***
## rad           0.127339   0.049963   2.549  0.01123 *  
## ptratio      -0.910298   0.157828  -5.768 1.75e-08 ***
## black         0.010816   0.003279   3.299  0.00107 ** 
## lstat        -0.533274   0.059856  -8.909  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.828 on 355 degrees of freedom
## Multiple R-squared:  0.7484, Adjusted R-squared:  0.7406 
## F-statistic: 96.01 on 11 and 355 DF,  p-value: < 2.2e-16

Predict on test data - method 2

predic <- predict(model, testing_data)
#predic

compare predicted and actual data - method 2

plot(testing_data$medv, type="l", lty = 1.8, col = "green")
lines(predic, type="l",col = "blue")

Predict on sample test data - method 2

#predic <- predict(model, #sample_data)
#predic

Linear Regression in ML using Boston Housing Data

Shubhendu Awasthi

Jul 16, 2017