Multiple Linear Regression

This tutorial uses the West Roxbury House Prices dataset to illustrate the procedure for conducting multiple linear regression. Visualization of this dataset is available at link

## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## dummies-1.5.6 provided by Decision Patterns
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
The dataset consists of 5802 rows and 14 columns as shown below.
# Read the data
df <- read.csv("../data/WestRoxbury.csv")
t(t(names(df))) ##display column names in a user friendly manner
##       [,1]         
##  [1,] "TOTAL.VALUE"
##  [2,] "TAX"        
##  [3,] "LOT.SQFT"   
##  [4,] "YR.BUILT"   
##  [5,] "GROSS.AREA" 
##  [6,] "LIVING.AREA"
##  [7,] "FLOORS"     
##  [8,] "ROOMS"      
##  [9,] "BEDROOMS"   
## [10,] "FULL.BATH"  
## [11,] "HALF.BATH"  
## [12,] "KITCHEN"    
## [13,] "FIREPLACE"  
## [14,] "REMODEL"
names(df)[3]<- "LOT.SIZE"  #change the column name if so desired
names(df) <- tolower(names(df)) #change to lowercase if so desired
dim(df) #Show the number of rows and columns
## [1] 5802   14
head(df, n = 3)
##   total.value  tax lot.size yr.built gross.area living.area floors rooms
## 1       344.2 4330     9965     1880       2436        1352      2     6
## 2       412.6 5190     6590     1945       3108        1976      2    10
## 3       330.1 4152     7500     1890       2294        1371      2     8
##   bedrooms full.bath half.bath kitchen fireplace remodel
## 1        3         1         1       1         0    None
## 2        4         2         1       1         0  Recent
## 3        4         1         1       1         0    None
The column remodel is a categorial variable with three options {None, Old, Recent}. It needs to be converted into dummy variables. library(dummies) is used for this purpose. As shown below, we will select specific columns as predictors.
df <- dummy.data.frame(df, sep = ".")
names(df)
##  [1] "total.value"    "tax"            "lot.size"       "yr.built"      
##  [5] "gross.area"     "living.area"    "floors"         "rooms"         
##  [9] "bedrooms"       "full.bath"      "half.bath"      "kitchen"       
## [13] "fireplace"      "remodel.None"   "remodel.Old"    "remodel.Recent"
cols <- c("total.value", "lot.size", "gross.area", "living.area", "floors")
cols <- c(cols, "rooms", "bedrooms","remodel.Old", "remodel.Recent")

df<- df[, cols]

##remove categorical columns (remodel.Old & remodel.Recent for heat map
df1 <- df[ , -(8:9) ] 
par(cex.main=1)
heatmap.2(cor(df1), Rowv = FALSE, Colv = FALSE, dendrogram = "none", 
          cellnote = round(cor(df1),2),main = "Correlation values",
          cexRow = 1, cexCol = 1, notecol = "black", key = FALSE, 
          trace = 'none', margins = c(10,10))

##regression with all predictors included
reg1 <- lm(total.value ~ ., data = df)
#  use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(reg1)
## 
## Call:
## lm(formula = total.value ~ ., data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -295.376  -27.668   -0.416   26.157  300.560 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    45.8630929  3.5945519  12.759 < 0.0000000000000002 ***
## lot.size        0.0089850  0.0002551  35.218 < 0.0000000000000002 ***
## gross.area      0.0327789  0.0016753  19.566 < 0.0000000000000002 ***
## living.area     0.0628466  0.0031160  20.169 < 0.0000000000000002 ***
## floors         42.5133205  1.6831957  25.258 < 0.0000000000000002 ***
## rooms           2.3119471  0.6872242   3.364             0.000773 ***
## bedrooms       -0.4945594  1.0599964  -0.467             0.640827    
## remodel.Old     2.2498146  2.0382432   1.104             0.269725    
## remodel.Recent 26.6510108  1.7315189  15.392 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.93 on 5793 degrees of freedom
## Multiple R-squared:  0.7858, Adjusted R-squared:  0.7855 
## F-statistic:  2657 on 8 and 5793 DF,  p-value: < 0.00000000000000022
###########################################
###Partitioning the dataset
set.seed(1)  # set seed for reproducing the partition

train.rows <- sample(1:nrow(df), 3500)

train.df <- df[train.rows, ]
valid.df <- df[-train.rows, ]
reg2 <- lm(total.value ~ ., data = train.df)
summary(reg2)
## 
## Call:
## lm(formula = total.value ~ ., data = train.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -232.123  -27.938   -0.602   25.863  195.057 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    39.0323360  4.5350984   8.607 < 0.0000000000000002 ***
## lot.size        0.0093371  0.0003184  29.325 < 0.0000000000000002 ***
## gross.area      0.0323227  0.0021283  15.187 < 0.0000000000000002 ***
## living.area     0.0596082  0.0039330  15.156 < 0.0000000000000002 ***
## floors         43.5623795  2.1186645  20.561 < 0.0000000000000002 ***
## rooms           3.6762897  0.8799811   4.178            0.0000302 ***
## bedrooms       -0.4024796  1.3831603  -0.291                0.771    
## remodel.Old     2.8335024  2.6166912   1.083                0.279    
## remodel.Recent 27.1272282  2.1854218  12.413 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.47 on 3491 degrees of freedom
## Multiple R-squared:  0.797,  Adjusted R-squared:  0.7965 
## F-statistic:  1713 on 8 and 3491 DF,  p-value: < 0.00000000000000022
lm.pred <- predict(reg2, valid.df)
options(scipen=999, digits = 0)
some.residuals <- valid.df$total.value[1:20] - lm.pred[1:20]
data.frame("Actual" = valid.df$total.value[1:20],"Predicted" = lm.pred[1:20], "Residual" = some.residuals)
##    Actual Predicted Residual
## 2     413       468      -56
## 5     332       360      -28
## 6     337       286       51
## 8     320       340      -19
## 9     334       339       -6
## 10    409       536     -126
## 11    313       345      -32
## 12    344       367      -23
## 13    316       342      -26
## 16    298       274       24
## 20    348       424      -76
## 21    318       311        7
## 23    358       401      -43
## 24    415       491      -77
## 25    319       358      -39
## 31    329       348      -19
## 33    280       444     -164
## 35    337       419      -81
## 42    350       386      -36
## 43    344       424      -80
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
t(accuracy(lm.pred, valid.df$total.value))
##      Test set
## ME      -1.05
## RMSE    46.71
## MAE     34.85
## MPE     -1.53
## MAPE     9.07
all.residuals <- valid.df$total.value - lm.pred

df2 <- data.frame(all.residuals, lm.pred)

h1 <- ggplot(df2, aes(x= all.residuals)) + 
  geom_histogram(fill = "blue", col = "red", binwidth = 20)
h1 <- h1 + labs(x = "residuals", y = "count", title = "Histogram of residuals")
h1

p1 <- ggplot(df2, aes(x = lm.pred, y = all.residuals)) + geom_point()
p1 <- p1 + labs(x = "predicted values", y = "residuals", title = "Residuals vs. predicted values")
p1 <- p1+ geom_smooth()
p1

search <- regsubsets(total.value ~ ., data = train.df, nbest = 1, nvmax = dim(train.df)[2],
                     method = "exhaustive")
sum <- summary(search)

# show models
t(sum$which)
##                    1     2     3     4     5     6     7    8
## (Intercept)     TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE
## lot.size       FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE
## gross.area     FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE TRUE
## living.area     TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE
## floors         FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE
## rooms          FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE TRUE
## bedrooms       FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## remodel.Old    FALSE FALSE FALSE FALSE FALSE FALSE  TRUE TRUE
## remodel.Recent FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE TRUE
# show metrics
sum$rsq
## [1] 0.705 0.753 0.772 0.787 0.796 0.797 0.797 0.797
sum$adjr2
## [1] 0.705 0.753 0.772 0.786 0.795 0.797 0.797 0.797
##Find the model with the highest adjusted R2.
x <- which(sum$adjr2== max(sum$adjr2))
t(t(sum$which[x,]))
##                 [,1]
## (Intercept)     TRUE
## lot.size        TRUE
## gross.area      TRUE
## living.area     TRUE
## floors          TRUE
## rooms           TRUE
## bedrooms       FALSE
## remodel.Old     TRUE
## remodel.Recent  TRUE
# use step() to run stepwise regression.
reg.step <- step(reg1, direction = "backward")
## Start:  AIC=44419
## total.value ~ lot.size + gross.area + living.area + floors + 
##     rooms + bedrooms + remodel.Old + remodel.Recent
## 
##                  Df Sum of Sq      RSS   AIC
## - bedrooms        1       459 12222339 44418
## - remodel.Old     1      2570 12224451 44419
## <none>                        12221880 44419
## - rooms           1     23878 12245758 44429
## - remodel.Recent  1    499813 12721693 44650
## - gross.area      1    807646 13029526 44789
## - living.area     1    858223 13080103 44811
## - floors          1   1345908 13567788 45024
## - lot.size        1   2616726 14838607 45543
## 
## Step:  AIC=44418
## total.value ~ lot.size + gross.area + living.area + floors + 
##     rooms + remodel.Old + remodel.Recent
## 
##                  Df Sum of Sq      RSS   AIC
## - remodel.Old     1      2567 12224906 44417
## <none>                        12222339 44418
## - rooms           1     26338 12248678 44428
## - remodel.Recent  1    499692 12722031 44648
## - gross.area      1    807192 13029531 44787
## - living.area     1    863450 13085790 44812
## - floors          1   1363363 13585703 45029
## - lot.size        1   2618080 14840419 45542
## 
## Step:  AIC=44417
## total.value ~ lot.size + gross.area + living.area + floors + 
##     rooms + remodel.Recent
## 
##                  Df Sum of Sq      RSS   AIC
## <none>                        12224906 44417
## - rooms           1     26709 12251615 44428
## - remodel.Recent  1    500432 12725338 44648
## - gross.area      1    813485 13038391 44789
## - living.area     1    861448 13086354 44810
## - floors          1   1370979 13595885 45032
## - lot.size        1   2617603 14842509 45541
summary(reg.step)  # Which variables were dropped?
## 
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area + 
##     floors + rooms + remodel.Recent, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -295.6  -27.6   -0.3   26.1  300.9 
## 
## Coefficients:
##                 Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    45.695689   3.586374   12.74 < 0.0000000000000002 ***
## lot.size        0.008986   0.000255   35.23 < 0.0000000000000002 ***
## gross.area      0.032848   0.001673   19.64 < 0.0000000000000002 ***
## living.area     0.062596   0.003098   20.21 < 0.0000000000000002 ***
## floors         42.488048   1.666662   25.49 < 0.0000000000000002 ***
## rooms           2.182100   0.613252    3.56              0.00038 ***
## remodel.Recent 26.357106   1.711281   15.40 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.9 on 5795 degrees of freedom
## Multiple R-squared:  0.786,  Adjusted R-squared:  0.786 
## F-statistic: 3.54e+03 on 6 and 5795 DF,  p-value: <0.0000000000000002
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$total.value)
##              ME RMSE  MAE   MPE MAPE
## Test set -0.611 46.6 34.8 -1.43 9.06
reg.step <- step(reg1, direction = "both")
## Start:  AIC=44419
## total.value ~ lot.size + gross.area + living.area + floors + 
##     rooms + bedrooms + remodel.Old + remodel.Recent
## 
##                  Df Sum of Sq      RSS   AIC
## - bedrooms        1       459 12222339 44418
## - remodel.Old     1      2570 12224451 44419
## <none>                        12221880 44419
## - rooms           1     23878 12245758 44429
## - remodel.Recent  1    499813 12721693 44650
## - gross.area      1    807646 13029526 44789
## - living.area     1    858223 13080103 44811
## - floors          1   1345908 13567788 45024
## - lot.size        1   2616726 14838607 45543
## 
## Step:  AIC=44418
## total.value ~ lot.size + gross.area + living.area + floors + 
##     rooms + remodel.Old + remodel.Recent
## 
##                  Df Sum of Sq      RSS   AIC
## - remodel.Old     1      2567 12224906 44417
## <none>                        12222339 44418
## + bedrooms        1       459 12221880 44419
## - rooms           1     26338 12248678 44428
## - remodel.Recent  1    499692 12722031 44648
## - gross.area      1    807192 13029531 44787
## - living.area     1    863450 13085790 44812
## - floors          1   1363363 13585703 45029
## - lot.size        1   2618080 14840419 45542
## 
## Step:  AIC=44417
## total.value ~ lot.size + gross.area + living.area + floors + 
##     rooms + remodel.Recent
## 
##                  Df Sum of Sq      RSS   AIC
## <none>                        12224906 44417
## + remodel.Old     1      2567 12222339 44418
## + bedrooms        1       455 12224451 44419
## - rooms           1     26709 12251615 44428
## - remodel.Recent  1    500432 12725338 44648
## - gross.area      1    813485 13038391 44789
## - living.area     1    861448 13086354 44810
## - floors          1   1370979 13595885 45032
## - lot.size        1   2617603 14842509 45541
summary(reg.step)  # Which variables were dropped?
## 
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area + 
##     floors + rooms + remodel.Recent, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -295.6  -27.6   -0.3   26.1  300.9 
## 
## Coefficients:
##                 Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    45.695689   3.586374   12.74 < 0.0000000000000002 ***
## lot.size        0.008986   0.000255   35.23 < 0.0000000000000002 ***
## gross.area      0.032848   0.001673   19.64 < 0.0000000000000002 ***
## living.area     0.062596   0.003098   20.21 < 0.0000000000000002 ***
## floors         42.488048   1.666662   25.49 < 0.0000000000000002 ***
## rooms           2.182100   0.613252    3.56              0.00038 ***
## remodel.Recent 26.357106   1.711281   15.40 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.9 on 5795 degrees of freedom
## Multiple R-squared:  0.786,  Adjusted R-squared:  0.786 
## F-statistic: 3.54e+03 on 6 and 5795 DF,  p-value: <0.0000000000000002
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$total.value)
##              ME RMSE  MAE   MPE MAPE
## Test set -0.611 46.6 34.8 -1.43 9.06