LM 1 - Exercise 1.2

Author

Dr. Vytlacil

GETTING STARTED

Complete the following tasks:
- Save this .qmd file in the folder you have designated on your computer for LM1
- Download and save the WestRoxbury.csv file into the same location as this file

NOTE

1 - The code is created for each TABLE in a way that stands alone. Data are read in fresh from the dataset at the start of each table. (Except Table 2.12-2.14 run together.) This way you can run just that chunk without running anything before that in order to get your answer. Thus, the code may differ some from the textbook.

2 - Also note that we will always read in the .csv file rather than use the \(mlba\) package from the textbook authors.

TABLE 2.3

Working with Files in R

Open the file and view metadata

housing.df <- read.csv('WestRoxbury.csv')

dim(housing.df)
[1] 5802   14
head(housing.df)
  TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
1       344.2 4330     9965     1880       2436        1352      2     6
2       412.6 5190     6590     1945       3108        1976      2    10
3       330.1 4152     7500     1890       2294        1371      2     8
4       498.6 6272    13773     1957       5032        2608      1     9
5       331.5 4170     5000     1910       2370        1438      2     7
6       337.4 4244     5142     1950       2124        1060      1     6
  BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
1        3         1         1       1         0    None
2        4         2         1       1         0  Recent
3        4         1         1       1         0    None
4        5         1         1       1         1    None
5        3         2         0       1         0    None
6        3         1         0       1         1     Old
#View(housing.df)

Practice showing different subsets of the data and some summary statistics.
(This is partial code displayed in text. You may want to add in all of the other code for practice.)

housing.df[1:10,1]
 [1] 344.2 412.6 330.1 498.6 331.5 337.4 359.4 320.4 333.5 409.4
housing.df[1:10, ]
   TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
1        344.2 4330     9965     1880       2436        1352      2     6
2        412.6 5190     6590     1945       3108        1976      2    10
3        330.1 4152     7500     1890       2294        1371      2     8
4        498.6 6272    13773     1957       5032        2608      1     9
5        331.5 4170     5000     1910       2370        1438      2     7
6        337.4 4244     5142     1950       2124        1060      1     6
7        359.4 4521     5000     1954       3220        1916      2     7
8        320.4 4030    10000     1950       2208        1200      1     6
9        333.5 4195     6835     1958       2582        1092      1     5
10       409.4 5150     5093     1900       4818        2992      2     8
   BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
1         3         1         1       1         0    None
2         4         2         1       1         0  Recent
3         4         1         1       1         0    None
4         5         1         1       1         1    None
5         3         2         0       1         0    None
6         3         1         0       1         1     Old
7         3         1         1       1         0    None
8         3         1         0       1         0    None
9         3         1         0       1         1  Recent
10        4         2         0       1         0    None
head(housing.df, 10)  #this is an alternative way to write code for the prior line -- you get same output
   TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
1        344.2 4330     9965     1880       2436        1352      2     6
2        412.6 5190     6590     1945       3108        1976      2    10
3        330.1 4152     7500     1890       2294        1371      2     8
4        498.6 6272    13773     1957       5032        2608      1     9
5        331.5 4170     5000     1910       2370        1438      2     7
6        337.4 4244     5142     1950       2124        1060      1     6
7        359.4 4521     5000     1954       3220        1916      2     7
8        320.4 4030    10000     1950       2208        1200      1     6
9        333.5 4195     6835     1958       2582        1092      1     5
10       409.4 5150     5093     1900       4818        2992      2     8
   BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
1         3         1         1       1         0    None
2         4         2         1       1         0  Recent
3         4         1         1       1         0    None
4         5         1         1       1         1    None
5         3         2         0       1         0    None
6         3         1         0       1         1     Old
7         3         1         1       1         0    None
8         3         1         0       1         0    None
9         3         1         0       1         1  Recent
10        4         2         0       1         0    None
housing.df[5, c(1:2, 4, 8:10)]
  TOTAL.VALUE  TAX YR.BUILT ROOMS BEDROOMS FULL.BATH
5       331.5 4170     1910     7        3         2
length(housing.df$TOTAL.VALUE)
[1] 5802
mean(housing.df$TOTAL.VALUE)
[1] 392.6857
round(mean(housing.df$TOTAL.VALUE),2)
[1] 392.69
round(mean(housing.df$TOTAL.VALUE),0)
[1] 393
summary(housing.df)
  TOTAL.VALUE          TAX           LOT.SQFT        YR.BUILT      GROSS.AREA  
 Min.   : 105.0   Min.   : 1320   Min.   :  997   Min.   :   0   Min.   : 821  
 1st Qu.: 325.1   1st Qu.: 4090   1st Qu.: 4772   1st Qu.:1920   1st Qu.:2347  
 Median : 375.9   Median : 4728   Median : 5683   Median :1935   Median :2700  
 Mean   : 392.7   Mean   : 4939   Mean   : 6278   Mean   :1937   Mean   :2925  
 3rd Qu.: 438.8   3rd Qu.: 5520   3rd Qu.: 7022   3rd Qu.:1955   3rd Qu.:3239  
 Max.   :1217.8   Max.   :15319   Max.   :46411   Max.   :2011   Max.   :8154  
  LIVING.AREA       FLOORS          ROOMS           BEDROOMS      FULL.BATH    
 Min.   : 504   Min.   :1.000   Min.   : 3.000   Min.   :1.00   Min.   :1.000  
 1st Qu.:1308   1st Qu.:1.000   1st Qu.: 6.000   1st Qu.:3.00   1st Qu.:1.000  
 Median :1548   Median :2.000   Median : 7.000   Median :3.00   Median :1.000  
 Mean   :1657   Mean   :1.684   Mean   : 6.995   Mean   :3.23   Mean   :1.297  
 3rd Qu.:1874   3rd Qu.:2.000   3rd Qu.: 8.000   3rd Qu.:4.00   3rd Qu.:2.000  
 Max.   :5289   Max.   :3.000   Max.   :14.000   Max.   :9.00   Max.   :5.000  
   HALF.BATH         KITCHEN        FIREPLACE        REMODEL         
 Min.   :0.0000   Min.   :1.000   Min.   :0.0000   Length:5802       
 1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
 Median :1.0000   Median :1.000   Median :1.0000   Mode  :character  
 Mean   :0.6139   Mean   :1.015   Mean   :0.7399                     
 3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.0000                     
 Max.   :3.0000   Max.   :2.000   Max.   :4.0000                     

TABLE 2.4

Sampling in R

Random sample of 5 observations

s <- sample(row.names(housing.df), 5)
housing.df[s,]
     TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
4447       612.6 7706     6150     2001       3667        2726    2.0    10
4814       351.3 4419     7255     1950       2448        1075    1.0     6
4488       268.5 3377     4000     1931       2134         912    1.0     5
2933       416.3 5237     6375     1895       4478        2588    2.0    11
3368       441.7 5556     6500     1927       3737        1934    1.5     5
     BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
4447        4         2         2       1         1    None
4814        3         1         0       1         1    None
4488        2         1         0       1         0    None
2933        5         1         0       1         0    None
3368        3         2         0       1         1  Recent

Oversample houses with over 10 rooms

s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, .9, .01))
housing.df[s,]
     TOTAL.VALUE  TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
5380       326.7 4109     8277     1950       2361        1123      1     8
4501       490.1 6165     9000     1939       4123        2285      2    11
1067       318.0 4000     6533     1960       2656        1109      1     6
394        326.0 4101     6194     1950       2816        1311      1     5
2874       645.7 8122    10569     1901       4760        2933      2    11
     BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
5380        3         1         1       1         2    None
4501        4         1         1       1         1     Old
1067        2         1         1       1         1     Old
394         2         1         0       1         1    None
2874        5         3         0       1         1    None

Rebalance
(Note, you may need to first install the \(caret\) package.)

library(caret)
housing.df$REMODEL <- factor(housing.df$REMODEL)
table(housing.df$REMODEL)

  None    Old Recent 
  4346    581    875 
upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x
table(upsampled.df$REMODEL)

  None    Old Recent 
  4346   4346   4346 
rm(housing.df, upsampled.df) #remove (clear out) datasets from the environment. we will recreate in the next step

TABLE 2.5

Reviewing Variables in R

Using the pipe operator to combine steps – read in the .csv and transform REMODEL to a factor all in one step, rather than in separate steps.
Read the pipe operator as “AND THEN”. For example, in this code, we would read this as create the housing.df dataframe by reading in the WestRoxbury.csv file AND THEN convert the REMODEL variable into a factor.

(Ctrl-Shift-m is shortcut to create pipe operator)

Note that \(tidyverse\) is a collection of R packages. The mutate command is part of the \(dplyr\) package. We will use \(dplyr\) extensively in LM 2 for data manipulation and munging.

library(tidyverse)

housing.df <- read.csv('WestRoxbury.csv') %>% 
    mutate(REMODEL=factor(REMODEL))
table(housing.df$REMODEL)

  None    Old Recent 
  4346    581    875 
rm(housing.df)

TABLE 2.6

Creating Dummy Variable in R

In this section, we transform a categorical variable into dummy variables.
The first part replicates the textbook by creating only 2 dummy variables for a 3-level categorical variable AND deleting the original text variable. This is all you need to use the variables in a model – you just need to remember what level you excluded, because that is your reference level and important for interpretation.

I prefer keeping all three levels and the original variable in my dataset so that I have the option of which to use as my reference category. This is doable if there are few categoricals in the dataset.

We use the dummy_cols function from the \(fastDummies\) package to do this. It’s fast and efficient.
I include traditional, longer IFELSE method to create dummy variables as a contrast. See why dummy_cols() is more efficient?

library(fastDummies)

housing.df <- dummy_cols(read.csv('WestRoxbury.csv'), 
                                                 remove_selected_columns = TRUE, 
                                                 remove_first_dummy = TRUE)
str(housing.df)
'data.frame':   5802 obs. of  15 variables:
 $ TOTAL.VALUE   : num  344 413 330 499 332 ...
 $ TAX           : int  4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
 $ LOT.SQFT      : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
 $ YR.BUILT      : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
 $ GROSS.AREA    : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
 $ LIVING.AREA   : int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
 $ FLOORS        : num  2 2 2 1 2 1 2 1 1 2 ...
 $ ROOMS         : int  6 10 8 9 7 6 7 6 5 8 ...
 $ BEDROOMS      : int  3 4 4 5 3 3 3 3 3 4 ...
 $ FULL.BATH     : int  1 2 1 1 2 1 1 1 1 2 ...
 $ HALF.BATH     : int  1 1 1 1 0 0 1 0 0 0 ...
 $ KITCHEN       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ FIREPLACE     : int  0 0 0 1 0 1 0 0 1 0 ...
 $ REMODEL_Old   : int  0 0 0 0 0 1 0 0 0 0 ...
 $ REMODEL_Recent: int  0 1 0 0 0 0 0 0 1 0 ...
rm(housing.df)

# replicate but keep all columns and the original variable  (the default is FALSE, so I remove the options)
housing.df <- dummy_cols(read.csv('WestRoxbury.csv'))
str(housing.df)
'data.frame':   5802 obs. of  17 variables:
 $ TOTAL.VALUE   : num  344 413 330 499 332 ...
 $ TAX           : int  4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
 $ LOT.SQFT      : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
 $ YR.BUILT      : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
 $ GROSS.AREA    : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
 $ LIVING.AREA   : int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
 $ FLOORS        : num  2 2 2 1 2 1 2 1 1 2 ...
 $ ROOMS         : int  6 10 8 9 7 6 7 6 5 8 ...
 $ BEDROOMS      : int  3 4 4 5 3 3 3 3 3 4 ...
 $ FULL.BATH     : int  1 2 1 1 2 1 1 1 1 2 ...
 $ HALF.BATH     : int  1 1 1 1 0 0 1 0 0 0 ...
 $ KITCHEN       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ FIREPLACE     : int  0 0 0 1 0 1 0 0 1 0 ...
 $ REMODEL       : chr  "None" "Recent" "None" "None" ...
 $ REMODEL_None  : int  1 0 1 1 1 0 1 1 0 1 ...
 $ REMODEL_Old   : int  0 0 0 0 0 1 0 0 0 0 ...
 $ REMODEL_Recent: int  0 1 0 0 0 0 0 0 1 0 ...
rm(housing.df)

# create the dummy variables using an if/else statement  (the long, tedious methdod :))
housing.df <- read.csv('WestRoxbury.csv')
table(housing.df$REMODEL)

  None    Old Recent 
  4346    581    875 
housing.df$REMODEL_None   <- ifelse(housing.df$REMODEL == "None", 1, 0)
housing.df$REMODEL_Old    <- ifelse(housing.df$REMODEL == "Old", 1, 0)
housing.df$REMODEL_Recent <- ifelse(housing.df$REMODEL == "Recent", 1, 0)
str(housing.df)
'data.frame':   5802 obs. of  17 variables:
 $ TOTAL.VALUE   : num  344 413 330 499 332 ...
 $ TAX           : int  4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
 $ LOT.SQFT      : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
 $ YR.BUILT      : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
 $ GROSS.AREA    : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
 $ LIVING.AREA   : int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
 $ FLOORS        : num  2 2 2 1 2 1 2 1 1 2 ...
 $ ROOMS         : int  6 10 8 9 7 6 7 6 5 8 ...
 $ BEDROOMS      : int  3 4 4 5 3 3 3 3 3 4 ...
 $ FULL.BATH     : int  1 2 1 1 2 1 1 1 1 2 ...
 $ HALF.BATH     : int  1 1 1 1 0 0 1 0 0 0 ...
 $ KITCHEN       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ FIREPLACE     : int  0 0 0 1 0 1 0 0 1 0 ...
 $ REMODEL       : chr  "None" "Recent" "None" "None" ...
 $ REMODEL_None  : num  1 0 1 1 1 0 1 1 0 1 ...
 $ REMODEL_Old   : num  0 0 0 0 0 1 0 0 0 0 ...
 $ REMODEL_Recent: num  0 1 0 0 0 0 0 0 1 0 ...
# check your work
sum(housing.df$REMODEL_None)
[1] 4346
sum(housing.df$REMODEL_Old)
[1] 581
sum(housing.df$REMODEL_Recent)
[1] 875
rm(housing.df)

TABLE 2.7

Imputing Missing Data in R

In this section we want to deal with missing values.
It turns our that our original dataset has no missing values. SO, we have to run some code to randomly create some missing values so that we can practice dealing with them.

housing.df <- dummy_cols(read.csv('WestRoxbury.csv'))

# check for missing values in our dataset
nbrna <- sum(is.na(housing.df$BEDROOMS))
cat("There are",nbrna,"observations with missing values for BEDROOMS. \n")
There are 0 observations with missing values for BEDROOMS. 
# So let's randomly replace 10 values for BEDROOMS to missing
rows.to.missing <- sample(row.names(housing.df), 10)
housing.df[rows.to.missing,]$BEDROOMS <- NA
nbrna <- sum(is.na(housing.df$BEDROOMS))
cat("There are",nbrna,"observations with missing values for BEDROOMS. \n\n")
There are 10 observations with missing values for BEDROOMS. 
summary(housing.df$BEDROOMS)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  1.000   3.000   3.000   3.231   4.000   9.000      10 
# Now impute the missing value with the median of the remaining non-missing
housing.df <- housing.df %>% 
    replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE)))
nbrna <- sum(is.na(housing.df$BEDROOMS))
cat("There are",nbrna,"observations with missing values for BEDROOMS. \n\n")
There are 0 observations with missing values for BEDROOMS. 
summary(housing.df$BEDROOMS)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00    3.00    3.00    3.23    4.00    9.00 
rm(housing.df, bedroom_median, rows.to.missing) #clean up the environment by name

TABLE 2.9

Data Partitioning in R

The text walks you through manual creation of partitions using random sampling.
This is useful to understand.
However, we will typically use the easier method from the \(caret\) package as shown in the last section of the code.
In this example, we partition into 60% training and 40% holdout sets.

library(caret)

housing.df <- read.csv('WestRoxbury.csv') %>% 
    mutate(REMODEL=factor(REMODEL))

set.seed(1)
index <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[index, ]
holdout.df <- housing.df[-index, ]

rows <- nrow(housing.df)  # nrow() counts number of observations
cat("The original dataset has",rows,"observations. \n\n")
The original dataset has 5802 observations. 
trows <- nrow(train.df) 
cat("The 60% training partition has",trows,"observations. \n\n")
The 60% training partition has 3483 observations. 
hrows <- nrow(holdout.df) 
cat("The 40% holdout partition has",hrows,"observations. \n\n")
The 40% holdout partition has 2319 observations. 
rm(list=ls())  #clear out environment globally -- everything :)

TABLE 2.11

Cleaning and Preprocessing Data

This is just a small taste of preprocessing. We will do much more in Week 2.
In this textbook example, the instructions are to keep only observations without missing values, remove the TAX variable, convert the REMODEL variable from a character to a factor variable and create categorical dummies for the second and third levels.
Use \(dplyr\) from the \(tidyverse\) to do this all in one step! 

library(tidyverse)
library(fastDummies)

housing.df <- read.csv('WestRoxbury.csv') %>% 
    drop_na()  %>% 
    select(-TAX) %>% 
    mutate(REMODEL=factor(REMODEL)) %>% 
    dummy_cols(select_columns = c('REMODEL'),
                         remove_selected_columns = TRUE,
                         remove_first_dummy = TRUE)

rm(list=ls()) 

TABLE 2.12

Training a Regression Model and Generating Predictions (Fitted Values) for the Training Data

We will do more with regression and scoring in Week 7. 

library(tidyverse)
library(fastDummies)

housing.df <- read.csv('WestRoxbury.csv') %>% 
    drop_na()  %>% 
    select(-TAX) %>% 
    mutate(REMODEL=factor(REMODEL)) %>% 
    dummy_cols(select_columns = c('REMODEL'),
                         remove_selected_columns = TRUE,
                         remove_first_dummy = TRUE)

# partition the dataset
set.seed(1)
index <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[index, ]
holdout.df <- housing.df[-index, ]

# train a regression model
reg <- lm(TOTAL.VALUE ~ ., data=train.df)
summary(reg)  # show the regression results

Call:
lm(formula = TOTAL.VALUE ~ ., data = train.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-262.55  -26.18   -0.15   24.70  230.40 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -4.314e+01  3.694e+01  -1.168  0.24287    
LOT.SQFT        8.322e-03  2.959e-04  28.125  < 2e-16 ***
YR.BUILT        5.068e-02  1.835e-02   2.762  0.00577 ** 
GROSS.AREA      3.260e-02  2.057e-03  15.853  < 2e-16 ***
LIVING.AREA     5.221e-02  3.762e-03  13.880  < 2e-16 ***
FLOORS          3.951e+01  2.102e+00  18.795  < 2e-16 ***
ROOMS           6.715e-01  8.222e-01   0.817  0.41420    
BEDROOMS       -2.728e+00  1.244e+00  -2.194  0.02831 *  
FULL.BATH       1.959e+01  1.688e+00  11.610  < 2e-16 ***
HALF.BATH       1.931e+01  1.560e+00  12.382  < 2e-16 ***
KITCHEN        -1.441e+01  5.876e+00  -2.453  0.01422 *  
FIREPLACE       1.919e+01  1.352e+00  14.200  < 2e-16 ***
REMODEL_Old     7.241e+00  2.443e+00   2.964  0.00306 ** 
REMODEL_Recent  2.661e+01  2.132e+00  12.481  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 42.23 on 3469 degrees of freedom
Multiple R-squared:  0.8221,    Adjusted R-squared:  0.8214 
F-statistic:  1233 on 13 and 3469 DF,  p-value: < 2.2e-16
# get fitted values and residuals and place into a dataframe
train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values,
                                                residuals=reg$residuals)
head(train.res, 10) # look at first 10 obs
   actual predicted  residuals
1   344.2  384.4206 -40.220638
4   498.6  546.4628 -47.862759
5   331.5  347.9170 -16.417031
12  344.5  380.4297 -35.929727
13  315.5  313.1879   2.312083
15  326.2  345.3751 -19.175064
17  313.1  357.7081 -44.608120
18  344.9  362.4810 -17.581043
20  348.0  385.2961 -37.296121
21  317.5  281.8739  35.626109
# not in textbook -- scatterplot of actuals x predicted
plot(train.res$actual, train.res$predicted)

TABLE 2.13

Generating Predictions for the Holdout Data

Make sure to run code for TABLE 2.12 first.

pred <- predict(reg, newdata=holdout.df)
holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred, 
                                                    residuals=holdout.df$TOTAL.VALUE - pred)
head(holdout.res, 10)
   actual predicted  residuals
2   412.6  460.2777 -47.677744
3   330.1  359.3920 -29.291958
6   337.4  290.0277  47.372303
7   359.4  402.5332 -43.133242
8   320.4  314.0683   6.331652
9   333.5  339.8206  -6.320582
10  409.4  507.0732 -97.673151
11  313.0  359.9514 -46.951416
14  575.0  572.7488   2.251179
16  298.2  272.1006  26.099410
plot(holdout.res$actual, holdout.res$predicted)

TABLE 2.14

Prediction Error Metrics for Training and Holdout Data ($000)

Make sure to run code for TABLE 2.12 and TABLE 2.13 first.

library(caret)

# Training Dataset
data.frame(
    ME   = round(mean(train.res$residuals), 5),
    RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual),
    MAE  = MAE(pred=train.res$predicted,  obs=train.res$actual))
  ME     RMSE      MAE
1  0 42.14665 31.98717
# Holdout Dataset
data.frame(
    ME   = round(mean(holdout.res$residuals), 5),
    RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual),
    MAE  = MAE(pred=holdout.res$predicted,  obs=holdout.res$actual))
        ME     RMSE      MAE
1 -1.04237 43.90381 33.05476

Final Clean Up

rm(list=ls())