# This is a data set about houses in Boston.
# Our purpose here is to predict a new house's value using existing houses
# in the data set.
# Prediction task: The target variable is TOTAL.VALUE and it is numerical.
# Read the data set:
housing.df <- read.csv("WestRoxbury.csv") # load the data into memory
View(housing.df)
str(housing.df)
## 'data.frame': 5802 obs. of 14 variables:
## $ TOTAL.VALUE: num 344 413 330 499 332 ...
## $ TAX : int 4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
## $ LOT.SQFT : int 9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
## $ YR.BUILT : int 1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
## $ GROSS.AREA : int 2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
## $ LIVING.AREA: int 1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
## $ FLOORS : num 2 2 2 1 2 1 2 1 1 2 ...
## $ ROOMS : int 6 10 8 9 7 6 7 6 5 8 ...
## $ BEDROOMS : int 3 4 4 5 3 3 3 3 3 4 ...
## $ FULL.BATH : int 1 2 1 1 2 1 1 1 1 2 ...
## $ HALF.BATH : int 1 1 1 1 0 0 1 0 0 0 ...
## $ KITCHEN : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FIREPLACE : int 0 0 0 1 0 1 0 0 1 0 ...
## $ REMODEL : chr "None" "Recent" "None" "None" ...
#----------------------------------------------------------------------------
# 1) Data understanding
#----------------------------------------------------------------------------
# Understand the data: Review each column and make changes if needed
# If the text data is read as "chr" by read.csv, it means it is a text type.
# Character is good for longer text (not categorical) such as first, last names
# and feedback of customers.
# For categorical data, the proper text type is "factor".
# read.csv() function may read all text data as "chr" or "factor" depending on default settings.
# If a categorical data is read as "chr" then you should convert it to factor type
# using function as.factor()
str(housing.df) # show the type of all columns
## 'data.frame': 5802 obs. of 14 variables:
## $ TOTAL.VALUE: num 344 413 330 499 332 ...
## $ TAX : int 4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
## $ LOT.SQFT : int 9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
## $ YR.BUILT : int 1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
## $ GROSS.AREA : int 2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
## $ LIVING.AREA: int 1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
## $ FLOORS : num 2 2 2 1 2 1 2 1 1 2 ...
## $ ROOMS : int 6 10 8 9 7 6 7 6 5 8 ...
## $ BEDROOMS : int 3 4 4 5 3 3 3 3 3 4 ...
## $ FULL.BATH : int 1 2 1 1 2 1 1 1 1 2 ...
## $ HALF.BATH : int 1 1 1 1 0 0 1 0 0 0 ...
## $ KITCHEN : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FIREPLACE : int 0 0 0 1 0 1 0 0 1 0 ...
## $ REMODEL : chr "None" "Recent" "None" "None" ...
# remodel variable is read as character type when it is actually categorical
table(housing.df$REMODEL)
##
## None Old Recent
## 4346 581 875
housing.df$REMODEL = as.factor(housing.df$REMODEL)
# housing.df$REMODEL = as.factor(housing.df$REMODEL, levels=c("None", "Old", "Recent")). # if it was ordinal
# 2) Identify outliers, missing values and other irregularities
# use summary() function
summary(housing.df)
## TOTAL.VALUE TAX LOT.SQFT YR.BUILT GROSS.AREA
## Min. : 105.0 Min. : 1320 Min. : 997 Min. : 0 Min. : 821
## 1st Qu.: 325.1 1st Qu.: 4090 1st Qu.: 4772 1st Qu.:1920 1st Qu.:2347
## Median : 375.9 Median : 4728 Median : 5683 Median :1935 Median :2700
## Mean : 392.7 Mean : 4939 Mean : 6278 Mean :1937 Mean :2925
## 3rd Qu.: 438.8 3rd Qu.: 5520 3rd Qu.: 7022 3rd Qu.:1955 3rd Qu.:3239
## Max. :1217.8 Max. :15319 Max. :46411 Max. :2011 Max. :8154
## LIVING.AREA FLOORS ROOMS BEDROOMS FULL.BATH
## Min. : 504 Min. :1.000 Min. : 3.000 Min. :1.00 Min. :1.000
## 1st Qu.:1308 1st Qu.:1.000 1st Qu.: 6.000 1st Qu.:3.00 1st Qu.:1.000
## Median :1548 Median :2.000 Median : 7.000 Median :3.00 Median :1.000
## Mean :1657 Mean :1.684 Mean : 6.995 Mean :3.23 Mean :1.297
## 3rd Qu.:1874 3rd Qu.:2.000 3rd Qu.: 8.000 3rd Qu.:4.00 3rd Qu.:2.000
## Max. :5289 Max. :3.000 Max. :14.000 Max. :9.00 Max. :5.000
## HALF.BATH KITCHEN FIREPLACE REMODEL
## Min. :0.0000 Min. :1.000 Min. :0.0000 None :4346
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 Old : 581
## Median :1.0000 Median :1.000 Median :1.0000 Recent: 875
## Mean :0.6139 Mean :1.015 Mean :0.7399
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.000 Max. :4.0000
# If you want a visualization, use plot() or boxplot() function.
plot(housing.df$LOT.SQFT)

# Delete decision depends on the possible impact of these outliers in
# the prediction model.
boxplot(housing.df$LOT.SQFT)

#-----------------
# We are concerned if tax is a mathematical function of total value.
# We need to check this either visually, or numerically
# 99, 100 percent correlation will indicate such relationship
plot(housing.df$TAX, housing.df$TOTAL.VALUE)

cor(housing.df$TAX, housing.df$TOTAL.VALUE)
## [1] 1
# We identified that tax is a derived variable of total value.
# We cannot use it for predicting total value. This column is out.
housing.df$TAX <- NULL # delete TAX column
#-----------------
# We spotted a zero for the YR.BUILT variable. What is the extent of this typo?
sum(housing.df$YR.BUILT == 0)
## [1] 1
housing.df[housing.df$YR.BUILT == 0, ] # exact row number: 1493
## TOTAL.VALUE LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1493 569.2 7000 0 4732 2641 2 8 4
## FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
## 1493 2 1 1 1 Recent
# Delete or impute? Since we have a large dataset,
# we decide to delete the entire row.
housing.df = housing.df[housing.df$YR.BUILT != 0, ]
#-----------------
# Check for missing values in the dataset
sum(is.na(housing.df))
## [1] 0
#----------------------------------------------------------------------------
# 2) Selecting the appropriate method
#----------------------------------------------------------------------------
# Our target (response) variable is Total Value
# The remaining variables are possible predictors
# Type of the outcome variable: Numeric
# Type of the predictors : Numeric except REMODEL variable
# Functions lm() or glm() will be used in our regression analysis and they handle
# categorical variables by creating dummies automatically.
# Why do we do regression analysis?
# 1) Prediction purposes: Business analytics professionals use regression as a prediction method.
# 2) Understanding relationship between predictors and the outcome variable. This is
# mostly used by scientists/economists doing research.
# The impact of the regressions coefficients (ie. average impact of each variable)
# on the target variable is the focus rather than predictions using the model.
# For example, the following conclusions can be made after the regression
# model is run:
# - Total value increases by 0.01977 thousand dollars on average
# for every square foot increase in lot sqrft.
# - Total value decreases by 0.294 dollars on average
# in each year.
# Typical multiple predictor linear regression model looks like this:
# Y = B0 + B1X1 + B2X2 + B3X3 +... BnXn + e
# Y: Actual value of the numerical target variable
# Bi: Regression coefficients that need to be calculated/estimated using Ordinary Least Squares method
# Xi: Predictors
# e : error term (actual - prediction)
# Each row in the dataset is one point of Y and Xs
# When all Bi are calculated, we have a prediction model.
# To make a prediction, we plug in a new dataset of X values in the model
# and calculate Y.
#----------------------------------------------------------------------------
# 3) Selecting the predictors
#----------------------------------------------------------------------------
# The next challenge is selecting the best predictors for the prediction job.
# We tend to add ALL possible columns as predictors. However, there are
# some problems with this approach, such as:
# a) When we have too many predictors, we are required to collect future data
# for each of these predictors which may be expensive or unavailable or
# may contain missing data.
# b) Predictors may highly correlate with each other which leads to "multicolinearity"
# problem. If this is the case, coefficient (Bi) estimations will unreliable.
# c) The model becomes too complex to understand and explain.
# Predictor selection methods:
# 1) We can get assistance from an expert in the domain of the dataset
# to tell us which predictors are most important to predict the outcome.
# 2) Look at the correlations between the outcome variable and the predictors
# Include the ones that are highly correlated.(caution: correlation does not imply causation)
# Watch for extreme predictor to predictor correlations (>85%). This indicates a multicolinearity problem.
# 3) Also, review the scatter plot matrix to see relationships.
# 4) You can use automated search procedures trying each predictor in and out of the model,
# such as "Stepwise Regression".
# 5) You can use advanced regression methods such as "Ridge Regression" to overcome the multicolinearity,
# and Principle Component Analysis (PCA) to reduce the number of predictors.
# Examples:
# Visualizations to see the pairwise relationships between Y and X
# Let's consider the first 9 variables
plot( housing.df[ , 1:9] )

# When you examine these plots, consider two things:
# 1) Outcome variable vs. Predictors: Look for an upward or
# downward trend. A plot that looks like flat indicates
# no or week relationship/association.
plot(housing.df$TOTAL.VALUE ~ housing.df$YR.BUILT )

# 2) Predictor vs Predictor: An upward or downward trend may indicate
# an undesired phenomenon called "Multicollinearity", which is
# the occurrence of high intercorrelations among two or more independent variables
# in a multiple regression model. It will make your Bi coefficients and the model
# in general less reliable. In other words, you have two variables carrying
# almost the same information, one needs to go, or advanced techniques needed
# to overcome the problem.
plot(housing.df$GROSS.AREA ~ housing.df$LIVING.AREA )

# How about correlation coefficients to see the extents of association
# between two variables?
cor(housing.df$GROSS.AREA, housing.df$LIVING.AREA) # 0.899775
## [1] 0.8997145
# Almost 90% corelation indicates
# possible multicolinearity problem
# if we use GROSS.AREA and LIVING.AREA
# in the same regression model.
# We should use only one of them
# to avoid multicolinearity.
cor(housing.df[ , -13])
## TOTAL.VALUE LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## TOTAL.VALUE 1.00000000 0.54619300 -0.11931299 0.80039842 0.83702952
## LOT.SQFT 0.54619300 1.00000000 -0.09383607 0.44894881 0.42608464
## YR.BUILT -0.11931299 -0.09383607 1.00000000 -0.21058904 -0.16169317
## GROSS.AREA 0.80039842 0.44894881 -0.21058904 1.00000000 0.89971454
## LIVING.AREA 0.83702952 0.42608464 -0.16169317 0.89971454 1.00000000
## FLOORS 0.48145729 0.07363251 -0.25984039 0.30053682 0.47575801
## ROOMS 0.63852581 0.30837700 -0.19531338 0.65151674 0.72068797
## BEDROOMS 0.56178578 0.25408289 -0.17238687 0.57171745 0.64098415
## FULL.BATH 0.43257938 0.20128716 0.12184425 0.41947621 0.43775776
## HALF.BATH 0.34805610 0.13496880 0.09524851 0.22652023 0.30097079
## KITCHEN 0.01830795 0.04453121 0.07197970 0.03055579 0.08288777
## FIREPLACE 0.35853027 0.18186166 0.12931788 0.27002039 0.26209405
## FLOORS ROOMS BEDROOMS FULL.BATH HALF.BATH
## TOTAL.VALUE 0.48145729 0.6385258 0.5617858 0.4325794 0.34805610
## LOT.SQFT 0.07363251 0.3083770 0.2540829 0.2012872 0.13496880
## YR.BUILT -0.25984039 -0.1953134 -0.1723869 0.1218443 0.09524851
## GROSS.AREA 0.30053682 0.6515167 0.5717175 0.4194762 0.22652023
## LIVING.AREA 0.47575801 0.7206880 0.6409842 0.4377578 0.30097079
## FLOORS 1.00000000 0.4328077 0.4311803 0.1120233 0.31608155
## ROOMS 0.43280773 1.0000000 0.7106644 0.3781867 0.28259207
## BEDROOMS 0.43118035 0.7106644 1.0000000 0.3324848 0.25676877
## FULL.BATH 0.11202329 0.3781867 0.3324848 1.0000000 -0.13082278
## HALF.BATH 0.31608155 0.2825921 0.2567688 -0.1308228 1.00000000
## KITCHEN -0.11459160 0.1292438 0.0853788 0.1467023 -0.02005600
## FIREPLACE 0.12045725 0.2051795 0.1643225 0.1400776 0.17618782
## KITCHEN FIREPLACE
## TOTAL.VALUE 0.018307952 0.358530267
## LOT.SQFT 0.044531210 0.181861664
## YR.BUILT 0.071979698 0.129317877
## GROSS.AREA 0.030555787 0.270020387
## LIVING.AREA 0.082887769 0.262094046
## FLOORS -0.114591595 0.120457249
## ROOMS 0.129243816 0.205179502
## BEDROOMS 0.085378798 0.164322507
## FULL.BATH 0.146702302 0.140077572
## HALF.BATH -0.020055997 0.176187819
## KITCHEN 1.000000000 -0.009552017
## FIREPLACE -0.009552017 1.000000000
# Let's start with the following initial regression model:
# TOTAL VALUE = B0 + B1 * LOT SQRFT + B2 * YR BUILT + B3 * LIVING AREA + B4 * ROOMS
#----------------------------------------------------------------------------
# 4) Partition the data set into "Training" and "Validation"
#----------------------------------------------------------------------------
# We need to decide the size of the these partitions. There is no fixed rule for this
# but it is a general practice to use 70% or 60% for training and remaining for the validation.
# The data in these partitions must be randomly selected. However, we use a seed number
# for the random numbers. Same seed results in the same set of random numbers. This method ensures that
# others can do the same analysis and get the same exact model as ours.
# We decide that training partition will be 70%, validation partition will be 30% and
# seed number is arbitrarily decided as 2023.
set.seed(2023)
total.rows = dim(housing.df)[1]
train.rows <- sample( c(1:total.rows), total.rows*0.7 )
train.data <- housing.df[train.rows, ]
valid.data <- housing.df[-train.rows, ]
#----------------------------------------------------------------------------
# 5) Create the model and run
#----------------------------------------------------------------------------
# We will use linear Model lm() function to create a linear regression model
# modelname = lm(response variable ~ predictor variables separated by "+" sign, data=training partition data)
reg1 = lm( TOTAL.VALUE ~ LOT.SQFT + YR.BUILT + LIVING.AREA + ROOMS, data = train.data)
summary(reg1)
##
## Call:
## lm(formula = TOTAL.VALUE ~ LOT.SQFT + YR.BUILT + LIVING.AREA +
## ROOMS, data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -274.179 -28.130 1.121 28.659 234.558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.503e+02 6.237e+01 -2.410 0.016 *
## LOT.SQFT 8.084e-03 3.172e-04 25.490 < 2e-16 ***
## YR.BUILT 1.271e-01 3.171e-02 4.008 6.23e-05 ***
## LIVING.AREA 1.266e-01 2.197e-03 57.644 < 2e-16 ***
## ROOMS 5.099e+00 8.027e-01 6.352 2.35e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.35 on 4055 degrees of freedom
## Multiple R-squared: 0.7464, Adjusted R-squared: 0.7462
## F-statistic: 2984 on 4 and 4055 DF, p-value: < 2.2e-16
# Check the normality condition of the residuals (ie. prediction errors):
# residuals = errors = e = actual - predicted
# Visually check if the distribution of the residuals are normally distributed
# 1) scatter plot or 2) histogram
plot(reg1$residuals)

hist(reg1$residuals)
# It seems the residuals are nearly normal. Normality assumption is confirmed.
# This rule is not very restrictive when it comes to predictions. It is important
# when doing scientific research investigating impact of predictors on the target.
# The report generated by the regression model shows us the significance of
# each predictor. We are looking for at least 10% significance (P-values)
# In reg1, all predictors are significant!
#----------------------------------------------------------------------------
# 6) Validate the model
#----------------------------------------------------------------------------
# Since we have the prediction model, now it is time to
# validate it on the validation partition
# Procedure:
# 1) use the model and predict TOTAL.VALUE in the
# validation partition.
# 2) Compare actual values in validation
# to the predicted values. How different they are overall?
# There are measures for this based on error = actual - predicted
# ME = mean(actual - fitted) Mean Error
# MSE = mean((actual - fitted)^2) Mean Squared Error
# RMSE = SQRT(mean((actual - fitted)^2)) Root Mean Squared Error
# MAE = mean(ABS(actual - fitted)) Mean Absolute Error or Mean Absolute Deviation (MAD)
# MPE = mean((actual - fitted) / actual)) x 100 Mean Percentage Error
# MAPE = mean(ABS(actual - fitted) / actual)) x 100 Mean Absolute Percentage Error
# ME:
# Errors are sometimes positive and sometimes are negative. When you add them up
# and get the mean, they cancel each other off. Normally, you should get a very small
# ME value. If not -> It means you are systematically under or over predicting on average.
# RMSE: This is one of the most important performance measures. It should be a small number.
# In other words, choose the technique with the lowest RMSE.
# Let's make predictions on the validation data set.
# Use predict()
pred1 = predict(reg1, newdata = valid.data) # predicting the TOTAL.VALUE using validation
# At this point, we need another package to calculate accuracy measures
# It is called "forecast"
# install.packages("forecast")
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo

# Now let's use function accuracy() from the "forecast" library
# accuracy(predicted target values, actual target values)
accuracy(pred1, valid.data$TOTAL.VALUE)
## ME RMSE MAE MPE MAPE
## Test set 1.804857 48.61909 37.16499 -1.155124 9.712792
# Accuracy of reg1 model on the validation dataset:
# ME RMSE MAE MPE MAPE
# Test set -0.855534 117.1603 83.86987 -4.82824 21.40145
# Accuracy of reg1 model on the training dataset:
pred2 = predict(reg1, newdata = train.data) # predicting the TOTAL.VALUE using validation
accuracy(pred2, train.data$TOTAL.VALUE)
## ME RMSE MAE MPE MAPE
## Test set 2.26866e-12 50.31835 37.74072 -1.603303 9.803901
# ME RMSE MAE MPE MAPE
# Test set 5.00883e-14 50.31835 37.74072 -1.603303 9.803901
# Compare the accuracy measures in the training and validation
#----------------------------------------------------------------------------
# 6) Scoring new data: Use the model to make prediction on a new data
#----------------------------------------------------------------------------
# A new property has arrived, with the following information:
# LOT.SQFT = 10000,
# YR.BUILT = 1920,
# GROSS.AREA = 2500,
# LIVING.AREA = 1500,
# FLOORS = 2,
# ROOMS = 8,
# BEDROOMS = 4,
# FULL.BATH = 2,
# HALF.BATH = 1,
# KITCHEN = 1,
# FIREPLACE = 1,
# REMODEL = "Old"
# reg1 = lm(TOTAL.VALUE ~ LOT.SQFT + YR.BUILT + LIVING.AREA + ROOMS, data = train.data )
new.property = data.frame(
LOT.SQFT = 10000,
YR.BUILT = 1920,
GROSS.AREA = 2500,
LIVING.AREA = 1500,
FLOORS = 2,
ROOMS = 8,
BEDROOMS = 4,
FULL.BATH = 2,
HALF.BATH = 1,
KITCHEN = 1,
FIREPLACE = 1,
REMODEL = "Old"
)
predict(reg1, newdata = new.property )
## 1
## 405.2919
# The predicted TOTAL.VALUE is 408.4399
#----------------------------------------------------------------------------
# 7) Summary
#----------------------------------------------------------------------------
# Linear regression models are very popular tools, not only for explanatory modeling,
# but also for prediction. A good predictive model has high predictive accuracy
# (to a useful practical level). Predictive models are fit to training data, and
# predictive accuracy is evaluated on a separate validation data set.
# Removing redundant predictors is key to achieving predictive accuracy and robustness.
# Trying different predictor subsets can help find “good” predictive models.
# Each alternative model should be run and assessed. After selecting the best model,
# it is used for predicting new data.