setwd("C:/Users/12267/Desktop/UWindsor/Winter 2021/MSCI 3230 Data Science Tools & Methods/RSTUDIO Work")
######### List all Libraries used
library(ggplot2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
#########
## Read the data
df <- read.csv("data/WestRoxbury (2).csv")
#############
t(t(names(df))) ##display column names in a user friendly manner
## [,1]
## [1,] "TOTAL.VALUE"
## [2,] "TAX"
## [3,] "LOT.SQFT"
## [4,] "YR.BUILT"
## [5,] "GROSS.AREA"
## [6,] "LIVING.AREA"
## [7,] "FLOORS"
## [8,] "ROOMS"
## [9,] "BEDROOMS"
## [10,] "FULL.BATH"
## [11,] "HALF.BATH"
## [12,] "KITCHEN"
## [13,] "FIREPLACE"
## [14,] "REMODEL"
names(df)[3]<- "LOT.SIZE" #change the column name if so desired
names(df) <- tolower(names(df)) #change to lowercase if so desired
dim(df) #Show the number of rows and columns
## [1] 5802 14
head(df, n = 3)
## total.value tax lot.size yr.built gross.area living.area floors rooms
## 1 344.2 4330 9965 1880 2436 1352 2 6
## 2 412.6 5190 6590 1945 3108 1976 2 10
## 3 330.1 4152 7500 1890 2294 1371 2 8
## bedrooms full.bath half.bath kitchen fireplace remodel
## 1 3 1 1 1 0 None
## 2 4 2 1 1 0 Recent
## 3 4 1 1 1 0 None
df <- dummy.data.frame(df, sep = ".")
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
names(df)
## [1] "total.value" "tax" "lot.size" "yr.built"
## [5] "gross.area" "living.area" "floors" "rooms"
## [9] "bedrooms" "full.bath" "half.bath" "kitchen"
## [13] "fireplace" "remodel.None" "remodel.Old" "remodel.Recent"
cols <- c("total.value", "lot.size", "gross.area", "living.area")
cols <- c(cols, "rooms", "bedrooms","remodel.Old", "remodel.Recent")
df<- df[, cols]
##remove categorical columns for heat map
df1 <- df[ , -(5:9) ]
par(cex.main=1)
heatmap.2(cor(df1), Rowv = FALSE, Colv = FALSE, dendrogram = "none",
cellnote = round(cor(df1),2),main = "Correlation Values",
cexRow = 1, cexCol = 1, notecol = "black", key = FALSE,
trace = 'none', margins = c(10,10))
###########################################
###Partitioning the dataset
set.seed(1) # set seed for reproducing the partition
train.rows <- sample(1:nrow(df), 3500)
train.df <- df[train.rows, ]
valid.df <- df[-train.rows, ]
reg2 <- lm(total.value ~ lot.size + gross.area + living.area, data = train.df)
summary(reg2)
##
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area,
## data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -303.773 -29.589 0.744 29.375 206.502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.6093086 2.9596489 35.683 <0.0000000000000002 ***
## lot.size 0.0083653 0.0003442 24.302 <0.0000000000000002 ***
## gross.area 0.0192353 0.0022076 8.713 <0.0000000000000002 ***
## living.area 0.1077604 0.0035683 30.199 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.57 on 3496 degrees of freedom
## Multiple R-squared: 0.7584, Adjusted R-squared: 0.7582
## F-statistic: 3658 on 3 and 3496 DF, p-value: < 0.00000000000000022
# use predict() to make predictions on a new set.
lm.pred <- predict(reg2, valid.df)
options(scipen=999, digits = 0)
some.residuals <- valid.df$total.value[1:20] - lm.pred[1:20]
data.frame("Actual" = valid.df$total.value[1:20],"Predicted" = lm.pred[1:20],
"Residual" = some.residuals)
## Actual Predicted Residual
## 2 413 433 -21
## 5 332 348 -16
## 6 337 304 34
## 8 320 361 -41
## 9 334 330 3
## 10 409 563 -154
## 11 313 358 -45
## 12 344 374 -30
## 13 316 329 -13
## 16 298 281 17
## 20 348 411 -63
## 21 318 303 15
## 23 358 396 -38
## 24 415 508 -94
## 25 319 341 -22
## 31 329 358 -29
## 33 280 428 -148
## 35 337 413 -75
## 42 350 372 -22
## 43 344 415 -71
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
t(accuracy(lm.pred, valid.df$total.value))
## Test set
## ME -0.675
## RMSE 49.615
## MAE 37.929
## MPE -1.732
## MAPE 9.934
all.residuals <- valid.df$total.value - lm.pred
df2 <- data.frame(all.residuals, lm.pred)
h1 <- ggplot(df2, aes(x= all.residuals)) +
geom_histogram(fill = "blue", col = "red", binwidth = 20)
h1 <- h1 + labs(x = "residuals", y = "count", title = "Histogram of residuals")
h1
p1 <- ggplot(df2, aes(x = lm.pred, y = all.residuals)) + geom_point()
p1 <- p1 + labs(x = "predicted values", y = "residuals", title = "Residuals vs. predicted values")
p1 <- p1+ geom_smooth()
p1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
search <- regsubsets(total.value ~ lot.size + gross.area + living.area, data = train.df, nbest = 1, nvmax = dim(train.df)[2],
method = "exhaustive")
sum <- summary(search)
# show models
t(sum$which)
## 1 2 3
## (Intercept) TRUE TRUE TRUE
## lot.size FALSE TRUE TRUE
## gross.area FALSE FALSE TRUE
## living.area TRUE TRUE TRUE
# show metrics
sum$rsq
## [1] 0.705 0.753 0.758
sum$adjr2
## [1] 0.705 0.753 0.758
#Here it shows us that the last model where all components are included has the highest adjusted R-square model. We choose model 3.
##Find the model with the highest adjusted R2.
x <- which(sum$adjr2== max(sum$adjr2))
t(t(sum$which[x,]))
## [,1]
## (Intercept) TRUE
## lot.size TRUE
## gross.area TRUE
## living.area TRUE
#Extra way, starts with full columns and starts taking out one column at a time.
# use step() to run stepwise regression.
reg.step <- step(reg1, direction = "backward")
## Start: AIC=45301
## total.value ~ lot.size + gross.area + living.area
##
## Df Sum of Sq RSS AIC
## <none> 14253102 45301
## - gross.area 1 318093 14571195 45428
## - lot.size 1 2150370 16403472 46115
## - living.area 1 3747845 18000947 46654
summary(reg.step) # Which variables were dropped?
##
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -304.30 -29.80 0.73 29.46 232.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.108944 2.338947 45.4 <0.0000000000000002 ***
## lot.size 0.008084 0.000273 29.6 <0.0000000000000002 ***
## gross.area 0.019468 0.001711 11.4 <0.0000000000000002 ***
## living.area 0.107955 0.002765 39.0 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.6 on 5798 degrees of freedom
## Multiple R-squared: 0.75, Adjusted R-squared: 0.75
## F-statistic: 5.8e+03 on 3 and 5798 DF, p-value: <0.0000000000000002
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$total.value)
## ME RMSE MAE MPE MAPE
## Test set -0.414 49.6 37.9 -1.67 9.94
###Another method known as “backward” demonstrates the same outcome as exhaustive method.
reg.step <- step(reg1, direction = "both")
## Start: AIC=45301
## total.value ~ lot.size + gross.area + living.area
##
## Df Sum of Sq RSS AIC
## <none> 14253102 45301
## - gross.area 1 318093 14571195 45428
## - lot.size 1 2150370 16403472 46115
## - living.area 1 3747845 18000947 46654
summary(reg.step) # Which variables were dropped?
##
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -304.30 -29.80 0.73 29.46 232.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.108944 2.338947 45.4 <0.0000000000000002 ***
## lot.size 0.008084 0.000273 29.6 <0.0000000000000002 ***
## gross.area 0.019468 0.001711 11.4 <0.0000000000000002 ***
## living.area 0.107955 0.002765 39.0 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.6 on 5798 degrees of freedom
## Multiple R-squared: 0.75, Adjusted R-squared: 0.75
## F-statistic: 5.8e+03 on 3 and 5798 DF, p-value: <0.0000000000000002
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$total.value)
## ME RMSE MAE MPE MAPE
## Test set -0.414 49.6 37.9 -1.67 9.94
###Another method known as “both” demonstrates the same outcome as exhaustive method.