if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999)
housing.df <- read.csv('WestRoxbury.csv') # load data from file
housing.df <- mlba::WestRoxbury # load data from mlba package
dim(housing.df) # find the dimension of data frame
head(housing.df) # show the first six rows
View(housing.df) # show all the data in a new tab
# Practice showing different subsets of the data
housing.df[1:10, 1] # show the first 10 rows of the first column only
housing.df[1:10, ] # show the first 10 rows of each of the columns
housing.df[5, 1:10] # show the fifth row of the first 10 columns
housing.df[5, c(1:2, 4, 8:10)] # show the fifth row of some columns
housing.df[, 1] # show the whole first column
housing.df$TOTAL.VALUE # a different way to show the whole first column
housing.df$TOTAL.VALUE[1:10] # show the first 10 rows of the first column
length(housing.df$TOTAL.VALUE) # find the length of the first column
mean(housing.df$TOTAL.VALUE) # find the mean of the first column
summary(housing.df) # find summary statistics for each column
housing.df <- mlba::WestRoxbury
# random sample of 5 observations
s <- sample(row.names(housing.df), 5)
housing.df[s,]
#> TOTAL.VALUE TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
#> 599 412.0 5182 8919 1991 3190 2509 1.5 7
#> 4763 735.4 9251 20467 1965 4800 3261 2.0 9
#> 4913 394.9 4967 7000 1930 2379 1188 2.0 6
#> 4646 435.9 5483 6647 1960 3302 1880 2.0 9
#> 3866 346.0 4352 4000 1940 2613 1584 2.0 6
#> BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
#> 599 4 1 1 1 1 None
#> 4763 4 2 1 1 1 Old
#> 4913 3 1 1 1 0 Recent
#> 4646 4 1 1 1 0 None
#> 3866 3 1 0 1 1 None
# oversample houses with over 10 rooms
s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, 0.9, 0.01))
housing.df[s,]
#> TOTAL.VALUE TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
#> 3292 605.5 7617 10233 1910 6549 3845 2.5 12
#> 4740 666.4 8383 12137 1915 5600 3462 2.5 11
#> 2609 410.9 5169 4700 1968 2822 1586 2.0 12
#> 5393 413.0 5195 4876 1930 3440 1942 2.0 7
#> 179 516.7 6500 6000 2005 4623 2597 2.0 11
#> BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL
#> 3292 4 2 1 1 1 None
#> 4740 6 2 0 1 1 Recent
#> 2609 3 2 1 1 1 None
#> 5393 3 1 1 1 1 Recent
#> 179 4 2 1 1 1 None
# rebalance
housing.df$REMODEL <- factor(housing.df$REMODEL)
table(housing.df$REMODEL)
#>
#> None Old Recent
#> 4346 581 875
upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x
table(upsampled.df$REMODEL)
#>
#> None Old Recent
#> 4346 4346 4346
library(tidyverse)
# get overview
str(housing.df)
#> 'data.frame': 5802 obs. of 14 variables:
#> $ TOTAL.VALUE: num 344 413 330 499 332 ...
#> $ TAX : int 4330 5190 4152 6272 4170 4244 4521 4030 4195 5150 ...
#> $ LOT.SQFT : int 9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
#> $ YR.BUILT : int 1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
#> $ GROSS.AREA : int 2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
#> $ LIVING.AREA: int 1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
#> $ FLOORS : num 2 2 2 1 2 1 2 1 1 2 ...
#> $ ROOMS : int 6 10 8 9 7 6 7 6 5 8 ...
#> $ BEDROOMS : int 3 4 4 5 3 3 3 3 3 4 ...
#> $ FULL.BATH : int 1 2 1 1 2 1 1 1 1 2 ...
#> $ HALF.BATH : int 1 1 1 1 0 0 1 0 0 0 ...
#> $ KITCHEN : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ FIREPLACE : int 0 0 0 1 0 1 0 0 1 0 ...
#> $ REMODEL : Factor w/ 3 levels "None","Old","Recent": 1 3 1 1 1 2 1 1 3 1 ...
# make REMODEL a factor variable
housing.df$REMODEL <- factor(housing.df$REMODEL)
str(housing.df$REMODEL)
#> Factor w/ 3 levels "None","Old","Recent": 1 3 1 1 1 2 1 1 3 1 ...
levels(housing.df$REMODEL) # show factor's categories (levels)
#> [1] "None" "Old" "Recent"
# use tidyverse to load and preprocess data in one statement
# the %>% operator inserts the result of the expression on the left
# as the first argument into the function on the right
housing.df <- mlba::WestRoxbury %>%
mutate(REMODEL=factor(REMODEL))
library(fastDummies)
library(tidyverse)
housing.df <- dummy_cols(mlba::WestRoxbury,
remove_selected_columns=TRUE, # remove the original column
remove_first_dummy=TRUE) # removes the first created dummy variable
housing.df %>% head(2)
#> TOTAL.VALUE TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
#> 1 344.2 4330 9965 1880 2436 1352 2 6
#> 2 412.6 5190 6590 1945 3108 1976 2 10
#> BEDROOMS FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
#> 1 3 1 1 1 0 0 0
#> 2 4 2 1 1 0 0 1
# To illustrate missing data procedures, we first convert a few entries for
# BEDROOMS to NA's. Then we impute these missing values using the median of the
# remaining values.
rows.to.missing <- sample(row.names(housing.df), 10)
housing.df[rows.to.missing,]$BEDROOMS <- NA
summary(housing.df$BEDROOMS)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 1.00 3.00 3.00 3.23 4.00 9.00 10
# Now we have 10 NA's and the median of the remaining values is 3.
# replace the missing values using the median of the remaining values
# use median() with na.rm=TRUE to ignore missing values when computing the median.
housing.df <- housing.df %>%
replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE)))
summary(housing.df$BEDROOMS)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.00 3.00 3.00 3.23 4.00 9.00
housing.df <- mlba::WestRoxbury %>%
mutate(REMODEL=factor(REMODEL))
# use set.seed() to get the same partitions when re-running the R code.
set.seed(1)
## partitioning into training (60%) and holdout (40%)
# randomly sample 60% of the row IDs for training; the remaining 40% serve
# as holdout
train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.6)
# collect all the columns with training row ID into training set:
train.df <- housing.df[train.rows, ]
# assign row IDs that are not already in the training set, into holdout
holdout.rows <- setdiff(rownames(housing.df), train.rows)
holdout.df <- housing.df[holdout.rows, ]
## partitioning into training (50%), validation (30%), holdout (20%)
# randomly sample 50% of the row IDs for training
train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.5)
# sample 30% of the row IDs into the validation set, drawing only from records
# not already in the training set
# use setdiff() to find records not already in the training set
valid.rows <- sample(setdiff(rownames(housing.df), train.rows),
nrow(housing.df)*0.3)
# assign the remaining 20% row IDs serve as holdout
holdout.rows <- setdiff(rownames(housing.df), union(train.rows, valid.rows))
# create the 3 data frames by collecting all columns from the appropriate rows
train.df <- housing.df[train.rows, ]
valid.df <- housing.df[valid.rows, ]
holdout.df <- housing.df[holdout.rows, ]
## partitioning into training (60%) and holdout (40%) using caret
set.seed(1)
idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[idx, ]
holdout.df <- housing.df[-idx, ]
library(tidyverse)
library(mlba)
library(fastDummies)
housing.df <- mlba::WestRoxbury %>%
# remove rows with missing values
drop_na() %>%
# remove column TAX
select(-TAX) %>%
# make REMODEL a factor and convert to dummy variables
mutate(REMODEL=factor(REMODEL)) %>%
dummy_cols(select_columns=c('REMODEL'),
remove_selected_columns=TRUE, remove_first_dummy=TRUE)
set.seed(1)
idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[idx, ]
holdout.df <- housing.df[-idx, ]
reg <- lm(TOTAL.VALUE ~ ., data=train.df)
train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values,
residuals=reg$residuals)
head(train.res)
#> actual predicted residuals
#> 1 344.2 384.4206 -40.220638
#> 4 498.6 546.4628 -47.862759
#> 5 331.5 347.9170 -16.417031
#> 12 344.5 380.4297 -35.929727
#> 13 315.5 313.1879 2.312083
#> 15 326.2 345.3751 -19.175064
pred <- predict(reg, newdata=holdout.df)
holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred,
residuals=holdout.df$TOTAL.VALUE - pred)
head(holdout.res)
#> actual predicted residuals
#> 2 412.6 460.2777 -47.677744
#> 3 330.1 359.3920 -29.291958
#> 6 337.4 290.0277 47.372303
#> 7 359.4 402.5332 -43.133242
#> 8 320.4 314.0683 6.331652
#> 9 333.5 339.8206 -6.320582
library(caret)
# compute metrics on training set
data.frame(
ME = round(mean(train.res$residuals), 5),
RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual),
MAE = MAE(pred=train.res$predicted, obs=train.res$actual)
)
#> ME RMSE MAE
#> 1 0 42.14665 31.98717
# compute metrics on holdout set
data.frame(
ME = round(mean(holdout.res$residuals), 5),
RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual),
MAE = MAE(pred=holdout.res$predicted, obs=holdout.res$actual)
)
#> ME RMSE MAE
#> 1 -1.04237 43.90381 33.05476
# For demonstration purposes, we construct the new.data from the original dataset
housing.df <- mlba::WestRoxbury
new.data <- housing.df[100:102, -1] %>%
mutate(REMODEL=factor(REMODEL, levels=c("None", "Old", "Recent"))) %>%
dummy_cols(select_columns=c('REMODEL'),
remove_selected_columns=TRUE, remove_first_dummy=TRUE)
new.data
#> TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS FULL.BATH
#> 1 3818 4200 1960 2670 1710 2.0 10 4 1
#> 2 3791 6444 1940 2886 1474 1.5 6 3 1
#> 3 4275 5035 1925 3264 1523 1.0 6 2 1
#> HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
#> 1 1 1 1 0 0
#> 2 1 1 1 0 0
#> 3 0 1 0 0 1
pred <- predict(reg, newdata = new.data)
pred
#> 1 2 3
#> 385.9718 378.6392 352.1145