# install packages
# install.packages("gains")
# install.packages("conflicted")
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag", "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
conflict_prefer("lift", "purrr")
## [conflicted] Will prefer purrr::lift over any other package.
library(ggplot2)
library(lattice)
library(caret)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
library(gains)
options(warn = -1)
# Data loading and pre-processing methods 1:
west.df1 <- mlba::WestRoxbury %>%
select(-c(TAX))
str(west.df1)
## 'data.frame': 5802 obs. of 13 variables:
## $ TOTAL.VALUE: num 344 413 330 499 332 ...
## $ LOT.SQFT : int 9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
## $ YR.BUILT : int 1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
## $ GROSS.AREA : int 2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
## $ LIVING.AREA: int 1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
## $ FLOORS : num 2 2 2 1 2 1 2 1 1 2 ...
## $ ROOMS : int 6 10 8 9 7 6 7 6 5 8 ...
## $ BEDROOMS : int 3 4 4 5 3 3 3 3 3 4 ...
## $ FULL.BATH : int 1 2 1 1 2 1 1 1 1 2 ...
## $ HALF.BATH : int 1 1 1 1 0 0 1 0 0 0 ...
## $ KITCHEN : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FIREPLACE : int 0 0 0 1 0 1 0 0 1 0 ...
## $ REMODEL : chr "None" "Recent" "None" "None" ...
summary(west.df1)
## TOTAL.VALUE LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## Min. : 105.0 Min. : 997 Min. : 0 Min. : 821 Min. : 504
## 1st Qu.: 325.1 1st Qu.: 4772 1st Qu.:1920 1st Qu.:2347 1st Qu.:1308
## Median : 375.9 Median : 5683 Median :1935 Median :2700 Median :1548
## Mean : 392.7 Mean : 6278 Mean :1937 Mean :2925 Mean :1657
## 3rd Qu.: 438.8 3rd Qu.: 7022 3rd Qu.:1955 3rd Qu.:3239 3rd Qu.:1874
## Max. :1217.8 Max. :46411 Max. :2011 Max. :8154 Max. :5289
## FLOORS ROOMS BEDROOMS FULL.BATH
## Min. :1.000 Min. : 3.000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.: 6.000 1st Qu.:3.00 1st Qu.:1.000
## Median :2.000 Median : 7.000 Median :3.00 Median :1.000
## Mean :1.684 Mean : 6.995 Mean :3.23 Mean :1.297
## 3rd Qu.:2.000 3rd Qu.: 8.000 3rd Qu.:4.00 3rd Qu.:2.000
## Max. :3.000 Max. :14.000 Max. :9.00 Max. :5.000
## HALF.BATH KITCHEN FIREPLACE REMODEL
## Min. :0.0000 Min. :1.000 Min. :0.0000 Length:5802
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :1.0000 Median :1.000 Median :1.0000 Mode :character
## Mean :0.6139 Mean :1.015 Mean :0.7399
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.000 Max. :4.0000
median.total.value <- median(west.df1$TOTAL.VALUE)
west.df1$EasyToSell <- ifelse(west.df1$TOTAL.VALUE >= median.total.value, "Yes", "No")
west.df1 <- west.df1[, -which(names(west.df1) == "TOTAL.VALUE")]
# Data loading and pre-processing methods 2:
west.df1 <- mlba::WestRoxbury %>%
select(-c(TAX)) %>%
mutate(EasyToSell = factor(ifelse(TOTAL.VALUE >= median(TOTAL.VALUE), "Yes", "No"))) %>%
select(-c(TOTAL.VALUE))
# set seed for reproducing the partition
set.seed(2)
# partition data
idx <- caret::createDataPartition(west.df1$EasyToSell, p=0.6, list=FALSE)
train.df <- west.df1[idx, ]
holdout.df <- west.df1[-idx, ]
length(holdout.df)
## [1] 13
# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(EasyToSell ~ ., data=train.df, trControl=trControl,
# fit logistic regression with a generalized linear model
method="glm", family="binomial")
logit.reg
## Generalized Linear Model
##
## 3482 samples
## 12 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2786, 2785, 2785, 2786, 2786
## Resampling results:
##
## Accuracy Kappa
## 0.8521005 0.704203
summary(logit.reg$finalModel)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.511e+01 5.368e+00 -2.814 0.00489 **
## LOT.SQFT 3.989e-04 3.225e-05 12.368 < 2e-16 ***
## YR.BUILT -5.576e-04 2.706e-03 -0.206 0.83674
## GROSS.AREA 1.562e-03 1.801e-04 8.674 < 2e-16 ***
## LIVING.AREA 2.151e-03 3.244e-04 6.629 3.37e-11 ***
## FLOORS 2.307e+00 1.760e-01 13.111 < 2e-16 ***
## ROOMS 2.022e-01 6.582e-02 3.072 0.00213 **
## BEDROOMS -1.902e-01 1.057e-01 -1.799 0.07196 .
## FULL.BATH 3.080e-01 1.434e-01 2.148 0.03174 *
## HALF.BATH 7.857e-01 1.210e-01 6.495 8.31e-11 ***
## KITCHEN -5.249e-01 5.158e-01 -1.018 0.30888
## FIREPLACE 1.112e+00 1.120e-01 9.926 < 2e-16 ***
## REMODELOld 9.089e-02 1.794e-01 0.507 0.61248
## REMODELRecent 1.404e+00 1.648e-01 8.521 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4827.1 on 3481 degrees of freedom
## Residual deviance: 2285.5 on 3468 degrees of freedom
## AIC: 2313.5
##
## Number of Fisher Scoring iterations: 7
round(data.frame(summary(logit.reg$finalModel)$coefficients,
odds = exp(coef(logit.reg$finalModel))), 5)
## Estimate Std..Error z.value Pr...z.. odds
## (Intercept) -15.10606 5.36767 -2.81427 0.00489 0.00000
## LOT.SQFT 0.00040 0.00003 12.36750 0.00000 1.00040
## YR.BUILT -0.00056 0.00271 -0.20607 0.83674 0.99944
## GROSS.AREA 0.00156 0.00018 8.67402 0.00000 1.00156
## LIVING.AREA 0.00215 0.00032 6.62925 0.00000 1.00215
## FLOORS 2.30729 0.17598 13.11088 0.00000 10.04720
## ROOMS 0.20218 0.06582 3.07187 0.00213 1.22407
## BEDROOMS -0.19023 0.10572 -1.79934 0.07196 0.82677
## FULL.BATH 0.30800 0.14341 2.14761 0.03174 1.36070
## HALF.BATH 0.78574 0.12098 6.49480 0.00000 2.19403
## KITCHEN -0.52489 0.51582 -1.01758 0.30888 0.59162
## FIREPLACE 1.11167 0.11200 9.92560 0.00000 3.03942
## REMODELOld 0.09089 0.17943 0.50654 0.61248 1.09515
## REMODELRecent 1.40401 0.16476 8.52129 0.00000 4.07148
# Model Performance
actual <- ifelse(holdout.df$EasyToSell == "Yes", 1, 0)
pred <- predict(logit.reg, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))
nactual <-sum(actual)
ggplot() +
geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color="green") +
geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="blue") +
labs(x="# Cases", y="Cumulative")
confusionMatrix(predict(logit.reg, holdout.df), holdout.df$EasyToSell)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1024 178
## Yes 135 983
##
## Accuracy : 0.8651
## 95% CI : (0.8505, 0.8787)
## No Information Rate : 0.5004
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7302
##
## Mcnemar's Test P-Value : 0.0176
##
## Sensitivity : 0.8835
## Specificity : 0.8467
## Pos Pred Value : 0.8519
## Neg Pred Value : 0.8792
## Prevalence : 0.4996
## Detection Rate : 0.4414
## Detection Prevalence : 0.5181
## Balanced Accuracy : 0.8651
##
## 'Positive' Class : No
##
ggsave(file=file.path(getwd(), "LRperformance_Flights.pdf"),
last_plot() + theme_bw(),
width=3, height=3)
# Data loading and pre-processing methods 2:
west.df2 <- mlba::WestRoxbury %>%
select(-c(TAX)) %>%
mutate(EasyToSell = factor(ifelse(TOTAL.VALUE >= quantile(TOTAL.VALUE, 0.3), "Yes", "No"))) %>%
select(-c(TOTAL.VALUE))
# set seed for reproducing the partition
set.seed(2)
# partition data
idx <- caret::createDataPartition(west.df2$EasyToSell, p=0.6, list=FALSE)
train.df <- west.df2[idx, ]
holdout.df <- west.df2[-idx, ]
length(holdout.df)
## [1] 13
# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(EasyToSell ~ ., data=train.df, trControl=trControl,
# fit logistic regression with a generalized linear model
method="glm", family="binomial")
logit.reg
## Generalized Linear Model
##
## 3482 samples
## 12 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2785, 2786, 2786, 2786, 2785
## Resampling results:
##
## Accuracy Kappa
## 0.8650217 0.6710532
summary(logit.reg$finalModel)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.184e+01 5.934e+00 -1.995 0.0460 *
## LOT.SQFT 5.168e-04 3.946e-05 13.096 < 2e-16 ***
## YR.BUILT -2.089e-03 2.998e-03 -0.697 0.4858
## GROSS.AREA 1.952e-03 1.905e-04 10.248 < 2e-16 ***
## LIVING.AREA 7.622e-04 3.236e-04 2.356 0.0185 *
## FLOORS 2.572e+00 1.715e-01 14.997 < 2e-16 ***
## ROOMS 3.903e-01 7.404e-02 5.271 1.35e-07 ***
## BEDROOMS -4.163e-02 1.137e-01 -0.366 0.7144
## FULL.BATH 2.817e-01 1.613e-01 1.746 0.0808 .
## HALF.BATH 2.695e-01 1.238e-01 2.176 0.0296 *
## KITCHEN -3.040e-01 4.376e-01 -0.695 0.4872
## FIREPLACE 1.133e+00 1.104e-01 10.262 < 2e-16 ***
## REMODELOld -6.193e-03 1.865e-01 -0.033 0.9735
## REMODELRecent 1.717e+00 2.184e-01 7.862 3.78e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4254.7 on 3481 degrees of freedom
## Residual deviance: 2110.7 on 3468 degrees of freedom
## AIC: 2138.7
##
## Number of Fisher Scoring iterations: 7
round(data.frame(summary(logit.reg$finalModel)$coefficients,
odds = exp(coef(logit.reg$finalModel))), 5)
## Estimate Std..Error z.value Pr...z.. odds
## (Intercept) -11.83966 5.93380 -1.99529 0.04601 0.00001
## LOT.SQFT 0.00052 0.00004 13.09613 0.00000 1.00052
## YR.BUILT -0.00209 0.00300 -0.69696 0.48583 0.99791
## GROSS.AREA 0.00195 0.00019 10.24824 0.00000 1.00195
## LIVING.AREA 0.00076 0.00032 2.35561 0.01849 1.00076
## FLOORS 2.57166 0.17148 14.99716 0.00000 13.08752
## ROOMS 0.39027 0.07404 5.27134 0.00000 1.47738
## BEDROOMS -0.04163 0.11374 -0.36600 0.71437 0.95922
## FULL.BATH 0.28167 0.16134 1.74581 0.08084 1.32534
## HALF.BATH 0.26945 0.12385 2.17569 0.02958 1.30925
## KITCHEN -0.30400 0.43760 -0.69471 0.48724 0.73786
## FIREPLACE 1.13315 0.11042 10.26220 0.00000 3.10541
## REMODELOld -0.00619 0.18646 -0.03322 0.97350 0.99383
## REMODELRecent 1.71729 0.21843 7.86208 0.00000 5.56942
# Model Performance
actual <- ifelse(holdout.df$EasyToSell == "Yes", 1, 0)
pred <- predict(logit.reg, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))
nactual <-sum(actual)
ggplot() +
geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color="red") +
geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="blue") +
labs(x="# Cases", y="Cumulative")
confusionMatrix(predict(logit.reg, holdout.df), holdout.df$EasyToSell)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 481 129
## Yes 215 1495
##
## Accuracy : 0.8517
## 95% CI : (0.8366, 0.8659)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.634
##
## Mcnemar's Test P-Value : 4.586e-06
##
## Sensitivity : 0.6911
## Specificity : 0.9206
## Pos Pred Value : 0.7885
## Neg Pred Value : 0.8743
## Prevalence : 0.3000
## Detection Rate : 0.2073
## Detection Prevalence : 0.2629
## Balanced Accuracy : 0.8058
##
## 'Positive' Class : No
##
ggsave(file=file.path(getwd(), "LRperformance_Flights.pdf"),
last_plot() + theme_bw(),
width=3, height=3)
# Data loading and pre-processing methods 2:
west.df3 <- mlba::WestRoxbury %>%
select(-c(TAX)) %>%
mutate(EasyToSell = factor(ifelse(TOTAL.VALUE >= quantile(TOTAL.VALUE, 0.7), "Yes", "No"))) %>%
select(-c(TOTAL.VALUE))
# set seed for reproducing the partition
set.seed(2)
# partition data
idx <- caret::createDataPartition(west.df3$EasyToSell, p=0.6, list=FALSE)
train.df <- west.df3[idx, ]
holdout.df <- west.df3[-idx, ]
length(holdout.df)
## [1] 13
# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(EasyToSell ~ ., data=train.df, trControl=trControl,
# fit logistic regression with a generalized linear model
method="glm", family="binomial")
logit.reg
## Generalized Linear Model
##
## 3482 samples
## 12 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2786, 2786, 2786, 2784, 2786
## Resampling results:
##
## Accuracy Kappa
## 0.8897169 0.7303891
summary(logit.reg$finalModel)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.285e+01 2.998e+00 -7.624 2.46e-14 ***
## LOT.SQFT 4.452e-04 3.316e-05 13.423 < 2e-16 ***
## YR.BUILT 2.208e-03 1.416e-03 1.560 0.1188
## GROSS.AREA 1.193e-03 2.007e-04 5.943 2.81e-09 ***
## LIVING.AREA 2.774e-03 3.684e-04 7.530 5.08e-14 ***
## FLOORS 2.378e+00 2.337e-01 10.177 < 2e-16 ***
## ROOMS 3.111e-02 6.720e-02 0.463 0.6434
## BEDROOMS -7.276e-02 1.121e-01 -0.649 0.5165
## FULL.BATH 8.801e-01 1.423e-01 6.183 6.27e-10 ***
## HALF.BATH 1.183e+00 1.439e-01 8.217 < 2e-16 ***
## KITCHEN -7.622e-01 5.873e-01 -1.298 0.1943
## FIREPLACE 1.058e+00 1.332e-01 7.944 1.96e-15 ***
## REMODELOld 4.446e-01 1.961e-01 2.267 0.0234 *
## REMODELRecent 8.411e-01 1.666e-01 5.049 4.44e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4256.4 on 3481 degrees of freedom
## Residual deviance: 1724.2 on 3468 degrees of freedom
## AIC: 1752.2
##
## Number of Fisher Scoring iterations: 7
round(data.frame(summary(logit.reg$finalModel)$coefficients,
odds = exp(coef(logit.reg$finalModel))), 5)
## Estimate Std..Error z.value Pr...z.. odds
## (Intercept) -22.85443 2.99768 -7.62405 0.00000 0.00000
## LOT.SQFT 0.00045 0.00003 13.42287 0.00000 1.00045
## YR.BUILT 0.00221 0.00142 1.55995 0.11877 1.00221
## GROSS.AREA 0.00119 0.00020 5.94251 0.00000 1.00119
## LIVING.AREA 0.00277 0.00037 7.52973 0.00000 1.00278
## FLOORS 2.37810 0.23368 10.17693 0.00000 10.78434
## ROOMS 0.03111 0.06720 0.46297 0.64339 1.03160
## BEDROOMS -0.07276 0.11214 -0.64877 0.51648 0.92983
## FULL.BATH 0.88007 0.14233 6.18345 0.00000 2.41108
## HALF.BATH 1.18256 0.14392 8.21695 0.00000 3.26271
## KITCHEN -0.76223 0.58731 -1.29784 0.19434 0.46663
## FIREPLACE 1.05832 0.13322 7.94400 0.00000 2.88154
## REMODELOld 0.44457 0.19611 2.26695 0.02339 1.55982
## REMODELRecent 0.84110 0.16659 5.04904 0.00000 2.31892
# Model Performance
actual <- ifelse(holdout.df$EasyToSell == "Yes", 1, 0)
pred <- predict(logit.reg, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))
nactual <-sum(actual)
ggplot() +
geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color="purple") +
geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="pink") +
labs(x="# Cases", y="Cumulative")
confusionMatrix(predict(logit.reg, holdout.df), holdout.df$EasyToSell)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1521 155
## Yes 102 542
##
## Accuracy : 0.8892
## 95% CI : (0.8757, 0.9017)
## No Information Rate : 0.6996
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7306
##
## Mcnemar's Test P-Value : 0.00118
##
## Sensitivity : 0.9372
## Specificity : 0.7776
## Pos Pred Value : 0.9075
## Neg Pred Value : 0.8416
## Prevalence : 0.6996
## Detection Rate : 0.6556
## Detection Prevalence : 0.7224
## Balanced Accuracy : 0.8574
##
## 'Positive' Class : No
##
ggsave(file=file.path(getwd(), "LRperformance_Flights.pdf"),
last_plot() + theme_bw(),
width=3, height=3)