Haiyuan_Gui’s Document

# install packages
# install.packages("gains")
# install.packages("conflicted")
library(conflicted)
conflict_prefer("filter", "dplyr")

## [conflicted] Will prefer dplyr::filter over any other package.

conflict_prefer("lag", "dplyr")

## [conflicted] Will prefer dplyr::lag over any other package.

conflict_prefer("lift", "purrr")

## [conflicted] Will prefer purrr::lift over any other package.

library(ggplot2)
library(lattice)
library(caret)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0

library(gains)
options(warn = -1)

Median of TOTAL.VALUE

# Data loading and pre-processing methods 1:
west.df1 <- mlba::WestRoxbury %>% 
  select(-c(TAX))
str(west.df1)

## 'data.frame':    5802 obs. of  13 variables:
##  $ TOTAL.VALUE: num  344 413 330 499 332 ...
##  $ LOT.SQFT   : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
##  $ YR.BUILT   : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
##  $ GROSS.AREA : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
##  $ LIVING.AREA: int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
##  $ FLOORS     : num  2 2 2 1 2 1 2 1 1 2 ...
##  $ ROOMS      : int  6 10 8 9 7 6 7 6 5 8 ...
##  $ BEDROOMS   : int  3 4 4 5 3 3 3 3 3 4 ...
##  $ FULL.BATH  : int  1 2 1 1 2 1 1 1 1 2 ...
##  $ HALF.BATH  : int  1 1 1 1 0 0 1 0 0 0 ...
##  $ KITCHEN    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FIREPLACE  : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ REMODEL    : chr  "None" "Recent" "None" "None" ...

summary(west.df1)

##   TOTAL.VALUE        LOT.SQFT        YR.BUILT      GROSS.AREA    LIVING.AREA  
##  Min.   : 105.0   Min.   :  997   Min.   :   0   Min.   : 821   Min.   : 504  
##  1st Qu.: 325.1   1st Qu.: 4772   1st Qu.:1920   1st Qu.:2347   1st Qu.:1308  
##  Median : 375.9   Median : 5683   Median :1935   Median :2700   Median :1548  
##  Mean   : 392.7   Mean   : 6278   Mean   :1937   Mean   :2925   Mean   :1657  
##  3rd Qu.: 438.8   3rd Qu.: 7022   3rd Qu.:1955   3rd Qu.:3239   3rd Qu.:1874  
##  Max.   :1217.8   Max.   :46411   Max.   :2011   Max.   :8154   Max.   :5289  
##      FLOORS          ROOMS           BEDROOMS      FULL.BATH    
##  Min.   :1.000   Min.   : 3.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.: 6.000   1st Qu.:3.00   1st Qu.:1.000  
##  Median :2.000   Median : 7.000   Median :3.00   Median :1.000  
##  Mean   :1.684   Mean   : 6.995   Mean   :3.23   Mean   :1.297  
##  3rd Qu.:2.000   3rd Qu.: 8.000   3rd Qu.:4.00   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :14.000   Max.   :9.00   Max.   :5.000  
##    HALF.BATH         KITCHEN        FIREPLACE        REMODEL         
##  Min.   :0.0000   Min.   :1.000   Min.   :0.0000   Length:5802       
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :1.0000   Median :1.000   Median :1.0000   Mode  :character  
##  Mean   :0.6139   Mean   :1.015   Mean   :0.7399                     
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.0000                     
##  Max.   :3.0000   Max.   :2.000   Max.   :4.0000

median.total.value <- median(west.df1$TOTAL.VALUE)

west.df1$EasyToSell <- ifelse(west.df1$TOTAL.VALUE >= median.total.value, "Yes", "No")

west.df1 <- west.df1[, -which(names(west.df1) == "TOTAL.VALUE")]



# Data loading and pre-processing methods 2:
west.df1 <- mlba::WestRoxbury %>% 
  select(-c(TAX)) %>%
  mutate(EasyToSell = factor(ifelse(TOTAL.VALUE >= median(TOTAL.VALUE), "Yes", "No"))) %>%
  select(-c(TOTAL.VALUE))

# set seed for reproducing the partition
set.seed(2)
# partition data
idx <- caret::createDataPartition(west.df1$EasyToSell, p=0.6, list=FALSE)
train.df <- west.df1[idx, ]
holdout.df <- west.df1[-idx, ]
length(holdout.df)

## [1] 13

# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(EasyToSell ~ ., data=train.df, trControl=trControl,
                          # fit logistic regression with a generalized linear model
                          method="glm", family="binomial")
logit.reg

## Generalized Linear Model 
## 
## 3482 samples
##   12 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2786, 2785, 2785, 2786, 2786 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.8521005  0.704203

summary(logit.reg$finalModel)

## 
## Call:
## NULL
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.511e+01  5.368e+00  -2.814  0.00489 ** 
## LOT.SQFT       3.989e-04  3.225e-05  12.368  < 2e-16 ***
## YR.BUILT      -5.576e-04  2.706e-03  -0.206  0.83674    
## GROSS.AREA     1.562e-03  1.801e-04   8.674  < 2e-16 ***
## LIVING.AREA    2.151e-03  3.244e-04   6.629 3.37e-11 ***
## FLOORS         2.307e+00  1.760e-01  13.111  < 2e-16 ***
## ROOMS          2.022e-01  6.582e-02   3.072  0.00213 ** 
## BEDROOMS      -1.902e-01  1.057e-01  -1.799  0.07196 .  
## FULL.BATH      3.080e-01  1.434e-01   2.148  0.03174 *  
## HALF.BATH      7.857e-01  1.210e-01   6.495 8.31e-11 ***
## KITCHEN       -5.249e-01  5.158e-01  -1.018  0.30888    
## FIREPLACE      1.112e+00  1.120e-01   9.926  < 2e-16 ***
## REMODELOld     9.089e-02  1.794e-01   0.507  0.61248    
## REMODELRecent  1.404e+00  1.648e-01   8.521  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4827.1  on 3481  degrees of freedom
## Residual deviance: 2285.5  on 3468  degrees of freedom
## AIC: 2313.5
## 
## Number of Fisher Scoring iterations: 7

round(data.frame(summary(logit.reg$finalModel)$coefficients,
                 odds = exp(coef(logit.reg$finalModel))), 5)

##                Estimate Std..Error  z.value Pr...z..     odds
## (Intercept)   -15.10606    5.36767 -2.81427  0.00489  0.00000
## LOT.SQFT        0.00040    0.00003 12.36750  0.00000  1.00040
## YR.BUILT       -0.00056    0.00271 -0.20607  0.83674  0.99944
## GROSS.AREA      0.00156    0.00018  8.67402  0.00000  1.00156
## LIVING.AREA     0.00215    0.00032  6.62925  0.00000  1.00215
## FLOORS          2.30729    0.17598 13.11088  0.00000 10.04720
## ROOMS           0.20218    0.06582  3.07187  0.00213  1.22407
## BEDROOMS       -0.19023    0.10572 -1.79934  0.07196  0.82677
## FULL.BATH       0.30800    0.14341  2.14761  0.03174  1.36070
## HALF.BATH       0.78574    0.12098  6.49480  0.00000  2.19403
## KITCHEN        -0.52489    0.51582 -1.01758  0.30888  0.59162
## FIREPLACE       1.11167    0.11200  9.92560  0.00000  3.03942
## REMODELOld      0.09089    0.17943  0.50654  0.61248  1.09515
## REMODELRecent   1.40401    0.16476  8.52129  0.00000  4.07148

# Model Performance
actual <- ifelse(holdout.df$EasyToSell == "Yes", 1, 0)
pred <- predict(logit.reg, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))

nactual <-sum(actual)
ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color="green") +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="blue") +
  labs(x="# Cases", y="Cumulative")

confusionMatrix(predict(logit.reg, holdout.df), holdout.df$EasyToSell)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1024  178
##        Yes  135  983
##                                           
##                Accuracy : 0.8651          
##                  95% CI : (0.8505, 0.8787)
##     No Information Rate : 0.5004          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7302          
##                                           
##  Mcnemar's Test P-Value : 0.0176          
##                                           
##             Sensitivity : 0.8835          
##             Specificity : 0.8467          
##          Pos Pred Value : 0.8519          
##          Neg Pred Value : 0.8792          
##              Prevalence : 0.4996          
##          Detection Rate : 0.4414          
##    Detection Prevalence : 0.5181          
##       Balanced Accuracy : 0.8651          
##                                           
##        'Positive' Class : No              
##

ggsave(file=file.path(getwd(), "LRperformance_Flights.pdf"),
       last_plot() + theme_bw(),
       width=3, height=3)

30% quantile of TOTAL.VALUE

# Data loading and pre-processing methods 2:
west.df2 <- mlba::WestRoxbury %>% 
  select(-c(TAX)) %>%
  mutate(EasyToSell = factor(ifelse(TOTAL.VALUE >= quantile(TOTAL.VALUE, 0.3), "Yes", "No"))) %>%
  select(-c(TOTAL.VALUE))

# set seed for reproducing the partition
set.seed(2)
# partition data
idx <- caret::createDataPartition(west.df2$EasyToSell, p=0.6, list=FALSE)
train.df <- west.df2[idx, ]
holdout.df <- west.df2[-idx, ]
length(holdout.df)

## [1] 13

# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(EasyToSell ~ ., data=train.df, trControl=trControl,
                          # fit logistic regression with a generalized linear model
                          method="glm", family="binomial")
logit.reg

## Generalized Linear Model 
## 
## 3482 samples
##   12 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2785, 2786, 2786, 2786, 2785 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8650217  0.6710532

summary(logit.reg$finalModel)

## 
## Call:
## NULL
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.184e+01  5.934e+00  -1.995   0.0460 *  
## LOT.SQFT       5.168e-04  3.946e-05  13.096  < 2e-16 ***
## YR.BUILT      -2.089e-03  2.998e-03  -0.697   0.4858    
## GROSS.AREA     1.952e-03  1.905e-04  10.248  < 2e-16 ***
## LIVING.AREA    7.622e-04  3.236e-04   2.356   0.0185 *  
## FLOORS         2.572e+00  1.715e-01  14.997  < 2e-16 ***
## ROOMS          3.903e-01  7.404e-02   5.271 1.35e-07 ***
## BEDROOMS      -4.163e-02  1.137e-01  -0.366   0.7144    
## FULL.BATH      2.817e-01  1.613e-01   1.746   0.0808 .  
## HALF.BATH      2.695e-01  1.238e-01   2.176   0.0296 *  
## KITCHEN       -3.040e-01  4.376e-01  -0.695   0.4872    
## FIREPLACE      1.133e+00  1.104e-01  10.262  < 2e-16 ***
## REMODELOld    -6.193e-03  1.865e-01  -0.033   0.9735    
## REMODELRecent  1.717e+00  2.184e-01   7.862 3.78e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4254.7  on 3481  degrees of freedom
## Residual deviance: 2110.7  on 3468  degrees of freedom
## AIC: 2138.7
## 
## Number of Fisher Scoring iterations: 7

round(data.frame(summary(logit.reg$finalModel)$coefficients,
                 odds = exp(coef(logit.reg$finalModel))), 5)

##                Estimate Std..Error  z.value Pr...z..     odds
## (Intercept)   -11.83966    5.93380 -1.99529  0.04601  0.00001
## LOT.SQFT        0.00052    0.00004 13.09613  0.00000  1.00052
## YR.BUILT       -0.00209    0.00300 -0.69696  0.48583  0.99791
## GROSS.AREA      0.00195    0.00019 10.24824  0.00000  1.00195
## LIVING.AREA     0.00076    0.00032  2.35561  0.01849  1.00076
## FLOORS          2.57166    0.17148 14.99716  0.00000 13.08752
## ROOMS           0.39027    0.07404  5.27134  0.00000  1.47738
## BEDROOMS       -0.04163    0.11374 -0.36600  0.71437  0.95922
## FULL.BATH       0.28167    0.16134  1.74581  0.08084  1.32534
## HALF.BATH       0.26945    0.12385  2.17569  0.02958  1.30925
## KITCHEN        -0.30400    0.43760 -0.69471  0.48724  0.73786
## FIREPLACE       1.13315    0.11042 10.26220  0.00000  3.10541
## REMODELOld     -0.00619    0.18646 -0.03322  0.97350  0.99383
## REMODELRecent   1.71729    0.21843  7.86208  0.00000  5.56942

# Model Performance
actual <- ifelse(holdout.df$EasyToSell == "Yes", 1, 0)
pred <- predict(logit.reg, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))

nactual <-sum(actual)
ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color="red") +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="blue") +
  labs(x="# Cases", y="Cumulative")

confusionMatrix(predict(logit.reg, holdout.df), holdout.df$EasyToSell)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No   481  129
##        Yes  215 1495
##                                           
##                Accuracy : 0.8517          
##                  95% CI : (0.8366, 0.8659)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.634           
##                                           
##  Mcnemar's Test P-Value : 4.586e-06       
##                                           
##             Sensitivity : 0.6911          
##             Specificity : 0.9206          
##          Pos Pred Value : 0.7885          
##          Neg Pred Value : 0.8743          
##              Prevalence : 0.3000          
##          Detection Rate : 0.2073          
##    Detection Prevalence : 0.2629          
##       Balanced Accuracy : 0.8058          
##                                           
##        'Positive' Class : No              
##

ggsave(file=file.path(getwd(), "LRperformance_Flights.pdf"),
       last_plot() + theme_bw(),
       width=3, height=3)

70% quantile of TOTAL.VALUE

# Data loading and pre-processing methods 2:
west.df3 <- mlba::WestRoxbury %>% 
  select(-c(TAX)) %>%
  mutate(EasyToSell = factor(ifelse(TOTAL.VALUE >= quantile(TOTAL.VALUE, 0.7), "Yes", "No"))) %>%
  select(-c(TOTAL.VALUE))

# set seed for reproducing the partition
set.seed(2)
# partition data
idx <- caret::createDataPartition(west.df3$EasyToSell, p=0.6, list=FALSE)
train.df <- west.df3[idx, ]
holdout.df <- west.df3[-idx, ]
length(holdout.df)

## [1] 13

# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(EasyToSell ~ ., data=train.df, trControl=trControl,
                          # fit logistic regression with a generalized linear model
                          method="glm", family="binomial")
logit.reg

## Generalized Linear Model 
## 
## 3482 samples
##   12 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2786, 2786, 2786, 2784, 2786 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8897169  0.7303891

summary(logit.reg$finalModel)

## 
## Call:
## NULL
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.285e+01  2.998e+00  -7.624 2.46e-14 ***
## LOT.SQFT       4.452e-04  3.316e-05  13.423  < 2e-16 ***
## YR.BUILT       2.208e-03  1.416e-03   1.560   0.1188    
## GROSS.AREA     1.193e-03  2.007e-04   5.943 2.81e-09 ***
## LIVING.AREA    2.774e-03  3.684e-04   7.530 5.08e-14 ***
## FLOORS         2.378e+00  2.337e-01  10.177  < 2e-16 ***
## ROOMS          3.111e-02  6.720e-02   0.463   0.6434    
## BEDROOMS      -7.276e-02  1.121e-01  -0.649   0.5165    
## FULL.BATH      8.801e-01  1.423e-01   6.183 6.27e-10 ***
## HALF.BATH      1.183e+00  1.439e-01   8.217  < 2e-16 ***
## KITCHEN       -7.622e-01  5.873e-01  -1.298   0.1943    
## FIREPLACE      1.058e+00  1.332e-01   7.944 1.96e-15 ***
## REMODELOld     4.446e-01  1.961e-01   2.267   0.0234 *  
## REMODELRecent  8.411e-01  1.666e-01   5.049 4.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4256.4  on 3481  degrees of freedom
## Residual deviance: 1724.2  on 3468  degrees of freedom
## AIC: 1752.2
## 
## Number of Fisher Scoring iterations: 7

round(data.frame(summary(logit.reg$finalModel)$coefficients,
                 odds = exp(coef(logit.reg$finalModel))), 5)

##                Estimate Std..Error  z.value Pr...z..     odds
## (Intercept)   -22.85443    2.99768 -7.62405  0.00000  0.00000
## LOT.SQFT        0.00045    0.00003 13.42287  0.00000  1.00045
## YR.BUILT        0.00221    0.00142  1.55995  0.11877  1.00221
## GROSS.AREA      0.00119    0.00020  5.94251  0.00000  1.00119
## LIVING.AREA     0.00277    0.00037  7.52973  0.00000  1.00278
## FLOORS          2.37810    0.23368 10.17693  0.00000 10.78434
## ROOMS           0.03111    0.06720  0.46297  0.64339  1.03160
## BEDROOMS       -0.07276    0.11214 -0.64877  0.51648  0.92983
## FULL.BATH       0.88007    0.14233  6.18345  0.00000  2.41108
## HALF.BATH       1.18256    0.14392  8.21695  0.00000  3.26271
## KITCHEN        -0.76223    0.58731 -1.29784  0.19434  0.46663
## FIREPLACE       1.05832    0.13322  7.94400  0.00000  2.88154
## REMODELOld      0.44457    0.19611  2.26695  0.02339  1.55982
## REMODELRecent   0.84110    0.16659  5.04904  0.00000  2.31892

# Model Performance
actual <- ifelse(holdout.df$EasyToSell == "Yes", 1, 0)
pred <- predict(logit.reg, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))

nactual <-sum(actual)
ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color="purple") +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="pink") +
  labs(x="# Cases", y="Cumulative")

confusionMatrix(predict(logit.reg, holdout.df), holdout.df$EasyToSell)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1521  155
##        Yes  102  542
##                                           
##                Accuracy : 0.8892          
##                  95% CI : (0.8757, 0.9017)
##     No Information Rate : 0.6996          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7306          
##                                           
##  Mcnemar's Test P-Value : 0.00118         
##                                           
##             Sensitivity : 0.9372          
##             Specificity : 0.7776          
##          Pos Pred Value : 0.9075          
##          Neg Pred Value : 0.8416          
##              Prevalence : 0.6996          
##          Detection Rate : 0.6556          
##    Detection Prevalence : 0.7224          
##       Balanced Accuracy : 0.8574          
##                                           
##        'Positive' Class : No              
##

ggsave(file=file.path(getwd(), "LRperformance_Flights.pdf"),
       last_plot() + theme_bw(),
       width=3, height=3)

Haiyuan_Gui’s Document

ghy

2023-07-10

Median of TOTAL.VALUE

30% quantile of TOTAL.VALUE

70% quantile of TOTAL.VALUE