1.Load and preprocess data
#Load and preprocess data
housing.df <- mlba::WestRoxbury
names(housing.df)[3]<- "LOT.SIZE" #change the column name if so desired
names(housing.df) <- tolower(names(housing.df)) #change to lowercase if so desired
#Discretize total.value
above_median <- subset(housing.df,total.value>=median(total.value))
below_median <- subset(housing.df,total.value<median(total.value))
above_median$total.value <- 1 # 1 indicates that the total value of the home is more than its median
below_median$total.value <- 0 # 0 indicates that the total value of the home is less than its median
house.df <- rbind(above_median,below_median)
house.df$total.value <- factor(house.df$total.value,levels=c(0,1),labels=c("No","Yes"),order=TRUE)
colnames(house.df)[1] <- "is_high_value"
str(house.df)
## 'data.frame': 5802 obs. of 14 variables:
## $ is_high_value: Ord.factor w/ 2 levels "No"<"Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ tax : int 5190 6272 5150 7233 5216 5428 6173 5440 5101 7124 ...
## $ lot.size : int 6590 13773 5093 12288 12972 6733 5683 7139 3867 8249 ...
## $ yr.built : int 1945 1957 1900 2004 1892 1990 1995 1996 1996 2007 ...
## $ gross.area : int 3108 5032 4818 4616 3796 2880 4100 2340 2431 4390 ...
## $ living.area : int 1976 2608 2992 2378 2054 1792 2640 1500 1560 2708 ...
## $ floors : num 2 1 2 2 1.5 2 2 2 2 2 ...
## $ rooms : int 10 9 8 9 6 7 6 7 7 8 ...
## $ bedrooms : int 4 5 4 4 3 3 3 3 3 4 ...
## $ full.bath : int 2 1 2 2 3 2 1 2 2 2 ...
## $ half.bath : int 1 1 0 1 0 1 1 1 1 1 ...
## $ kitchen : int 1 1 1 1 1 1 1 1 1 1 ...
## $ fireplace : int 0 1 0 1 0 1 1 0 1 1 ...
## $ remodel : chr "Recent" "None" "None" "None" ...
2.Partition data
#Partition data
library(caret)
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
set.seed(1011)
idx <- caret::createDataPartition(house.df$is_high_value,p=0.6, list=FALSE)
train.df <- house.df[idx,]
holdout.df <- house.df[-idx,]
3.Build Logit Model
#build model
trControl <- caret::trainControl(method = "cv",
number=8,
allowParallel = TRUE)
options (warn = -1)
logit.reg <- caret::train(is_high_value~lot.size+yr.built+gross.area+
living.area+floors+bedrooms+full.bath+half.bath+
kitchen+fireplace+remodel,data=train.df,trControl=trControl,
method="glm", family="binomial")
summary(logit.reg$finalModel)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.692e+01 5.437e+00 -3.112 0.00186 **
## lot.size 4.384e-04 3.164e-05 13.859 < 2e-16 ***
## yr.built 1.766e-04 2.725e-03 0.065 0.94833
## gross.area 1.865e-03 1.922e-04 9.703 < 2e-16 ***
## living.area 1.980e-03 3.327e-04 5.952 2.64e-09 ***
## floors 2.570e+00 1.838e-01 13.978 < 2e-16 ***
## bedrooms -1.685e-02 9.906e-02 -0.170 0.86496
## full.bath 1.938e-01 1.448e-01 1.339 0.18069
## half.bath 6.020e-01 1.222e-01 4.927 8.34e-07 ***
## kitchen -3.105e-01 5.260e-01 -0.590 0.55493
## fireplace 1.137e+00 1.128e-01 10.074 < 2e-16 ***
## remodelOld 1.092e-01 1.784e-01 0.612 0.54040
## remodelRecent 1.320e+00 1.643e-01 8.034 9.40e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4827.1 on 3481 degrees of freedom
## Residual deviance: 2232.4 on 3469 degrees of freedom
## AIC: 2258.4
##
## Number of Fisher Scoring iterations: 7
4.Confusion Matrix
logit.reg.pred <- predict(logit.reg, holdout.df, type = "prob")
# Confusion Matrix
pred_0.3 <- ifelse(logit.reg.pred[,2] > 0.3 ,"Yes","No") # cut-off point =0.3
pred_0.5 <- ifelse(logit.reg.pred[,2] > 0.5 ,"Yes","No") # cut-off point =0.5
pred_0.7 <- ifelse(logit.reg.pred[,2] > 0.7 ,"Yes","No") # cut-off point =0.7
pred <- data.frame(pred_0.3=pred_0.3,
pred_0.5=pred_0.5,
pred_0.7=pred_0.7)
# cut-off point =0.3
ConfusionMatrix1 <- table(Predicted=pred$pred_0.3,Actual=holdout.df$is_high_value);
ConfusionMatrix1
## Actual
## Predicted No Yes
## No 881 86
## Yes 278 1075
# cut-off point =0.5
ConfusionMatrix2 <- table(Predicted=pred$pred_0.5,Actual=holdout.df$is_high_value);
ConfusionMatrix2
## Actual
## Predicted No Yes
## No 1017 188
## Yes 142 973
# cut-off point =0.7
ConfusionMatrix3 <- table(Predicted=pred$pred_0.7,Actual=holdout.df$is_high_value);
ConfusionMatrix3
## Actual
## Predicted No Yes
## No 1089 324
## Yes 70 837
5. ROC Curve and Best Cut-off point
## Select the best cut-off point
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 载入程辑包:'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rocobj <- roc(holdout.df$is_high_value,logit.reg.pred[,2],levels=c("No","Yes"))
## Setting direction: controls < cases
plot(rocobj,
legacy.axes = TRUE,
main="Best Cut-Off Point of ROC Curve",
thresholds="best",
print.thres="best") # The best cut-off point is displayed on the ROC curve

# We find the best cut-off point of ROC Curve is 0.41
roc_result <- coords(rocobj, "best")
# Calculate TP, FP, TN and FN
TP <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==2 & logit.reg.pred[,2] > roc_result$threshold, ])[1]
FP <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==1 & logit.reg.pred[,2] > roc_result$threshold, ])[1]
TN <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==1 & logit.reg.pred[,2] <= roc_result$threshold, ])[1]
FN <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==2 & logit.reg.pred[,2] <= roc_result$threshold, ])[1]
data.frame(
Sensitivity_best = TP/(TP+FN),
Specificity_best = TN/(TN+FP),
Precision_best = TP/(TP+FP),
F1_Score_best = 2*TP/(2*TP+FN+FP)
)
## Sensitivity_best Specificity_best Precision_best F1_Score_best
## 1 0.8871662 0.8412425 0.8484349 0.8673684