Homework4

1.Load and preprocess data

#Load and preprocess data
housing.df <- mlba::WestRoxbury
names(housing.df)[3]<- "LOT.SIZE"  #change the column name if so desired
names(housing.df) <- tolower(names(housing.df)) #change to lowercase if so desired

#Discretize total.value
above_median <- subset(housing.df,total.value>=median(total.value))
below_median <- subset(housing.df,total.value<median(total.value))
above_median$total.value <- 1  # 1 indicates that the total value of the home is more than its median
below_median$total.value <- 0  # 0 indicates that the total value of the home is less than its median
house.df <- rbind(above_median,below_median)  
house.df$total.value <- factor(house.df$total.value,levels=c(0,1),labels=c("No","Yes"),order=TRUE)
colnames(house.df)[1] <- "is_high_value"
str(house.df)

## 'data.frame':    5802 obs. of  14 variables:
##  $ is_high_value: Ord.factor w/ 2 levels "No"<"Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ tax          : int  5190 6272 5150 7233 5216 5428 6173 5440 5101 7124 ...
##  $ lot.size     : int  6590 13773 5093 12288 12972 6733 5683 7139 3867 8249 ...
##  $ yr.built     : int  1945 1957 1900 2004 1892 1990 1995 1996 1996 2007 ...
##  $ gross.area   : int  3108 5032 4818 4616 3796 2880 4100 2340 2431 4390 ...
##  $ living.area  : int  1976 2608 2992 2378 2054 1792 2640 1500 1560 2708 ...
##  $ floors       : num  2 1 2 2 1.5 2 2 2 2 2 ...
##  $ rooms        : int  10 9 8 9 6 7 6 7 7 8 ...
##  $ bedrooms     : int  4 5 4 4 3 3 3 3 3 4 ...
##  $ full.bath    : int  2 1 2 2 3 2 1 2 2 2 ...
##  $ half.bath    : int  1 1 0 1 0 1 1 1 1 1 ...
##  $ kitchen      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ fireplace    : int  0 1 0 1 0 1 1 0 1 1 ...
##  $ remodel      : chr  "Recent" "None" "None" "None" ...

2.Partition data

#Partition data
library(caret)

## 载入需要的程辑包：ggplot2

## 载入需要的程辑包：lattice

set.seed(1011)
idx <- caret::createDataPartition(house.df$is_high_value,p=0.6, list=FALSE)
train.df <- house.df[idx,]
holdout.df <- house.df[-idx,]

3.Build Logit Model

#build model
trControl <- caret::trainControl(method = "cv",
                                 number=8,
                                 allowParallel = TRUE)
options (warn = -1)
logit.reg <- caret::train(is_high_value~lot.size+yr.built+gross.area+
                            living.area+floors+bedrooms+full.bath+half.bath+
                            kitchen+fireplace+remodel,data=train.df,trControl=trControl,
                          method="glm", family="binomial")
summary(logit.reg$finalModel)

## 
## Call:
## NULL
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.692e+01  5.437e+00  -3.112  0.00186 ** 
## lot.size       4.384e-04  3.164e-05  13.859  < 2e-16 ***
## yr.built       1.766e-04  2.725e-03   0.065  0.94833    
## gross.area     1.865e-03  1.922e-04   9.703  < 2e-16 ***
## living.area    1.980e-03  3.327e-04   5.952 2.64e-09 ***
## floors         2.570e+00  1.838e-01  13.978  < 2e-16 ***
## bedrooms      -1.685e-02  9.906e-02  -0.170  0.86496    
## full.bath      1.938e-01  1.448e-01   1.339  0.18069    
## half.bath      6.020e-01  1.222e-01   4.927 8.34e-07 ***
## kitchen       -3.105e-01  5.260e-01  -0.590  0.55493    
## fireplace      1.137e+00  1.128e-01  10.074  < 2e-16 ***
## remodelOld     1.092e-01  1.784e-01   0.612  0.54040    
## remodelRecent  1.320e+00  1.643e-01   8.034 9.40e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4827.1  on 3481  degrees of freedom
## Residual deviance: 2232.4  on 3469  degrees of freedom
## AIC: 2258.4
## 
## Number of Fisher Scoring iterations: 7

4.Confusion Matrix

logit.reg.pred <- predict(logit.reg, holdout.df, type = "prob")

# Confusion Matrix
pred_0.3 <- ifelse(logit.reg.pred[,2] > 0.3 ,"Yes","No")  # cut-off point =0.3
pred_0.5 <- ifelse(logit.reg.pred[,2] > 0.5 ,"Yes","No")  # cut-off point =0.5
pred_0.7 <- ifelse(logit.reg.pred[,2] > 0.7 ,"Yes","No")  # cut-off point =0.7
pred <- data.frame(pred_0.3=pred_0.3,
                    pred_0.5=pred_0.5,
                    pred_0.7=pred_0.7)

# cut-off point =0.3
ConfusionMatrix1 <- table(Predicted=pred$pred_0.3,Actual=holdout.df$is_high_value);
ConfusionMatrix1

##          Actual
## Predicted   No  Yes
##       No   881   86
##       Yes  278 1075

# cut-off point =0.5
ConfusionMatrix2 <- table(Predicted=pred$pred_0.5,Actual=holdout.df$is_high_value);
ConfusionMatrix2

##          Actual
## Predicted   No  Yes
##       No  1017  188
##       Yes  142  973

# cut-off point =0.7
ConfusionMatrix3 <- table(Predicted=pred$pred_0.7,Actual=holdout.df$is_high_value);
ConfusionMatrix3

##          Actual
## Predicted   No  Yes
##       No  1089  324
##       Yes   70  837

5. ROC Curve and Best Cut-off point

## Select the best cut-off point
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 载入程辑包：'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

rocobj <- roc(holdout.df$is_high_value,logit.reg.pred[,2],levels=c("No","Yes"))

## Setting direction: controls < cases

plot(rocobj,
     legacy.axes = TRUE,
     main="Best Cut-Off Point of ROC Curve",
     thresholds="best", 
     print.thres="best") # The best cut-off point is displayed on the ROC curve

# We find the best cut-off point of ROC Curve is 0.41
roc_result <- coords(rocobj, "best")
# Calculate TP, FP, TN and FN
TP <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==2 & logit.reg.pred[,2] > roc_result$threshold, ])[1]
FP <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==1 & logit.reg.pred[,2] > roc_result$threshold, ])[1]
TN <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==1 & logit.reg.pred[,2] <= roc_result$threshold, ])[1]
FN <- dim(holdout.df[as.numeric(holdout.df$is_high_value)==2 & logit.reg.pred[,2] <= roc_result$threshold, ])[1]

data.frame(
  Sensitivity_best = TP/(TP+FN),
  Specificity_best = TN/(TN+FP),
  Precision_best = TP/(TP+FP),
  F1_Score_best = 2*TP/(2*TP+FN+FP)
)

##   Sensitivity_best Specificity_best Precision_best F1_Score_best
## 1        0.8871662        0.8412425      0.8484349     0.8673684

Homework4

Guang Cao

2023-07-11

1.Load and preprocess data

2.Partition data

3.Build Logit Model

4.Confusion Matrix

5. ROC Curve and Best Cut-off point