homework04

library(ggplot2)
library(gridExtra)

p <- seq(0.005, 0.995, 0.01)
df <- data.frame(
  p = p,
  odds = p / (1 - p),
  logit = log(p / (1 - p))
)

g1 <- ggplot(df, aes(x=p, y=odds)) +
  geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(0, 100)) +
  labs(x='Probability of success', y='Odds', title='(a)') +
  geom_hline(yintercept = 0) +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        axis.line.x = element_blank(),
        panel.border = element_blank())
g2 <- ggplot(df, aes(x=p, y=logit)) +
  geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(-4, 4)) +
  labs(x='Probability of success', y='Logit', title='(b)' ) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        axis.line.x = element_blank(),
        panel.border = element_blank())

grid.arrange(g1, g2, ncol=2)

library(caret)

## 载入需要的程辑包：lattice

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ purrr::lift()    masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

# load and preprocess data
housing.df <- mlba::WestRoxbury 
housing.df <- housing.df[, -2] %>%
mutate(
  REMODEL = recode(REMODEL, "None" = 1, "Recent" = 2, "Old" = 3),
  TOTAL.VALUE = ifelse(TOTAL.VALUE <= mean(housing.df$TOTAL.VALUE), 0,1)
  )

# partition data
set.seed(2)
idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[idx, ]
holdout.df <- housing.df[-idx, ]

# build model
train.df$TOTAL.VALUE <- factor(train.df$TOTAL.VALUE, levels = c(0, 1))

trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                          method="glm", family="binomial")
logit.reg

## Generalized Linear Model 
## 
## 3482 samples
##   12 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2785, 2786, 2786, 2786, 2785 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.8627204  0.714422

round(data.frame(summary(logit.reg$finalModel)$coefficients,
                 odds = exp(coef(logit.reg$finalModel))), 5)

##              Estimate Std..Error  z.value Pr...z..    odds
## (Intercept) -22.16656    5.54979 -3.99412  0.00006 0.00000
## LOT.SQFT      0.00040    0.00003 13.22724  0.00000 1.00040
## YR.BUILT      0.00254    0.00277  0.91731  0.35898 1.00254
## GROSS.AREA    0.00121    0.00019  6.44051  0.00000 1.00121
## LIVING.AREA   0.00262    0.00035  7.47168  0.00000 1.00262
## FLOORS        2.17120    0.19219 11.29712  0.00000 8.76877
## ROOMS         0.20738    0.06520  3.18057  0.00147 1.23045
## BEDROOMS     -0.25377    0.10775 -2.35519  0.01851 0.77587
## FULL.BATH     0.65827    0.14485  4.54444  0.00001 1.93146
## HALF.BATH     0.87676    0.12810  6.84415  0.00000 2.40309
## KITCHEN      -0.49085    0.48105 -1.02036  0.30756 0.61211
## FIREPLACE     1.03044    0.11938  8.63136  0.00000 2.80229
## REMODEL       0.43856    0.08420  5.20871  0.00000 1.55047

summary(logit.reg$finalModel)

## 
## Call:
## NULL
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.217e+01  5.550e+00  -3.994 6.49e-05 ***
## LOT.SQFT     4.004e-04  3.027e-05  13.227  < 2e-16 ***
## YR.BUILT     2.541e-03  2.770e-03   0.917  0.35898    
## GROSS.AREA   1.208e-03  1.876e-04   6.441 1.19e-10 ***
## LIVING.AREA  2.615e-03  3.500e-04   7.472 7.92e-14 ***
## FLOORS       2.171e+00  1.922e-01  11.297  < 2e-16 ***
## ROOMS        2.074e-01  6.520e-02   3.181  0.00147 ** 
## BEDROOMS    -2.538e-01  1.077e-01  -2.355  0.01851 *  
## FULL.BATH    6.583e-01  1.449e-01   4.544 5.51e-06 ***
## HALF.BATH    8.768e-01  1.281e-01   6.844 7.69e-12 ***
## KITCHEN     -4.908e-01  4.811e-01  -1.020  0.30756    
## FIREPLACE    1.030e+00  1.194e-01   8.631  < 2e-16 ***
## REMODEL      4.386e-01  8.420e-02   5.209 1.90e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4718.2  on 3481  degrees of freedom
## Residual deviance: 2151.1  on 3469  degrees of freedom
## AIC: 2177.1
## 
## Number of Fisher Scoring iterations: 7

logit.reg.pred <- predict(logit.reg,holdout.df,type = "prob")

# display four different cases
interestingCases = c(1, 12, 32, 1333)
data.frame(
  actual = holdout.df$TOTAL.VALUE[interestingCases],
  p0 = logit.reg.pred[interestingCases, 1],
  p1 = logit.reg.pred[interestingCases, 2],
  predicted = ifelse(logit.reg.pred[interestingCases, 2] > 0.5, 1, 0)
)

##   actual         p0        p1 predicted
## 1      0 0.64834697 0.3516530         0
## 2      0 0.45205297 0.5479470         1
## 3      0 0.61739817 0.3826018         0
## 4      1 0.02088205 0.9791179         1

library(gains)
actual <- ifelse(holdout.df$TOTAL.VALUE < mean(housing.df$TOTAL.VALUE), 0,1)
gain <- gains(actual, logit.reg.pred[,2], groups=length(actual) - 2)

# plot gains chart
nactual <-sum(actual)
g1 <- ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
  labs(x="# Cases", y="Cumulative") 

# plot decile-wise lift chart
gain10 <- gains(actual, logit.reg.pred[,2], groups=10)
g2 <- ggplot(mapping=aes(x=gain10$depth, y=gain10$lift / 100)) +
  geom_col(fill="steelblue") +
  geom_text(aes(label=round(gain10$lift / 100, 1)), vjust=-0.2, size=3) +
  ylim(0, 3) + labs(x="Percentile", y="Lift")
grid.arrange(g1, g2, ncol=2)

# 阈值为0.3的预测准确性
pred_0.3 <- ifelse(logit.reg.pred[, 2] > 0.3, 1, 0)
confusion_0.3 <- table(pred_0.3, holdout.df$TOTAL.VALUE)
accuracy_0.3 <- sum(diag(confusion_0.3)) / sum(confusion_0.3)
precision_0.3 <- confusion_0.3[2, 2] / sum(confusion_0.3[, 2])
recall_0.3 <- confusion_0.3[2, 2] / sum(confusion_0.3[2, ])
f1score_0.3 <- 2 * precision_0.3 * recall_0.3 / (precision_0.3 + recall_0.3)

# 阈值为0.5的预测准确性
pred_0.5 <- ifelse(logit.reg.pred[, 2] > 0.5, 1, 0)
confusion_0.5 <- table(pred_0.5, holdout.df$TOTAL.VALUE)
accuracy_0.5 <- sum(diag(confusion_0.5)) / sum(confusion_0.5)
precision_0.5 <- confusion_0.5[2, 2] / sum(confusion_0.5[, 2])
recall_0.5 <- confusion_0.5[2, 2] / sum(confusion_0.5[2, ])
f1score_0.5 <- 2 * precision_0.5 * recall_0.5 / (precision_0.5 + recall_0.5)

# 阈值为0.7的预测准确性
pred_0.7 <- ifelse(logit.reg.pred[, 2] > 0.7, 1, 0)
confusion_0.7 <- table(pred_0.7, holdout.df$TOTAL.VALUE)
accuracy_0.7 <- sum(diag(confusion_0.7)) / sum(confusion_0.7)
precision_0.7 <- confusion_0.7[2, 2] / sum(confusion_0.7[, 2])
recall_0.7 <- confusion_0.7[2, 2] / sum(confusion_0.7[2, ])
f1score_0.7 <- 2 * precision_0.7 * recall_0.7 / (precision_0.7 + recall_0.7)


accuracy_0.3

## [1] 0.8543103

precision_0.3

## [1] 0.9233954

recall_0.3

## [1] 0.7716263

f1score_0.3

## [1] 0.8407163

accuracy_0.5

## [1] 0.8724138

precision_0.5

## [1] 0.8343685

recall_0.5

## [1] 0.8556263

f1score_0.5

## [1] 0.8448637

accuracy_0.7

## [1] 0.850431

precision_0.7

## [1] 0.6904762

recall_0.7

## [1] 0.9328671

f1score_0.7

## [1] 0.7935753

Accuracy: Accuracy measures the overall correctness of the model’s predictions across all samples. A higher accuracy indicates better overall performance of the model. However, it is important to consider the class distribution in the dataset when interpreting accuracy.

Precision: Precision measures the proportion of true positives among the samples predicted as positive by the model. A higher precision indicates that the model generates fewer false positives, reducing the risk of false alarms.

Recall: Recall measures the proportion of true positives among all actual positives. A higher recall indicates that the model captures more of the positive instances, reducing the risk of false negatives.

F1-score: F1-score is the harmonic mean of precision and recall, providing a balanced measure of the model’s accuracy and recall capability. A higher F1-score indicates that the model achieves a good balance between precision and recall.

Based on the given results, it can be observed that the accuracy and F1-score are relatively high when the threshold is set to 0.5. The precision and recall values are also at a good level. This indicates that at a threshold of 0.5, the model performs well overall and achieves a balanced trade-off between precision and recall.

However, at a threshold of 0.3, the accuracy and F1-score are slightly lower, indicating a higher rate of false positives (Type I errors). However, the recall is relatively high, indicating better capture of positive instances and a lower rate of false negatives (Type II errors). At a threshold of 0.7, the precision is lower, which may lead to more false negatives, but the recall is higher.

homework04

Zhang Heng

2023-07-14