library(ggplot2)
library(gridExtra)
p <- seq(0.005, 0.995, 0.01)
df <- data.frame(
p = p,
odds = p / (1 - p),
logit = log(p / (1 - p))
)
g1 <- ggplot(df, aes(x=p, y=odds)) +
geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(0, 100)) +
labs(x='Probability of success', y='Odds', title='(a)') +
geom_hline(yintercept = 0) +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
axis.line.x = element_blank(),
panel.border = element_blank())
g2 <- ggplot(df, aes(x=p, y=logit)) +
geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(-4, 4)) +
labs(x='Probability of success', y='Logit', title='(b)' ) +
geom_hline(yintercept = 0) +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
axis.line.x = element_blank(),
panel.border = element_blank())
grid.arrange(g1, g2, ncol=2)
library(caret)
## 载入需要的程辑包:lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# load and preprocess data
housing.df <- mlba::WestRoxbury
housing.df <- housing.df[, -2] %>%
mutate(
REMODEL = recode(REMODEL, "None" = 1, "Recent" = 2, "Old" = 3),
TOTAL.VALUE = ifelse(TOTAL.VALUE <= mean(housing.df$TOTAL.VALUE), 0,1)
)
# partition data
set.seed(2)
idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- housing.df[idx, ]
holdout.df <- housing.df[-idx, ]
# build model
train.df$TOTAL.VALUE <- factor(train.df$TOTAL.VALUE, levels = c(0, 1))
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
method="glm", family="binomial")
logit.reg
## Generalized Linear Model
##
## 3482 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2785, 2786, 2786, 2786, 2785
## Resampling results:
##
## Accuracy Kappa
## 0.8627204 0.714422
round(data.frame(summary(logit.reg$finalModel)$coefficients,
odds = exp(coef(logit.reg$finalModel))), 5)
## Estimate Std..Error z.value Pr...z.. odds
## (Intercept) -22.16656 5.54979 -3.99412 0.00006 0.00000
## LOT.SQFT 0.00040 0.00003 13.22724 0.00000 1.00040
## YR.BUILT 0.00254 0.00277 0.91731 0.35898 1.00254
## GROSS.AREA 0.00121 0.00019 6.44051 0.00000 1.00121
## LIVING.AREA 0.00262 0.00035 7.47168 0.00000 1.00262
## FLOORS 2.17120 0.19219 11.29712 0.00000 8.76877
## ROOMS 0.20738 0.06520 3.18057 0.00147 1.23045
## BEDROOMS -0.25377 0.10775 -2.35519 0.01851 0.77587
## FULL.BATH 0.65827 0.14485 4.54444 0.00001 1.93146
## HALF.BATH 0.87676 0.12810 6.84415 0.00000 2.40309
## KITCHEN -0.49085 0.48105 -1.02036 0.30756 0.61211
## FIREPLACE 1.03044 0.11938 8.63136 0.00000 2.80229
## REMODEL 0.43856 0.08420 5.20871 0.00000 1.55047
summary(logit.reg$finalModel)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.217e+01 5.550e+00 -3.994 6.49e-05 ***
## LOT.SQFT 4.004e-04 3.027e-05 13.227 < 2e-16 ***
## YR.BUILT 2.541e-03 2.770e-03 0.917 0.35898
## GROSS.AREA 1.208e-03 1.876e-04 6.441 1.19e-10 ***
## LIVING.AREA 2.615e-03 3.500e-04 7.472 7.92e-14 ***
## FLOORS 2.171e+00 1.922e-01 11.297 < 2e-16 ***
## ROOMS 2.074e-01 6.520e-02 3.181 0.00147 **
## BEDROOMS -2.538e-01 1.077e-01 -2.355 0.01851 *
## FULL.BATH 6.583e-01 1.449e-01 4.544 5.51e-06 ***
## HALF.BATH 8.768e-01 1.281e-01 6.844 7.69e-12 ***
## KITCHEN -4.908e-01 4.811e-01 -1.020 0.30756
## FIREPLACE 1.030e+00 1.194e-01 8.631 < 2e-16 ***
## REMODEL 4.386e-01 8.420e-02 5.209 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4718.2 on 3481 degrees of freedom
## Residual deviance: 2151.1 on 3469 degrees of freedom
## AIC: 2177.1
##
## Number of Fisher Scoring iterations: 7
logit.reg.pred <- predict(logit.reg,holdout.df,type = "prob")
# display four different cases
interestingCases = c(1, 12, 32, 1333)
data.frame(
actual = holdout.df$TOTAL.VALUE[interestingCases],
p0 = logit.reg.pred[interestingCases, 1],
p1 = logit.reg.pred[interestingCases, 2],
predicted = ifelse(logit.reg.pred[interestingCases, 2] > 0.5, 1, 0)
)
## actual p0 p1 predicted
## 1 0 0.64834697 0.3516530 0
## 2 0 0.45205297 0.5479470 1
## 3 0 0.61739817 0.3826018 0
## 4 1 0.02088205 0.9791179 1
library(gains)
actual <- ifelse(holdout.df$TOTAL.VALUE < mean(housing.df$TOTAL.VALUE), 0,1)
gain <- gains(actual, logit.reg.pred[,2], groups=length(actual) - 2)
# plot gains chart
nactual <-sum(actual)
g1 <- ggplot() +
geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
labs(x="# Cases", y="Cumulative")
# plot decile-wise lift chart
gain10 <- gains(actual, logit.reg.pred[,2], groups=10)
g2 <- ggplot(mapping=aes(x=gain10$depth, y=gain10$lift / 100)) +
geom_col(fill="steelblue") +
geom_text(aes(label=round(gain10$lift / 100, 1)), vjust=-0.2, size=3) +
ylim(0, 3) + labs(x="Percentile", y="Lift")
grid.arrange(g1, g2, ncol=2)
# 阈值为0.3的预测准确性
pred_0.3 <- ifelse(logit.reg.pred[, 2] > 0.3, 1, 0)
confusion_0.3 <- table(pred_0.3, holdout.df$TOTAL.VALUE)
accuracy_0.3 <- sum(diag(confusion_0.3)) / sum(confusion_0.3)
precision_0.3 <- confusion_0.3[2, 2] / sum(confusion_0.3[, 2])
recall_0.3 <- confusion_0.3[2, 2] / sum(confusion_0.3[2, ])
f1score_0.3 <- 2 * precision_0.3 * recall_0.3 / (precision_0.3 + recall_0.3)
# 阈值为0.5的预测准确性
pred_0.5 <- ifelse(logit.reg.pred[, 2] > 0.5, 1, 0)
confusion_0.5 <- table(pred_0.5, holdout.df$TOTAL.VALUE)
accuracy_0.5 <- sum(diag(confusion_0.5)) / sum(confusion_0.5)
precision_0.5 <- confusion_0.5[2, 2] / sum(confusion_0.5[, 2])
recall_0.5 <- confusion_0.5[2, 2] / sum(confusion_0.5[2, ])
f1score_0.5 <- 2 * precision_0.5 * recall_0.5 / (precision_0.5 + recall_0.5)
# 阈值为0.7的预测准确性
pred_0.7 <- ifelse(logit.reg.pred[, 2] > 0.7, 1, 0)
confusion_0.7 <- table(pred_0.7, holdout.df$TOTAL.VALUE)
accuracy_0.7 <- sum(diag(confusion_0.7)) / sum(confusion_0.7)
precision_0.7 <- confusion_0.7[2, 2] / sum(confusion_0.7[, 2])
recall_0.7 <- confusion_0.7[2, 2] / sum(confusion_0.7[2, ])
f1score_0.7 <- 2 * precision_0.7 * recall_0.7 / (precision_0.7 + recall_0.7)
accuracy_0.3
## [1] 0.8543103
precision_0.3
## [1] 0.9233954
recall_0.3
## [1] 0.7716263
f1score_0.3
## [1] 0.8407163
accuracy_0.5
## [1] 0.8724138
precision_0.5
## [1] 0.8343685
recall_0.5
## [1] 0.8556263
f1score_0.5
## [1] 0.8448637
accuracy_0.7
## [1] 0.850431
precision_0.7
## [1] 0.6904762
recall_0.7
## [1] 0.9328671
f1score_0.7
## [1] 0.7935753
Accuracy: Accuracy measures the overall correctness of the model’s predictions across all samples. A higher accuracy indicates better overall performance of the model. However, it is important to consider the class distribution in the dataset when interpreting accuracy.
Precision: Precision measures the proportion of true positives among the samples predicted as positive by the model. A higher precision indicates that the model generates fewer false positives, reducing the risk of false alarms.
Recall: Recall measures the proportion of true positives among all actual positives. A higher recall indicates that the model captures more of the positive instances, reducing the risk of false negatives.
F1-score: F1-score is the harmonic mean of precision and recall, providing a balanced measure of the model’s accuracy and recall capability. A higher F1-score indicates that the model achieves a good balance between precision and recall.
Based on the given results, it can be observed that the accuracy and F1-score are relatively high when the threshold is set to 0.5. The precision and recall values are also at a good level. This indicates that at a threshold of 0.5, the model performs well overall and achieves a balanced trade-off between precision and recall.
However, at a threshold of 0.3, the accuracy and F1-score are slightly lower, indicating a higher rate of false positives (Type I errors). However, the recall is relatively high, indicating better capture of positive instances and a lower rate of false negatives (Type II errors). At a threshold of 0.7, the precision is lower, which may lead to more false negatives, but the recall is higher.