代码
suppressPackageStartupMessages({
library(tidyverse)
library(tidymodels)
library(knitr)
library(naniar)
library(patchwork)
library(corrplot)
library(Hmisc)
library(janitor)
})手动提交后,四个模型的 Kaggle 评分如 Kaggle 评分 1
加载数据分析与建模所需包
suppressPackageStartupMessages({
library(tidyverse)
library(tidymodels)
library(knitr)
library(naniar)
library(patchwork)
library(corrplot)
library(Hmisc)
library(janitor)
})加载训练集和测试集数据
# 加载原始数据
train_full <- read_csv(
'D:/RDirectory/house-price/house-prices-advanced-regression-techniques/train.csv',
show_col_types = FALSE
)
test_full <- read_csv(
'D:/RDirectory/house-price/house-prices-advanced-regression-techniques/test.csv',
show_col_types = FALSE
)
cat("数据加载完成!train.csv:", dim(train_full), "test.csv:", dim(test_full))数据加载完成!train.csv: 1460 81 test.csv: 1459 80
基准模型-数据预处理配方构建与应用
# 创建基准数据副本
train_baseline <- train_full |> mutate(type = "train")
test_baseline <- test_full |> mutate(type = "test", SalePrice = NA)
# 构建基准预处理配方
baseline_recipe <- recipe(SalePrice ~ ., data = train_baseline) |>
update_role(Id, type, new_role = "ID") |>
step_novel(all_nominal_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_mean(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors())
# 训练并应用配方
baseline_recipe_trained <- prep(baseline_recipe, training = train_baseline)
train_processed_baseline <- bake(baseline_recipe_trained, train_baseline) |>
select(-type, -Id)
test_processed_baseline <- bake(baseline_recipe_trained, test_baseline) |>
select(-type, -Id, -SalePrice)基准模型-线性回归训练与性能评估
# 数据分割
set.seed(42)
split_baseline <- initial_split(train_processed_baseline, prop = 0.7)
train_data_baseline <- training(split_baseline)
test_data_baseline <- testing(split_baseline)
# 模型训练与评估
lm_baseline <- linear_reg() |>
set_engine("lm") |>
fit(SalePrice ~ ., data = train_data_baseline)
# 生成预测
pred_baseline <- predict(lm_baseline, test_data_baseline) |>
bind_cols(test_data_baseline |> select(SalePrice)) |>
mutate(residual = SalePrice - .pred)
# 评估指标
metrics_baseline <- pred_baseline |>
metrics(truth = SalePrice, estimate = .pred)
cat(
"基准模型性能 (",
length(coef(lm_baseline$fit)) - 1,
" 个特征):\n",
sep = ""
)基准模型性能 (288 个特征):
metrics_baseline |>
mutate(.estimate = round(.estimate, digits = 3)) |>
kable()| .metric | .estimator | .estimate |
|---|---|---|
| rmse | standard | 50278.07 |
| rsq | standard | 0.69 |
| mae | standard | 20503.94 |
基准模型-预测效果与残差分布可视化
# 预测效果可视化
p1 <- ggplot(pred_baseline, aes(x = SalePrice, y = .pred)) +
geom_point(alpha = 0.3, color = model_colors["基准模型"]) +
geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 0.5) +
labs(title = "基准模型: 真实值 vs 预测值", x = "真实价格", y = "预测价格") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["基准模型"], face = "bold")
)
p2 <- ggplot(pred_baseline, aes(residual)) +
geom_density(fill = model_colors["基准模型"], alpha = 0.7) +
geom_vline(xintercept = 0, color = "red", linewidth = 0.5) +
labs(title = "基准模型: 残差分布", x = "残差") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["基准模型"], face = "bold")
)
p1 + p2由于应用了对数变换,所有性能指标在对数尺度上评估,与基准模型不可直接比较。
完整预处理模型-缺失值模式分析
# 数据探索:缺失值分析
for_check_data <- bind_rows(train_full, test_full) |>
select(-SalePrice)
# 可视化缺失值模式
for_check_data |> gg_miss_upset()for_check_data |> vis_miss()# 变量缺失情况摘要
missing_summary <- for_check_data |>
miss_var_summary() |>
arrange(desc(pct_miss))
missing_summary |>
head(10) |>
kable()| variable | n_miss | pct_miss |
|---|---|---|
| PoolQC | 2909 | 99.7 |
| MiscFeature | 2814 | 96.4 |
| Alley | 2721 | 93.2 |
| Fence | 2348 | 80.4 |
| FireplaceQu | 1420 | 48.6 |
| LotFrontage | 486 | 16.6 |
| GarageYrBlt | 159 | 5.45 |
| GarageFinish | 159 | 5.45 |
| GarageQual | 159 | 5.45 |
| GarageCond | 159 | 5.45 |
完整预处理模型-特征保留策略调整
# =======================================================
# [逻辑变更] 根据新的填充策略,保留原先因缺失率高而被剔除的变量
# PoolQC, Alley, Fence 等属于结构性缺失,含有重要信息,不再丢弃
# =======================================================
# 不再执行批量剔除,保留所有列以便在 Recipe 中进行精细填充
train_clean <- train_full
test_clean <- test_full
cat(
"策略调整:保留 PoolQC, Alley, Fence 等高缺失变量,将在后续步骤中进行结构性填充。\n"
)策略调整:保留 PoolQC, Alley, Fence 等高缺失变量,将在后续步骤中进行结构性填充。
cat("当前保留特征数:", ncol(train_clean))当前保留特征数: 81
完整预处理模型-目标变量分布与对数变换
# 标准化变量名并添加类型标记
train_raw <- train_clean |>
mutate(type = "train") |>
clean_names()
test_raw <- test_clean |>
mutate(type = "test", sale_price = NA) |>
clean_names()
# 目标变量分布检查
p_dist_original <- ggplot(train_raw, aes(sale_price)) +
geom_density(fill = model_colors["基准模型"], alpha = 0.7) +
labs(title = "原始销售价格分布", x = "SalePrice") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["基准模型"], face = "bold")
)
p_dist_log <- ggplot(train_raw, aes(log(sale_price))) +
geom_density(fill = model_colors["完整预处理"], alpha = 0.7) +
labs(title = "对数变换后分布", x = "log(SalePrice)") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["完整预处理"], face = "bold")
)
p_dist_original +
p_dist_log +
plot_annotation(caption = "对数变换显著改善分布形态,使其更接近正态分布")完整预处理模型-变量类型错配识别与校正
# 识别类型错配变量 (唯一值<30视为分类变量)
var_unique_count <- train_raw |>
mutate(across(everything(), as.character)) |>
pivot_longer(everything(), names_to = "var", values_to = "value") |>
group_by(var) |>
summarise(
unique_count = n_distinct(value),
.groups = "drop"
) |>
arrange(unique_count)
var_unique_count |> kable()| var | unique_count |
|---|---|
| type | 1 |
| central_air | 2 |
| street | 2 |
| utilities | 2 |
| alley | 3 |
| bsmt_half_bath | 3 |
| half_bath | 3 |
| land_slope | 3 |
| paved_drive | 3 |
| bsmt_full_bath | 4 |
| exter_qual | 4 |
| fireplaces | 4 |
| full_bath | 4 |
| garage_finish | 4 |
| kitchen_abv_gr | 4 |
| kitchen_qual | 4 |
| land_contour | 4 |
| lot_shape | 4 |
| pool_qc | 4 |
| bldg_type | 5 |
| bsmt_cond | 5 |
| bsmt_exposure | 5 |
| bsmt_qual | 5 |
| exter_cond | 5 |
| fence | 5 |
| garage_cars | 5 |
| heating_qc | 5 |
| lot_config | 5 |
| mas_vnr_type | 5 |
| misc_feature | 5 |
| ms_zoning | 5 |
| yr_sold | 5 |
| electrical | 6 |
| fireplace_qu | 6 |
| foundation | 6 |
| garage_cond | 6 |
| garage_qual | 6 |
| heating | 6 |
| roof_style | 6 |
| sale_condition | 6 |
| bsmt_fin_type1 | 7 |
| bsmt_fin_type2 | 7 |
| functional | 7 |
| garage_type | 7 |
| bedroom_abv_gr | 8 |
| condition2 | 8 |
| house_style | 8 |
| pool_area | 8 |
| roof_matl | 8 |
| condition1 | 9 |
| overall_cond | 9 |
| sale_type | 9 |
| overall_qual | 10 |
| mo_sold | 12 |
| tot_rms_abv_grd | 12 |
| exterior1st | 15 |
| ms_sub_class | 15 |
| exterior2nd | 16 |
| x3ssn_porch | 20 |
| misc_val | 21 |
| low_qual_fin_sf | 24 |
| neighborhood | 25 |
| year_remod_add | 61 |
| screen_porch | 76 |
| garage_yr_blt | 98 |
| lot_frontage | 111 |
| year_built | 112 |
| enclosed_porch | 120 |
| bsmt_fin_sf2 | 144 |
| open_porch_sf | 202 |
| wood_deck_sf | 274 |
| mas_vnr_area | 328 |
| x2nd_flr_sf | 417 |
| garage_area | 441 |
| bsmt_fin_sf1 | 637 |
| sale_price | 663 |
| total_bsmt_sf | 721 |
| x1st_flr_sf | 753 |
| bsmt_unf_sf | 780 |
| gr_liv_area | 861 |
| lot_area | 1073 |
| id | 1460 |
# 提取分类变量
var_factor <- var_unique_count |>
filter(unique_count < 30) |>
pull(var)
cat("识别出", length(var_factor), "个分类变量,包括:\n")识别出 62 个分类变量,包括:
var_factor [1] "type" "central_air" "street" "utilities"
[5] "alley" "bsmt_half_bath" "half_bath" "land_slope"
[9] "paved_drive" "bsmt_full_bath" "exter_qual" "fireplaces"
[13] "full_bath" "garage_finish" "kitchen_abv_gr" "kitchen_qual"
[17] "land_contour" "lot_shape" "pool_qc" "bldg_type"
[21] "bsmt_cond" "bsmt_exposure" "bsmt_qual" "exter_cond"
[25] "fence" "garage_cars" "heating_qc" "lot_config"
[29] "mas_vnr_type" "misc_feature" "ms_zoning" "yr_sold"
[33] "electrical" "fireplace_qu" "foundation" "garage_cond"
[37] "garage_qual" "heating" "roof_style" "sale_condition"
[41] "bsmt_fin_type1" "bsmt_fin_type2" "functional" "garage_type"
[45] "bedroom_abv_gr" "condition2" "house_style" "pool_area"
[49] "roof_matl" "condition1" "overall_cond" "sale_type"
[53] "overall_qual" "mo_sold" "tot_rms_abv_grd" "exterior1st"
[57] "ms_sub_class" "exterior2nd" "x3ssn_porch" "misc_val"
[61] "low_qual_fin_sf" "neighborhood"
完整预处理模型-数据清洗配方构建与应用
# 1. 构建数据清洗配方
full_recipe <- train_raw |>
recipe(sale_price ~ .) |>
update_role(id, type, new_role = "ID") |>
# 结构性NA(无该设施→填None)
step_mutate(
across(
c(
pool_qc,
misc_feature,
alley,
fence,
fireplace_qu,
garage_type,
garage_finish,
garage_qual,
garage_cond,
bsmt_qual,
bsmt_cond,
bsmt_exposure,
bsmt_fin_type1,
bsmt_fin_type2,
mas_vnr_type
),
~ replace_na(as.character(.), "None")
),
# 结构性NA(无设施→填0)
across(
c(
mas_vnr_area,
garage_yr_blt,
garage_area,
garage_cars,
total_bsmt_sf,
bsmt_fin_sf1,
bsmt_fin_sf2,
bsmt_unf_sf
),
~ replace_na(., 0)
)
) |>
# 真实缺失填充+标准化
step_impute_median(lot_frontage) |>
step_mutate(across(all_of(var_factor), as.factor)) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_mean(all_numeric_predictors()) |>
step_novel(all_nominal_predictors()) |>
step_log(sale_price, skip = any(is.na(train_raw$sale_price))) |>
step_dummy(all_nominal_predictors())
# 2. 训练配方并应用
full_recipe_trained <- full_recipe |> prep(training = train_raw)
train_processed_full <- bake(full_recipe_trained, new_data = train_raw) |>
select(-type, -id)
test_processed_full <- bake(full_recipe_trained, new_data = test_raw) |>
select(-type, -id, -sale_price)
# 3. 通用变量名匹配函数(修复.未找到错误)
match_var_name <- function(var_name, ref_vars) {
# 统一清洗规则:转小写+移除所有非字母数字字符(修复gsub的.使用问题)
clean_var <- function(x) {
x_clean <- tolower(x)
x_clean <- gsub("[^a-z0-9]", "", x_clean)
return(x_clean)
}
# 匹配变量名
ref_clean <- sapply(ref_vars, clean_var)
var_clean <- clean_var(var_name)
match_idx <- which(ref_clean == var_clean)
if (length(match_idx) > 0) {
return(ref_vars[match_idx[1]])
} else {
return(NA)
}
}
# 4. 构建缺失值策略表(自动匹配NA率)
strategy_long <- tribble(
~变量名 , ~NA本质 , ~变量类型 , ~处理策略 ,
# 结构缺失-分类
"pool_qc" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"misc_feature" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"alley" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"fence" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"fireplace_qu" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"garage_type" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"garage_finish" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"garage_qual" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"garage_cond" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"bsmt_qual" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"bsmt_cond" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"bsmt_exposure" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"bsmt_fin_type1" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"bsmt_fin_type2" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
"mas_vnr_type" , "无该设施(非缺失)" , "分类变量" , "填'None'(标注无设施)" ,
# 结构缺失-数值
"mas_vnr_area" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"garage_yr_blt" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"garage_area" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"garage_cars" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"total_bsmt_sf" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"bsmt_fin_sf1" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"bsmt_fin_sf2" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
"bsmt_unf_sf" , "无该设施(非缺失)" , "数值变量" , "填0(无设施则数值为0)" ,
# 真实缺失-数值
"lot_frontage" , "数据遗漏(真实缺失)" , "数值变量" , "填中位数(禁止填0)" ,
# 真实缺失-分类
"electrical" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"mszoning" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"utilities" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"exterior1st" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"exterior2nd" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"kitchen_qual" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"functional" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)" ,
"sale_type" , "数据遗漏(真实缺失)" , "分类变量" , "填众数(仅补充数据遗漏)"
) |>
rowwise() |>
mutate(
# 匹配原始变量名并提取NA率
match_var = match_var_name(变量名, missing_summary$variable),
原始NA率 = ifelse(
!is.na(match_var),
sprintf(
"%.3f%%",
missing_summary$pct_miss[missing_summary$variable == match_var]
),
"未匹配"
)
) |>
ungroup() |>
select(-match_var) # 移除中间匹配列
# 5. 渲染表格
strategy_long |> kable()| 变量名 | NA本质 | 变量类型 | 处理策略 | 原始NA率 |
|---|---|---|---|---|
| pool_qc | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 99.657% |
| misc_feature | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 96.403% |
| alley | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 93.217% |
| fence | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 80.439% |
| fireplace_qu | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 48.647% |
| garage_type | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 5.379% |
| garage_finish | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 5.447% |
| garage_qual | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 5.447% |
| garage_cond | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 5.447% |
| bsmt_qual | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 2.775% |
| bsmt_cond | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 2.809% |
| bsmt_exposure | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 2.809% |
| bsmt_fin_type1 | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 2.706% |
| bsmt_fin_type2 | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 2.741% |
| mas_vnr_type | 无该设施(非缺失) | 分类变量 | 填’None’(标注无设施) | 0.822% |
| mas_vnr_area | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.788% |
| garage_yr_blt | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 5.447% |
| garage_area | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.034% |
| garage_cars | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.034% |
| total_bsmt_sf | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.034% |
| bsmt_fin_sf1 | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.034% |
| bsmt_fin_sf2 | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.034% |
| bsmt_unf_sf | 无该设施(非缺失) | 数值变量 | 填0(无设施则数值为0) | 0.034% |
| lot_frontage | 数据遗漏(真实缺失) | 数值变量 | 填中位数(禁止填0) | 16.650% |
| electrical | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.034% |
| mszoning | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.137% |
| utilities | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.069% |
| exterior1st | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.034% |
| exterior2nd | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.034% |
| kitchen_qual | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.034% |
| functional | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.069% |
| sale_type | 数据遗漏(真实缺失) | 分类变量 | 填众数(仅补充数据遗漏) | 0.034% |
完整预处理模型-对数尺度线性回归训练与评估
# 数据分割 (70/30)
set.seed(42)
data_split_full <- initial_split(train_processed_full, prop = 0.7)
train_data_full <- training(data_split_full)
test_data_full <- testing(data_split_full)
cat(
"训练集样本量:",
nrow(train_data_full),
"|测试集样本量:",
nrow(test_data_full)
)训练集样本量: 1021 |测试集样本量: 439
# 模型训练
lm_full <- linear_reg() |>
set_engine("lm") |>
set_mode("regression") |>
fit(sale_price ~ ., data = train_data_full)
# 模型评估
pred_full <- predict(lm_full, new_data = test_data_full) |>
bind_cols(test_data_full |> select(sale_price)) |>
mutate(residual = sale_price - .pred)
metrics_full <- pred_full |> metrics(truth = sale_price, estimate = .pred)
cat(
"完整预处理模型性能 (",
length(coef(lm_full$fit)) - 1,
" 个特征):\n",
sep = ""
)完整预处理模型性能 (455 个特征):
metrics_full |>
mutate(.estimate = round(.estimate, digits = 3)) |>
kable()| .metric | .estimator | .estimate |
|---|---|---|
| rmse | standard | 0.217 |
| rsq | standard | 0.731 |
| mae | standard | 0.109 |
完整预处理模型-对数尺度预测效果可视化
# 预测效果可视化
lm_pred_plot1 <- ggplot(pred_full, aes(x = sale_price, y = .pred)) +
geom_point(alpha = 0.3, color = model_colors["完整预处理"]) +
geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 0.5) +
labs(
x = "真实值 (对数尺度)",
y = "预测值 (对数尺度)",
title = "完整预处理模型: 真实值 vs 预测值"
) +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["完整预处理"], face = "bold")
)
lm_pred_plot2 <- ggplot(pred_full, aes(residual)) +
geom_density(fill = model_colors["完整预处理"], alpha = 0.7) +
geom_vline(xintercept = 0, color = "red", linewidth = 0.5) +
labs(title = "完整预处理模型: 残差分布", x = "残差") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["完整预处理"], face = "bold")
)
lm_pred_plot1 + lm_pred_plot2特征选择后,模型性能显著提升。特别是引入
total_sf后,它很可能取代单一楼层面积成为强力预测因子。
特征优化模型-特征构建与相关性分析准备
# =======================================================
# [新增步骤] 构建 total_sf 特征 (领域知识融合)
# =======================================================
# 1. 定义计算总面积的辅助函数 (处理潜在的NA,例如无地下室情况)
calc_total_sf <- function(d) {
d |>
mutate(
# 使用 rowSums 配合 na.rm=TRUE,确保 NA 被视为 0 进行加和
total_sf = rowSums(
across(c(total_bsmt_sf, x1st_flr_sf, x2nd_flr_sf)),
na.rm = TRUE
)
)
}
# 2. 同步更新所有相关数据集
# 更新 train_raw:用于生成下面的相关性热力图
train_raw <- train_raw |> calc_total_sf()
# 更新建模数据 (train_data_full / test_data_full):
# 确保后续 lm() 训练时能找到 total_sf 这个变量
# 注意:这两份数据在模型2中已经过 clean_names(),变量名已统一为下划线格式
train_data_full <- train_data_full |> calc_total_sf()
test_data_full <- test_data_full |> calc_total_sf()
cat(
"特征工程完成:已创建 'total_sf' (total_bsmt_sf + x1st_flr_sf + x2nd_flr_sf)\n"
)特征工程完成:已创建 'total_sf' (total_bsmt_sf + x1st_flr_sf + x2nd_flr_sf)
# =======================================================
# 原有流程:相关性分析数据集预处理
# =======================================================
# 创建用于相关性分析的数据集 (更新配方以包含新特征)
corr_recipe <- recipe(sale_price ~ ., data = train_raw) |>
update_role(id, new_role = "ID") |>
step_mutate(across(all_of(var_factor), as.factor)) |>
step_novel(all_nominal_predictors()) |>
step_log(sale_price) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_mean(all_numeric_predictors())
corr_recipe_trained <- prep(corr_recipe, training = train_raw)
data_juiced <- juice(corr_recipe_trained) # 仅训练集
# 均值编码:将分类变量替换为sale_price的组均值
for_corr_data <- data_juiced |>
mutate(across(
where(is.factor),
~ tapply(sale_price, ., mean, na.rm = TRUE)[as.character(.)]
))特征优化模型-特征相关性热力图绘制
# 计算相关系数矩阵
cor_matrix <- rcorr(as.matrix(for_corr_data))
r_matrix <- cor_matrix$r
p_matrix <- cor_matrix$P
# 创建热力图
corrplot(
r_matrix,
method = "color",
type = "upper",
order = "hclust",
col = colorRampPalette(c(
"#2c7bb6",
"#abd9e9",
"#ffffbf",
"#fdae61",
"#d7191c"
))(200),
addCoef.col = NULL,
number.cex = .4,
tl.col = "black",
tl.srt = 45,
diag = FALSE,
title = "特征相关系数热力图 (含 TotalSF)",
mar = c(0, 0, 1.5, 0)
)特征优化模型-与房价强相关特征筛选
# 提取与sale_price相关的特征 (|r| > 0.35) 并计算p值
sale_corr <- tibble(
variable = rownames(r_matrix),
corr = r_matrix[, "sale_price"],
p_value = p_matrix[, "sale_price"]
) |>
filter(variable != "sale_price") |>
arrange(desc(abs(corr)))
# 显示Top 20相关特征
sale_corr |> head(20) |> mutate(corr = round(corr, digits = 3)) |> kable()| variable | corr | p_value |
|---|---|---|
| overall_qual | 0.821 | 0 |
| total_sf | 0.777 | 0 |
| neighborhood | 0.756 | 0 |
| gr_liv_area | 0.701 | 0 |
| garage_cars | 0.700 | 0 |
| exter_qual | 0.679 | 0 |
| kitchen_qual | 0.669 | 0 |
| bsmt_qual | 0.664 | 0 |
| garage_area | 0.651 | 0 |
| total_bsmt_sf | 0.612 | 0 |
| full_bath | 0.602 | 0 |
| x1st_flr_sf | 0.597 | 0 |
| garage_finish | 0.590 | 0 |
| year_built | 0.587 | 0 |
| ms_sub_class | 0.575 | 0 |
| year_remod_add | 0.566 | 0 |
| foundation | 0.551 | 0 |
| tot_rms_abv_grd | 0.550 | 0 |
| fireplaces | 0.515 | 0 |
| garage_yr_blt | 0.500 | 0 |
# 筛选强相关特征
strong_corr_vars <- sale_corr |>
filter(abs(corr) > 0.35) |>
pull(variable)
cat(
"识别出",
length(strong_corr_vars),
"个与销售价格强相关的特征 (|r| > 0.35):\n"
)识别出 32 个与销售价格强相关的特征 (|r| > 0.35):
strong_corr_vars [1] "overall_qual" "total_sf" "neighborhood" "gr_liv_area"
[5] "garage_cars" "exter_qual" "kitchen_qual" "bsmt_qual"
[9] "garage_area" "total_bsmt_sf" "full_bath" "x1st_flr_sf"
[13] "garage_finish" "year_built" "ms_sub_class" "year_remod_add"
[17] "foundation" "tot_rms_abv_grd" "fireplaces" "garage_yr_blt"
[21] "heating_qc" "bsmt_fin_type1" "garage_type" "mas_vnr_type"
[25] "mas_vnr_area" "exterior1st" "ms_zoning" "exterior2nd"
[29] "overall_cond" "bsmt_fin_sf1" "sale_condition" "central_air"
特征优化模型-共线性特征消除与最终特征集确定
# 识别高共线性特征对 (|r| > 0.9)
var_pairs <- for_corr_data |>
select(all_of(strong_corr_vars)) |>
cor() |>
as.data.frame() |>
rownames_to_column("var1") |>
pivot_longer(-var1, names_to = "var2", values_to = "corr") |>
filter(var1 < var2, abs(corr) > 0.9)
# 选择保留特征:在共线对中保留与目标相关性更高的特征
# 注意:这里 total_sf 很可能会因为与 gr_liv_area 或 total_bsmt_sf 高度共线而被评估
drop_vars <- var_pairs |>
mutate(
corr1 = sale_corr$corr[match(var1, sale_corr$variable)],
corr2 = sale_corr$corr[match(var2, sale_corr$variable)],
drop = ifelse(abs(corr1) < abs(corr2), var1, var2)
) |>
pull(drop) |>
unique()
# 最终特征集
selected_vars <- setdiff(strong_corr_vars, drop_vars)
# 匹配独热编码后的特征名 (确保 total_sf 若被选中能正确匹配)
dummy_cols <- selected_vars |>
map(
~ str_subset(
colnames(train_data_full),
regex(paste0("^", .x, "_"), ignore_case = TRUE)
)
) |>
unlist() |>
unique()
original_cols <- selected_vars[selected_vars %in% colnames(train_data_full)]
matched_vars <- c(dummy_cols, original_cols) |> unique()
cat("特征选择结果:\n")特征选择结果:
cat("- 初始强相关特征:", length(strong_corr_vars), "\n")- 初始强相关特征: 32
cat("- 消除共线性后:", length(selected_vars), "\n")- 消除共线性后: 31
cat("- 最终用于建模的特征:", length(matched_vars), "\n")- 最终用于建模的特征: 168
if ("total_sf" %in% matched_vars) {
cat("★ 成功保留: total_sf (复合面积特征)\n")
}★ 成功保留: total_sf (复合面积特征)
特征优化模型-精选特征线性回归训练与评估
# 训练特征优化模型 (此时 train_data_full 已包含 total_sf)
lm_reduced <- linear_reg() |>
set_engine("lm") |>
fit(
sale_price ~ .,
data = train_data_full |> select(sale_price, any_of(matched_vars))
)
# 生成预测
pred_reduced <- predict(lm_reduced, test_data_full) |>
bind_cols(test_data_full |> select(sale_price)) |>
mutate(residual = sale_price - .pred)
# 评估指标
metrics_reduced <- pred_reduced |> metrics(truth = sale_price, estimate = .pred)
cat(
"特征优化模型性能 (",
length(coef(lm_reduced$fit)) - 1,
" 个特征):\n",
sep = ""
)特征优化模型性能 (168 个特征):
metrics_reduced |>
mutate(.estimate = round(.estimate, digits = 3)) |>
kable()| .metric | .estimator | .estimate |
|---|---|---|
| rmse | standard | 0.128 |
| rsq | standard | 0.891 |
| mae | standard | 0.090 |
特征优化模型-对数尺度预测效果与残差可视化
# 预测效果可视化
p5 <- ggplot(pred_reduced, aes(x = sale_price, y = .pred)) +
geom_point(alpha = 0.3, color = model_colors["特征优化"]) +
geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 0.5) +
labs(
title = "特征优化模型: 真实值 vs 预测值",
x = "真实价格 (对数尺度)",
y = "预测价格 (对数尺度)"
) +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["特征优化"], face = "bold")
)
p6 <- ggplot(pred_reduced, aes(residual)) +
geom_density(fill = model_colors["特征优化"], alpha = 0.7) +
geom_vline(xintercept = 0, color = "red", linewidth = 0.5) +
labs(title = "特征优化模型: 残差分布", x = "残差") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["特征优化"], face = "bold")
)
p5 + p6离群值处理模型-离群值可视化与识别
# 绘制 GrLivArea 与 SalePrice 的散点图 (使用原始尺度直观展示)
p_outlier <- ggplot(train_raw, aes(x = gr_liv_area, y = sale_price)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
geom_vline(xintercept = 4000, linetype = "dashed", color = "orange") +
annotate(
"text",
x = 4600,
y = 250000,
label = "Outliers",
color = "orange",
fontface = "bold",
size = 5
) +
labs(
title = "居住面积与房价的关系:寻找离群值",
subtitle = "右下角的两个点:面积极大(>4000 sqft)但价格极低",
x = "居住面积 (sqft)",
y = "销售价格 ($)"
) +
theme_bw()
p_outlier图表洞察:散点图右下角清晰地显示了两个极端异常点:它们的居住面积超过4600平方英尺,但售价却低于20万美元。这在经济学上是不合理的,具有极高的杠杆值(Leverage),会像杠杆一样强行将回归线拉向右下方,导致回归系数(斜率)被严重低估。
离群值处理模型-剔除离群点并重新训练
我们在模型3(特征优化)的基础上,仅仅剔除训练集中的这两个离群点,其余设置(特征子集)保持完全一致,以观察离群值处理的纯粹贡献。
# 剔除 GrLivArea > 4000 的样本 (注意:仅在训练集中剔除)
# 在处理后的数据中,gr_liv_area 仍然是数值型且未被缩放,可以直接比较
train_data_outlier_removed <- train_data_full |>
filter(gr_liv_area < 4000)
cat("剔除前训练集样本量:", nrow(train_data_full), "\n")剔除前训练集样本量: 1021
cat("剔除后训练集样本量:", nrow(train_data_outlier_removed), "\n")剔除后训练集样本量: 1018
cat(
"剔除样本数:",
nrow(train_data_full) - nrow(train_data_outlier_removed),
"\n"
)剔除样本数: 3
# 使用与模型3相同的特征集 (matched_vars) 重新训练
lm_outlier <- linear_reg() |>
set_engine("lm") |>
fit(
sale_price ~ .,
data = train_data_outlier_removed |>
select(sale_price, any_of(matched_vars))
)
# 生成预测 (测试集保持不变,以公平比较)
pred_outlier <- predict(lm_outlier, test_data_full) |>
bind_cols(test_data_full |> select(sale_price)) |>
mutate(residual = sale_price - .pred)
# 评估指标
metrics_outlier <- pred_outlier |> metrics(truth = sale_price, estimate = .pred)
cat(
"离群值处理模型性能 (",
length(coef(lm_outlier$fit)) - 1,
" 个特征):\n",
sep = ""
)离群值处理模型性能 (168 个特征):
metrics_outlier |>
mutate(.estimate = round(.estimate, digits = 3)) |>
kable()| .metric | .estimator | .estimate |
|---|---|---|
| rmse | standard | 0.114 |
| rsq | standard | 0.913 |
| mae | standard | 0.082 |
离群值处理模型-预测效果与残差可视化
# 预测效果可视化
p7 <- ggplot(pred_outlier, aes(x = sale_price, y = .pred)) +
geom_point(alpha = 0.3, color = model_colors["离群值处理"]) +
geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 0.5) +
labs(
title = "离群值处理模型: 真实值 vs 预测值",
x = "真实价格 (对数尺度)",
y = "预测价格 (对数尺度)"
) +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["离群值处理"], face = "bold")
)
p8 <- ggplot(pred_outlier, aes(residual)) +
geom_density(fill = model_colors["离群值处理"], alpha = 0.7) +
geom_vline(xintercept = 0, color = "red", linewidth = 0.5) +
labs(title = "离群值处理模型: 残差分布", x = "残差") +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["离群值处理"], face = "bold")
)
p7 + p8关键性能洞察:
四模型-关键性能指标提取与对比
# 提取关键指标(包含性能+复杂度)
comparison <- bind_rows(
# 基准模型
metrics_baseline |>
filter(.metric %in% c("rmse", "rsq")) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = "基准模型", features = length(coef(lm_baseline$fit)) - 1),
# 完整预处理模型
metrics_full |>
filter(.metric %in% c("rmse", "rsq")) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = "完整预处理", features = length(coef(lm_full$fit)) - 1),
# 特征优化模型
metrics_reduced |>
filter(.metric %in% c("rmse", "rsq")) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = "特征优化", features = length(coef(lm_reduced$fit)) - 1),
# 离群值处理模型
metrics_outlier |>
filter(.metric %in% c("rmse", "rsq")) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = "离群值处理", features = length(coef(lm_outlier$fit)) - 1)
) |>
select(model, rsq, rmse, features) |>
mutate(complexity = features / max(features))# 输出结果
comparison %>%
mutate(across(c(rsq, rmse, complexity), ~ round(.x, digits = 3))) %>%
kable()| model | rsq | rmse | features | complexity |
|---|---|---|---|---|
| 基准模型 | 0.690 | 50278.065 | 288 | 0.633 |
| 完整预处理 | 0.731 | 0.217 | 455 | 1.000 |
| 特征优化 | 0.891 | 0.128 | 168 | 0.369 |
| 离群值处理 | 0.913 | 0.114 | 168 | 0.369 |
四模型-预测效果与残差分布可视化对比
# 定义模型顺序
model_order <- c("基准模型", "完整预处理", "特征优化", "离群值处理")
# 准备比较数据
pred_all <- bind_rows(
pred_baseline |>
mutate(model = factor("基准模型", levels = model_order)) |>
rename(sale_price = SalePrice),
pred_full |> mutate(model = factor("完整预处理", levels = model_order)),
pred_reduced |> mutate(model = factor("特征优化", levels = model_order)),
pred_outlier |> mutate(model = factor("离群值处理", levels = model_order))
)
# 预测效果比较
p_compare1 <- ggplot(pred_all, aes(x = sale_price, y = .pred, color = model)) +
geom_point(alpha = 0.3) +
geom_abline(
slope = 1,
intercept = 0,
color = "red",
linewidth = 0.5
) +
facet_wrap(~model, nrow = 1, scales = "free") +
labs(title = "四模型预测效果对比", x = "真实值", y = "预测值") +
scale_color_manual(values = model_colors) +
theme_bw() +
theme(
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14),
legend.position = "none"
)
# 残差分布比较
p_compare2 <- ggplot(pred_all, aes(x = residual, fill = model)) +
geom_density(alpha = 0.7) +
facet_wrap(~model, nrow = 1, scales = "free") +
labs(title = "四模型残差分布对比", x = "残差", y = "密度") +
scale_fill_manual(values = model_colors) +
scale_color_manual(values = model_colors) +
theme_bw() +
theme(
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14),
legend.position = "none"
)
p_compare1 /
p_compare2 +
plot_annotation(
caption = "注:基准模型在原始价格尺度评估,其他三个模型在对数尺度评估"
)最终提交-对数预测值还原与提交文件生成
此处的结果是自评模型指标。最终如果要提交给 Kaggle 官网评分实际应该使用 test.csv(无真实房价)去预测:我在 小节 9 中实现了。
# =======================================================
# 关键修复:同步特征工程
# 模型 lm_outlier 依赖 total_sf,因此必须在预测前添加到测试集
# calc_total_sf 函数已在第3节定义,此处直接调用
# =======================================================
test_for_submission <- test_processed_full |>
calc_total_sf()
# 生成最终预测 (使用模型4:lm_outlier)
final_predictions <- predict(lm_outlier, test_for_submission) |>
mutate(sale_price = exp(.pred)) # 指数变换还原
# 创建提交文件
submission <- tibble(
Id = test_full$Id,
SalePrice = final_predictions$sale_price
)
cat(
"提交文件生成完毕!包含特征:",
ncol(test_for_submission)
)提交文件生成完毕!包含特征: 456
submission |> head() |> kable()| Id | SalePrice |
|---|---|
| 1461 | 124864.1 |
| 1462 | 146914.3 |
| 1463 | 192139.9 |
| 1464 | 197982.5 |
| 1465 | 200248.6 |
| 1466 | 170782.0 |
# 保存提交文件 (取消注释以实际保存)
# write_csv(submission, "D:/RDirectory/house-price/submission.csv")结论-离群值处理模型的最终形式呈现与重要性可视化
# 最终模型数学形式
model_coef <- tidy(lm_outlier)
# 提取截距
intercept <- model_coef$estimate[model_coef$term == "(Intercept)"]
coef_terms <- model_coef %>%
filter(term != "(Intercept)") %>%
mutate(
term_escaped = str_replace_all(term, "_", "\\\\_"),
# 处理 NA
est_fmt = ifelse(
is.na(estimate),
"(完全共线性)NA",
format(round(estimate, 6), nsmall = 6, scientific = FALSE)
),
# 符号处理
term_str = ifelse(
is.na(estimate),
paste0(" & + ", est_fmt, " \\cdot \\text{", term_escaped, "} \\\\"),
ifelse(
estimate >= 0,
paste0(" & + ", est_fmt, " \\cdot \\text{", term_escaped, "} \\\\"),
paste0(
" & - ",
substr(est_fmt, 2, nchar(est_fmt)),
" \\cdot \\text{",
term_escaped,
"} \\\\"
)
)
)
) %>%
pull(term_str)
formula_text <- paste0(
"\\ln(\\text{y}) &= ",
format(round(intercept, 6), nsmall = 6, scientific = FALSE),
" \\\\",
paste(coef_terms, collapse = "")
)
asis_output(paste0(
"$$\n\\begin{aligned}\n",
formula_text,
"\n\\end{aligned}\n$$"
))\[ \begin{aligned} \ln(\text{y}) &= 4.642633 \\ & + 0.219175 \cdot \text{overall\_qual\_X2} \\ & + 0.265257 \cdot \text{overall\_qual\_X3} \\ & + 0.371312 \cdot \text{overall\_qual\_X4} \\ & + 0.398419 \cdot \text{overall\_qual\_X5} \\ & + 0.414520 \cdot \text{overall\_qual\_X6} \\ & + 0.458526 \cdot \text{overall\_qual\_X7} \\ & + 0.507754 \cdot \text{overall\_qual\_X8} \\ & + 0.622270 \cdot \text{overall\_qual\_X9} \\ & + 0.609085 \cdot \text{overall\_qual\_X10} \\ & + (完全共线性)NA \cdot \text{overall\_qual\_new} \\ & - 0.016982 \cdot \text{neighborhood\_Blueste} \\ & + 0.065792 \cdot \text{neighborhood\_BrDale} \\ & + 0.079093 \cdot \text{neighborhood\_BrkSide} \\ & + 0.126313 \cdot \text{neighborhood\_ClearCr} \\ & + 0.029681 \cdot \text{neighborhood\_CollgCr} \\ & + 0.166229 \cdot \text{neighborhood\_Crawfor} \\ & + 0.008196 \cdot \text{neighborhood\_Edwards} \\ & + 0.033612 \cdot \text{neighborhood\_Gilbert} \\ & + 0.064077 \cdot \text{neighborhood\_IDOTRR} \\ & - 0.113763 \cdot \text{neighborhood\_MeadowV} \\ & + 0.029567 \cdot \text{neighborhood\_Mitchel} \\ & + 0.021654 \cdot \text{neighborhood\_NAmes} \\ & + 0.068337 \cdot \text{neighborhood\_NoRidge} \\ & - 0.002691 \cdot \text{neighborhood\_NPkVill} \\ & + 0.052362 \cdot \text{neighborhood\_NridgHt} \\ & - 0.015907 \cdot \text{neighborhood\_NWAmes} \\ & + 0.011451 \cdot \text{neighborhood\_OldTown} \\ & + 0.008533 \cdot \text{neighborhood\_Sawyer} \\ & + 0.026259 \cdot \text{neighborhood\_SawyerW} \\ & + 0.029659 \cdot \text{neighborhood\_Somerst} \\ & + 0.157630 \cdot \text{neighborhood\_StoneBr} \\ & + 0.057946 \cdot \text{neighborhood\_SWISU} \\ & + 0.068783 \cdot \text{neighborhood\_Timber} \\ & + 0.101966 \cdot \text{neighborhood\_Veenker} \\ & + (完全共线性)NA \cdot \text{neighborhood\_new} \\ & + 0.529595 \cdot \text{garage\_cars\_X1} \\ & + 0.561213 \cdot \text{garage\_cars\_X2} \\ & + 0.588968 \cdot \text{garage\_cars\_X3} \\ & + 0.610228 \cdot \text{garage\_cars\_X4} \\ & + (完全共线性)NA \cdot \text{garage\_cars\_new} \\ & - 0.052532 \cdot \text{exter\_qual\_Fa} \\ & - 0.052559 \cdot \text{exter\_qual\_Gd} \\ & - 0.054141 \cdot \text{exter\_qual\_TA} \\ & + (完全共线性)NA \cdot \text{exter\_qual\_new} \\ & - 0.056338 \cdot \text{kitchen\_qual\_Fa} \\ & - 0.028566 \cdot \text{kitchen\_qual\_Gd} \\ & - 0.037702 \cdot \text{kitchen\_qual\_TA} \\ & + (完全共线性)NA \cdot \text{kitchen\_qual\_new} \\ & - 0.038531 \cdot \text{bsmt\_qual\_Fa} \\ & - 0.015493 \cdot \text{bsmt\_qual\_Gd} \\ & - 0.108546 \cdot \text{bsmt\_qual\_None} \\ & - 0.045573 \cdot \text{bsmt\_qual\_TA} \\ & + (完全共线性)NA \cdot \text{bsmt\_qual\_new} \\ & - 0.000785 \cdot \text{full\_bath\_X1} \\ & - 0.005853 \cdot \text{full\_bath\_X2} \\ & + 0.087156 \cdot \text{full\_bath\_X3} \\ & + (完全共线性)NA \cdot \text{full\_bath\_new} \\ & + (完全共线性)NA \cdot \text{garage\_finish\_None} \\ & + 0.004561 \cdot \text{garage\_finish\_RFn} \\ & - 0.016006 \cdot \text{garage\_finish\_Unf} \\ & + (完全共线性)NA \cdot \text{garage\_finish\_new} \\ & - 0.062775 \cdot \text{ms\_sub\_class\_X30} \\ & + 0.005872 \cdot \text{ms\_sub\_class\_X40} \\ & + 0.021507 \cdot \text{ms\_sub\_class\_X45} \\ & + 0.012881 \cdot \text{ms\_sub\_class\_X50} \\ & + 0.028079 \cdot \text{ms\_sub\_class\_X60} \\ & + 0.058853 \cdot \text{ms\_sub\_class\_X70} \\ & + 0.046292 \cdot \text{ms\_sub\_class\_X75} \\ & + 0.025969 \cdot \text{ms\_sub\_class\_X80} \\ & + 0.007732 \cdot \text{ms\_sub\_class\_X85} \\ & - 0.045402 \cdot \text{ms\_sub\_class\_X90} \\ & - 0.024256 \cdot \text{ms\_sub\_class\_X120} \\ & - 0.104291 \cdot \text{ms\_sub\_class\_X160} \\ & + 0.015416 \cdot \text{ms\_sub\_class\_X180} \\ & - 0.006409 \cdot \text{ms\_sub\_class\_X190} \\ & + (完全共线性)NA \cdot \text{ms\_sub\_class\_new} \\ & + 0.003921 \cdot \text{foundation\_CBlock} \\ & + 0.031825 \cdot \text{foundation\_PConc} \\ & + 0.029270 \cdot \text{foundation\_Slab} \\ & + 0.057477 \cdot \text{foundation\_Stone} \\ & - 0.117711 \cdot \text{foundation\_Wood} \\ & + (完全共线性)NA \cdot \text{foundation\_new} \\ & - 0.008519 \cdot \text{tot\_rms\_abv\_grd\_X3} \\ & + 0.027807 \cdot \text{tot\_rms\_abv\_grd\_X4} \\ & + 0.021120 \cdot \text{tot\_rms\_abv\_grd\_X5} \\ & + 0.033355 \cdot \text{tot\_rms\_abv\_grd\_X6} \\ & + 0.021633 \cdot \text{tot\_rms\_abv\_grd\_X7} \\ & + 0.037966 \cdot \text{tot\_rms\_abv\_grd\_X8} \\ & + 0.010221 \cdot \text{tot\_rms\_abv\_grd\_X9} \\ & - 0.023217 \cdot \text{tot\_rms\_abv\_grd\_X10} \\ & - 0.070608 \cdot \text{tot\_rms\_abv\_grd\_X11} \\ & + 0.000620 \cdot \text{tot\_rms\_abv\_grd\_X12} \\ & + 0.097719 \cdot \text{tot\_rms\_abv\_grd\_X14} \\ & + (完全共线性)NA \cdot \text{tot\_rms\_abv\_grd\_new} \\ & + 0.040485 \cdot \text{fireplaces\_X1} \\ & + 0.071690 \cdot \text{fireplaces\_X2} \\ & + 0.081850 \cdot \text{fireplaces\_X3} \\ & + (完全共线性)NA \cdot \text{fireplaces\_new} \\ & - 0.031094 \cdot \text{heating\_qc\_Fa} \\ & - 0.007484 \cdot \text{heating\_qc\_Gd} \\ & + (完全共线性)NA \cdot \text{heating\_qc\_Po} \\ & - 0.018161 \cdot \text{heating\_qc\_TA} \\ & + (完全共线性)NA \cdot \text{heating\_qc\_new} \\ & - 0.004813 \cdot \text{bsmt\_fin\_type1\_BLQ} \\ & + 0.006263 \cdot \text{bsmt\_fin\_type1\_GLQ} \\ & - 0.028530 \cdot \text{bsmt\_fin\_type1\_LwQ} \\ & + (完全共线性)NA \cdot \text{bsmt\_fin\_type1\_None} \\ & + 0.002005 \cdot \text{bsmt\_fin\_type1\_Rec} \\ & - 0.032631 \cdot \text{bsmt\_fin\_type1\_Unf} \\ & + (完全共线性)NA \cdot \text{bsmt\_fin\_type1\_new} \\ & + 0.142935 \cdot \text{garage\_type\_Attchd} \\ & + 0.086754 \cdot \text{garage\_type\_Basment} \\ & + 0.140628 \cdot \text{garage\_type\_BuiltIn} \\ & + 0.095308 \cdot \text{garage\_type\_CarPort} \\ & + 0.133089 \cdot \text{garage\_type\_Detchd} \\ & + (完全共线性)NA \cdot \text{garage\_type\_None} \\ & + (完全共线性)NA \cdot \text{garage\_type\_new} \\ & + 0.062277 \cdot \text{mas\_vnr\_type\_BrkFace} \\ & + 0.050474 \cdot \text{mas\_vnr\_type\_None} \\ & + 0.091951 \cdot \text{mas\_vnr\_type\_Stone} \\ & + (完全共线性)NA \cdot \text{mas\_vnr\_type\_new} \\ & + 0.033251 \cdot \text{exterior1st\_AsphShn} \\ & - 0.105971 \cdot \text{exterior1st\_BrkComm} \\ & + 0.135246 \cdot \text{exterior1st\_BrkFace} \\ & - 0.146946 \cdot \text{exterior1st\_CBlock} \\ & + 0.021818 \cdot \text{exterior1st\_CemntBd} \\ & + 0.004856 \cdot \text{exterior1st\_HdBoard} \\ & + 0.023323 \cdot \text{exterior1st\_ImStucc} \\ & + 0.032309 \cdot \text{exterior1st\_MetalSd} \\ & + 0.021687 \cdot \text{exterior1st\_Plywood} \\ & + 0.055941 \cdot \text{exterior1st\_Stone} \\ & + 0.074018 \cdot \text{exterior1st\_Stucco} \\ & + 0.011356 \cdot \text{exterior1st\_VinylSd} \\ & + 0.011662 \cdot \text{exterior1st\_Wd.Sdng} \\ & + 0.053229 \cdot \text{exterior1st\_WdShing} \\ & + (完全共线性)NA \cdot \text{exterior1st\_new} \\ & + 0.432982 \cdot \text{ms\_zoning\_FV} \\ & + 0.375894 \cdot \text{ms\_zoning\_RH} \\ & + 0.367677 \cdot \text{ms\_zoning\_RL} \\ & + 0.304843 \cdot \text{ms\_zoning\_RM} \\ & + (完全共线性)NA \cdot \text{ms\_zoning\_new} \\ & - 0.245044 \cdot \text{overall\_cond\_X2} \\ & - 0.298694 \cdot \text{overall\_cond\_X3} \\ & - 0.156982 \cdot \text{overall\_cond\_X4} \\ & - 0.120703 \cdot \text{overall\_cond\_X5} \\ & - 0.075548 \cdot \text{overall\_cond\_X6} \\ & - 0.015817 \cdot \text{overall\_cond\_X7} \\ & - 0.033317 \cdot \text{overall\_cond\_X8} \\ & + (完全共线性)NA \cdot \text{overall\_cond\_X9} \\ & + (完全共线性)NA \cdot \text{overall\_cond\_new} \\ & + 0.098231 \cdot \text{sale\_condition\_AdjLand} \\ & + 0.022075 \cdot \text{sale\_condition\_Alloca} \\ & + 0.007488 \cdot \text{sale\_condition\_Family} \\ & + 0.069884 \cdot \text{sale\_condition\_Normal} \\ & + 0.109965 \cdot \text{sale\_condition\_Partial} \\ & + (完全共线性)NA \cdot \text{sale\_condition\_new} \\ & + 0.047269 \cdot \text{central\_air\_Y} \\ & + (完全共线性)NA \cdot \text{central\_air\_new} \\ & + 0.000150 \cdot \text{total\_sf} \\ & + 0.000108 \cdot \text{gr\_liv\_area} \\ & + 0.000139 \cdot \text{garage\_area} \\ & - 0.000058 \cdot \text{total\_bsmt\_sf} \\ & + 0.000013 \cdot \text{x1st\_flr\_sf} \\ & + 0.002219 \cdot \text{year\_built} \\ & + 0.000753 \cdot \text{year\_remod\_add} \\ & - 0.000314 \cdot \text{garage\_yr\_blt} \\ & + 0.000019 \cdot \text{mas\_vnr\_area} \\ & + 0.000063 \cdot \text{bsmt\_fin\_sf1} \\ \end{aligned} \]
# 提取标准化系数
model_coef |>
filter(term != "(Intercept)") |>
mutate(term = str_replace(term, "sale_price_", "")) |>
arrange(desc(abs(estimate))) |>
slice_max(order_by = abs(estimate), n = 20) |>
ggplot(aes(
x = fct_reorder(term, abs(estimate)),
y = estimate
)) +
geom_col(fill = model_colors["离群值处理"], color = "black", alpha = 0.7) +
coord_flip() +
labs(
title = "Top 20 特征重要性 (离群值处理模型)",
x = "特征(独热)",
y = "标准化系数"
) +
theme_bw() +
theme(
plot.title = element_text(color = model_colors["离群值处理"], face = "bold")
)total_sf特征成为最强预测因子之一(r = 0.777),证明领域知识融合的价值GrLivArea>4000且售价<20万美元),\(R^2\)从0.89提升至0.91PoolQC、Alley等高缺失率变量中的信息价值,而非简单剔除OverallQual):9分(Excellent)溢价最高,8分(Very Good)、7分(Good)次之。评分每提升1级,对数价格增加0.22-0.62(对应实际价格增长25-86%)OverallCond):2-3分(Poor/Fair)系数为-0.25至-0.30,折价剧烈;优质状态维护的边际价值极高StoneBr (石溪, 系数0.158), ClearCr (清溪, 0.126), Crawfor (0.166)MeadowV (草原村, -0.114)NoRidge (北岭, 0.068), Veenker (0.102), CollgCr (0.030)GarageCars):4车位(0.610) > 3车位(0.589) > 2车位(0.561) > 1车位(0.530),每增加1车位增值约5-8%FullBath):3个地上完整浴室系数0.087,是高端市场核心阈值Fireplaces):3个(0.082) > 2个(0.072) > 1个(0.040),边际效应递减ExterQual):虽然所有级别系数为负(基准为Ex),但Gd/TA级别仅轻微折价(-0.053),Fa级别折价更明显(-0.053)Foundation):木质地基严重折价(-0.118);混凝土/石质地基为优质标准(PConc 0.032, Stone 0.057)Exterior1st):砖面(BrkFace 0.135)/石材(Stone 0.056)溢价显著;沥青瓦/普通砖墙折价total_sf):系数0.00015,每增加1000平方英尺,对数价格增加0.15(对应实际价格增长约16%)GrLivArea):系数0.000108,与total_sf存在共线性,但总面积的边际价值更高TotalBsmtSF):系数-0.000058,负值表明在控制总面积后,单独增加地下室面积可能折价YearBuilt):系数0.002219,每提前10年建造,对数价格减少0.022(约折价2.2%)YearRemodAdd):系数0.000753,近10年翻新可部分抵消房龄折价KitchenQual):Gd级别仅轻微折价(-0.029),Fa/TA级别折价更明显(-0.056/-0.038)BsmtQual):无地下室(“None”)严重折价(-0.109),优质地下室(Gd)仅轻微折价(-0.015)CentralAir):有空调溢价0.047,是现代住宅的基础配置定义统一的输出路径,并加载所需的数据包。
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
options(scipen = 999)
suppressPackageStartupMessages({
library(tidyverse)
library(tidymodels)
library(janitor)
library(Hmisc)
})
# 定义输出路径
OUTPUT_PATH <- "D:/RDirectory/house-price/"
if (!dir.exists(OUTPUT_PATH)) {
dir.create(OUTPUT_PATH, recursive = TRUE)
}
# 加载原始数据
train_full <- read_csv(
'D:/RDirectory/house-price/house-prices-advanced-regression-techniques/train.csv',
show_col_types = FALSE
)
test_full <- read_csv(
'D:/RDirectory/house-price/house-prices-advanced-regression-techniques/test.csv',
show_col_types = FALSE
)
cat("数据加载完成。输出路径为:", OUTPUT_PATH)数据加载完成。输出路径为: D:/RDirectory/house-price/
使用全量训练集训练,不做复杂清洗,直接生成预测结果。
# 1.1 数据准备
train_baseline <- train_full |> mutate(type = "train")
test_baseline <- test_full |> mutate(type = "test", SalePrice = NA)
# 1.2 配方构建
baseline_recipe <- recipe(SalePrice ~ ., data = train_baseline) |>
update_role(Id, type, new_role = "ID") |>
step_novel(all_nominal_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_mean(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors())
# 1.3 训练配方 (使用全量数据)
baseline_prep <- prep(baseline_recipe, training = train_baseline)
train_processed_base <- bake(baseline_prep, train_baseline)
test_processed_base <- bake(baseline_prep, test_baseline)
# 1.4 模型训练 (使用全量数据)
lm_baseline <- linear_reg() |>
set_engine("lm") |>
fit(SalePrice ~ ., data = train_processed_base |> select(-Id, -type))
# 1.5 预测与导出
pred_baseline <- predict(lm_baseline, test_processed_base)
submission_1 <- tibble(
Id = test_full$Id,
SalePrice = pred_baseline$.pred
)
write_csv(submission_1, paste0(OUTPUT_PATH, "submission_1_baseline.csv"))
cat(">>> 模型 1 完成。已导出: submission_1_baseline.csv\n")>>> 模型 1 完成。已导出: submission_1_baseline.csv
引入精细化缺失值填充、变量类型校正和对数变换。
# 2.1 数据清洗准备
train_clean <- train_full |> clean_names()
test_clean <- test_full |> mutate(sale_price = NA) |> clean_names()
# 2.2 识别分类变量 (唯一值 < 30)
var_unique_count <- train_clean |>
select(-sale_price) |>
summarise(across(everything(), n_distinct)) |>
pivot_longer(everything(), names_to = "var", values_to = "count")
var_factor <- var_unique_count |> filter(count < 30) |> pull(var)
# 2.3 构建配方
full_recipe <- train_clean |>
recipe(sale_price ~ .) |>
update_role(id, new_role = "ID") |>
# 结构性填充
step_mutate(
across(
c(
pool_qc,
misc_feature,
alley,
fence,
fireplace_qu,
garage_type,
garage_finish,
garage_qual,
garage_cond,
bsmt_qual,
bsmt_cond,
bsmt_exposure,
bsmt_fin_type1,
bsmt_fin_type2,
mas_vnr_type
),
~ replace_na(as.character(.), "None")
),
across(
c(
mas_vnr_area,
garage_yr_blt,
garage_area,
garage_cars,
total_bsmt_sf,
bsmt_fin_sf1,
bsmt_fin_sf2,
bsmt_unf_sf
),
~ replace_na(., 0)
)
) |>
step_impute_median(lot_frontage) |>
step_mutate(across(all_of(var_factor), as.factor)) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_mean(all_numeric_predictors()) |>
step_novel(all_nominal_predictors()) |>
step_log(sale_price, skip = TRUE) |>
step_dummy(all_nominal_predictors())
# 2.4 处理全量数据
full_prep <- prep(full_recipe, training = train_clean)
# 注意:recipe中的log变换通常在bake测试集时不生效,所以手动确保训练集log化
train_processed_full <- bake(full_prep, new_data = train_clean) |>
mutate(sale_price = log(sale_price))
test_processed_full <- bake(full_prep, new_data = test_clean)
# 2.5 训练与预测
lm_full <- linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = train_processed_full |> select(-id))
submission_2 <- tibble(
Id = test_full$Id,
SalePrice = exp(predict(lm_full, test_processed_full)$.pred) # 指数还原
)
write_csv(submission_2, paste0(OUTPUT_PATH, "submission_2_full_preproc.csv"))
cat(">>> 模型 2 完成。已导出: submission_2_full_preproc.csv\n")>>> 模型 2 完成。已导出: submission_2_full_preproc.csv
构建 total_sf,并基于全量训练集计算相关性,筛选特征。
# 3.1 特征工程函数
calc_total_sf <- function(d) {
d |>
mutate(
total_sf = rowSums(
across(c(total_bsmt_sf, x1st_flr_sf, x2nd_flr_sf)),
na.rm = TRUE
)
)
}
train_eng <- train_processed_full |> calc_total_sf()
test_eng <- test_processed_full |> calc_total_sf()
# 3.2 准备相关性分析数据 (基于原始数据+新特征)
corr_data_prep <- train_clean |> calc_total_sf()
# 简单的数值化处理用于计算相关性
corr_recipe <- recipe(sale_price ~ ., data = corr_data_prep) |>
update_role(id, new_role = "ID") |>
step_mutate(across(all_of(var_factor), as.factor)) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_mean(all_numeric_predictors()) |>
prep()
data_for_corr <- juice(corr_recipe) |> select(-id)
# 均值编码 (用于处理因子变量的相关性计算)
data_for_corr_num <- data_for_corr |>
mutate(across(
where(is.factor),
~ tapply(sale_price, ., mean, na.rm = TRUE)[as.character(.)]
)) |>
mutate(sale_price = log(sale_price))
# 3.3 计算相关性矩阵
r_matrix <- rcorr(as.matrix(data_for_corr_num))$r
# 3.4 筛选强相关特征 (|r| > 0.35)
sale_corr <- tibble(
variable = rownames(r_matrix),
corr = r_matrix[, "sale_price"]
) |>
filter(variable != "sale_price") |>
filter(!is.na(corr)) # 剔除计算失败的NA
strong_vars <- sale_corr |>
filter(abs(corr) > 0.35) |>
pull(variable)
# 3.5 消除共线性 (|r| > 0.9)
var_pairs <- data_for_corr_num |>
select(all_of(strong_vars)) |>
cor() |>
as.data.frame() |>
rownames_to_column("var1") |>
pivot_longer(-var1, names_to = "var2", values_to = "corr") |>
filter(var1 < var2, abs(corr) > 0.9)
drop_vars <- var_pairs |>
mutate(
c1 = abs(sale_corr$corr[match(var1, sale_corr$variable)]),
c2 = abs(sale_corr$corr[match(var2, sale_corr$variable)]),
drop = ifelse(c1 < c2, var1, var2)
) |>
pull(drop) |>
unique()
selected_vars <- setdiff(strong_vars, drop_vars)
# 3.6 关键修复:匹配真实存在的列名
# 确保筛选出的变量名在经过独热编码后的 train_eng 中能找到对应列
# 正则匹配:匹配 "变量名" 或 "变量名_xxx" (独热编码格式)
all_cols <- colnames(train_eng)
matched_vars <- character(0)
for (v in selected_vars) {
# 精确匹配数值变量 或 前缀匹配独热变量
matches <- all_cols[all_cols == v | str_detect(all_cols, paste0("^", v, "_"))]
matched_vars <- c(matched_vars, matches)
}
matched_vars <- unique(matched_vars)
matched_vars <- matched_vars[!is.na(matched_vars)] # 再次确保无NA
cat("特征筛选完毕,入选特征数:", length(matched_vars), "\n")特征筛选完毕,入选特征数: 168
# 3.7 训练与预测
lm_reduced <- linear_reg() |>
set_engine("lm") |>
fit(
sale_price ~ .,
data = train_eng |> select(sale_price, any_of(matched_vars))
)
submission_3 <- tibble(
Id = test_full$Id,
SalePrice = exp(predict(lm_reduced, test_eng)$.pred)
)
write_csv(submission_3, paste0(OUTPUT_PATH, "submission_3_feature_opt.csv"))
cat(">>> 模型 3 完成。已导出: submission_3_feature_opt.csv\n")>>> 模型 3 完成。已导出: submission_3_feature_opt.csv
基于特征优化的结果,进一步剔除全量训练集中的离群点 (GrLivArea > 4000)。
# 4.1 剔除离群值
train_no_outliers <- train_eng |>
filter(gr_liv_area < 4000)
cat("剔除离群值后样本量:", nrow(train_no_outliers), "\n")剔除离群值后样本量: 1456
# 4.2 训练 (使用 matched_vars)
lm_outlier <- linear_reg() |>
set_engine("lm") |>
fit(
sale_price ~ .,
data = train_no_outliers |> select(sale_price, any_of(matched_vars))
)
# 4.3 预测 (Test集保持完整,不去掉任何行)
submission_4 <- tibble(
Id = test_full$Id,
SalePrice = exp(predict(lm_outlier, test_eng)$.pred)
)
write_csv(submission_4, paste0(OUTPUT_PATH, "submission_4_outlier_removed.csv"))
cat(">>> 模型 4 完成。已导出: submission_4_outlier_removed.csv\n")>>> 模型 4 完成。已导出: submission_4_outlier_removed.csv
cat("所有可提交至 Kaggle 的 csv 文件导出完毕。请检查文件夹:\n", OUTPUT_PATH)所有可提交至 Kaggle 的 csv 文件导出完毕。请检查文件夹:
D:/RDirectory/house-price/
手动提交后,四个模型的 Kaggle 评分如下: