This is our Markdown document.
1.环境准备与包加载 在本阶段,我们检查并安装所需的 R 包,并加载分析所需的核心库与数据。
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(nortest)
## Warning: package 'nortest' was built under R version 4.5.2
library(PMCMRplus)
## Warning: package 'PMCMRplus' was built under R version 4.5.2
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.5.2
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.2
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.5.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-10
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(np)
## Warning: package 'np' was built under R version 4.5.2
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-18)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.5.2
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.5.2
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
2.数据读取与探索 读取 Ames Housing 数据集,并对目标变量 Sale_Price 进行初步的描述性统计。
# 2.1 读取数据
ames<-read_excel("C:/Users/1/Desktop/ames.xlsx")
# 2.2 目标变量初步描述统计
summary(ames$Sale_Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180796 213500 755000
# 2.2.1 价格直方图)
p1 <- ggplot(ames, aes(x = Sale_Price)) +
geom_histogram(fill = "steelblue", alpha = 0.7, bins = 30) +
labs(title = "房屋销售价格分布(原始)", x = "销售价格(美元)", y = "频数") +
theme_minimal()
p1
p2 <- ggplot(ames, aes(x = log(Sale_Price))) + # 对数转换(log=自然对数,进行线性回归需要符合正态性假设)
geom_histogram(fill = "coral", alpha = 0.7, bins = 30) +
labs(title = "房屋销售价格分布(对数转换)", x = "ln(销售价格)", y = "频数") +
theme_minimal()
p2
grid.arrange(p1, p2, ncol = 2) # 并排显示两图
# 2.2.2 价格箱线图
ggplot(ames, aes(y = Sale_Price)) +
geom_boxplot(fill = "lightgreen", alpha = 0.7) +
labs(title = "房屋销售价格箱线图(异常值识别)", y = "销售价格(美元)") +
theme_minimal()
3.单变量分析
#3.1 核心数值型特征单变量分析
# 3.1.1 地上居住面积(Gr_Liv_Area)分布
p_area <- ggplot(ames, aes(x = Gr_Liv_Area)) +
geom_density(fill = "purple", alpha = 0.5) +
labs(title = "地上居住面积分布", x = "面积(平方英尺)", y = "密度") +
theme_minimal()
# 3.1.2 建造年份(Year_Built)分布
p_year <- ggplot(ames, aes(x = Year_Built)) +
geom_bar(fill = "orange", alpha = 0.7) +
labs(title = "房屋建造年份分布", x = "建造年份", y = "房屋数量") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # 旋转x轴标签
grid.arrange(p_area, p_year, ncol = 2)
# 3.2 核心分类型特征单变量分析
# 3.2.1 整体质量(1=极差→10=极优)分布
p_qual <- ames %>%
count(Overall_Qual) %>%
ggplot(aes(x = factor(Overall_Qual), y = n)) +
geom_bar(stat = "identity", fill = "#008080", alpha = 0.7) +
labs(title = "房屋整体质量分布", x = "整体质量等级", y = "房屋数量") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 3.2.2 社区分布
p_neighbor <- ames %>%
count(Neighborhood) %>%
arrange(desc(n)) %>%
slice(1:10) %>% # 取数量前10的社区
ggplot(aes(x = reorder(Neighborhood, -n), y = n)) +
geom_bar(stat = "identity", fill = "pink", alpha = 0.7) +
labs(title = "前10大社区房屋数量分布", x = "社区", y = "房屋数量") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p_qual, p_neighbor, ncol = 2)
4.双变量分析
#4. 双变量分析
#4.1 数值型特征 、销售价格
# 4.1.1 地上居住面积 、销售价格
p1 <- ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "地上居住面积 vs 销售价格", x = "面积", y = "销售价格") +
theme_minimal()
# 4.1.2 建造年份、销售价格
p2 <- ggplot(ames, aes(x = Year_Built, y = log(Sale_Price))) +
geom_point(alpha = 0.5, color = "green") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "建造年份 vs ln(销售价格)", x = "建造年份", y = "ln(销售价格)") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#4.2 分类型特征、销售价格
# 4.2.1 整体质量(Overall_Qual)、销售价格
p_qual_price <- ggplot(ames, aes(x = factor(Overall_Qual), y = Sale_Price)) +
geom_boxplot(fill = "lightblue", alpha = 0.7) +
labs(title = "整体质量等级、销售价格", x = "整体质量等级", y = "销售价格") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#社区、销售价格
top5_neighbor <- ames %>%
count(Neighborhood) %>%
arrange(desc(n)) %>%
slice(1:5) %>%
pull(Neighborhood)
p_neighbor_price <- ames %>%
filter(Neighborhood %in% top5_neighbor) %>%
ggplot(aes(x = Neighborhood, y = Sale_Price)) +
geom_boxplot(fill = "lightyellow", alpha = 0.7) +
labs(title = "前5大社区、销售价格", x = "社区", y = "销售价格") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p_qual_price, p_neighbor_price, ncol = 2)
# 4.2.3 ANOVA检验
anova_qual <- aov(Sale_Price ~ factor(Overall_Qual), data = ames)
summary(anova_qual)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(Overall_Qual) 9 1.313e+13 1.459e+12 765.2 <2e-16 ***
## Residuals 2920 5.566e+12 1.906e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5.有无中央空调对房屋售价的关系 5.1 分组描述性统计分析
# 按中央空调分组计算售价统计量
price_stats <- ames %>%
group_by(Central_Air) %>%
summarise(
样本数量 = n(),
平均值 = mean(Sale_Price, na.rm = TRUE),
中位数 = median(Sale_Price, na.rm = TRUE),
标准差 = sd(Sale_Price, na.rm = TRUE),
最小值 = min(Sale_Price, na.rm = TRUE),
最大值 = max(Sale_Price, na.rm = TRUE),
第一四分位数 = quantile(Sale_Price, 0.25, na.rm = TRUE),
第三四分位数 = quantile(Sale_Price, 0.75, na.rm = TRUE)
) %>%
mutate(
平均值 = round(平均值, 2),
中位数 = round(中位数, 2),
标准差 = round(标准差, 2),
最小值 = round(最小值, 2),
最大值 = round(最大值, 2),
第一四分位数 = round(第一四分位数, 2),
第三四分位数 = round(第三四分位数, 2)
)
cat("\n=== 按中央空调分组的售价统计 ===\n")
##
## === 按中央空调分组的售价统计 ===
print(price_stats)
## # A tibble: 2 × 9
## Central_Air 样本数量 平均值 中位数 标准差 最小值 最大值 第一四分位数
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 N 196 101890. 98250 37597. 12789 265979 80000
## 2 Y 2734 186453. 166250 79121. 50000 755000 134500
## # ℹ 1 more variable: 第三四分位数 <dbl>
price_diff <- ames %>%
group_by(Central_Air) %>%
summarise(mean_price = mean(Sale_Price, na.rm = TRUE)) %>%
spread(Central_Air, mean_price) %>%
mutate(
绝对差异 = Y - N,
相对差异百分比 = (Y - N) / N * 100
) %>%
mutate(
绝对差异 = round(绝对差异, 2),
相对差异百分比 = round(相对差异百分比, 2)
)
cat("\n=== 售价差异分析 ===\n")
##
## === 售价差异分析 ===
print(price_diff)
## # A tibble: 1 × 4
## N Y 绝对差异 相对差异百分比
## <dbl> <dbl> <dbl> <dbl>
## 1 101890. 186453. 84562. 83.0
5.2 统计前提检验
# 5.2.1 正态性检验
cat("\n=== 正态性检验 ===\n")
##
## === 正态性检验 ===
# 有中央空调组
set.seed(42) # 设置随机种子保证结果可重复
y_sample <- sample(ames$Sale_Price[ames$Central_Air == "Y"], 1000)
shapiro_y <- shapiro.test(y_sample)
ks_y <- lillie.test(y_sample)
# 无中央空调组
n_prices <- ames$Sale_Price[ames$Central_Air == "N"]
shapiro_n <- shapiro.test(n_prices)
ks_n <- lillie.test(n_prices)
cat("有中央空调组:\n")
## 有中央空调组:
cat("Shapiro-Wilk检验 p值:", round(shapiro_y$p.value, 6), "\n")
## Shapiro-Wilk检验 p值: 0
cat("Lilliefors检验 p值:", round(ks_y$p.value, 6), "\n")
## Lilliefors检验 p值: 0
cat("\n无中央空调组:\n")
##
## 无中央空调组:
cat("Shapiro-Wilk检验 p值:", round(shapiro_n$p.value, 6), "\n")
## Shapiro-Wilk检验 p值: 5e-06
cat("Lilliefors检验 p值:", round(ks_n$p.value, 6), "\n")
## Lilliefors检验 p值: 0.030081
cat("\n结论:p值 < 0.05,拒绝正态性假设,数据不符合正态分布\n")
##
## 结论:p值 < 0.05,拒绝正态性假设,数据不符合正态分布
# 5.2.2 方差齐性检验
cat("\n=== 方差齐性检验 ===\n")
##
## === 方差齐性检验 ===
levene_test <- leveneTest(Sale_Price ~ Central_Air, data = ames)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
print(levene_test)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 40.678 2.078e-10 ***
## 2928
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("结论:p值 < 0.05,拒绝方差齐性假设,两组方差不齐\n")
## 结论:p值 < 0.05,拒绝方差齐性假设,两组方差不齐
5.3 分组对比检验(非参数检验)
cat("\n=== Mann-Whitney U检验 ===\n")
##
## === Mann-Whitney U检验 ===
mw_test <- wilcox.test(Sale_Price ~ Central_Air, data = ames,
alternative = "two.sided",
conf.int = TRUE)
print(mw_test)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Sale_Price by Central_Air
## W = 63164, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -77400 -63000
## sample estimates:
## difference in location
## -70000
cat("\n=== 检验结果解释 ===\n")
##
## === 检验结果解释 ===
if (mw_test$p.value < 0.001) {
sig_level <- "p < 0.001"
} else if (mw_test$p.value < 0.01) {
sig_level <- "p < 0.01"
} else if (mw_test$p.value < 0.05) {
sig_level <- "p < 0.05"
} else {
sig_level <- "p ≥ 0.05"
}
cat("Mann-Whitney U检验结果:", sig_level, "\n")
## Mann-Whitney U检验结果: p < 0.001
if (mw_test$p.value < 0.05) {
cat("结论:在0.05显著性水平下,拒绝原假设,有无中央空调的房屋售价存在显著差异\n")
} else {
cat("结论:在0.05显著性水平下,不能拒绝原假设,有无中央空调的房屋售价无显著差异\n")
}
## 结论:在0.05显著性水平下,拒绝原假设,有无中央空调的房屋售价存在显著差异
5.4 数据可视化
# 5.4.1 箱线图
boxplot_plot <- ggplot(ames, aes(x = Central_Air, y = Sale_Price, fill = Central_Air)) +
geom_boxplot(alpha = 0.7, width = 0.6) +
stat_summary(fun = mean, geom = "point", shape = 21, size = 4,
color = "white", fill = "black") +
stat_summary(fun = mean, geom = "text", aes(label = paste0("$", round(..y.., 0))),
vjust = -1, size = 3.5) +
scale_fill_manual(values = c("#e74c3c", "#3498db"),
labels = c("无中央空调", "有中央空调")) +
labs(
x = "中央空调状态",
y = "房屋售价",
title = "有无中央空调对房屋售价的影响",
subtitle = paste0("Mann-Whitney U检验: p = ", round(mw_test$p.value, 6)),
fill = "中央空调状态"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
) +
scale_y_continuous(labels = scales::dollar_format()) +
scale_x_discrete(labels = c("无中央空调", "有中央空调"))
# 5.4.2 直方图
histogram_plot <- ggplot(ames, aes(x = Sale_Price, fill = Central_Air)) +
geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
facet_wrap(~Central_Air, labeller = labeller(Central_Air = c("N" = "无中央空调", "Y" = "有中央空调"))) +
scale_fill_manual(values = c("#e74c3c", "#3498db"), guide = "none") +
labs(
x = "房屋售价",
y = "频数",
title = "有无中央空调房屋售价分布直方图"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
strip.text = element_text(size = 12)
) +
scale_x_continuous(labels = scales::dollar_format())
combined_plot <- grid.arrange(boxplot_plot, histogram_plot, ncol = 1, heights = c(1, 0.8))
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
6.缺失值处理
# 设置随机种子
set.seed(42)
# 6. 缺失值处理
# 6.1 查看缺失值比例
missing_ratio <- ames %>%
summarise_all(~sum(is.na(.))/n() * 100) %>%
pivot_longer(everything(), names_to = "变量", values_to = "缺失比例(%)") %>%
filter(`缺失比例(%)` > 0) %>%
arrange(desc(`缺失比例(%)`))
cat("\n=== 缺失值统计(仅显示有缺失的变量) ===\n")
##
## === 缺失值统计(仅显示有缺失的变量) ===
print(head(missing_ratio, 10))
## # A tibble: 0 × 2
## # ℹ 2 variables: 变量 <chr>, 缺失比例(%) <dbl>
# 6.2 处理缺失值
# - 删除缺失比例>50%的变量
library(dplyr)
high_missing_vars <- missing_ratio %>%
filter(`缺失比例(%)` > 50) %>%
pull(`变量`)
ames <- ames %>% dplyr::select(-all_of(high_missing_vars))
# 6.3 数值变量:用中位数填充
# 6.3.1 筛选所有数值型变量
num_vars <- ames %>% select_if(is.numeric) %>% names()
ames[num_vars] <- ames[num_vars] %>%
mutate(
across(
everything(),
~ifelse(
. == 0,
median(.[. != 0], na.rm = TRUE),
.
)
)
)
# 6.4 分类变量
cat_vars <- ames %>% select_if(is.character) %>% names()
ames[cat_vars] <- ames[cat_vars] %>%
mutate(across(everything(), ~replace_na(., names(sort(table(.), decreasing = TRUE))[1])))
# 6.5 异常值处理
iqr <- IQR(ames$Sale_Price, na.rm = TRUE)
q1 <- quantile(ames$Sale_Price, 0.25, na.rm = TRUE)
q3 <- quantile(ames$Sale_Price, 0.75, na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
# 6.6删除售价异常值
ames_clean <- ames %>% filter(Sale_Price >= lower_bound & Sale_Price <= upper_bound)
cat("\n=== 异常值处理结果 ===\n")
##
## === 异常值处理结果 ===
cat("原始数据量:", nrow(ames), "\n")
## 原始数据量: 2930
cat("清洗后数据量:", nrow(ames_clean), "\n")
## 清洗后数据量: 2793
cat("删除异常值数量:", nrow(ames) - nrow(ames_clean), "\n")
## 删除异常值数量: 137
# 6.7 变量类型转换
cat_vars <- c(
# 一、字符型分类变量(35个)
"MS_SubClass", "MS_Zoning", "Street", "Alley", "Lot_Shape",
"Land_Contour", "Utilities", "Lot_Config", "Land_Slope", "Neighborhood",
"Condition_1", "Condition_2", "Bldg_Type", "House_Style", "Roof_Style",
"Roof_Matl", "Exterior_1st", "Exterior_2nd", "Mas_Vnr_Type", "Exter_Qual",
"Exter_Cond", "Foundation", "Bsmt_Qual", "Bsmt_Cond", "Bsmt_Exposure",
"BsmtFin_Type_1", "BsmtFin_Type_2", "Heating", "Heating_QC", "Central_Air",
"Electrical", "Kitchen_Qual", "Functional", "Fireplace_Qu", "Garage_Type",
"Garage_Finish", "Garage_Qual", "Garage_Cond", "Paved_Drive", "Pool_QC",
"Fence", "Misc_Feature", "Sale_Type", "Sale_Condition",
# 二、数值编码分类变量
"Overall_Qual", "Overall_Cond", "Mo_Sold"
)
ames_clean[cat_vars] <- ames_clean[cat_vars] %>% mutate(across(everything(), as.factor))
# 6.8 数据划分
train_index <- createDataPartition(ames_clean$Sale_Price, p = 0.7, list = FALSE)
train_data <- ames_clean[train_index, ]
test_data <- ames_clean[-train_index, ]
cat("\n=== 数据划分结果 ===\n")
##
## === 数据划分结果 ===
cat("训练集数量:", nrow(train_data), "\n")
## 训练集数量: 1957
cat("测试集数量:", nrow(test_data), "\n")
## 测试集数量: 836
7.1 计算数值变量相关系数(Pearson)
num_vars_clean <- train_data %>%
select_if(is.numeric) %>%
names()
cor_matrix <- cor(train_data[num_vars_clean], use = "complete.obs")
library(dplyr)
sale_corr <- as.data.frame(cor_matrix[, "Sale_Price"]) %>%
rename(correlation = 1) %>%
mutate(variable = rownames(.)) %>%
filter(variable != "sale_Price") %>%
arrange(desc(abs(correlation))) %>%
head(15)
plot_vars <- c(sale_corr$variable, "Sale_Price")
cor_plot_matrix <- cor_matrix[plot_vars, plot_vars]
cat("\n=== 与房屋售价相关性Top15变量热力图 ===\n")
##
## === 与房屋售价相关性Top15变量热力图 ===
corrplot(
cor_plot_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.6,
tl.cex = 0.7,
main = "与房屋售价相关性Top15变量热力图",
mar = c(1, 1, 2, 1)
)
# 7.1.5 输出相关性结果
cat("\n=== 与房屋售价相关性Top10变量 ===\n")
##
## === 与房屋售价相关性Top10变量 ===
print(head(sale_corr, 10))
## correlation variable
## Sale_Price 1.0000000 Sale_Price
## Gr_Liv_Area 0.6558209 Gr_Liv_Area
## Year_Built 0.5908400 Year_Built
## Full_Bath 0.5797702 Full_Bath
## Garage_Cars 0.5688825 Garage_Cars
## Year_Remod_Add 0.5612162 Year_Remod_Add
## Garage_Area 0.5532755 Garage_Area
## First_Flr_SF 0.5216260 First_Flr_SF
## Total_Bsmt_SF 0.5069790 Total_Bsmt_SF
## TotRms_AbvGrd 0.4546589 TotRms_AbvGrd
7.2变量筛选
#7.2.1方法1:单变量筛选
# 选择相关系数绝对值>0.3的变量
selected_vars_corr <- sale_corr %>%
filter(abs(correlation) > 0.3) %>%
pull(variable)
cat("\n=== 单变量筛选结果 ===\n")
##
## === 单变量筛选结果 ===
print(selected_vars_corr)
## [1] "Sale_Price" "Gr_Liv_Area" "Year_Built" "Full_Bath"
## [5] "Garage_Cars" "Year_Remod_Add" "Garage_Area" "First_Flr_SF"
## [9] "Total_Bsmt_SF" "TotRms_AbvGrd" "Second_Flr_SF" "Longitude"
# 7.2.2 方法2:逐步回归
# 构建初始线性模型
initial_model <- lm(Sale_Price ~ ., data = train_data[num_vars_clean])
# 逐步回归
step_model <- stepAIC(initial_model, direction = "both", trace = FALSE)
selected_vars_step <- names(coef(step_model))[-1]
cat("\n=== 逐步回归筛选结果 ===\n")
##
## === 逐步回归筛选结果 ===
print(selected_vars_step)
## [1] "Lot_Area" "Year_Built" "Year_Remod_Add" "BsmtFin_SF_1"
## [5] "BsmtFin_SF_2" "Total_Bsmt_SF" "First_Flr_SF" "Second_Flr_SF"
## [9] "Gr_Liv_Area" "Bsmt_Full_Bath" "Bsmt_Half_Bath" "Full_Bath"
## [13] "Bedroom_AbvGr" "Kitchen_AbvGr" "TotRms_AbvGrd" "Fireplaces"
## [17] "Garage_Cars" "Garage_Area" "Wood_Deck_SF" "Enclosed_Porch"
## [21] "Screen_Porch" "Misc_Val" "Year_Sold" "Latitude"
# 7.2.3 方法3:LASSO回归
x <- model.matrix(Sale_Price ~ ., data = train_data[num_vars_clean])[, -1]
y <- train_data$Sale_Price
lasso_cv <- cv.glmnet(x, y, alpha = 1, nfolds = 10)
lasso_coef <- coef(lasso_cv, s = "lambda.min")
selected_vars_lasso <- rownames(lasso_coef)[lasso_coef[, 1] != 0][-1]
cat("\n=== LASSO回归筛选结果 ===\n")
##
## === LASSO回归筛选结果 ===
print(selected_vars_lasso)
## [1] "Lot_Area" "Year_Built" "Year_Remod_Add" "BsmtFin_SF_1"
## [5] "BsmtFin_SF_2" "Total_Bsmt_SF" "First_Flr_SF" "Second_Flr_SF"
## [9] "Gr_Liv_Area" "Full_Bath" "Kitchen_AbvGr" "Fireplaces"
## [13] "Garage_Cars" "Garage_Area" "Wood_Deck_SF" "Misc_Val"
## [17] "Latitude"
7.3确定最终建模变量
final_vars <- intersect(intersect(selected_vars_corr, selected_vars_step), selected_vars_lasso)
if (length(final_vars) == 0) final_vars <- unique(c(selected_vars_corr, selected_vars_step, selected_vars_lasso))
cat("\n=== 最终建模变量===\n")
##
## === 最终建模变量===
print(final_vars)
## [1] "Gr_Liv_Area" "Year_Built" "Full_Bath" "Garage_Cars"
## [5] "Year_Remod_Add" "Garage_Area" "First_Flr_SF" "Total_Bsmt_SF"
## [9] "Second_Flr_SF"
7.4准备最终建模数据
library(dplyr)
final_vars <- as.character(final_vars)
train_model_data <- train_data %>%
dplyr::select(one_of(c(final_vars, "Sale_Price")))
test_model_data <- test_data %>%
dplyr::select(one_of(c(final_vars, "Sale_Price")))
# 7.4.1构建线性模型
lm_model <- lm(Sale_Price ~ ., data = train_model_data)
cat("\n=== 线性回归模型Summary ===\n")
##
## === 线性回归模型Summary ===
print(summary(lm_model))
##
## Call:
## lm(formula = Sale_Price ~ ., data = train_model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -376995 -17233 -1869 16230 133152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.248e+06 8.275e+04 -27.163 < 2e-16 ***
## Gr_Liv_Area 4.906e+01 2.556e+00 19.194 < 2e-16 ***
## Year_Built 5.779e+02 3.466e+01 16.675 < 2e-16 ***
## Full_Bath -7.910e+02 1.984e+03 -0.399 0.690176
## Garage_Cars 6.132e+03 2.364e+03 2.594 0.009551 **
## Year_Remod_Add 5.769e+02 4.503e+01 12.812 < 2e-16 ***
## Garage_Area 2.233e+01 8.060e+00 2.770 0.005660 **
## First_Flr_SF 1.782e+01 4.272e+00 4.173 3.14e-05 ***
## Total_Bsmt_SF 7.448e+00 4.037e+00 1.845 0.065210 .
## Second_Flr_SF 1.870e+01 5.326e+00 3.511 0.000456 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32200 on 1947 degrees of freedom
## Multiple R-squared: 0.7092, Adjusted R-squared: 0.7078
## F-statistic: 527.5 on 9 and 1947 DF, p-value: < 2.2e-16
lm_pred <- predict(lm_model, newdata = test_model_data)
lm_metrics <- data.frame(
模型 = "线性回归",
RMSE = sqrt(mean((lm_pred - test_model_data$Sale_Price)^2)),
MAE = mean(abs(lm_pred - test_model_data$Sale_Price)),
R2 = cor(lm_pred, test_model_data$Sale_Price)^2
)
cat("\n=== 线性回归模型测试集评估 ===\n")
##
## === 线性回归模型测试集评估 ===
print(lm_metrics)
## 模型 RMSE MAE R2
## 1 线性回归 30260.47 22198.23 0.7244256
# 7.4.2构建多项式回归模型
poly_var <- if ("Gr_LIV_Area" %in% final_vars) "Gr_LIV_Area" else final_vars[1]
train_poly_data <- train_model_data %>%
mutate(
poly_2 = .data[[poly_var]] ^ 2,
poly_3 = .data[[poly_var]] ^ 3
)
test_poly_data <- test_model_data %>%
mutate(
poly_2 = .data[[poly_var]] ^ 2,
poly_3 = .data[[poly_var]] ^ 3
)
# 构建多项式回归模型
train_poly_data_filtered <- train_poly_data[, !names(train_poly_data) %in% poly_var, drop = FALSE]
poly_model <- lm(Sale_Price ~ ., data = train_poly_data_filtered)
test_poly_data_filtered <- test_poly_data[, names(test_poly_data) != poly_var, drop = FALSE]
poly_pred <- predict(poly_model, newdata = test_poly_data_filtered)
poly_metrics <- data.frame(
模型 = "多项式回归",
RMSE = sqrt(mean((poly_pred - test_poly_data$Sale_Price)^2)),
MAE = mean(abs(poly_pred - test_poly_data$Sale_Price)),
R2 = cor(poly_pred, test_poly_data$Sale_Price)^2
)
cat("\n=== 多项式回归模型测试集评估 ===\n")
##
## === 多项式回归模型测试集评估 ===
print(poly_metrics)
## 模型 RMSE MAE R2
## 1 多项式回归 28898.29 21293.65 0.7497456
y_test <- test_model_data$Sale_Price
if (!require(kknn)) install.packages("kknn")
## Loading required package: kknn
## Warning: package 'kknn' was built under R version 4.5.2
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
library(kknn)
# 核回归
set.seed(42)
kknn_model <- train.kknn(
Sale_Price ~ .,
data = train_model_data,
kmax = 200,
kernel = "epanechnikov",
scale = TRUE
)
np_pred <- predict(kknn_model, newdata = test_model_data)
np_metrics <- data.frame(
模型 = "核回归",
RMSE = sqrt(mean((np_pred - y_test)^2)),
MAE = mean(abs(np_pred - y_test)),
R2 = cor(np_pred, y_test)^2
)
print(np_metrics)
## 模型 RMSE MAE R2
## 1 核回归 27145.1 19264.5 0.7782424
8.三种模型效果对比
cat("\n=== 三种模型效果对比 ===\n")
##
## === 三种模型效果对比 ===
if (!require(grid)) install.packages("grid")
## Loading required package: grid
if (!require(gridExtra)) install.packages("gridExtra")
if (!require(dplyr)) install.packages("dplyr")
if (!require(tidyr)) install.packages("tidyr")
if (!require(ggplot2)) install.packages("ggplot2")
library(grid)
library(gridExtra)
library(dplyr)
library(tidyr)
library(ggplot2)
model_comparison <- bind_rows(
lm_metrics,
poly_metrics,
np_metrics
) %>%
mutate(
RMSE = round(RMSE, 2),
MAE = round(MAE, 2),
R2 = round(R2, 4)
)
cat("\n=== 模型评估指标对比表 ===\n")
##
## === 模型评估指标对比表 ===
print(model_comparison)
## 模型 RMSE MAE R2
## 1 线性回归 30260.47 22198.23 0.7244
## 2 多项式回归 28898.29 21293.65 0.7497
## 3 核回归 27145.10 19264.50 0.7782
colnames(model_comparison) <- c("Model", "RMSE", "MAE", "R2")
error_metrics <- model_comparison %>%
dplyr::select(Model, RMSE, MAE) %>%
tidyr::pivot_longer(cols = c(RMSE, MAE),
names_to = "Metric",
values_to = "Value")
model_labels <- c(
"线性回归" = "Linear Regression",
"多项式回归" = "Polynomial Regression",
"核回归" = "Kernel Regression"
)
error_metrics <- error_metrics %>%
mutate(Model_CN = names(model_labels)[match(Model, model_labels)]) %>%
mutate(Model_CN = ifelse(is.na(Model_CN), Model, Model_CN))
cat("\n=== 误差指标对比图===\n")
##
## === 误差指标对比图===
p_error <- ggplot(error_metrics, aes(x = Model_CN, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = position_dodge(0.8), alpha = 0.8) +
geom_text(aes(label = Value),
position = position_dodge(0.8),
vjust = -0.5,
size = 3.5
) +
labs(
title = "模型误差指标对比",
subtitle = "数值越小表示模型预测越准确",
x = "模型",
y = "误差值",
fill = "指标"
) +
scale_fill_manual(values = c("#FF6B6B", "#4ECDC4"),
labels = c("RMSE", "MAE")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 11),
axis.text.x = element_text(angle = 15, hjust = 1)
)
print(p_error)
r2_data <- model_comparison %>%
mutate(Model_CN = names(model_labels)[match(Model, model_labels)]) %>%
mutate(Model_CN = ifelse(is.na(Model_CN), Model, Model_CN))
cat("\n=== 拟合优度对比图 ===\n")
##
## === 拟合优度对比图 ===
p_r2 <- ggplot(r2_data, aes(x = Model_CN, y = R2, fill = Model_CN)) +
geom_bar(stat = "identity", alpha = 0.8, width = 0.6) +
geom_text(aes(label = paste0(round(R2 * 100, 1), "%")),
vjust = -0.5,
size = 4,
fontface = "bold"
) +
geom_hline(yintercept = 0.8, linetype = "dashed", color = "red", alpha = 0.5) +
annotate("text", x = 2.5, y = 0.82,
label = "优秀阈值 (R² > 0.8)",
color = "red", size = 3
) +
labs(
title = "模型拟合优度对比",
subtitle = "R²越接近1表示模型解释力越强",
x = "模型",
y = "决定系数 "
) +
scale_fill_manual(values = c("#3498DB", "#2ECC71", "#9B59B6")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 11),
axis.text.x = element_text(angle = 15, hjust = 1),
legend.position = "none"
) +
ylim(0, 1)
print(p_r2)
cat("\n=== 实际值 vs 预测值对比图 ===\n")
##
## === 实际值 vs 预测值对比图 ===
comparison_data <- data.frame(
Actual = rep(test_model_data$Sale_Price, 3),
Predicted = c(lm_pred, poly_pred, np_pred),
Model = rep(c("Linear Regression", "Polynomial Regression", "Kernel Regression"),
each = length(test_model_data$Sale_Price)
)
)
comparison_data <- comparison_data %>%
mutate(Model_CN = case_when(
Model == "Linear Regression" ~ "线性回归",
Model == "Polynomial Regression" ~ "多项式回归",
Model == "Kernel Regression" ~ "核回归",
TRUE ~ Model
))
r2_labels <- model_comparison %>%
mutate(
label = paste0("R² = ", round(R2, 3)),
Model_CN = names(model_labels)[match(Model, model_labels)]
) %>%
mutate(Model_CN = ifelse(is.na(Model_CN), Model, Model_CN))
p_scatter <- ggplot(comparison_data, aes(x = Actual, y = Predicted, color = Model_CN)) +
geom_point(alpha = 0.4, size = 1.5) +
geom_abline(slope = 1, intercept = 0,
color = "red", linetype = "dashed", size = 0.8
) +
facet_wrap(~Model_CN, ncol = 3) +
geom_text(data = r2_labels,
aes(x = Inf, y = -Inf, label = label),
hjust = 1.1, vjust = -0.5,
size = 4, color = "black", fontface = "bold"
) +
labs(
title = "实际值 vs 预测值对比",
subtitle = "红色虚线完美预测线",
x = "实际售价",
y = "预测售价"
) +
scale_color_manual(values = c("#3498DB", "#2ECC71", "#9B59B6")) +
scale_x_continuous(labels = scales::dollar_format()) +
scale_y_continuous(labels = scales::dollar_format()) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 11),
strip.text = element_text(size = 11, face = "bold"),
legend.position = "none"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p_scatter)
if (!require(patchwork)) install.packages("patchwork")
## Loading required package: patchwork
## Warning: package 'patchwork' was built under R version 4.5.2
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
##
## area
library(patchwork)
cat("\n=== 模型对比综合图 ===\n")
##
## === 模型对比综合图 ===
combined_plot <- p_error / p_r2 / p_scatter +
plot_layout(heights = c(1, 1, 2)) + # 调整三个子图的高度比例
plot_annotation(
title = "三种回归模型效果对比",
theme = theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
)
print(combined_plot)
cat("\n=== 模型选择建议 ===\n")
##
## === 模型选择建议 ===
best_rmse_model <- r2_data$Model_CN[which.min(r2_data$RMSE)]
best_mae_model <- r2_data$Model_CN[which.min(r2_data$MAE)]
best_r2_model <- r2_data$Model_CN[which.max(r2_data$R2)]
cat("1. 基于RMSE最优模型:", best_rmse_model, "\n")
## 1. 基于RMSE最优模型: 核回归
cat("2. 基于MAE最优模型:", best_mae_model, "\n")
## 2. 基于MAE最优模型: 核回归
cat("3. 基于R²最优模型:", best_r2_model, "\n\n")
## 3. 基于R²最优模型: 核回归
# 综合评估
if (best_rmse_model == best_mae_model && best_mae_model == best_r2_model) {
cat("综合结论:", best_rmse_model, "在所有指标上表现最优,推荐使用。\n")
} else {
cat("综合结论:各模型在不同指标上各有优势:\n")
cat(" - 若关注整体预测精度,推荐", best_rmse_model, "\n")
cat(" - 若关注预测偏差的稳定性,推荐", best_mae_model, "\n")
cat(" - 若关注模型解释力,推荐", best_r2_model, "\n")
}
## 综合结论: 核回归 在所有指标上表现最优,推荐使用。