This is our Markdown document.

1.环境准备与包加载 在本阶段,我们检查并安装所需的 R 包,并加载分析所需的核心库与数据。

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)  
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(nortest)
## Warning: package 'nortest' was built under R version 4.5.2
library(PMCMRplus) 
## Warning: package 'PMCMRplus' was built under R version 4.5.2
library(ggpubr) 
## Warning: package 'ggpubr' was built under R version 4.5.2
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.2
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.5.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(np)
## Warning: package 'np' was built under R version 4.5.2
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-18)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.5.2
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.5.2
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

2.数据读取与探索 读取 Ames Housing 数据集,并对目标变量 Sale_Price 进行初步的描述性统计。

# 2.1 读取数据
ames<-read_excel("C:/Users/1/Desktop/ames.xlsx")

# 2.2 目标变量初步描述统计
summary(ames$Sale_Price)  
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12789  129500  160000  180796  213500  755000
# 2.2.1 价格直方图)
p1 <- ggplot(ames, aes(x = Sale_Price)) +
  geom_histogram(fill = "steelblue", alpha = 0.7, bins = 30) +
  labs(title = "房屋销售价格分布(原始)", x = "销售价格(美元)", y = "频数") +
  theme_minimal()
p1

p2 <- ggplot(ames, aes(x = log(Sale_Price))) +  # 对数转换(log=自然对数,进行线性回归需要符合正态性假设)
  geom_histogram(fill = "coral", alpha = 0.7, bins = 30) +
  labs(title = "房屋销售价格分布(对数转换)", x = "ln(销售价格)", y = "频数") +
  theme_minimal()
p2

grid.arrange(p1, p2, ncol = 2)  # 并排显示两图

# 2.2.2 价格箱线图
ggplot(ames, aes(y = Sale_Price)) +
  geom_boxplot(fill = "lightgreen", alpha = 0.7) +
  labs(title = "房屋销售价格箱线图(异常值识别)", y = "销售价格(美元)") +
  theme_minimal()

3.单变量分析

#3.1 核心数值型特征单变量分析
# 3.1.1 地上居住面积(Gr_Liv_Area)分布
p_area <- ggplot(ames, aes(x = Gr_Liv_Area)) +
  geom_density(fill = "purple", alpha = 0.5) +
  labs(title = "地上居住面积分布", x = "面积(平方英尺)", y = "密度") +
  theme_minimal()
# 3.1.2 建造年份(Year_Built)分布
p_year <- ggplot(ames, aes(x = Year_Built)) +
  geom_bar(fill = "orange", alpha = 0.7) +
  labs(title = "房屋建造年份分布", x = "建造年份", y = "房屋数量") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # 旋转x轴标签
grid.arrange(p_area, p_year, ncol = 2)

# 3.2 核心分类型特征单变量分析
# 3.2.1 整体质量(1=极差→10=极优)分布
p_qual <- ames %>%
  count(Overall_Qual) %>% 
  ggplot(aes(x = factor(Overall_Qual), y = n)) +
  geom_bar(stat = "identity", fill = "#008080", alpha = 0.7) +
  labs(title = "房屋整体质量分布", x = "整体质量等级", y = "房屋数量") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 3.2.2 社区分布
p_neighbor <- ames %>%
  count(Neighborhood) %>%
  arrange(desc(n)) %>%
  slice(1:10) %>%  # 取数量前10的社区
  ggplot(aes(x = reorder(Neighborhood, -n), y = n)) + 
  geom_bar(stat = "identity", fill = "pink", alpha = 0.7) +
  labs(title = "前10大社区房屋数量分布", x = "社区", y = "房屋数量") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p_qual, p_neighbor, ncol = 2)

4.双变量分析

#4. 双变量分析
#4.1 数值型特征 、销售价格
# 4.1.1 地上居住面积 、销售价格
p1 <- ggplot(ames, aes(x = Gr_Liv_Area, y = Sale_Price)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +  
  labs(title = "地上居住面积 vs 销售价格", x = "面积", y = "销售价格") +
  theme_minimal()

# 4.1.2 建造年份、销售价格
p2 <- ggplot(ames, aes(x = Year_Built, y = log(Sale_Price))) +
  geom_point(alpha = 0.5, color = "green") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "建造年份 vs ln(销售价格)", x = "建造年份", y = "ln(销售价格)") +
  theme_minimal()
grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#4.2 分类型特征、销售价格
# 4.2.1 整体质量(Overall_Qual)、销售价格
p_qual_price <- ggplot(ames, aes(x = factor(Overall_Qual), y = Sale_Price)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7) +
  labs(title = "整体质量等级、销售价格", x = "整体质量等级", y = "销售价格") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
#社区、销售价格
top5_neighbor <- ames %>%
  count(Neighborhood) %>%
  arrange(desc(n)) %>%
  slice(1:5) %>%
  pull(Neighborhood)  

p_neighbor_price <- ames %>%
  filter(Neighborhood %in% top5_neighbor) %>%
  ggplot(aes(x = Neighborhood, y = Sale_Price)) +
  geom_boxplot(fill = "lightyellow", alpha = 0.7) +
  labs(title = "前5大社区、销售价格", x = "社区", y = "销售价格") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p_qual_price, p_neighbor_price, ncol = 2)

# 4.2.3 ANOVA检验
anova_qual <- aov(Sale_Price ~ factor(Overall_Qual), data = ames)
summary(anova_qual)  
##                        Df    Sum Sq   Mean Sq F value Pr(>F)    
## factor(Overall_Qual)    9 1.313e+13 1.459e+12   765.2 <2e-16 ***
## Residuals            2920 5.566e+12 1.906e+09                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.有无中央空调对房屋售价的关系 5.1 分组描述性统计分析

# 按中央空调分组计算售价统计量
price_stats <- ames %>%
  group_by(Central_Air) %>%
  summarise(
    样本数量 = n(),
    平均值 = mean(Sale_Price, na.rm = TRUE),
    中位数 = median(Sale_Price, na.rm = TRUE),
    标准差 = sd(Sale_Price, na.rm = TRUE),
    最小值 = min(Sale_Price, na.rm = TRUE),
    最大值 = max(Sale_Price, na.rm = TRUE),
    第一四分位数 = quantile(Sale_Price, 0.25, na.rm = TRUE),
    第三四分位数 = quantile(Sale_Price, 0.75, na.rm = TRUE)
  ) %>%
  mutate(
    平均值 = round(平均值, 2),
    中位数 = round(中位数, 2),
    标准差 = round(标准差, 2),
    最小值 = round(最小值, 2),
    最大值 = round(最大值, 2),
    第一四分位数 = round(第一四分位数, 2),
    第三四分位数 = round(第三四分位数, 2)
  )

cat("\n=== 按中央空调分组的售价统计 ===\n")
## 
## === 按中央空调分组的售价统计 ===
print(price_stats)
## # A tibble: 2 × 9
##   Central_Air 样本数量  平均值 中位数 标准差 最小值 最大值 第一四分位数
##   <chr>          <int>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>        <dbl>
## 1 N                196 101890.  98250 37597.  12789 265979        80000
## 2 Y               2734 186453. 166250 79121.  50000 755000       134500
## # ℹ 1 more variable: 第三四分位数 <dbl>
price_diff <- ames %>%
  group_by(Central_Air) %>%
  summarise(mean_price = mean(Sale_Price, na.rm = TRUE)) %>%
  spread(Central_Air, mean_price) %>%
  mutate(
    绝对差异 = Y - N,
    相对差异百分比 = (Y - N) / N * 100
  ) %>%
  mutate(
    绝对差异 = round(绝对差异, 2),
    相对差异百分比 = round(相对差异百分比, 2)
  )
cat("\n=== 售价差异分析 ===\n")
## 
## === 售价差异分析 ===
print(price_diff)
## # A tibble: 1 × 4
##         N       Y 绝对差异 相对差异百分比
##     <dbl>   <dbl>    <dbl>          <dbl>
## 1 101890. 186453.   84562.           83.0

5.2 统计前提检验

# 5.2.1 正态性检验
cat("\n=== 正态性检验 ===\n")
## 
## === 正态性检验 ===
# 有中央空调组
set.seed(42)  # 设置随机种子保证结果可重复
y_sample <- sample(ames$Sale_Price[ames$Central_Air == "Y"], 1000)
shapiro_y <- shapiro.test(y_sample)
ks_y <- lillie.test(y_sample) 

# 无中央空调组
n_prices <- ames$Sale_Price[ames$Central_Air == "N"]
shapiro_n <- shapiro.test(n_prices)
ks_n <- lillie.test(n_prices)

cat("有中央空调组:\n")
## 有中央空调组:
cat("Shapiro-Wilk检验 p值:", round(shapiro_y$p.value, 6), "\n")
## Shapiro-Wilk检验 p值: 0
cat("Lilliefors检验 p值:", round(ks_y$p.value, 6), "\n")
## Lilliefors检验 p值: 0
cat("\n无中央空调组:\n")
## 
## 无中央空调组:
cat("Shapiro-Wilk检验 p值:", round(shapiro_n$p.value, 6), "\n")
## Shapiro-Wilk检验 p值: 5e-06
cat("Lilliefors检验 p值:", round(ks_n$p.value, 6), "\n")
## Lilliefors检验 p值: 0.030081
cat("\n结论:p值 < 0.05,拒绝正态性假设,数据不符合正态分布\n")
## 
## 结论:p值 < 0.05,拒绝正态性假设,数据不符合正态分布
# 5.2.2 方差齐性检验
cat("\n=== 方差齐性检验 ===\n")
## 
## === 方差齐性检验 ===
levene_test <- leveneTest(Sale_Price ~ Central_Air, data = ames)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
print(levene_test)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value    Pr(>F)    
## group    1  40.678 2.078e-10 ***
##       2928                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("结论:p值 < 0.05,拒绝方差齐性假设,两组方差不齐\n")
## 结论:p值 < 0.05,拒绝方差齐性假设,两组方差不齐

5.3 分组对比检验(非参数检验)

cat("\n=== Mann-Whitney U检验 ===\n")
## 
## === Mann-Whitney U检验 ===
mw_test <- wilcox.test(Sale_Price ~ Central_Air, data = ames, 
                       alternative = "two.sided",  
                       conf.int = TRUE)             

print(mw_test)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Sale_Price by Central_Air
## W = 63164, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -77400 -63000
## sample estimates:
## difference in location 
##                 -70000
cat("\n=== 检验结果解释 ===\n")
## 
## === 检验结果解释 ===
if (mw_test$p.value < 0.001) {
  sig_level <- "p < 0.001"
} else if (mw_test$p.value < 0.01) {
  sig_level <- "p < 0.01"
} else if (mw_test$p.value < 0.05) {
  sig_level <- "p < 0.05"
} else {
  sig_level <- "p ≥ 0.05"
}

cat("Mann-Whitney U检验结果:", sig_level, "\n")
## Mann-Whitney U检验结果: p < 0.001
if (mw_test$p.value < 0.05) {
  cat("结论:在0.05显著性水平下,拒绝原假设,有无中央空调的房屋售价存在显著差异\n")
} else {
  cat("结论:在0.05显著性水平下,不能拒绝原假设,有无中央空调的房屋售价无显著差异\n")
}
## 结论:在0.05显著性水平下,拒绝原假设,有无中央空调的房屋售价存在显著差异

5.4 数据可视化

# 5.4.1 箱线图
boxplot_plot <- ggplot(ames, aes(x = Central_Air, y = Sale_Price, fill = Central_Air)) +
  geom_boxplot(alpha = 0.7, width = 0.6) +
  stat_summary(fun = mean, geom = "point", shape = 21, size = 4, 
               color = "white", fill = "black") +
  stat_summary(fun = mean, geom = "text", aes(label = paste0("$", round(..y.., 0))),
               vjust = -1, size = 3.5) +
  scale_fill_manual(values = c("#e74c3c", "#3498db"), 
                    labels = c("无中央空调", "有中央空调")) +
  labs(
    x = "中央空调状态",
    y = "房屋售价",
    title = "有无中央空调对房屋售价的影响",
    subtitle = paste0("Mann-Whitney U检验: p = ", round(mw_test$p.value, 6)),
    fill = "中央空调状态"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +
  scale_x_discrete(labels = c("无中央空调", "有中央空调"))

# 5.4.2 直方图
histogram_plot <- ggplot(ames, aes(x = Sale_Price, fill = Central_Air)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
  facet_wrap(~Central_Air, labeller = labeller(Central_Air = c("N" = "无中央空调", "Y" = "有中央空调"))) +
  scale_fill_manual(values = c("#e74c3c", "#3498db"), guide = "none") +
  labs(
    x = "房屋售价",
    y = "频数",
    title = "有无中央空调房屋售价分布直方图"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12)
  ) +
  scale_x_continuous(labels = scales::dollar_format())
combined_plot <- grid.arrange(boxplot_plot, histogram_plot, ncol = 1, heights = c(1, 0.8))
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

6.缺失值处理

# 设置随机种子
set.seed(42)
# 6. 缺失值处理
# 6.1 查看缺失值比例
missing_ratio <- ames %>%
  summarise_all(~sum(is.na(.))/n() * 100) %>%
  pivot_longer(everything(), names_to = "变量", values_to = "缺失比例(%)") %>%
  filter(`缺失比例(%)` > 0) %>%
  arrange(desc(`缺失比例(%)`))

cat("\n=== 缺失值统计(仅显示有缺失的变量) ===\n")
## 
## === 缺失值统计(仅显示有缺失的变量) ===
print(head(missing_ratio, 10)) 
## # A tibble: 0 × 2
## # ℹ 2 variables: 变量 <chr>, 缺失比例(%) <dbl>
# 6.2 处理缺失值
# - 删除缺失比例>50%的变量

library(dplyr)

high_missing_vars <- missing_ratio %>%
  filter(`缺失比例(%)` > 50) %>%
  pull(`变量`)

ames <- ames %>% dplyr::select(-all_of(high_missing_vars))

# 6.3 数值变量:用中位数填充
# 6.3.1 筛选所有数值型变量
num_vars <- ames %>% select_if(is.numeric) %>% names()
ames[num_vars] <- ames[num_vars] %>%
  mutate(
    across(
      everything(),  
      ~ifelse(
        . == 0,  
        median(.[. != 0], na.rm = TRUE),  
        .  
      )
    )
  )

# 6.4 分类变量
cat_vars <- ames %>% select_if(is.character) %>% names()
ames[cat_vars] <- ames[cat_vars] %>%
  mutate(across(everything(), ~replace_na(., names(sort(table(.), decreasing = TRUE))[1])))

# 6.5 异常值处理
iqr <- IQR(ames$Sale_Price, na.rm = TRUE)
q1 <- quantile(ames$Sale_Price, 0.25, na.rm = TRUE)
q3 <- quantile(ames$Sale_Price, 0.75, na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr

# 6.6删除售价异常值
ames_clean <- ames %>% filter(Sale_Price >= lower_bound & Sale_Price <= upper_bound)
cat("\n=== 异常值处理结果 ===\n")
## 
## === 异常值处理结果 ===
cat("原始数据量:", nrow(ames), "\n")
## 原始数据量: 2930
cat("清洗后数据量:", nrow(ames_clean), "\n")
## 清洗后数据量: 2793
cat("删除异常值数量:", nrow(ames) - nrow(ames_clean), "\n")
## 删除异常值数量: 137
# 6.7 变量类型转换
cat_vars <- c(
  # 一、字符型分类变量(35个)
  "MS_SubClass", "MS_Zoning", "Street", "Alley", "Lot_Shape", 
  "Land_Contour", "Utilities", "Lot_Config", "Land_Slope", "Neighborhood", 
  "Condition_1", "Condition_2", "Bldg_Type", "House_Style", "Roof_Style", 
  "Roof_Matl", "Exterior_1st", "Exterior_2nd", "Mas_Vnr_Type", "Exter_Qual", 
  "Exter_Cond", "Foundation", "Bsmt_Qual", "Bsmt_Cond", "Bsmt_Exposure", 
  "BsmtFin_Type_1", "BsmtFin_Type_2", "Heating", "Heating_QC", "Central_Air", 
  "Electrical", "Kitchen_Qual", "Functional", "Fireplace_Qu", "Garage_Type", 
  "Garage_Finish", "Garage_Qual", "Garage_Cond", "Paved_Drive", "Pool_QC", 
  "Fence", "Misc_Feature", "Sale_Type", "Sale_Condition",
  # 二、数值编码分类变量
  "Overall_Qual", "Overall_Cond", "Mo_Sold"
)
ames_clean[cat_vars] <- ames_clean[cat_vars] %>% mutate(across(everything(), as.factor))

# 6.8 数据划分
train_index <- createDataPartition(ames_clean$Sale_Price, p = 0.7, list = FALSE)
train_data <- ames_clean[train_index, ]
test_data <- ames_clean[-train_index, ]

cat("\n=== 数据划分结果 ===\n")
## 
## === 数据划分结果 ===
cat("训练集数量:", nrow(train_data), "\n")
## 训练集数量: 1957
cat("测试集数量:", nrow(test_data), "\n")
## 测试集数量: 836

7.1 计算数值变量相关系数(Pearson)

num_vars_clean <- train_data %>% 
  select_if(is.numeric) %>% 
  names()
cor_matrix <- cor(train_data[num_vars_clean], use = "complete.obs")


library(dplyr)

sale_corr <- as.data.frame(cor_matrix[, "Sale_Price"]) %>%  
  rename(correlation = 1) %>% 
  mutate(variable = rownames(.)) %>% 
  filter(variable != "sale_Price") %>% 
  arrange(desc(abs(correlation))) %>% 
  head(15)



plot_vars <- c(sale_corr$variable, "Sale_Price")
cor_plot_matrix <- cor_matrix[plot_vars, plot_vars]


cat("\n=== 与房屋售价相关性Top15变量热力图 ===\n")
## 
## === 与房屋售价相关性Top15变量热力图 ===
corrplot(
  cor_plot_matrix,
  method = "color",
  type = "upper",
  addCoef.col = "black",
  number.cex = 0.6,
  tl.cex = 0.7,
  main = "与房屋售价相关性Top15变量热力图",
  mar = c(1, 1, 2, 1)
)

# 7.1.5 输出相关性结果
cat("\n=== 与房屋售价相关性Top10变量 ===\n")
## 
## === 与房屋售价相关性Top10变量 ===
print(head(sale_corr, 10))
##                correlation       variable
## Sale_Price       1.0000000     Sale_Price
## Gr_Liv_Area      0.6558209    Gr_Liv_Area
## Year_Built       0.5908400     Year_Built
## Full_Bath        0.5797702      Full_Bath
## Garage_Cars      0.5688825    Garage_Cars
## Year_Remod_Add   0.5612162 Year_Remod_Add
## Garage_Area      0.5532755    Garage_Area
## First_Flr_SF     0.5216260   First_Flr_SF
## Total_Bsmt_SF    0.5069790  Total_Bsmt_SF
## TotRms_AbvGrd    0.4546589  TotRms_AbvGrd

7.2变量筛选

#7.2.1方法1:单变量筛选
# 选择相关系数绝对值>0.3的变量
selected_vars_corr <- sale_corr %>%
  filter(abs(correlation) > 0.3) %>%
  pull(variable)  
cat("\n=== 单变量筛选结果 ===\n")
## 
## === 单变量筛选结果 ===
print(selected_vars_corr)
##  [1] "Sale_Price"     "Gr_Liv_Area"    "Year_Built"     "Full_Bath"     
##  [5] "Garage_Cars"    "Year_Remod_Add" "Garage_Area"    "First_Flr_SF"  
##  [9] "Total_Bsmt_SF"  "TotRms_AbvGrd"  "Second_Flr_SF"  "Longitude"
# 7.2.2 方法2:逐步回归
# 构建初始线性模型
initial_model <- lm(Sale_Price ~ ., data = train_data[num_vars_clean])
# 逐步回归
step_model <- stepAIC(initial_model, direction = "both", trace = FALSE)
selected_vars_step <- names(coef(step_model))[-1] 
cat("\n=== 逐步回归筛选结果 ===\n")
## 
## === 逐步回归筛选结果 ===
print(selected_vars_step)
##  [1] "Lot_Area"       "Year_Built"     "Year_Remod_Add" "BsmtFin_SF_1"  
##  [5] "BsmtFin_SF_2"   "Total_Bsmt_SF"  "First_Flr_SF"   "Second_Flr_SF" 
##  [9] "Gr_Liv_Area"    "Bsmt_Full_Bath" "Bsmt_Half_Bath" "Full_Bath"     
## [13] "Bedroom_AbvGr"  "Kitchen_AbvGr"  "TotRms_AbvGrd"  "Fireplaces"    
## [17] "Garage_Cars"    "Garage_Area"    "Wood_Deck_SF"   "Enclosed_Porch"
## [21] "Screen_Porch"   "Misc_Val"       "Year_Sold"      "Latitude"
# 7.2.3 方法3:LASSO回归
x <- model.matrix(Sale_Price ~ ., data = train_data[num_vars_clean])[, -1] 
y <- train_data$Sale_Price
lasso_cv <- cv.glmnet(x, y, alpha = 1, nfolds = 10)  
lasso_coef <- coef(lasso_cv, s = "lambda.min")
selected_vars_lasso <- rownames(lasso_coef)[lasso_coef[, 1] != 0][-1]  
cat("\n=== LASSO回归筛选结果 ===\n")
## 
## === LASSO回归筛选结果 ===
print(selected_vars_lasso)
##  [1] "Lot_Area"       "Year_Built"     "Year_Remod_Add" "BsmtFin_SF_1"  
##  [5] "BsmtFin_SF_2"   "Total_Bsmt_SF"  "First_Flr_SF"   "Second_Flr_SF" 
##  [9] "Gr_Liv_Area"    "Full_Bath"      "Kitchen_AbvGr"  "Fireplaces"    
## [13] "Garage_Cars"    "Garage_Area"    "Wood_Deck_SF"   "Misc_Val"      
## [17] "Latitude"

7.3确定最终建模变量

final_vars <- intersect(intersect(selected_vars_corr, selected_vars_step), selected_vars_lasso)
if (length(final_vars) == 0) final_vars <- unique(c(selected_vars_corr, selected_vars_step, selected_vars_lasso))
cat("\n=== 最终建模变量===\n")
## 
## === 最终建模变量===
print(final_vars)
## [1] "Gr_Liv_Area"    "Year_Built"     "Full_Bath"      "Garage_Cars"   
## [5] "Year_Remod_Add" "Garage_Area"    "First_Flr_SF"   "Total_Bsmt_SF" 
## [9] "Second_Flr_SF"

7.4准备最终建模数据

library(dplyr)

final_vars <- as.character(final_vars)

train_model_data <- train_data %>% 
  dplyr::select(one_of(c(final_vars, "Sale_Price")))  
test_model_data <- test_data %>% 
  dplyr::select(one_of(c(final_vars, "Sale_Price"))) 

# 7.4.1构建线性模型
lm_model <- lm(Sale_Price ~ ., data = train_model_data)
cat("\n=== 线性回归模型Summary ===\n")
## 
## === 线性回归模型Summary ===
print(summary(lm_model))
## 
## Call:
## lm(formula = Sale_Price ~ ., data = train_model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -376995  -17233   -1869   16230  133152 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -2.248e+06  8.275e+04 -27.163  < 2e-16 ***
## Gr_Liv_Area     4.906e+01  2.556e+00  19.194  < 2e-16 ***
## Year_Built      5.779e+02  3.466e+01  16.675  < 2e-16 ***
## Full_Bath      -7.910e+02  1.984e+03  -0.399 0.690176    
## Garage_Cars     6.132e+03  2.364e+03   2.594 0.009551 ** 
## Year_Remod_Add  5.769e+02  4.503e+01  12.812  < 2e-16 ***
## Garage_Area     2.233e+01  8.060e+00   2.770 0.005660 ** 
## First_Flr_SF    1.782e+01  4.272e+00   4.173 3.14e-05 ***
## Total_Bsmt_SF   7.448e+00  4.037e+00   1.845 0.065210 .  
## Second_Flr_SF   1.870e+01  5.326e+00   3.511 0.000456 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32200 on 1947 degrees of freedom
## Multiple R-squared:  0.7092, Adjusted R-squared:  0.7078 
## F-statistic: 527.5 on 9 and 1947 DF,  p-value: < 2.2e-16
lm_pred <- predict(lm_model, newdata = test_model_data)
lm_metrics <- data.frame(
  模型 = "线性回归",
  RMSE = sqrt(mean((lm_pred - test_model_data$Sale_Price)^2)),
  MAE = mean(abs(lm_pred - test_model_data$Sale_Price)),
  R2 = cor(lm_pred, test_model_data$Sale_Price)^2
)
cat("\n=== 线性回归模型测试集评估 ===\n")
## 
## === 线性回归模型测试集评估 ===
print(lm_metrics)
##       模型     RMSE      MAE        R2
## 1 线性回归 30260.47 22198.23 0.7244256
# 7.4.2构建多项式回归模型
poly_var <- if ("Gr_LIV_Area" %in% final_vars) "Gr_LIV_Area" else final_vars[1]
train_poly_data <- train_model_data %>%
  mutate(
    poly_2 = .data[[poly_var]] ^ 2,  
    poly_3 = .data[[poly_var]] ^ 3   
  )
test_poly_data <- test_model_data %>% 
  mutate(
    poly_2 = .data[[poly_var]] ^ 2,  
    poly_3 = .data[[poly_var]] ^ 3   
  )

# 构建多项式回归模型
train_poly_data_filtered <- train_poly_data[, !names(train_poly_data) %in% poly_var, drop = FALSE]
poly_model <- lm(Sale_Price ~ ., data = train_poly_data_filtered)
test_poly_data_filtered <- test_poly_data[, names(test_poly_data) != poly_var, drop = FALSE]


poly_pred <- predict(poly_model, newdata = test_poly_data_filtered)
poly_metrics <- data.frame(
  模型 = "多项式回归",
  RMSE = sqrt(mean((poly_pred - test_poly_data$Sale_Price)^2)),
  MAE = mean(abs(poly_pred - test_poly_data$Sale_Price)),
  R2 = cor(poly_pred, test_poly_data$Sale_Price)^2
)
cat("\n=== 多项式回归模型测试集评估 ===\n")
## 
## === 多项式回归模型测试集评估 ===
print(poly_metrics)
##         模型     RMSE      MAE        R2
## 1 多项式回归 28898.29 21293.65 0.7497456
y_test <- test_model_data$Sale_Price

if (!require(kknn)) install.packages("kknn")
## Loading required package: kknn
## Warning: package 'kknn' was built under R version 4.5.2
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
library(kknn)

# 核回归
set.seed(42)
kknn_model <- train.kknn(
  Sale_Price ~ ., 
  data = train_model_data,  
  kmax = 200,  
  kernel = "epanechnikov",  
  scale = TRUE  
)



np_pred <- predict(kknn_model, newdata = test_model_data)

np_metrics <- data.frame(
  模型 = "核回归",
  RMSE = sqrt(mean((np_pred - y_test)^2)),
  MAE = mean(abs(np_pred - y_test)),
  R2 = cor(np_pred, y_test)^2
)
print(np_metrics)
##     模型    RMSE     MAE        R2
## 1 核回归 27145.1 19264.5 0.7782424

8.三种模型效果对比

cat("\n=== 三种模型效果对比 ===\n")
## 
## === 三种模型效果对比 ===
if (!require(grid)) install.packages("grid")
## Loading required package: grid
if (!require(gridExtra)) install.packages("gridExtra")
if (!require(dplyr)) install.packages("dplyr")
if (!require(tidyr)) install.packages("tidyr")
if (!require(ggplot2)) install.packages("ggplot2")

library(grid)
library(gridExtra)
library(dplyr)
library(tidyr)
library(ggplot2)

model_comparison <- bind_rows(
  lm_metrics,
  poly_metrics,
  np_metrics
) %>%
  mutate(
    RMSE = round(RMSE, 2),
    MAE = round(MAE, 2),
    R2 = round(R2, 4)
  )


cat("\n=== 模型评估指标对比表 ===\n")
## 
## === 模型评估指标对比表 ===
print(model_comparison)
##         模型     RMSE      MAE     R2
## 1   线性回归 30260.47 22198.23 0.7244
## 2 多项式回归 28898.29 21293.65 0.7497
## 3     核回归 27145.10 19264.50 0.7782
colnames(model_comparison) <- c("Model", "RMSE", "MAE", "R2")

error_metrics <- model_comparison %>%
  dplyr::select(Model, RMSE, MAE) %>%
  tidyr::pivot_longer(cols = c(RMSE, MAE),
    names_to = "Metric",
    values_to = "Value")

model_labels <- c(
  "线性回归" = "Linear Regression",
  "多项式回归" = "Polynomial Regression",
  "核回归" = "Kernel Regression"
)

error_metrics <- error_metrics %>%
  mutate(Model_CN = names(model_labels)[match(Model, model_labels)]) %>%
  mutate(Model_CN = ifelse(is.na(Model_CN), Model, Model_CN))

cat("\n=== 误差指标对比图===\n")
## 
## === 误差指标对比图===
p_error <- ggplot(error_metrics, aes(x = Model_CN, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = position_dodge(0.8), alpha = 0.8) +
  geom_text(aes(label = Value),
    position = position_dodge(0.8),
    vjust = -0.5,
    size = 3.5
  ) +
  labs(
    title = "模型误差指标对比",
    subtitle = "数值越小表示模型预测越准确",
    x = "模型",
    y = "误差值",
    fill = "指标"
  ) +
  scale_fill_manual(values = c("#FF6B6B", "#4ECDC4"),
    labels = c("RMSE", "MAE")) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

print(p_error)

r2_data <- model_comparison %>%
  mutate(Model_CN = names(model_labels)[match(Model, model_labels)]) %>%
  mutate(Model_CN = ifelse(is.na(Model_CN), Model, Model_CN))

cat("\n=== 拟合优度对比图 ===\n")
## 
## === 拟合优度对比图 ===
p_r2 <- ggplot(r2_data, aes(x = Model_CN, y = R2, fill = Model_CN)) +
  geom_bar(stat = "identity", alpha = 0.8, width = 0.6) +
  geom_text(aes(label = paste0(round(R2 * 100, 1), "%")),
    vjust = -0.5,
    size = 4,
    fontface = "bold"
  ) +
  geom_hline(yintercept = 0.8, linetype = "dashed", color = "red", alpha = 0.5) +
  annotate("text", x = 2.5, y = 0.82,
    label = "优秀阈值 (R² > 0.8)",
    color = "red", size = 3
  ) +
  labs(
    title = "模型拟合优度对比",
    subtitle = "R²越接近1表示模型解释力越强",
    x = "模型",
    y = "决定系数 "
  ) +
  scale_fill_manual(values = c("#3498DB", "#2ECC71", "#9B59B6")) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    axis.text.x = element_text(angle = 15, hjust = 1),
    legend.position = "none"
  ) +
  ylim(0, 1)

print(p_r2)

cat("\n=== 实际值 vs 预测值对比图 ===\n")
## 
## === 实际值 vs 预测值对比图 ===
comparison_data <- data.frame(
  Actual = rep(test_model_data$Sale_Price, 3),
  Predicted = c(lm_pred, poly_pred, np_pred),
  Model = rep(c("Linear Regression", "Polynomial Regression", "Kernel Regression"),
    each = length(test_model_data$Sale_Price)
  )
)

comparison_data <- comparison_data %>%
  mutate(Model_CN = case_when(
    Model == "Linear Regression" ~ "线性回归",
    Model == "Polynomial Regression" ~ "多项式回归",
    Model == "Kernel Regression" ~ "核回归",
    TRUE ~ Model
  ))

r2_labels <- model_comparison %>%
  mutate(
    label = paste0("R² = ", round(R2, 3)),
    Model_CN = names(model_labels)[match(Model, model_labels)]
  ) %>%
  mutate(Model_CN = ifelse(is.na(Model_CN), Model, Model_CN))

p_scatter <- ggplot(comparison_data, aes(x = Actual, y = Predicted, color = Model_CN)) +
  geom_point(alpha = 0.4, size = 1.5) +
  geom_abline(slope = 1, intercept = 0,
    color = "red", linetype = "dashed", size = 0.8
  ) +
  facet_wrap(~Model_CN, ncol = 3) +
  geom_text(data = r2_labels,
    aes(x = Inf, y = -Inf, label = label),
    hjust = 1.1, vjust = -0.5,
    size = 4, color = "black", fontface = "bold"
  ) +
  labs(
    title = "实际值 vs 预测值对比",
    subtitle = "红色虚线完美预测线",
    x = "实际售价",
    y = "预测售价"
  ) +
  scale_color_manual(values = c("#3498DB", "#2ECC71", "#9B59B6")) +
  scale_x_continuous(labels = scales::dollar_format()) +
  scale_y_continuous(labels = scales::dollar_format()) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    strip.text = element_text(size = 11, face = "bold"),
    legend.position = "none"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p_scatter)

if (!require(patchwork)) install.packages("patchwork")
## Loading required package: patchwork
## Warning: package 'patchwork' was built under R version 4.5.2
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
## 
##     area
library(patchwork)

cat("\n=== 模型对比综合图 ===\n")
## 
## === 模型对比综合图 ===
combined_plot <- p_error / p_r2 / p_scatter +
  plot_layout(heights = c(1, 1, 2)) +  # 调整三个子图的高度比例
  plot_annotation(
    title = "三种回归模型效果对比",
    theme = theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
  )


print(combined_plot)

cat("\n=== 模型选择建议 ===\n")
## 
## === 模型选择建议 ===
best_rmse_model <- r2_data$Model_CN[which.min(r2_data$RMSE)]
best_mae_model <- r2_data$Model_CN[which.min(r2_data$MAE)]
best_r2_model <- r2_data$Model_CN[which.max(r2_data$R2)]

cat("1. 基于RMSE最优模型:", best_rmse_model, "\n")
## 1. 基于RMSE最优模型: 核回归
cat("2. 基于MAE最优模型:", best_mae_model, "\n")
## 2. 基于MAE最优模型: 核回归
cat("3. 基于R²最优模型:", best_r2_model, "\n\n")
## 3. 基于R²最优模型: 核回归
# 综合评估
if (best_rmse_model == best_mae_model && best_mae_model == best_r2_model) {
  cat("综合结论:", best_rmse_model, "在所有指标上表现最优,推荐使用。\n")
} else {
  cat("综合结论:各模型在不同指标上各有优势:\n")
  cat("  - 若关注整体预测精度,推荐", best_rmse_model, "\n")
  cat("  - 若关注预测偏差的稳定性,推荐", best_mae_model, "\n")
  cat("  - 若关注模型解释力,推荐", best_r2_model, "\n")
}
## 综合结论: 核回归 在所有指标上表现最优,推荐使用。