---
title: "探索性数据分析-ames房屋价格"
format: html
editor: visual
toc: true
toc-location: right
code-tools: true
code-fold: false
---
# 1、背景介绍
这是一份ames房屋数据,您可以把它想象为房屋中介推出的成都市武侯区、锦江区以及高新区等各区县的房屋信息。

# 2、导入数据
```{r}
library (tidyverse)
ames <- read_csv ("D:/project/R/ames房屋价格_20250105/ames_houseprice.csv" )
```
# 3、数据结构
```{r}
library (readxl)
library (kableExtra)
# 读取Excel文件中的数据
ExcelData <- read_excel ("D:/project/R/ames房屋价格_20250105/ames_houseprice_explanation.xlsx" , sheet = "Sheet1" )
# 将数据框转换为Markdown格式的表格,并添加一些格式化选项
ExcelData %>%
kable (format = "html" , booktabs = TRUE ) %>%
kable_styling (bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ))
```
```{r}
# str(ames)
```
# 4、数据处理
```{r}
# 查看缺失值
missing1 <- colSums (is.na (ames))
missing1
```
```{r}
# 选择分析对象
data <- ames %>%
select (
SalePrice,
Neighborhood,
GrLivArea,
YearBuilt
)
data
```
```{r}
# 查看缺失值
missing2 <- colSums (is.na (data))
missing2
```
```{r}
# 删除缺失值
# data <- data %>%
# drop_na()
# missing3 <- colSums(is.na(data))
# missing3
```
# 5、数据分析
## 5.1、**各区域的房屋数量和价格均值**
```{r}
# data %>% count(Neighborhood)
neighborhood_counts <- data %>%
count (Neighborhood) %>%
arrange (desc (n))
print (neighborhood_counts)
```
```{r}
data %>%
group_by (Neighborhood) %>%
summarise (
mean_sale = mean (SalePrice)
) %>%
ggplot (
aes (x = mean_sale, y = fct_reorder (Neighborhood, mean_sale))
) +
geom_col (aes (fill = mean_sale < 180000 ),show.legend = FALSE ) +
geom_text (aes (label = round (mean_sale, 0 )), hjust = 1 ) +
scale_x_continuous (
expand = c (0 , 0 ),
labels = scales:: dollar
) +
#scale_fill_viridis_d(option = "D") +
scale_fill_manual (values = c ("TRUE" = "Gray" , "FALSE" = "Pink" )) +
theme_classic () +
labs (x = "价格" , y = "位置" )
```
## 5.2、房屋价格均值趋势
```{r}
# 对数据进行汇总,计算每年建造房屋的平均售价
summarized_data <- data %>%
group_by (YearBuilt) %>%
summarise (AverageSalePrice = mean (SalePrice, na.rm = TRUE ))
# 创建时间趋势图
ggplot (summarized_data, aes (x = YearBuilt, y = AverageSalePrice)) +
geom_line () + # 绘制线条图
geom_point () + # 在数据点上添加标记
labs (
title = "房屋价格均值" ,
x = "年份" ,
y = "均值"
) +
geom_line (color = "#1f77b4" , size = 0.5 ) + # 设置线条颜色和粗细
geom_point (color = "#1f77b4" , size = 1 ) + # 设置点的颜色和大小
theme (
plot.title = element_text (hjust = 0.5 , face = "bold" , size = 16 ), # 自定义标题
axis.title = element_text (size = 14 ), # 自定义轴标题
axis.text = element_text (size = 12 ), # 自定义轴文本
panel.grid.major = element_line (colour = "#d3d3d3" , size = 0.5 ) # 添加网格线
) +
theme_minimal () + # 使用简洁的主题
scale_y_continuous (labels = scales:: label_dollar ())
```
## 5.3、房屋面积和价格的关系
```{r}
# 创建面积与价格的散点图,并添加拟合曲线
ggplot (data, aes (x = GrLivArea, y = SalePrice)) +
geom_point (alpha = 0.6 , color = "#1f77b4" ) + # 绘制半透明散点
geom_smooth (method = "lm" , col = "#ff7f0e" , se = FALSE ) + # 添加线性拟合曲线(无置信区间)
labs (
title = "房屋面积与价格的关系" ,
x = "房屋面积" ,
y = "价格"
) +
theme_minimal () + # 应用简洁主题
theme (
plot.title = element_text (hjust = 0.5 , face = "bold" , size = 16 ), # 自定义标题样式
axis.title = element_text (size = 14 ), # 自定义轴标题大小
axis.text = element_text (size = 12 ), # 自定义轴文本大小
panel.grid.major = element_line (colour = "#d3d3d3" , size = 0.5 ) # 添加网格线
) +
scale_y_continuous (labels = scales:: label_dollar ()) + # 格式化y轴标签为美元
coord_cartesian (xlim = c (0 , max (data$ GrLivArea) * 1.1 ), ylim = c (0 , max (data$ SalePrice) * 1.1 )) # 稍微扩展坐标轴范围,避免数据点紧贴边缘
```