##1.读取数据并查看前十行

#加载包
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#读取数据
df = read_excel("C:/Users/hzx/Desktop/beijing.xlsx",sheet = 1)

#展示前10行
head(df,10)

变量解释 CATE: 城区(分类变量)- chaoyang(朝阳区)、haidian(海淀区)、fengtai(丰台区)、xicheng(西城区)、dongcheng(东城区)、shijingshan(石景山区) AREA: 房屋面积(数值变量) subway: 是否邻近地铁(二分类变量) school: 是否为学区房(二分类变量) price: 单位面积房价(数值变量)

##2.将school及subway列数据转换为因子型

#查看转换前的数据类型
cat("转换前的数据类型:\n")
## 转换前的数据类型:
str(df[, c("subway", "school")])
## tibble [16,210 × 2] (S3: tbl_df/tbl/data.frame)
##  $ subway: num [1:16210] 1 1 1 1 1 1 1 1 1 1 ...
##  $ school: num [1:16210] 0 0 1 1 1 1 1 1 1 0 ...
#将subway和school转换为因子型
df$subway <- as.factor(df$subway)
df$school <- as.factor(df$school)

#查看转换后的数据类型
cat("\n转换后的数据类型:\n")
## 
## 转换后的数据类型:
str(df[, c("subway", "school")])
## tibble [16,210 × 2] (S3: tbl_df/tbl/data.frame)
##  $ subway: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ school: Factor w/ 2 levels "0","1": 1 1 2 2 2 2 2 2 2 1 ...

##描述性统计

summary(df)
##      CATE                AREA        subway    school        price       
##  Length:16210       Min.   : 30.06   0: 2791   0:11297   Min.   : 18348  
##  Class :character   1st Qu.: 60.00   1:13419   1: 4913   1st Qu.: 42812  
##  Mode  :character   Median : 78.83                       Median : 57473  
##                     Mean   : 91.75                       Mean   : 61152  
##                     3rd Qu.:110.52                       3rd Qu.: 76100  
##                     Max.   :299.00                       Max.   :149871
cat("\n========== 数值变量统计 ==========\n")
## 
## ========== 数值变量统计 ==========
#面积统计
cat("AREA(面积):\n")
## AREA(面积):
cat("  最小值:", min(df$AREA), "㎡\n")
##   最小值: 30.06 ㎡
cat("  最大值:", max(df$AREA), "㎡\n")
##   最大值: 299 ㎡
cat("  平均值:", round(mean(df$AREA), 2), "㎡\n")
##   平均值: 91.75 ㎡
cat("  中位数:", median(df$AREA), "㎡\n")
##   中位数: 78.83 ㎡
#价格统计
cat("\nprice(单价):\n")
## 
## price(单价):
cat("  最小值:", min(df$price), "元/㎡\n")
##   最小值: 18348 元/㎡
cat("  最大值:", max(df$price), "元/㎡\n")
##   最大值: 149871 元/㎡
cat("  平均值:", round(mean(df$price), 2), "元/㎡\n")
##   平均值: 61151.81 元/㎡
cat("  中位数:", median(df$price), "元/㎡\n")
##   中位数: 57473 元/㎡
#分类变量统计
cat("\n========== 分类变量统计 ==========\n")
## 
## ========== 分类变量统计 ==========
cat("CATE(城区):\n")
## CATE(城区):
print(table(df$CATE))
## 
##    chaoyang   dongcheng     fengtai     haidian shijingshan     xicheng 
##        2864        2783        2947        2919        1947        2750
cat("\nsubway(是否近地铁):\n")
## 
## subway(是否近地铁):
print(table(df$subway))
## 
##     0     1 
##  2791 13419
cat("\nschool(是否学区房):\n")
## 
## school(是否学区房):
print(table(df$school))
## 
##     0     1 
## 11297  4913

##3.筛选 AREA > 250 且 price > 100000 的数据

high_end <- df[df$AREA > 250 & df$price > 100000, ]

##查看结果
cat("符合条件的房源数:", nrow(high_end), "\n")
## 符合条件的房源数: 16
high_end

##4.将数据按照 price 降序排列

df_sorted <- df[order(-df$price), ]

# 查看前10行
head(df_sorted, 10)

##5.在原数据集上增加一列房屋总价,命名为totalprice

df$totalprice <- df$AREA * df$price

#查看前几行
head(df[, c("AREA", "price", "totalprice")], 10)

##6.按照 CATE 进行分类,计算price的最大值、最小值和平均值

result <- aggregate(price ~ CATE, data = df, 
                    FUN = function(x) c(max = max(x), min = min(x), mean = mean(x)))

#查看结果
print(result)
##          CATE price.max price.min price.mean
## 1    chaoyang 124800.00  23011.00   52800.62
## 2   dongcheng 149254.00  20089.00   71883.60
## 3     fengtai  87838.00  18348.00   42500.90
## 4     haidian 135105.00  25568.00   68757.60
## 5 shijingshan 100000.00  18854.00   40286.89
## 6     xicheng 149871.00  21918.00   85674.78

##7.分别计算海淀区靠近地铁的学区房以及非学区房的totalprice均值

haidian_subway <- df[df$CATE == "haidian" & df$subway == "1", ]
#学区房均值
school_mean <- mean(haidian_subway$totalprice[haidian_subway$school == "1"])
#非学区房均值
nonschool_mean <- mean(haidian_subway$totalprice[haidian_subway$school == "0"])
#输出结果
cat("海淀区靠近地铁的学区房总价均值:", round(school_mean, 2), "\n")
## 海淀区靠近地铁的学区房总价均值: 7319466
cat("海淀区靠近地铁的非学区房总价均值:", round(nonschool_mean, 2), "\n")
## 海淀区靠近地铁的非学区房总价均值: 5731588