##1.读取数据并查看前十行
#加载包
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#读取数据
df = read_excel("C:/Users/hzx/Desktop/beijing.xlsx",sheet = 1)
#展示前10行
head(df,10)
变量解释 CATE: 城区(分类变量)- chaoyang(朝阳区)、haidian(海淀区)、fengtai(丰台区)、xicheng(西城区)、dongcheng(东城区)、shijingshan(石景山区) AREA: 房屋面积(数值变量) subway: 是否邻近地铁(二分类变量) school: 是否为学区房(二分类变量) price: 单位面积房价(数值变量)
##2.将school及subway列数据转换为因子型
#查看转换前的数据类型
cat("转换前的数据类型:\n")
## 转换前的数据类型:
str(df[, c("subway", "school")])
## tibble [16,210 × 2] (S3: tbl_df/tbl/data.frame)
## $ subway: num [1:16210] 1 1 1 1 1 1 1 1 1 1 ...
## $ school: num [1:16210] 0 0 1 1 1 1 1 1 1 0 ...
#将subway和school转换为因子型
df$subway <- as.factor(df$subway)
df$school <- as.factor(df$school)
#查看转换后的数据类型
cat("\n转换后的数据类型:\n")
##
## 转换后的数据类型:
str(df[, c("subway", "school")])
## tibble [16,210 × 2] (S3: tbl_df/tbl/data.frame)
## $ subway: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ school: Factor w/ 2 levels "0","1": 1 1 2 2 2 2 2 2 2 1 ...
##描述性统计
summary(df)
## CATE AREA subway school price
## Length:16210 Min. : 30.06 0: 2791 0:11297 Min. : 18348
## Class :character 1st Qu.: 60.00 1:13419 1: 4913 1st Qu.: 42812
## Mode :character Median : 78.83 Median : 57473
## Mean : 91.75 Mean : 61152
## 3rd Qu.:110.52 3rd Qu.: 76100
## Max. :299.00 Max. :149871
cat("\n========== 数值变量统计 ==========\n")
##
## ========== 数值变量统计 ==========
#面积统计
cat("AREA(面积):\n")
## AREA(面积):
cat(" 最小值:", min(df$AREA), "㎡\n")
## 最小值: 30.06 ㎡
cat(" 最大值:", max(df$AREA), "㎡\n")
## 最大值: 299 ㎡
cat(" 平均值:", round(mean(df$AREA), 2), "㎡\n")
## 平均值: 91.75 ㎡
cat(" 中位数:", median(df$AREA), "㎡\n")
## 中位数: 78.83 ㎡
#价格统计
cat("\nprice(单价):\n")
##
## price(单价):
cat(" 最小值:", min(df$price), "元/㎡\n")
## 最小值: 18348 元/㎡
cat(" 最大值:", max(df$price), "元/㎡\n")
## 最大值: 149871 元/㎡
cat(" 平均值:", round(mean(df$price), 2), "元/㎡\n")
## 平均值: 61151.81 元/㎡
cat(" 中位数:", median(df$price), "元/㎡\n")
## 中位数: 57473 元/㎡
#分类变量统计
cat("\n========== 分类变量统计 ==========\n")
##
## ========== 分类变量统计 ==========
cat("CATE(城区):\n")
## CATE(城区):
print(table(df$CATE))
##
## chaoyang dongcheng fengtai haidian shijingshan xicheng
## 2864 2783 2947 2919 1947 2750
cat("\nsubway(是否近地铁):\n")
##
## subway(是否近地铁):
print(table(df$subway))
##
## 0 1
## 2791 13419
cat("\nschool(是否学区房):\n")
##
## school(是否学区房):
print(table(df$school))
##
## 0 1
## 11297 4913
##3.筛选 AREA > 250 且 price > 100000 的数据
high_end <- df[df$AREA > 250 & df$price > 100000, ]
##查看结果
cat("符合条件的房源数:", nrow(high_end), "\n")
## 符合条件的房源数: 16
high_end
##4.将数据按照 price 降序排列
df_sorted <- df[order(-df$price), ]
# 查看前10行
head(df_sorted, 10)
##5.在原数据集上增加一列房屋总价,命名为totalprice
df$totalprice <- df$AREA * df$price
#查看前几行
head(df[, c("AREA", "price", "totalprice")], 10)
##6.按照 CATE 进行分类,计算price的最大值、最小值和平均值
result <- aggregate(price ~ CATE, data = df,
FUN = function(x) c(max = max(x), min = min(x), mean = mean(x)))
#查看结果
print(result)
## CATE price.max price.min price.mean
## 1 chaoyang 124800.00 23011.00 52800.62
## 2 dongcheng 149254.00 20089.00 71883.60
## 3 fengtai 87838.00 18348.00 42500.90
## 4 haidian 135105.00 25568.00 68757.60
## 5 shijingshan 100000.00 18854.00 40286.89
## 6 xicheng 149871.00 21918.00 85674.78
##7.分别计算海淀区靠近地铁的学区房以及非学区房的totalprice均值
haidian_subway <- df[df$CATE == "haidian" & df$subway == "1", ]
#学区房均值
school_mean <- mean(haidian_subway$totalprice[haidian_subway$school == "1"])
#非学区房均值
nonschool_mean <- mean(haidian_subway$totalprice[haidian_subway$school == "0"])
#输出结果
cat("海淀区靠近地铁的学区房总价均值:", round(school_mean, 2), "\n")
## 海淀区靠近地铁的学区房总价均值: 7319466
cat("海淀区靠近地铁的非学区房总价均值:", round(nonschool_mean, 2), "\n")
## 海淀区靠近地铁的非学区房总价均值: 5731588