House Price Insight
Obtain the data with Python
import requests
from pyquery import PyQuery as pq
import pandas as pd
import time
#=================================================================================
def save_to_csv(filename,variblelist):
df = pd.DataFrame([variblelist])
#df.columns = headers
df.to_csv('U:\cse\windows\zhang.11091\Rprojects\%s.csv'%filename, mode='a', header=False,encoding='GB18030')
#csv_to_xls.csv_to_excel('%s'%filename)
#==================================================================================
def getdata(city, city_dict, p):
# city为城市名称,对于上海、北京等城市需要输入'sh'或'bj'这样的开头首字母缩写
# p为需要调取的页数
start = time.clock()
baseurl = 'https://%s.lianjia.com/ershoufang/pg{page}/'%city
for i in range(1, p+1):
url = baseurl.format(page=i)
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/62.0'}
res = requests.get(url, headers=headers)
doc = pq(res.text)
for l in doc('.sellListContent li .info.clear').items(): #.items()好用
title = l('.title').text()
address = l('.address').text()
houseinfo = l('.houseInfo').text().split('|')
flood = l('.flood').text()
follow = l('.followInfo').text()
tprice = l('.priceInfo .totalPrice span').text()
uprice = l('.priceInfo .unitPrice span').text()
li = [title,flood,follow,tprice,uprice] + houseinfo
save_to_csv('%s二手房数据'%city_dict[city],li)
end = time.clock()
print('耗时: %s Minutes' % round((end - start)/60))
#=================================================================================
city_dict = {'bj':'北京',
'sh':'上海'}
getdata('bj', city_dict, 200)Library
Data
Data Import
Initial Cleaning
fun_sub <- function(x){
as.numeric(str_extract(str_split(x, "/")[[1]][1],'\\d+'))
}
fun_date <- function(x){
str_extract(str_split(x, "/")[[1]][2],'(\\d+天|\\d+个月)')
}
fun_location_1 <- function(x){
str_split(x, " - ")[[1]][1]
}
fun_location_2 <- function(x){
str_split(x, " - ")[[1]][2]
}
data <- bj_2nd %>%
mutate(
# 建筑年份
year = str_extract(year, '....'),
# 每平米价格
unit_price = as.numeric(str_extract(price_per_square, '\\d+')),
# 几室
room = as.integer(apply(bj_2nd["type"], 1,
function(x) str_extract_all(x, "\\d")[[1]][1])),
# 几厅
living_room = as.integer(apply(bj_2nd["type"], 1,
function(x) str_extract_all(x, "\\d")[[1]][2])),
# 建筑面积
area = as.numeric(str_extract(area, "[^平米]*")), # "\\d+[\\.]?\\d+"
# 总价
Total_Price = Total_Price * 10^4,
# 关注人数
subscribe = apply(bj_2nd['pub'], 1, fun_sub),
# 发布时间
publish_date = apply(bj_2nd['pub'], 1, fun_date),
# 位置信息
location_1 = apply(bj_2nd['short_desc'], 1, fun_location_1),
location_2 = apply(bj_2nd['short_desc'], 1, fun_location_2)
) %>%
select(-short_desc, -pub, -price_per_square, -type) %>%
filter(str_detect(floor, "(中楼层|低楼层|高楼层|底层|顶层)") == T) %>%
mutate(
total_floor = str_extract(floor, "\\d+"),
floor = str_extract(floor, "(中楼层|低楼层|高楼层|底层|顶层)")
)
# 房屋朝向变量清洗
fun_ori <- function(x) {
if (str_detect(x, '(南 北|北 南)')) {
" 南 北"
} else if (str_detect(x, '(南 西 北|南 北 西|西 南 北|西 北 南|北 南 西|北 西 南)')){
'南 西 北'
} else if (str_detect(x, '(东 西|西 东)')) {
'东 西'
} else if (str_detect(x, '(东 南 北|东 北 南|南 北 东|南 东 北|北 东 南|北 南 东)')){
"东 南 北"
} else if (str_detect(x, '(东南 北|北 东南)')) {
"东南 北"
} else if (str_detect(x, '(西南 北|北 西南)')) {
"西南 北"
} else if (str_detect(x, '(东 西 北|东 北 西|北 西 东|北 东 西|西 东 北|西 北 东)')){
"东 西 北"
} else if (str_detect(x, '(东 南 西|东 西 南|南 西 东|南 东 西|西 东 南|西 南 东)')){
"东 南 西"
} else if (str_detect(x, '(西 北|北 西)')) {
"西 北"
} else if (str_detect(x, '(西 东|东 西)')) {
"西 东"
} else if (str_detect(x, '(东南 西北|西北 东南)')) {
"东南 西北"
} else {
x
}
}
data$orientation <- apply(data['orientation'], 1, fun_ori)
ori <- data.frame(table(data$orientation)) %>% arrange(desc(Freq))
data <- data %>%
filter(orientation %in% ori$Var1[1:14])Geographic Information
Baidu
get_lng <- function(text) {
lng_str <- '<lng>\\d+\\.\\d+'
str_extract(str_extract(text, lng_str), '\\d+\\.\\d+')
}
get_lat <- function(text) {
lat_str <- '<lat>\\d+\\.\\d+'
str_extract(str_extract(text, lat_str), '\\d+\\.\\d+')
}
get_geo <- function(address){
result <- data.frame()
for (add in address) {
ak <- "e14t8eKcwKl9jZgvjmxGIMfmCtpDNHMF"
url <- paste("http://api.map.baidu.com/geocoding/v3/?address=", add, "&output = json&ak=", ak, "&callback=showLocation", sep = "")
response <- GET(url, user_agent = "text")
txt <- content(response, as = "text", encoding = "UTF-8")
ini_result <- data.frame("address" = add,
"lat" = get_lat(txt),
"lng" = get_lng(txt))
result <- rbind(result, ini_result)
}
result
}
cityname <- "北京市"
data <- data %>%
mutate(location = str_c(cityname, location_2, location_1))
# 调取
location_data <- get_geo(data$location)
location_data_backup <- location_data
# location_data <- location_data_backup
location_data <- location_data %>%
mutate(
address = as.character(address),
lat = as.numeric(as.character(lat)),
lng = as.numeric(as.character(lng))
)Amap (alternative option)
get_lat_lng <- function(address) {
key <- "a410a87693c48b0f87acacf39f2d0cfb"
result <- data.frame()
for (add in address) {
url <- paste("https://restapi.amap.com/v3/geocode/geo?address=", add, "&output=XML&key=", key, sep = "")
r <- GET(url, user_agent = "test")
t <- content(r, as = "text")
ini <- str_extract(t, "<location>\\d+\\.\\d+,\\d+\\.\\d+")
ini_2 <- str_extract(ini, '[^<location>]+')
res <- data.frame("address" = add,
"lng" = str_split(ini_2, ",", simplify = T)[1],
"lat" = str_split(ini_2, ",", simplify = T)[2])
result <- rbind(result, res)
}
result
}Data Viz on Map
# Normalization
normalize <- function(x) {
min <- min(x)
max <- max(x)
(x - min)/(max-min)
}
location_data$price <- data$unit_price
location_data$n_price <- normalize(data$unit_price)
location_data$log_price <- log10(data$unit_price)
# Continuous ColorBar
Npal <- colorNumeric(palette = c("green","blue","yellow", "red"), domain = location_data$price)
## 注意leaflet索引变量时前面要加~!!
leaflet(location_data) %>%
addTiles() %>%
setView(116.4, 39.91, zoom = 9.5) %>%
addCircleMarkers(lat = ~lat, lng = ~lng,
radius = ~n_price,
color = Npal(location_data[['price']]),
fillOpacity = 0.8) %>%
addLegend(pal = Npal, values = location_data[["price"]],
opacity = 1)Data Preprocessing
# 为ML预测准备数据集================================== Step
# 1.===========================================================
ml_data <- data %>% select(-desc, -Total_Price, -location_1, -location_2, -location,
-publish_date) %>% select(unit_price, everything())
str(ml_data)Classes 'tbl_df', 'tbl' and 'data.frame': 5880 obs. of 11 variables:
$ unit_price : num 35371 100313 39850 46203 66790 ...
$ area : num 76.9 38.4 85.3 94.2 86.7 ...
$ orientation : chr " 南 北" "北" " 南 北" " 南 北" ...
$ decoration : chr "精装" "简装" "精装" "精装" ...
$ floor : chr "中楼层" "中楼层" "底层" "中楼层" ...
$ year : chr "1996" "2006" "1994" "2003" ...
$ building_type: chr "板楼" "塔楼" "板楼" "板楼" ...
$ room : int 2 1 3 2 2 2 2 3 2 1 ...
$ living_room : int 1 0 1 1 1 1 1 1 1 1 ...
$ subscribe : num 79 18 71 21 81 109 49 9 12 26 ...
$ total_floor : chr "6" "26" "7" "6" ...
# 数据类型 numeric
ml_data$room <- as.integer(ml_data$room)
ml_data$living_room <- as.integer(ml_data$living_room)
ml_data$total_floor <- as.integer(ml_data$total_floor)
ml_data$year <- (2019 - as.integer(ml_data$year))
## factor
table(ml_data$decoration)
其他 毛坯 简装 精装
3 158 2286 3433
塔楼 暂无数据 板塔结合 板楼
1357 10 1102 3396
中楼层 低楼层 底层 顶层 高楼层
2530 1226 503 504 1117
1 2 3 4 5 6
1351 3445 1029 44 9 2
0 1 2 3 4
351 4983 540 5 1
ml_data$decoration <- factor(ml_data$decoration, levels = c("精装", "简装", "毛坯",
"其他"))
ml_data$building_type <- factor(ml_data$building_type, levels = c("板楼", "板塔结合",
"塔楼", "暂无数据"))
ml_data$floor <- factor(ml_data$floor, levels = c("底层", "低楼层", "中楼层",
"高楼层", "顶层"))
ml_data$orientation <- factor(ml_data$orientation)
# 删除部分异常值
ml_data <- ml_data %>% filter(room <= 3, living_room != 3)
ml_data$room <- factor(ml_data$room)
ml_data$living_room <- factor(ml_data$living_room)
# 去除缺失值
ml_data <- na.omit(ml_data)
str(ml_data)Classes 'tbl_df', 'tbl' and 'data.frame': 5807 obs. of 11 variables:
$ unit_price : num 35371 100313 39850 46203 66790 ...
$ area : num 76.9 38.4 85.3 94.2 86.7 ...
$ orientation : Factor w/ 14 levels " 南 北","东",..: 1 8 1 1 5 2 14 4 1 2 ...
$ decoration : Factor w/ 4 levels "精装","简装",..: 1 2 1 1 1 2 2 2 1 2 ...
$ floor : Factor w/ 5 levels "底层","低楼层",..: 3 3 1 3 3 4 3 2 1 3 ...
$ year : num 23 13 25 16 9 23 36 26 18 29 ...
$ building_type: Factor w/ 4 levels "板楼","板塔结合",..: 1 3 1 1 2 3 3 1 1 3 ...
$ room : Factor w/ 3 levels "1","2","3": 2 1 3 2 2 2 2 3 2 1 ...
$ living_room : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 2 2 2 2 2 ...
$ subscribe : num 79 18 71 21 81 109 49 9 12 26 ...
$ total_floor : int 6 26 7 6 11 16 16 6 6 18 ...
- attr(*, "na.action")= 'omit' Named int 384 627 888 1047 1476 1591 1725 1990 2305 2390 ...
..- attr(*, "names")= chr "384" "627" "888" "1047" ...
## Step 2. =============================================================
## Themes
ggthemr("earth", type = "outer", layout = "scientific", spacing = 2)
## 绘图函数
distribution_plot <- function(x, title, log = FALSE) {
if (log == F) {
ggplot() + geom_histogram(aes(ml_data[[x]], fill = "orange")) + ggtitle(title) +
labs(x = NULL, y = NULL) + theme(legend.position = "none")
} else {
ggplot() + geom_histogram(aes(log(ml_data[[x]]), fill = "orange")) + ggtitle(title) +
labs(x = NULL, y = NULL) + theme(legend.position = "none")
}
}
## unit_price--------------------------------------------------------------- 原始
unit_price_raw <- distribution_plot("unit_price", "Positive Skew in Unit House Price")
unit_price_raw_2 <- ggplot(ml_data, aes(sample = unit_price)) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(unit_price_raw, unit_price_raw_2, cols = 2)### 对数处理后
unit_price_log <- distribution_plot("unit_price", "No Skewness after log transformation",
log = TRUE)
unit_price_log2 <- ggplot(ml_data, aes(sample = log(unit_price))) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(unit_price_log, unit_price_log2, cols = 2)## area---------------------------------------------------------------------- 原始
area_raw <- distribution_plot("area", "Positive Skew in area")
area_raw2 <- ggplot(ml_data, aes(sample = area)) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(area_raw, area_raw2, cols = 2)### 对数处理后
area_log <- distribution_plot("area", "No Skewness after log transformation", log = T)
area_log2 <- ggplot(ml_data, aes(sample = log(area))) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(area_log, area_log2, cols = 2)## year---------------------------------------------------------------------
year_raw <- distribution_plot("year", "No transformation needed for house age")
year_raw2 <- ggplot(ml_data, aes(sample = year)) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(year_raw, year_raw2, cols = 2)## subscribe--------------------------------------------- 原始
sub_raw <- distribution_plot("subscribe", "Positive skew in subscribes")
sub_raw2 <- ggplot(ml_data, aes(sample = subscribe)) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(sub_raw, sub_raw2, cols = 2)### 对数处理后
sub_log <- distribution_plot("subscribe", "No Skewness after log transformation",
log = T)
sub_log2 <- ggplot(ml_data, aes(sample = log(subscribe))) + stat_qq() + stat_qq_line(color = "gray90",
size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(sub_log, sub_log2, cols = 2)Modeling
# Step 2.3 Rewrite.-------------------------------------------------------------
set.seed(4595)
data_split <- initial_split(ml_data, strata = "unit_price", p = 0.8)
traindata <- training(data_split)
testdata <- testing(data_split)
house_recipe <- training(data_split) %>% recipe(unit_price ~ .) %>% step_other(orientation) %>%
step_dummy(all_nominal()) %>% step_log(all_outcomes(), base = 10) %>% step_center(all_predictors()) %>%
step_scale(all_predictors()) %>% prep(training = traindata, retain = TRUE)
traindata <- juice(house_recipe)
testdata <- house_recipe %>% bake(testing(data_split))
# random forest
house_rf <- rand_forest(mode = "regression") %>% set_engine("ranger") %>% fit(unit_price ~
., data = traindata)
house_rfparsnip model object
Fit time: 1.6s
Ranger result
Call:
ranger::ranger(formula = formula, data = data, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
Type: Regression
Number of trees: 500
Sample size: 4648
Number of independent variables: 23
Mtry: 4
Target node size: 5
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 0.008722543
R squared (OOB): 0.6783138
# glmn
house_glmn <- linear_reg(penalty = 0.001, mixture = 0.5) %>% set_engine("glmnet") %>%
fit(unit_price ~ ., data = juice(house_recipe))
house_glmnparsnip model object
Fit time: 14ms
Call: glmnet::glmnet(x = as.matrix(x), y = y, family = "gaussian", alpha = ~0.5)
Df %Dev Lambda
1 0 0.00000 0.116700
2 1 0.01631 0.106300
3 2 0.03848 0.096900
4 2 0.06221 0.088290
5 2 0.08307 0.080450
6 2 0.10130 0.073300
7 2 0.11720 0.066790
8 4 0.13560 0.060860
9 4 0.15230 0.055450
10 5 0.16920 0.050520
11 5 0.18810 0.046040
12 6 0.20480 0.041950
13 6 0.21980 0.038220
14 7 0.23290 0.034820
15 8 0.24540 0.031730
16 8 0.25740 0.028910
17 11 0.27060 0.026340
18 11 0.28330 0.024000
19 11 0.29410 0.021870
20 11 0.30320 0.019930
21 12 0.31120 0.018160
22 13 0.31830 0.016540
23 13 0.32420 0.015070
24 14 0.32930 0.013740
25 14 0.33380 0.012520
[ reached getOption("max.print") -- omitted 50 rows ]
# xgboost
house_xgb <- boost_tree(mode = "regression") %>% set_engine("xgboost") %>% fit(unit_price ~
., data = traindata)
test_results <- testdata %>% select(unit_price) %>% bind_cols(predict(house_rf, new_data = testdata) %>%
rename(RandomForest = .pred), predict(house_glmn, new_data = testdata) %>% rename(glmnet = .pred),
predict(house_xgb, new_data = testdata) %>% rename(xgb = .pred))
test_results# A tibble: 1,159 x 4
unit_price RandomForest glmnet xgb
<dbl> <dbl> <dbl> <dbl>
1 4.55 4.56 4.72 4.56
2 5.00 5.00 4.96 4.97
3 4.66 4.66 4.67 4.64
4 4.82 4.80 4.70 4.76
5 4.84 4.84 4.82 4.82
6 4.85 4.85 4.91 4.83
7 4.48 4.48 4.60 4.48
8 4.66 4.72 4.70 4.67
9 4.59 4.87 4.76 4.83
10 4.64 4.66 4.66 4.65
# … with 1,149 more rows
# A tibble: 3 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.132
2 rsq standard 0.333
3 mae standard 0.103
# A tibble: 3 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.0994
2 rsq standard 0.620
3 mae standard 0.0567
# A tibble: 3 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.104
2 rsq standard 0.598
3 mae standard 0.0670
test_results %>% gather(model, prediction, -unit_price) %>% ggplot(aes(x = prediction,
y = unit_price)) + geom_abline(col = "green", lty = 2) + geom_point(alpha = 0.4) +
facet_wrap(~model) + coord_fixed()