House Price Insight

Obtain the data with Python
Library
Data
- Data Import
- Initial Cleaning
Geographic Information
Data Preprocessing
Modeling

Obtain the data with Python

import requests
from  pyquery import PyQuery as pq
import pandas as pd
import time
#=================================================================================
def save_to_csv(filename,variblelist):
    df = pd.DataFrame([variblelist])
    #df.columns = headers
    df.to_csv('U:\cse\windows\zhang.11091\Rprojects\%s.csv'%filename, mode='a', header=False,encoding='GB18030')
    #csv_to_xls.csv_to_excel('%s'%filename)
#==================================================================================
def getdata(city, city_dict, p):
    # city为城市名称，对于上海、北京等城市需要输入'sh'或'bj'这样的开头首字母缩写
    # p为需要调取的页数
    start = time.clock()
    baseurl = 'https://%s.lianjia.com/ershoufang/pg{page}/'%city
    for i in range(1, p+1):
        url = baseurl.format(page=i)
        print(url)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/62.0'}
        res = requests.get(url, headers=headers)
        doc = pq(res.text)
        for l in doc('.sellListContent li .info.clear').items(): #.items()好用
            title = l('.title').text()
            address = l('.address').text()
            houseinfo = l('.houseInfo').text().split('|')
            flood = l('.flood').text()
            follow = l('.followInfo').text()
            tprice = l('.priceInfo .totalPrice span').text()
            uprice = l('.priceInfo .unitPrice span').text()
            li = [title,flood,follow,tprice,uprice] + houseinfo
            save_to_csv('%s二手房数据'%city_dict[city],li)
            
    end = time.clock()
    print('耗时: %s Minutes' % round((end - start)/60))
#=================================================================================
city_dict = {'bj':'北京',
            'sh':'上海'}
getdata('bj', city_dict, 200)

Library

# Data Manipulation Libs
library(tidyverse)
library(stringr)
library(lubridate)
# Web & API
library(httr)
# Map
library(leaflet)
library(rgdal)
# Models
library(tidymodels)
library(caret)
library(randomForest)
library(scales)    # 处理画图标度
library(ggridges)
library(pROC)
# Plot Theme
library(ggthemr)

Data

Data Import

cityname <- "北京市"

bj_2nd <- read_csv("data.csv", col_names = FALSE)
bj_2nd <- bj_2nd[,-1]
names(bj_2nd) <- c("desc", "short_desc", "pub", "Total_Price", "price_per_square", "type", 
                   "area", "orientation", "decoration", "floor", "year", "building_type")

Initial Cleaning

fun_sub <- function(x){
  as.numeric(str_extract(str_split(x, "/")[[1]][1],'\\d+'))
}
fun_date <- function(x){
  str_extract(str_split(x, "/")[[1]][2],'(\\d+天|\\d+个月)')
}
fun_location_1 <- function(x){
  str_split(x, " - ")[[1]][1]
}
fun_location_2 <- function(x){
  str_split(x, " - ")[[1]][2]
}

data <- bj_2nd %>%
  mutate(
    # 建筑年份
         year = str_extract(year, '....'),
    # 每平米价格
         unit_price = as.numeric(str_extract(price_per_square, '\\d+')),
    # 几室
         room = as.integer(apply(bj_2nd["type"], 1, 
                                 function(x)  str_extract_all(x, "\\d")[[1]][1])),
    # 几厅
         living_room = as.integer(apply(bj_2nd["type"], 1, 
                                        function(x)  str_extract_all(x, "\\d")[[1]][2])),
    # 建筑面积
         area = as.numeric(str_extract(area, "[^平米]*")), # "\\d+[\\.]?\\d+"
    # 总价
         Total_Price = Total_Price * 10^4,
    # 关注人数
         subscribe = apply(bj_2nd['pub'], 1, fun_sub),
    # 发布时间
         publish_date = apply(bj_2nd['pub'], 1, fun_date),
    # 位置信息
         location_1 = apply(bj_2nd['short_desc'], 1, fun_location_1),
         location_2 = apply(bj_2nd['short_desc'], 1, fun_location_2)
         
  ) %>%
  select(-short_desc, -pub, -price_per_square, -type) %>%
  filter(str_detect(floor, "(中楼层|低楼层|高楼层|底层|顶层)") == T) %>%
  mutate(
    total_floor = str_extract(floor, "\\d+"),
    floor = str_extract(floor, "(中楼层|低楼层|高楼层|底层|顶层)")
  )

# 房屋朝向变量清洗
fun_ori <- function(x) {
  if (str_detect(x, '(南 北|北 南)')) {
    " 南 北"
  } else if (str_detect(x, '(南 西 北|南 北 西|西 南 北|西 北 南|北 南 西|北 西 南)')){
    '南 西 北'
  } else if (str_detect(x, '(东 西|西 东)')) {
    '东 西'
  } else if (str_detect(x, '(东 南 北|东 北 南|南 北 东|南 东 北|北 东 南|北 南 东)')){
    "东 南 北"
  } else if (str_detect(x, '(东南 北|北 东南)')) {
    "东南 北"
  } else if (str_detect(x, '(西南 北|北 西南)')) {
    "西南 北"
  } else if (str_detect(x, '(东 西 北|东 北 西|北 西 东|北 东 西|西 东 北|西 北 东)')){
    "东 西 北"
  } else if (str_detect(x, '(东 南 西|东 西 南|南 西 东|南 东 西|西 东 南|西 南 东)')){
    "东 南 西"
  } else if (str_detect(x, '(西 北|北 西)')) {
    "西 北"
  } else if (str_detect(x, '(西 东|东 西)')) {
    "西 东"
  } else if (str_detect(x, '(东南 西北|西北 东南)')) {
    "东南 西北"
  } else {
    x
  }
}
data$orientation <- apply(data['orientation'], 1, fun_ori)
ori <- data.frame(table(data$orientation)) %>% arrange(desc(Freq))
data <- data %>%
  filter(orientation %in% ori$Var1[1:14])

Geographic Information

Baidu

get_lng <- function(text) {
  lng_str <- '<lng>\\d+\\.\\d+'
  str_extract(str_extract(text, lng_str), '\\d+\\.\\d+')
}

get_lat <- function(text) {
  lat_str <- '<lat>\\d+\\.\\d+'
  str_extract(str_extract(text, lat_str), '\\d+\\.\\d+')
}

get_geo <- function(address){
  result <- data.frame()
  for (add in address) {
    ak <- "e14t8eKcwKl9jZgvjmxGIMfmCtpDNHMF"
    url <- paste("http://api.map.baidu.com/geocoding/v3/?address=", add, "&output = json&ak=", ak, "&callback=showLocation", sep = "")
    response <- GET(url, user_agent = "text")
    txt <-  content(response, as = "text", encoding = "UTF-8")
    ini_result <- data.frame("address" = add, 
                             "lat" = get_lat(txt), 
                             "lng" = get_lng(txt))
    result <- rbind(result, ini_result)
  }
  result
}

cityname <- "北京市"
data <- data %>%
  mutate(location = str_c(cityname, location_2, location_1))
# 调取
location_data <- get_geo(data$location)
location_data_backup <- location_data
# location_data <- location_data_backup
location_data <- location_data %>%
  mutate(
    address = as.character(address),
    lat = as.numeric(as.character(lat)),
    lng = as.numeric(as.character(lng))
  )

Amap (alternative option)

get_lat_lng <- function(address) {
  key <- "a410a87693c48b0f87acacf39f2d0cfb"
  result <- data.frame()
  for (add in address) {
    url <- paste("https://restapi.amap.com/v3/geocode/geo?address=", add, "&output=XML&key=", key, sep = "")
    r <- GET(url, user_agent = "test")
    t <- content(r, as = "text")
    ini <- str_extract(t, "<location>\\d+\\.\\d+,\\d+\\.\\d+")
    ini_2 <- str_extract(ini, '[^<location>]+')
    res <- data.frame("address" = add, 
                      "lng" = str_split(ini_2, ",", simplify = T)[1],
                      "lat" = str_split(ini_2, ",", simplify = T)[2])
    result <- rbind(result, res)
  }
  result
}

Data Viz on Map

# Normalization
normalize <- function(x) {
  min <- min(x)
  max <- max(x)
  (x - min)/(max-min)
}
location_data$price <- data$unit_price
location_data$n_price <- normalize(data$unit_price)
location_data$log_price <- log10(data$unit_price)
# Continuous ColorBar
Npal <- colorNumeric(palette = c("green","blue","yellow", "red"), domain = location_data$price)

## 注意leaflet索引变量时前面要加~!!
leaflet(location_data) %>%
  addTiles() %>%
  setView(116.4, 39.91, zoom = 9.5) %>%
  addCircleMarkers(lat = ~lat, lng = ~lng,
                   radius = ~n_price,
                   color = Npal(location_data[['price']]),
                   fillOpacity = 0.8) %>%
  addLegend(pal = Npal, values = location_data[["price"]], 
            opacity = 1)

Data Preprocessing

# 为ML预测准备数据集================================== Step
# 1.===========================================================
ml_data <- data %>% select(-desc, -Total_Price, -location_1, -location_2, -location, 
    -publish_date) %>% select(unit_price, everything())
str(ml_data)

Classes 'tbl_df', 'tbl' and 'data.frame':   5880 obs. of  11 variables:
 $ unit_price   : num  35371 100313 39850 46203 66790 ...
 $ area         : num  76.9 38.4 85.3 94.2 86.7 ...
 $ orientation  : chr  " 南 北" "北" " 南 北" " 南 北" ...
 $ decoration   : chr  "精装" "简装" "精装" "精装" ...
 $ floor        : chr  "中楼层" "中楼层" "底层" "中楼层" ...
 $ year         : chr  "1996" "2006" "1994" "2003" ...
 $ building_type: chr  "板楼" "塔楼" "板楼" "板楼" ...
 $ room         : int  2 1 3 2 2 2 2 3 2 1 ...
 $ living_room  : int  1 0 1 1 1 1 1 1 1 1 ...
 $ subscribe    : num  79 18 71 21 81 109 49 9 12 26 ...
 $ total_floor  : chr  "6" "26" "7" "6" ...

# 数据类型 numeric
ml_data$room <- as.integer(ml_data$room)
ml_data$living_room <- as.integer(ml_data$living_room)
ml_data$total_floor <- as.integer(ml_data$total_floor)
ml_data$year <- (2019 - as.integer(ml_data$year))
## factor
table(ml_data$decoration)


其他 毛坯 简装 精装 
   3  158 2286 3433

table(ml_data$building_type)


    塔楼 暂无数据 板塔结合     板楼 
    1357       10     1102     3396

table(ml_data$floor)


中楼层 低楼层   底层   顶层 高楼层 
  2530   1226    503    504   1117

table(ml_data$room)  # 需要进行因子化处理


   1    2    3    4    5    6 
1351 3445 1029   44    9    2

table(ml_data$living_room)


   0    1    2    3    4 
 351 4983  540    5    1

ml_data$decoration <- factor(ml_data$decoration, levels = c("精装", "简装", "毛坯", 
    "其他"))
ml_data$building_type <- factor(ml_data$building_type, levels = c("板楼", "板塔结合", 
    "塔楼", "暂无数据"))
ml_data$floor <- factor(ml_data$floor, levels = c("底层", "低楼层", "中楼层", 
    "高楼层", "顶层"))

ml_data$orientation <- factor(ml_data$orientation)

# 删除部分异常值
ml_data <- ml_data %>% filter(room <= 3, living_room != 3)
ml_data$room <- factor(ml_data$room)
ml_data$living_room <- factor(ml_data$living_room)
# 去除缺失值
ml_data <- na.omit(ml_data)
str(ml_data)

Classes 'tbl_df', 'tbl' and 'data.frame':   5807 obs. of  11 variables:
 $ unit_price   : num  35371 100313 39850 46203 66790 ...
 $ area         : num  76.9 38.4 85.3 94.2 86.7 ...
 $ orientation  : Factor w/ 14 levels " 南 北","东",..: 1 8 1 1 5 2 14 4 1 2 ...
 $ decoration   : Factor w/ 4 levels "精装","简装",..: 1 2 1 1 1 2 2 2 1 2 ...
 $ floor        : Factor w/ 5 levels "底层","低楼层",..: 3 3 1 3 3 4 3 2 1 3 ...
 $ year         : num  23 13 25 16 9 23 36 26 18 29 ...
 $ building_type: Factor w/ 4 levels "板楼","板塔结合",..: 1 3 1 1 2 3 3 1 1 3 ...
 $ room         : Factor w/ 3 levels "1","2","3": 2 1 3 2 2 2 2 3 2 1 ...
 $ living_room  : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 2 2 2 2 2 ...
 $ subscribe    : num  79 18 71 21 81 109 49 9 12 26 ...
 $ total_floor  : int  6 26 7 6 11 16 16 6 6 18 ...
 - attr(*, "na.action")= 'omit' Named int  384 627 888 1047 1476 1591 1725 1990 2305 2390 ...
  ..- attr(*, "names")= chr  "384" "627" "888" "1047" ...

## Step 2. =============================================================

## Themes
ggthemr("earth", type = "outer", layout = "scientific", spacing = 2)
## 绘图函数
distribution_plot <- function(x, title, log = FALSE) {
    if (log == F) {
        ggplot() + geom_histogram(aes(ml_data[[x]], fill = "orange")) + ggtitle(title) + 
            labs(x = NULL, y = NULL) + theme(legend.position = "none")
    } else {
        ggplot() + geom_histogram(aes(log(ml_data[[x]]), fill = "orange")) + ggtitle(title) + 
            labs(x = NULL, y = NULL) + theme(legend.position = "none")
    }
}
## unit_price--------------------------------------------------------------- 原始
unit_price_raw <- distribution_plot("unit_price", "Positive Skew in Unit House Price")

unit_price_raw_2 <- ggplot(ml_data, aes(sample = unit_price)) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(unit_price_raw, unit_price_raw_2, cols = 2)

### 对数处理后
unit_price_log <- distribution_plot("unit_price", "No Skewness after log transformation", 
    log = TRUE)

unit_price_log2 <- ggplot(ml_data, aes(sample = log(unit_price))) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(unit_price_log, unit_price_log2, cols = 2)

## area---------------------------------------------------------------------- 原始
area_raw <- distribution_plot("area", "Positive Skew in area")
area_raw2 <- ggplot(ml_data, aes(sample = area)) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(area_raw, area_raw2, cols = 2)

### 对数处理后
area_log <- distribution_plot("area", "No Skewness after log transformation", log = T)
area_log2 <- ggplot(ml_data, aes(sample = log(area))) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(area_log, area_log2, cols = 2)

## year---------------------------------------------------------------------
year_raw <- distribution_plot("year", "No transformation needed for house age")
year_raw2 <- ggplot(ml_data, aes(sample = year)) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(year_raw, year_raw2, cols = 2)

## subscribe--------------------------------------------- 原始
sub_raw <- distribution_plot("subscribe", "Positive skew in subscribes")
sub_raw2 <- ggplot(ml_data, aes(sample = subscribe)) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(sub_raw, sub_raw2, cols = 2)

### 对数处理后
sub_log <- distribution_plot("subscribe", "No Skewness after log transformation", 
    log = T)
sub_log2 <- ggplot(ml_data, aes(sample = log(subscribe))) + stat_qq() + stat_qq_line(color = "gray90", 
    size = 1.5, linetype = 2, alpha = 0.8) + labs(title = "QQplot", x = NULL, y = NULL)
Rmisc::multiplot(sub_log, sub_log2, cols = 2)

## total_floor wasn't presenting well-------
distribution_plot("total_floor", "", log = T)

Modeling

# Step 2.3 Rewrite.-------------------------------------------------------------
set.seed(4595)
data_split <- initial_split(ml_data, strata = "unit_price", p = 0.8)
traindata <- training(data_split)
testdata <- testing(data_split)

house_recipe <- training(data_split) %>% recipe(unit_price ~ .) %>% step_other(orientation) %>% 
    step_dummy(all_nominal()) %>% step_log(all_outcomes(), base = 10) %>% step_center(all_predictors()) %>% 
    step_scale(all_predictors()) %>% prep(training = traindata, retain = TRUE)

traindata <- juice(house_recipe)
testdata <- house_recipe %>% bake(testing(data_split))

# random forest
house_rf <- rand_forest(mode = "regression") %>% set_engine("ranger") %>% fit(unit_price ~ 
    ., data = traindata)
house_rf

parsnip model object

Fit time:  1.6s 
Ranger result

Call:
 ranger::ranger(formula = formula, data = data, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      4648 
Number of independent variables:  23 
Mtry:                             4 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       0.008722543 
R squared (OOB):                  0.6783138

# glmn
house_glmn <- linear_reg(penalty = 0.001, mixture = 0.5) %>% set_engine("glmnet") %>% 
    fit(unit_price ~ ., data = juice(house_recipe))
house_glmn

parsnip model object

Fit time:  14ms 

Call:  glmnet::glmnet(x = as.matrix(x), y = y, family = "gaussian",      alpha = ~0.5) 

   Df    %Dev   Lambda
1   0 0.00000 0.116700
2   1 0.01631 0.106300
3   2 0.03848 0.096900
4   2 0.06221 0.088290
5   2 0.08307 0.080450
6   2 0.10130 0.073300
7   2 0.11720 0.066790
8   4 0.13560 0.060860
9   4 0.15230 0.055450
10  5 0.16920 0.050520
11  5 0.18810 0.046040
12  6 0.20480 0.041950
13  6 0.21980 0.038220
14  7 0.23290 0.034820
15  8 0.24540 0.031730
16  8 0.25740 0.028910
17 11 0.27060 0.026340
18 11 0.28330 0.024000
19 11 0.29410 0.021870
20 11 0.30320 0.019930
21 12 0.31120 0.018160
22 13 0.31830 0.016540
23 13 0.32420 0.015070
24 14 0.32930 0.013740
25 14 0.33380 0.012520
 [ reached getOption("max.print") -- omitted 50 rows ]

# xgboost
house_xgb <- boost_tree(mode = "regression") %>% set_engine("xgboost") %>% fit(unit_price ~ 
    ., data = traindata)
test_results <- testdata %>% select(unit_price) %>% bind_cols(predict(house_rf, new_data = testdata) %>% 
    rename(RandomForest = .pred), predict(house_glmn, new_data = testdata) %>% rename(glmnet = .pred), 
    predict(house_xgb, new_data = testdata) %>% rename(xgb = .pred))
test_results

# A tibble: 1,159 x 4
   unit_price RandomForest glmnet   xgb
        <dbl>        <dbl>  <dbl> <dbl>
 1       4.55         4.56   4.72  4.56
 2       5.00         5.00   4.96  4.97
 3       4.66         4.66   4.67  4.64
 4       4.82         4.80   4.70  4.76
 5       4.84         4.84   4.82  4.82
 6       4.85         4.85   4.91  4.83
 7       4.48         4.48   4.60  4.48
 8       4.66         4.72   4.70  4.67
 9       4.59         4.87   4.76  4.83
10       4.64         4.66   4.66  4.65
# … with 1,149 more rows

# 
test_results %>% metrics(truth = unit_price, estimate = glmnet)

# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.132
2 rsq     standard       0.333
3 mae     standard       0.103

test_results %>% metrics(truth = unit_price, estimate = RandomForest)

# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0994
2 rsq     standard      0.620 
3 mae     standard      0.0567

test_results %>% metrics(truth = unit_price, estimate = xgb)

# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.104 
2 rsq     standard      0.598 
3 mae     standard      0.0670

test_results %>% gather(model, prediction, -unit_price) %>% ggplot(aes(x = prediction, 
    y = unit_price)) + geom_abline(col = "green", lty = 2) + geom_point(alpha = 0.4) + 
    facet_wrap(~model) + coord_fixed()