House Price Insight

Obtain the data with Python

import requests
from  pyquery import PyQuery as pq
import pandas as pd
import time
#=================================================================================
def save_to_csv(filename,variblelist):
    df = pd.DataFrame([variblelist])
    #df.columns = headers
    df.to_csv('U:\cse\windows\zhang.11091\Rprojects\%s.csv'%filename, mode='a', header=False,encoding='GB18030')
    #csv_to_xls.csv_to_excel('%s'%filename)
#==================================================================================
def getdata(city, city_dict, p):
    # city为城市名称,对于上海、北京等城市需要输入'sh'或'bj'这样的开头首字母缩写
    # p为需要调取的页数
    start = time.clock()
    baseurl = 'https://%s.lianjia.com/ershoufang/pg{page}/'%city
    for i in range(1, p+1):
        url = baseurl.format(page=i)
        print(url)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/62.0'}
        res = requests.get(url, headers=headers)
        doc = pq(res.text)
        for l in doc('.sellListContent li .info.clear').items(): #.items()好用
            title = l('.title').text()
            address = l('.address').text()
            houseinfo = l('.houseInfo').text().split('|')
            flood = l('.flood').text()
            follow = l('.followInfo').text()
            tprice = l('.priceInfo .totalPrice span').text()
            uprice = l('.priceInfo .unitPrice span').text()
            li = [title,flood,follow,tprice,uprice] + houseinfo
            save_to_csv('%s二手房数据'%city_dict[city],li)
            
    end = time.clock()
    print('耗时: %s Minutes' % round((end - start)/60))
#=================================================================================
city_dict = {'bj':'北京',
            'sh':'上海'}
getdata('bj', city_dict, 200)

Data

Initial Cleaning

fun_sub <- function(x){
  as.numeric(str_extract(str_split(x, "/")[[1]][1],'\\d+'))
}
fun_date <- function(x){
  str_extract(str_split(x, "/")[[1]][2],'(\\d+天|\\d+个月)')
}
fun_location_1 <- function(x){
  str_split(x, " - ")[[1]][1]
}
fun_location_2 <- function(x){
  str_split(x, " - ")[[1]][2]
}

data <- bj_2nd %>%
  mutate(
    # 建筑年份
         year = str_extract(year, '....'),
    # 每平米价格
         unit_price = as.numeric(str_extract(price_per_square, '\\d+')),
    # 几室
         room = as.integer(apply(bj_2nd["type"], 1, 
                                 function(x)  str_extract_all(x, "\\d")[[1]][1])),
    # 几厅
         living_room = as.integer(apply(bj_2nd["type"], 1, 
                                        function(x)  str_extract_all(x, "\\d")[[1]][2])),
    # 建筑面积
         area = as.numeric(str_extract(area, "[^平米]*")), # "\\d+[\\.]?\\d+"
    # 总价
         Total_Price = Total_Price * 10^4,
    # 关注人数
         subscribe = apply(bj_2nd['pub'], 1, fun_sub),
    # 发布时间
         publish_date = apply(bj_2nd['pub'], 1, fun_date),
    # 位置信息
         location_1 = apply(bj_2nd['short_desc'], 1, fun_location_1),
         location_2 = apply(bj_2nd['short_desc'], 1, fun_location_2)
         
  ) %>%
  select(-short_desc, -pub, -price_per_square, -type) %>%
  filter(str_detect(floor, "(中楼层|低楼层|高楼层|底层|顶层)") == T) %>%
  mutate(
    total_floor = str_extract(floor, "\\d+"),
    floor = str_extract(floor, "(中楼层|低楼层|高楼层|底层|顶层)")
  )

# 房屋朝向变量清洗
fun_ori <- function(x) {
  if (str_detect(x, '(南 北|北 南)')) {
    " 南 北"
  } else if (str_detect(x, '(南 西 北|南 北 西|西 南 北|西 北 南|北 南 西|北 西 南)')){
    '南 西 北'
  } else if (str_detect(x, '(东 西|西 东)')) {
    '东 西'
  } else if (str_detect(x, '(东 南 北|东 北 南|南 北 东|南 东 北|北 东 南|北 南 东)')){
    "东 南 北"
  } else if (str_detect(x, '(东南 北|北 东南)')) {
    "东南 北"
  } else if (str_detect(x, '(西南 北|北 西南)')) {
    "西南 北"
  } else if (str_detect(x, '(东 西 北|东 北 西|北 西 东|北 东 西|西 东 北|西 北 东)')){
    "东 西 北"
  } else if (str_detect(x, '(东 南 西|东 西 南|南 西 东|南 东 西|西 东 南|西 南 东)')){
    "东 南 西"
  } else if (str_detect(x, '(西 北|北 西)')) {
    "西 北"
  } else if (str_detect(x, '(西 东|东 西)')) {
    "西 东"
  } else if (str_detect(x, '(东南 西北|西北 东南)')) {
    "东南 西北"
  } else {
    x
  }
}
data$orientation <- apply(data['orientation'], 1, fun_ori)
ori <- data.frame(table(data$orientation)) %>% arrange(desc(Freq))
data <- data %>%
  filter(orientation %in% ori$Var1[1:14])

Geographic Information

Data Viz on Map

Data Preprocessing

Classes 'tbl_df', 'tbl' and 'data.frame':   5880 obs. of  11 variables:
 $ unit_price   : num  35371 100313 39850 46203 66790 ...
 $ area         : num  76.9 38.4 85.3 94.2 86.7 ...
 $ orientation  : chr  " 南 北" "北" " 南 北" " 南 北" ...
 $ decoration   : chr  "精装" "简装" "精装" "精装" ...
 $ floor        : chr  "中楼层" "中楼层" "底层" "中楼层" ...
 $ year         : chr  "1996" "2006" "1994" "2003" ...
 $ building_type: chr  "板楼" "塔楼" "板楼" "板楼" ...
 $ room         : int  2 1 3 2 2 2 2 3 2 1 ...
 $ living_room  : int  1 0 1 1 1 1 1 1 1 1 ...
 $ subscribe    : num  79 18 71 21 81 109 49 9 12 26 ...
 $ total_floor  : chr  "6" "26" "7" "6" ...

其他 毛坯 简装 精装 
   3  158 2286 3433 

    塔楼 暂无数据 板塔结合     板楼 
    1357       10     1102     3396 

中楼层 低楼层   底层   顶层 高楼层 
  2530   1226    503    504   1117 

   1    2    3    4    5    6 
1351 3445 1029   44    9    2 

   0    1    2    3    4 
 351 4983  540    5    1 
Classes 'tbl_df', 'tbl' and 'data.frame':   5807 obs. of  11 variables:
 $ unit_price   : num  35371 100313 39850 46203 66790 ...
 $ area         : num  76.9 38.4 85.3 94.2 86.7 ...
 $ orientation  : Factor w/ 14 levels " 南 北","东",..: 1 8 1 1 5 2 14 4 1 2 ...
 $ decoration   : Factor w/ 4 levels "精装","简装",..: 1 2 1 1 1 2 2 2 1 2 ...
 $ floor        : Factor w/ 5 levels "底层","低楼层",..: 3 3 1 3 3 4 3 2 1 3 ...
 $ year         : num  23 13 25 16 9 23 36 26 18 29 ...
 $ building_type: Factor w/ 4 levels "板楼","板塔结合",..: 1 3 1 1 2 3 3 1 1 3 ...
 $ room         : Factor w/ 3 levels "1","2","3": 2 1 3 2 2 2 2 3 2 1 ...
 $ living_room  : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 2 2 2 2 2 ...
 $ subscribe    : num  79 18 71 21 81 109 49 9 12 26 ...
 $ total_floor  : int  6 26 7 6 11 16 16 6 6 18 ...
 - attr(*, "na.action")= 'omit' Named int  384 627 888 1047 1476 1591 1725 1990 2305 2390 ...
  ..- attr(*, "names")= chr  "384" "627" "888" "1047" ...

Modeling

parsnip model object

Fit time:  1.6s 
Ranger result

Call:
 ranger::ranger(formula = formula, data = data, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      4648 
Number of independent variables:  23 
Mtry:                             4 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       0.008722543 
R squared (OOB):                  0.6783138 
parsnip model object

Fit time:  14ms 

Call:  glmnet::glmnet(x = as.matrix(x), y = y, family = "gaussian",      alpha = ~0.5) 

   Df    %Dev   Lambda
1   0 0.00000 0.116700
2   1 0.01631 0.106300
3   2 0.03848 0.096900
4   2 0.06221 0.088290
5   2 0.08307 0.080450
6   2 0.10130 0.073300
7   2 0.11720 0.066790
8   4 0.13560 0.060860
9   4 0.15230 0.055450
10  5 0.16920 0.050520
11  5 0.18810 0.046040
12  6 0.20480 0.041950
13  6 0.21980 0.038220
14  7 0.23290 0.034820
15  8 0.24540 0.031730
16  8 0.25740 0.028910
17 11 0.27060 0.026340
18 11 0.28330 0.024000
19 11 0.29410 0.021870
20 11 0.30320 0.019930
21 12 0.31120 0.018160
22 13 0.31830 0.016540
23 13 0.32420 0.015070
24 14 0.32930 0.013740
25 14 0.33380 0.012520
 [ reached getOption("max.print") -- omitted 50 rows ]
# A tibble: 1,159 x 4
   unit_price RandomForest glmnet   xgb
        <dbl>        <dbl>  <dbl> <dbl>
 1       4.55         4.56   4.72  4.56
 2       5.00         5.00   4.96  4.97
 3       4.66         4.66   4.67  4.64
 4       4.82         4.80   4.70  4.76
 5       4.84         4.84   4.82  4.82
 6       4.85         4.85   4.91  4.83
 7       4.48         4.48   4.60  4.48
 8       4.66         4.72   4.70  4.67
 9       4.59         4.87   4.76  4.83
10       4.64         4.66   4.66  4.65
# … with 1,149 more rows
# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.132
2 rsq     standard       0.333
3 mae     standard       0.103
# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0994
2 rsq     standard      0.620 
3 mae     standard      0.0567
# A tibble: 3 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.104 
2 rsq     standard      0.598 
3 mae     standard      0.0670

2020-01-11