Packages/Libraries

suppressMessages(
  {
    library(ggplot2)
    library(dplyr)
    library(leaflet)
  }
)

Data

data = read.csv("beijing.csv"); dim(data)
## [1] 318851     26
head(data)
##                                                  url           id      Lng
## 1 https://bj.lianjia.com/chengjiao/101084782030.html 101084782030 116.4755
## 2 https://bj.lianjia.com/chengjiao/101086012217.html 101086012217 116.4539
## 3 https://bj.lianjia.com/chengjiao/101086041636.html 101086041636 116.5620
## 4 https://bj.lianjia.com/chengjiao/101086406841.html 101086406841 116.4380
## 5 https://bj.lianjia.com/chengjiao/101086920653.html 101086920653 116.4284
## 6 https://bj.lianjia.com/chengjiao/101087277815.html 101087277815 116.4663
##        Lat          Cid  tradeTime  DOM followers totalPrice price square
## 1 40.01952 1.111027e+12 2016-08-09 1464       106      415.0 31680 131.00
## 2 39.88153 1.111027e+12 2016-07-28  903       126      575.0 43436 132.38
## 3 39.87714 1.111041e+12 2016-12-11 1271        48     1030.0 52021 198.00
## 4 40.07611 1.111043e+12 2016-09-30  965       138      297.5 22202 134.00
## 5 39.88623 1.111027e+12 2016-08-28  927       286      392.0 48396  81.00
## 6 39.99136 1.111027e+12 2016-07-22  861        57      275.6 52000  53.00
##   livingRoom drawingRoom kitchen bathRoom floor buildingType constructionTime
## 1          2           1       1        1 ¸ß 26            1             2005
## 2          2           2       1        2 ¸ß 22            1             2004
## 3          3           2       1        3  ÖÐ 4            4             2005
## 4          3           1       1        1 µ× 21            1             2008
## 5          2           1       1        1  ÖÐ 6            4             1960
## 6          1           0       1        1  ÖÐ 8            4             2005
##   renovationCondition buildingStructure ladderRatio elevator fiveYearsProperty
## 1                   3                 6       0.217        1                 0
## 2                   4                 6       0.667        1                 1
## 3                   3                 6       0.500        1                 0
## 4                   1                 6       0.273        1                 0
## 5                   2                 2       0.333        0                 1
## 6                   3                 6       0.333        1                 1
##   subway district communityAverage
## 1      1        7            56021
## 2      0        7            71539
## 3      0        7            48160
## 4      0        6            51238
## 5      1        1            62588
## 6      0        7            67738
names(data)
##  [1] "url"                 "id"                  "Lng"                
##  [4] "Lat"                 "Cid"                 "tradeTime"          
##  [7] "DOM"                 "followers"           "totalPrice"         
## [10] "price"               "square"              "livingRoom"         
## [13] "drawingRoom"         "kitchen"             "bathRoom"           
## [16] "floor"               "buildingType"        "constructionTime"   
## [19] "renovationCondition" "buildingStructure"   "ladderRatio"        
## [22] "elevator"            "fiveYearsProperty"   "subway"             
## [25] "district"            "communityAverage"

Unique districts in the dataset

length(unique(data$district))
## [1] 13

Missing values

temp = sapply(data, function(x){sum(is.na(x))})
temp[as.numeric(which(temp>0))]
##               DOM      buildingType          elevator fiveYearsProperty 
##            157977              2021                32                32 
##            subway  communityAverage 
##                32               463

Unique values in each columns

temp = sapply(data, function(x){length(unique(factor(x)))})
temp[as.numeric(which(temp>0))]
##                 url                  id                 Lng                 Lat 
##              318851              318851                3995                3993 
##                 Cid           tradeTime                 DOM           followers 
##                4035                2560                 552                 547 
##          totalPrice               price              square          livingRoom 
##                5780               80042               18298                  11 
##         drawingRoom             kitchen            bathRoom               floor 
##                  22                   5                  18                 203 
##        buildingType    constructionTime renovationCondition   buildingStructure 
##                  13                  74                   5                   7 
##         ladderRatio            elevator   fiveYearsProperty              subway 
##                 183                   3                   3                   3 
##            district    communityAverage 
##                  13                4073

Distribution of price in each regions

ggplot(data, 
       aes(y = price, x = factor(district), fill = factor(district))
      ) + geom_bar(stat = "identity") 

Map view of districts

## Subsetting data for mapplot
data_mp = select(data, select = c(Lat, Lng, district))
names(data_mp) = c("lat","lng", "district")

data_mp %>% 
    leaflet() %>%
    addTiles() %>%
    addMarkers(clusterOptions = markerClusterOptions()) 
## Assuming "lng" and "lat" are longitude and latitude, respectively