Data Cleaning

knitr::opts_chunk$set(echo = TRUE)
suppressWarnings(library("jsonlite"))
suppressWarnings(library("lubridate"))
suppressWarnings(library("caTools"))
suppressWarnings(library("caret"))
json_file <- "C:/Users/dannyhuang/Desktop/train.json"

suppressWarnings(library("Hmisc"))

x1 <-fromJSON(json_file, simplifyDataFrame = TRUE)

df <- data.frame(bathrooms = as.numeric(unlist(x1$bathrooms))
                 ,bedrooms=as.numeric(unlist(x1$bedrooms))
                 ,building_id=as.factor(unlist(x1$building_id))
                 #,created=as.POSIXct(unlist(x1$created))
                 ,latitude=unlist(x1$latitude)
                 ,longitude=unlist(x1$longitude)
                 ,price=as.numeric(unlist(x1$price))
                 ,features_num = as.numeric(unlist(lapply(x1$features ,length)))
                 ,photo_num = as.numeric(unlist(lapply(x1$photos,length)))
                 ,interest_level=as.factor(unlist(x1$interest_level)))
plot(df$price)# 4 outliner

summary(df$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      43    2500    3150    3830    4100 4490000

df[df== 4490000 ,]

##    bathrooms bedrooms building_id latitude longitude price features_num
## NA        NA       NA        <NA>       NA        NA    NA           NA
##    photo_num interest_level
## NA        NA           <NA>

df[df$price==4490000,]

##       bathrooms bedrooms                      building_id latitude
## 32611         1        2 cd25bbea2af848ebe9821da820b725da  40.7299
##       longitude   price features_num photo_num interest_level
## 32611  -74.0071 4490000            5         0            low

df[df$price==1150000,]

##       bathrooms bedrooms                      building_id latitude
## 12168         1        2 5d3525a5085445e7fcd64a53aac3cb0a  40.8011
##       longitude   price features_num photo_num interest_level
## 12168   -73.948 1150000            5         0            low

df[df$price==1070000,]

##       bathrooms bedrooms                      building_id latitude
## 55437         1        1 37385c8a58176b529964083315c28e32  40.7676
## 57803         1        1 37385c8a58176b529964083315c28e32  40.7676
##       longitude   price features_num photo_num interest_level
## 55437  -73.9844 1070000            3         0            low
## 57803  -73.9844 1070000            5         7            low

df["32611",] <-NA
df["12168",] <-NA
df["55437",]<-NA
df["57803",]<-NA
df<-na.omit(df)
str(df)

## 'data.frame':    49348 obs. of  9 variables:
##  $ bathrooms     : num  1 1 1 1.5 1 1 2 1 0 3 ...
##  $ bedrooms      : num  1 2 2 3 0 3 3 0 1 3 ...
##  $ building_id   : Factor w/ 7585 levels "0","00005cb939f9986300d987652c933e15",..: 3941 5494 6096 2432 5690 1390 375 6313 6232 2783 ...
##  $ latitude      : num  40.7 40.8 40.8 40.7 40.7 ...
##  $ longitude     : num  -74 -74 -74 -73.9 -74 ...
##  $ price         : num  2400 3800 3495 3000 2795 ...
##  $ features_num  : num  7 6 6 0 4 6 5 5 1 2 ...
##  $ photo_num     : num  12 6 6 5 4 5 7 5 4 11 ...
##  $ interest_level: Factor w/ 3 levels "high","low","medium": 3 2 3 3 2 2 2 1 2 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:4] 4794 12948 22059 22969
##   .. ..- attr(*, "names")= chr [1:4] "12168" "32611" "55437" "57803"

plot(df$price)

Build Price Class, A~D : Price High~Price Low

summary(df$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      43    2500    3150    3673    4100  135000

price_class <- function(price){
   
    
  
    if (price <= 2500 ){
        return("D")
    
   
    } else if (price>2500&price <=3150 ){
        return("C")
    
   
   } else if (price> 3150&price <=4100 ){
        return("B")
    
   
  }else if (price > 4100 ){
        return("A")
    
   
    }else{
        return("A")
    }
}

df$price_class<- sapply(df$price, function(x) price_class(x))
df$price_class<- as.factor(df$price_class)

Intlevel_class <- function(level){
   
    
  
    if (level == "high" ){
        return(3)
    
   
    } else if (level == "medium" ){
        return(2)
    
   
   } else if (level == "low" ){
        return(1)
  
    }
}
df$level_class<- sapply(df$interest_level, function(x) Intlevel_class(x))
df$level_class<- as.factor(df$level_class)
df$interest_level<-ordered(df$interest_level, levels = c("low","medium","high"))
str(df$interest_level)

##  Ord.factor w/ 3 levels "low"<"medium"<..: 2 1 2 2 1 1 1 3 1 1 ...

Multiple regression analysis/Rsquare by Price Class

library(purrr)
library(dplyr)
library(broom)
library(ggplot2)
library(MASS)
library("nnet")
library(rms)
model <- df %>% split(.$interest_level) %>% map(function(x) lm(price~bedrooms+bathrooms, data = x))
model %>% map(anova)

## $low
## Analysis of Variance Table
## 
## Response: price
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## bedrooms      1 5.6489e+10 5.6489e+10   12618 < 2.2e-16 ***
## bathrooms     1 6.4059e+10 6.4059e+10   14309 < 2.2e-16 ***
## Residuals 34277 1.5345e+11 4.4769e+06                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $medium
## Analysis of Variance Table
## 
## Response: price
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## bedrooms      1 7439776012 7439776012   10725 < 2.2e-16 ***
## bathrooms     1 2140053891 2140053891    3085 < 2.2e-16 ***
## Residuals 11226 7787353149     693689                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $high
## Analysis of Variance Table
## 
## Response: price
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## bedrooms     1 1.9197e+09 1919660445 512.625 < 2.2e-16 ***
## bathrooms    1 3.2900e+08  328998315  87.856 < 2.2e-16 ***
## Residuals 3836 1.4365e+10    3744766                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

df %>% split(.$interest_level) %>% map(~mean(.$price))

## $low
## [1] 3950.132
## 
## $medium
## [1] 3158.767
## 
## $high
## [1] 2700.293

model %>% map(coefficients)

## $low
## (Intercept)    bedrooms   bathrooms 
##   -290.5605    322.7630   3028.6388 
## 
## $medium
## (Intercept)    bedrooms   bathrooms 
##    835.5088    496.2871   1304.4491 
## 
## $high
## (Intercept)    bedrooms   bathrooms 
##    850.8284    499.9551    964.2617

model1<-model%>% map(glance)
model1

## $low
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4399536     0.4399209 2115.863  13463.43       0  3 -311129.1 622266.3
##        BIC     deviance df.residual
## 1 622300.1 153453872784       34277
## 
## $medium
##   r.squared adj.r.squared  sigma statistic p.value df    logLik      AIC
## 1 0.5516053     0.5515254 832.88  6904.989       0  3 -91445.55 182899.1
##        BIC   deviance df.residual
## 1 182928.4 7787353149       11226
## 
## $high
##   r.squared adj.r.squared   sigma statistic       p.value df    logLik
## 1 0.1353506     0.1348998 1935.14  300.2403 7.230379e-122  3 -34499.11
##        AIC      BIC    deviance df.residual
## 1 69006.21 69031.22 14364921067        3836

mode1_rsq<- model1 %>%  map_dbl("r.squared")
mode1_rsq

##       low    medium      high 
## 0.4399536 0.5516053 0.1353506

Location mapping by price class , A~D : Price High to Price Low

library(ggmap)
library(plotly)


Sys.setenv('MAPBOX_TOKEN'= 'pk.eyJ1IjoiZGFubnlodWFuZyIsImEiOiJjajB2MHhxbXEwMzdpMzhscmcwbHZ0YXdtIn0.tvgmqm8TUM4gvdNGJAvNLA')
p <- df %>%
  plot_mapbox(lat = ~latitude, lon = ~longitude,
              split = ~price_class, size=2,
              mode = 'scattermapbox')
p

待租房資訊互動式圖表化/回歸分析

JHONG TING HUANG

2017年3月30日

變數解釋:

Data Cleaning

Build Price Class, A~D : Price High~Price Low

Multiple regression analysis/Rsquare by Price Class

Location mapping by price class , A~D : Price High to Price Low