bathrooms : 該房間有多少廁所 bedrooms : 該房間有多少臥房 latitude : 該房間的緯度 longitude : 該房間的經度 price : 該房間的月租金 features_num : 該房間有多少特色數 photo_num : 該房間在網路上的照片數 interest_level : 客戶感興趣的程度(High,Medium,Low)
knitr::opts_chunk$set(echo = TRUE)
suppressWarnings(library("jsonlite"))
suppressWarnings(library("lubridate"))
suppressWarnings(library("caTools"))
suppressWarnings(library("caret"))
json_file <- "C:/Users/dannyhuang/Desktop/train.json"
suppressWarnings(library("Hmisc"))
x1 <-fromJSON(json_file, simplifyDataFrame = TRUE)
df <- data.frame(bathrooms = as.numeric(unlist(x1$bathrooms))
,bedrooms=as.numeric(unlist(x1$bedrooms))
,building_id=as.factor(unlist(x1$building_id))
#,created=as.POSIXct(unlist(x1$created))
,latitude=unlist(x1$latitude)
,longitude=unlist(x1$longitude)
,price=as.numeric(unlist(x1$price))
,features_num = as.numeric(unlist(lapply(x1$features ,length)))
,photo_num = as.numeric(unlist(lapply(x1$photos,length)))
,interest_level=as.factor(unlist(x1$interest_level)))
plot(df$price)# 4 outliner
summary(df$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 43 2500 3150 3830 4100 4490000
df[df== 4490000 ,]
## bathrooms bedrooms building_id latitude longitude price features_num
## NA NA NA <NA> NA NA NA NA
## photo_num interest_level
## NA NA <NA>
df[df$price==4490000,]
## bathrooms bedrooms building_id latitude
## 32611 1 2 cd25bbea2af848ebe9821da820b725da 40.7299
## longitude price features_num photo_num interest_level
## 32611 -74.0071 4490000 5 0 low
df[df$price==1150000,]
## bathrooms bedrooms building_id latitude
## 12168 1 2 5d3525a5085445e7fcd64a53aac3cb0a 40.8011
## longitude price features_num photo_num interest_level
## 12168 -73.948 1150000 5 0 low
df[df$price==1070000,]
## bathrooms bedrooms building_id latitude
## 55437 1 1 37385c8a58176b529964083315c28e32 40.7676
## 57803 1 1 37385c8a58176b529964083315c28e32 40.7676
## longitude price features_num photo_num interest_level
## 55437 -73.9844 1070000 3 0 low
## 57803 -73.9844 1070000 5 7 low
df["32611",] <-NA
df["12168",] <-NA
df["55437",]<-NA
df["57803",]<-NA
df<-na.omit(df)
str(df)
## 'data.frame': 49348 obs. of 9 variables:
## $ bathrooms : num 1 1 1 1.5 1 1 2 1 0 3 ...
## $ bedrooms : num 1 2 2 3 0 3 3 0 1 3 ...
## $ building_id : Factor w/ 7585 levels "0","00005cb939f9986300d987652c933e15",..: 3941 5494 6096 2432 5690 1390 375 6313 6232 2783 ...
## $ latitude : num 40.7 40.8 40.8 40.7 40.7 ...
## $ longitude : num -74 -74 -74 -73.9 -74 ...
## $ price : num 2400 3800 3495 3000 2795 ...
## $ features_num : num 7 6 6 0 4 6 5 5 1 2 ...
## $ photo_num : num 12 6 6 5 4 5 7 5 4 11 ...
## $ interest_level: Factor w/ 3 levels "high","low","medium": 3 2 3 3 2 2 2 1 2 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:4] 4794 12948 22059 22969
## .. ..- attr(*, "names")= chr [1:4] "12168" "32611" "55437" "57803"
plot(df$price)
summary(df$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 43 2500 3150 3673 4100 135000
price_class <- function(price){
if (price <= 2500 ){
return("D")
} else if (price>2500&price <=3150 ){
return("C")
} else if (price> 3150&price <=4100 ){
return("B")
}else if (price > 4100 ){
return("A")
}else{
return("A")
}
}
df$price_class<- sapply(df$price, function(x) price_class(x))
df$price_class<- as.factor(df$price_class)
Intlevel_class <- function(level){
if (level == "high" ){
return(3)
} else if (level == "medium" ){
return(2)
} else if (level == "low" ){
return(1)
}
}
df$level_class<- sapply(df$interest_level, function(x) Intlevel_class(x))
df$level_class<- as.factor(df$level_class)
df$interest_level<-ordered(df$interest_level, levels = c("low","medium","high"))
str(df$interest_level)
## Ord.factor w/ 3 levels "low"<"medium"<..: 2 1 2 2 1 1 1 3 1 1 ...
library(purrr)
library(dplyr)
library(broom)
library(ggplot2)
library(MASS)
library("nnet")
library(rms)
model <- df %>% split(.$interest_level) %>% map(function(x) lm(price~bedrooms+bathrooms, data = x))
model %>% map(anova)
## $low
## Analysis of Variance Table
##
## Response: price
## Df Sum Sq Mean Sq F value Pr(>F)
## bedrooms 1 5.6489e+10 5.6489e+10 12618 < 2.2e-16 ***
## bathrooms 1 6.4059e+10 6.4059e+10 14309 < 2.2e-16 ***
## Residuals 34277 1.5345e+11 4.4769e+06
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $medium
## Analysis of Variance Table
##
## Response: price
## Df Sum Sq Mean Sq F value Pr(>F)
## bedrooms 1 7439776012 7439776012 10725 < 2.2e-16 ***
## bathrooms 1 2140053891 2140053891 3085 < 2.2e-16 ***
## Residuals 11226 7787353149 693689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $high
## Analysis of Variance Table
##
## Response: price
## Df Sum Sq Mean Sq F value Pr(>F)
## bedrooms 1 1.9197e+09 1919660445 512.625 < 2.2e-16 ***
## bathrooms 1 3.2900e+08 328998315 87.856 < 2.2e-16 ***
## Residuals 3836 1.4365e+10 3744766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df %>% split(.$interest_level) %>% map(~mean(.$price))
## $low
## [1] 3950.132
##
## $medium
## [1] 3158.767
##
## $high
## [1] 2700.293
model %>% map(coefficients)
## $low
## (Intercept) bedrooms bathrooms
## -290.5605 322.7630 3028.6388
##
## $medium
## (Intercept) bedrooms bathrooms
## 835.5088 496.2871 1304.4491
##
## $high
## (Intercept) bedrooms bathrooms
## 850.8284 499.9551 964.2617
model1<-model%>% map(glance)
model1
## $low
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4399536 0.4399209 2115.863 13463.43 0 3 -311129.1 622266.3
## BIC deviance df.residual
## 1 622300.1 153453872784 34277
##
## $medium
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.5516053 0.5515254 832.88 6904.989 0 3 -91445.55 182899.1
## BIC deviance df.residual
## 1 182928.4 7787353149 11226
##
## $high
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.1353506 0.1348998 1935.14 300.2403 7.230379e-122 3 -34499.11
## AIC BIC deviance df.residual
## 1 69006.21 69031.22 14364921067 3836
mode1_rsq<- model1 %>% map_dbl("r.squared")
mode1_rsq
## low medium high
## 0.4399536 0.5516053 0.1353506
library(ggmap)
library(plotly)
Sys.setenv('MAPBOX_TOKEN'= 'pk.eyJ1IjoiZGFubnlodWFuZyIsImEiOiJjajB2MHhxbXEwMzdpMzhscmcwbHZ0YXdtIn0.tvgmqm8TUM4gvdNGJAvNLA')
p <- df %>%
plot_mapbox(lat = ~latitude, lon = ~longitude,
split = ~price_class, size=2,
mode = 'scattermapbox')
p