In this project we’ll use real estate listing description data from the CRMLS to predict the final sale price of a home. While ultimately a property’s physical characterisitics (bed, bath, squarefootage, location) are the primary determinants of a sale, perhaps a well phrased listing description may attract more interest in a home, which means more competing offers and a higher sale price.
Import Data -MLS Data -Extract listing description text -Merge with sentiment -Gen features -LM Baseline -Run XGBoost
-Followup projects, days on market
Import Libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(rtweet)
library(ggplot2)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
library(devtools)
library(widyr)
library(maps)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
Riverside_Sales_2018 <- read.csv("~/ML_apps/listing_quality/Riverside_Sales_2018.csv", colClasses = "character")
#Remove excess headers
Riverside_Sales_2018 <- Riverside_Sales_2018 %>%
filter(PublicRemarks != "PublicRemarks")
#Format Parcel Number
Riverside_Sales_2018$ParcelNumber <- gsub('[[:punct:]]', '', Riverside_Sales_2018$ParcelNumber)
#Extract Listing Description and Index
publicRemarks <- Riverside_Sales_2018 %>%
select(PublicRemarks, ParcelNumber )
publicRemarks <- publicRemarks %>%
mutate(word = strsplit(as.character(PublicRemarks), " ")) %>%
unnest(word)
publicRemarks$word <- gsub('[[:punct:]]', '', publicRemarks$word)
publicRemarks$word = gsub("&", "", publicRemarks$word)
publicRemarks$word = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", publicRemarks$word)
publicRemarks$word = gsub("@\\w+", "", publicRemarks$word)
publicRemarks$word = gsub("[[:punct:]]", "", publicRemarks$word)
publicRemarks$word = gsub("[[:digit:]]", "", publicRemarks$word)
publicRemarks$word = gsub("http\\w+", "", publicRemarks$word)
publicRemarks$word = gsub("[ \t]{2,}", "", publicRemarks$word)
publicRemarks$word = gsub("^\\s+|\\s+$", "", publicRemarks$word)
#Get Sentiments
nrc <- get_sentiments("nrc")
#Only two classes for "bing" positive/negative
bing <- get_sentiments("bing")
#Merge Bing sentiment
words_bing <- inner_join(publicRemarks, bing, by = "word")
words_bing <- words_bing %>%
select(ParcelNumber, sentiment) %>%
group_by(ParcelNumber) %>%
count(sentiment)
words_bing <- words_bing %>% spread(sentiment,n)
#Replace NA values to 0
words_bing[is.na(words_bing)] <- 0
#rename positive/negative to positive/negative_bing because of duplicate columns in NRC
words_bing$positive_bing <- words_bing$positive
words_bing$negative_bing <- words_bing$negative
words_bing$negative <- NULL
words_bing$positive <- NULL
#Merge NRC Sentiment
words_nrc <- inner_join(publicRemarks, nrc, by = "word")
words_nrc <- words_nrc %>%
select(ParcelNumber, sentiment) %>%
group_by(ParcelNumber) %>%
count(sentiment)
words_nrc <- words_nrc %>% spread(sentiment,n)
words_nrc[is.na(words_nrc)] <- 0
listings <- Riverside_Sales_2018 %>%
select(ParcelNumber, CurrentPrice, DaysOnMarket, ListPrice, ClosePrice)
as.data.frame(sort(colSums(sapply(listings, is.na)), decreasing = TRUE))
modeling <- inner_join(listings,words_bing,by="ParcelNumber")
modeling <- inner_join(modeling, words_nrc, by = "ParcelNumber")
sale_list_ratio : Ratio of sale price to listing price. Values below 1 indicate that the property sold below asking (listing) price. While values above 1 typically indicate competing offers that drove the sale price above the asking (listing) price.
dom : days on market
modeling$sale_list_ratio <- as.numeric(modeling$ClosePrice) / as.numeric(modeling$ListPrice)
modeling$dom <- as.numeric(modeling$DaysOnMarket)
Baseline LM for Sale_List_Ratio
modeling_sale_list <- modeling %>%
select(-ParcelNumber, -CurrentPrice, -DaysOnMarket, -ListPrice, -ClosePrice, -dom)
lm <- lm(sale_list_ratio ~ .,modeling_sale_list)
summary(lm)
##
## Call:
## lm(formula = sale_list_ratio ~ ., data = modeling_sale_list)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8922 -0.0143 0.0070 0.0161 9.2823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9854502 0.0005600 1759.649 < 2e-16 ***
## positive_bing 0.0007938 0.0001578 5.032 4.89e-07 ***
## negative_bing 0.0003491 0.0005336 0.654 0.512957
## anger 0.0011924 0.0007951 1.500 0.133667
## anticipation 0.0001867 0.0003069 0.608 0.543086
## disgust -0.0010419 0.0007471 -1.395 0.163160
## fear -0.0027638 0.0008109 -3.408 0.000654 ***
## joy -0.0002858 0.0003355 -0.852 0.394272
## negative -0.0008303 0.0004998 -1.661 0.096657 .
## positive -0.0005333 0.0001496 -3.566 0.000363 ***
## sadness 0.0001683 0.0006257 0.269 0.787887
## surprise -0.0002054 0.0004552 -0.451 0.651924
## trust 0.0003587 0.0003072 1.168 0.242946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08147 on 31435 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.001737, Adjusted R-squared: 0.001356
## F-statistic: 4.558 on 12 and 31435 DF, p-value: 2.088e-07
Baseline LM for Sale_List_Ratio
modeling_dom <- modeling %>%
select(-ParcelNumber, -CurrentPrice, -DaysOnMarket, -ListPrice, -ClosePrice, -sale_list_ratio)
lm <- lm(dom ~ .,modeling_dom)
summary(lm)
##
## Call:
## lm(formula = dom ~ ., data = modeling_dom)
##
## Residuals:
## Min 1Q Median 3Q Max
## -170.20 -39.36 -20.27 16.84 1366.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.70352 0.44024 119.716 < 2e-16 ***
## positive_bing -1.19399 0.12401 -9.628 < 2e-16 ***
## negative_bing -1.26307 0.41943 -3.011 0.00260 **
## anger 2.00750 0.62496 3.212 0.00132 **
## anticipation 0.21151 0.24128 0.877 0.38071
## disgust 0.34117 0.58728 0.581 0.56129
## fear 5.12138 0.63736 8.035 9.66e-16 ***
## joy 0.32876 0.26374 1.247 0.21259
## negative -0.09633 0.39289 -0.245 0.80632
## positive 0.89494 0.11757 7.612 2.77e-14 ***
## sadness 1.93196 0.49177 3.929 8.56e-05 ***
## surprise -0.29851 0.35785 -0.834 0.40419
## trust -1.31099 0.24146 -5.429 5.70e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.04 on 31436 degrees of freedom
## Multiple R-squared: 0.009669, Adjusted R-squared: 0.009291
## F-statistic: 25.58 on 12 and 31436 DF, p-value: < 2.2e-16