In this project we’ll use real estate listing description data from the CRMLS to predict the final sale price of a home. While ultimately a property’s physical characterisitics (bed, bath, squarefootage, location) are the primary determinants of a sale, perhaps a well phrased listing description may attract more interest in a home, which means more competing offers and a higher sale price.

Import Data -MLS Data -Extract listing description text -Merge with sentiment -Gen features -LM Baseline -Run XGBoost

-Followup projects, days on market

Data Preprocessing

Import Libraries

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(rtweet)
library(ggplot2)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
library(devtools)
library(widyr)
library(maps)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
## 
##     crossing

Import Data & Extract Features

Riverside_Sales_2018 <- read.csv("~/ML_apps/listing_quality/Riverside_Sales_2018.csv", colClasses = "character")
#Remove excess headers
Riverside_Sales_2018 <- Riverside_Sales_2018 %>% 
  filter(PublicRemarks != "PublicRemarks")
#Format Parcel Number
Riverside_Sales_2018$ParcelNumber <- gsub('[[:punct:]]', '', Riverside_Sales_2018$ParcelNumber)

#Extract Listing Description and Index
publicRemarks <- Riverside_Sales_2018 %>%
  select(PublicRemarks, ParcelNumber )

Extract Words from Public Remarks

publicRemarks <- publicRemarks %>% 
    mutate(word = strsplit(as.character(PublicRemarks), " ")) %>% 
    unnest(word)

publicRemarks$word <- gsub('[[:punct:]]', '', publicRemarks$word)
publicRemarks$word = gsub("&amp", "", publicRemarks$word)
publicRemarks$word = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", publicRemarks$word)
publicRemarks$word = gsub("@\\w+", "", publicRemarks$word)
publicRemarks$word = gsub("[[:punct:]]", "", publicRemarks$word)
publicRemarks$word = gsub("[[:digit:]]", "", publicRemarks$word)
publicRemarks$word = gsub("http\\w+", "", publicRemarks$word)
publicRemarks$word = gsub("[ \t]{2,}", "", publicRemarks$word)
publicRemarks$word = gsub("^\\s+|\\s+$", "", publicRemarks$word)

Merge Sentiments

#Get Sentiments
nrc <- get_sentiments("nrc") 
#Only two classes for "bing" positive/negative
bing <- get_sentiments("bing") 
#Merge Bing sentiment
words_bing <- inner_join(publicRemarks, bing, by = "word") 
words_bing <- words_bing %>%
  select(ParcelNumber, sentiment) %>%
  group_by(ParcelNumber) %>%
  count(sentiment)
words_bing <- words_bing %>% spread(sentiment,n)
#Replace NA values to 0
words_bing[is.na(words_bing)] <- 0
#rename positive/negative to positive/negative_bing because of duplicate columns in NRC
words_bing$positive_bing <- words_bing$positive
words_bing$negative_bing <- words_bing$negative
words_bing$negative <- NULL
words_bing$positive <- NULL



#Merge NRC Sentiment
words_nrc <- inner_join(publicRemarks, nrc, by = "word")
words_nrc <- words_nrc %>%
  select(ParcelNumber, sentiment) %>%
  group_by(ParcelNumber) %>%
  count(sentiment)
words_nrc <- words_nrc %>% spread(sentiment,n)
words_nrc[is.na(words_nrc)] <- 0

Extract Features from Listings data

listings <- Riverside_Sales_2018 %>%
  select(ParcelNumber, CurrentPrice, DaysOnMarket, ListPrice, ClosePrice)
as.data.frame(sort(colSums(sapply(listings, is.na)), decreasing = TRUE))

Merge Tidytext data with sales data

modeling <- inner_join(listings,words_bing,by="ParcelNumber")
modeling <- inner_join(modeling, words_nrc, by = "ParcelNumber")

Modeling

Feature Engineering

  • sale_list_ratio : Ratio of sale price to listing price. Values below 1 indicate that the property sold below asking (listing) price. While values above 1 typically indicate competing offers that drove the sale price above the asking (listing) price.

  • dom : days on market

modeling$sale_list_ratio <- as.numeric(modeling$ClosePrice) / as.numeric(modeling$ListPrice)
modeling$dom <- as.numeric(modeling$DaysOnMarket)

Baseline LM for Sale_List_Ratio

modeling_sale_list <- modeling %>%
  select(-ParcelNumber, -CurrentPrice, -DaysOnMarket, -ListPrice, -ClosePrice, -dom)
lm <- lm(sale_list_ratio ~ .,modeling_sale_list)
summary(lm)
## 
## Call:
## lm(formula = sale_list_ratio ~ ., data = modeling_sale_list)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8922 -0.0143  0.0070  0.0161  9.2823 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)    0.9854502  0.0005600 1759.649  < 2e-16 ***
## positive_bing  0.0007938  0.0001578    5.032 4.89e-07 ***
## negative_bing  0.0003491  0.0005336    0.654 0.512957    
## anger          0.0011924  0.0007951    1.500 0.133667    
## anticipation   0.0001867  0.0003069    0.608 0.543086    
## disgust       -0.0010419  0.0007471   -1.395 0.163160    
## fear          -0.0027638  0.0008109   -3.408 0.000654 ***
## joy           -0.0002858  0.0003355   -0.852 0.394272    
## negative      -0.0008303  0.0004998   -1.661 0.096657 .  
## positive      -0.0005333  0.0001496   -3.566 0.000363 ***
## sadness        0.0001683  0.0006257    0.269 0.787887    
## surprise      -0.0002054  0.0004552   -0.451 0.651924    
## trust          0.0003587  0.0003072    1.168 0.242946    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08147 on 31435 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.001737,   Adjusted R-squared:  0.001356 
## F-statistic: 4.558 on 12 and 31435 DF,  p-value: 2.088e-07

Baseline LM for Sale_List_Ratio

modeling_dom <- modeling %>%
  select(-ParcelNumber, -CurrentPrice, -DaysOnMarket, -ListPrice, -ClosePrice, -sale_list_ratio)
lm <- lm(dom ~ .,modeling_dom)
summary(lm)
## 
## Call:
## lm(formula = dom ~ ., data = modeling_dom)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -170.20  -39.36  -20.27   16.84 1366.38 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.70352    0.44024 119.716  < 2e-16 ***
## positive_bing -1.19399    0.12401  -9.628  < 2e-16 ***
## negative_bing -1.26307    0.41943  -3.011  0.00260 ** 
## anger          2.00750    0.62496   3.212  0.00132 ** 
## anticipation   0.21151    0.24128   0.877  0.38071    
## disgust        0.34117    0.58728   0.581  0.56129    
## fear           5.12138    0.63736   8.035 9.66e-16 ***
## joy            0.32876    0.26374   1.247  0.21259    
## negative      -0.09633    0.39289  -0.245  0.80632    
## positive       0.89494    0.11757   7.612 2.77e-14 ***
## sadness        1.93196    0.49177   3.929 8.56e-05 ***
## surprise      -0.29851    0.35785  -0.834  0.40419    
## trust         -1.31099    0.24146  -5.429 5.70e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.04 on 31436 degrees of freedom
## Multiple R-squared:  0.009669,   Adjusted R-squared:  0.009291 
## F-statistic: 25.58 on 12 and 31436 DF,  p-value: < 2.2e-16