In this project we’ll use real estate listing description data from the CRMLS to predict the final sale price of a home. While ultimately a property’s physical characterisitics (bed, bath, squarefootage, location) are the primary determinants of a sale, perhaps a well phrased listing description may attract more interest in a home, which means more competing offers and a higher sale price.

Import Data -MLS Data -Extract listing description text -Merge with sentiment -Gen features -LM Baseline -Run XGBoost

-Followup projects, days on market

Data Preprocessing

Import Libraries

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(rtweet)
library(ggplot2)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)
library(devtools)
library(widyr)
library(maps)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:igraph':
## 
##     crossing

Import Data & Extract Features

Riverside_Sales_2018 <- read.csv("~/ML_apps/listing_quality/Riverside_Sales_2018.csv", colClasses = "character")
#Remove excess headers
Riverside_Sales_2018 <- Riverside_Sales_2018 %>% 
  filter(PublicRemarks != "PublicRemarks")
#Format Parcel Number
Riverside_Sales_2018$ParcelNumber <- gsub('[[:punct:]]', '', Riverside_Sales_2018$ParcelNumber)

#Extract Listing Description and Index
publicRemarks <- Riverside_Sales_2018 %>%
  select(PublicRemarks, ParcelNumber )

Extract Words from Public Remarks

publicRemarks <- publicRemarks %>% 
    mutate(word = strsplit(as.character(PublicRemarks), " ")) %>% 
    unnest(word)

publicRemarks$word <- gsub('[[:punct:]]', '', publicRemarks$word)
publicRemarks$word = gsub("&amp", "", publicRemarks$word)
publicRemarks$word = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", publicRemarks$word)
publicRemarks$word = gsub("@\\w+", "", publicRemarks$word)
publicRemarks$word = gsub("[[:punct:]]", "", publicRemarks$word)
publicRemarks$word = gsub("[[:digit:]]", "", publicRemarks$word)
publicRemarks$word = gsub("http\\w+", "", publicRemarks$word)
publicRemarks$word = gsub("[ \t]{2,}", "", publicRemarks$word)
publicRemarks$word = gsub("^\\s+|\\s+$", "", publicRemarks$word)

Merge Sentiments

#Get Sentiments
nrc <- get_sentiments("nrc") 
#Only two classes for "bing" positive/negative
bing <- get_sentiments("bing") 
#Merge Bing sentiment
words_bing <- inner_join(publicRemarks, bing, by = "word") 
words_bing <- words_bing %>%
  select(ParcelNumber, sentiment) %>%
  group_by(ParcelNumber) %>%
  count(sentiment)
words_bing <- words_bing %>% spread(sentiment,n)
#Replace NA values to 0
words_bing[is.na(words_bing)] <- 0
#rename positive/negative to positive/negative_bing because of duplicate columns in NRC
words_bing$positive_bing <- words_bing$positive
words_bing$negative_bing <- words_bing$negative
words_bing$negative <- NULL
words_bing$positive <- NULL



#Merge NRC Sentiment
words_nrc <- inner_join(publicRemarks, nrc, by = "word")
words_nrc <- words_nrc %>%
  select(ParcelNumber, sentiment) %>%
  group_by(ParcelNumber) %>%
  count(sentiment)
words_nrc <- words_nrc %>% spread(sentiment,n)
words_nrc[is.na(words_nrc)] <- 0

Extract Features from Listings data

listings <- Riverside_Sales_2018 %>%
  select(ParcelNumber, CurrentPrice, DaysOnMarket, ListPrice, ClosePrice)
as.data.frame(sort(colSums(sapply(listings, is.na)), decreasing = TRUE))

Merge Tidytext data with sales data

modeling <- inner_join(listings,words_bing,by="ParcelNumber")
modeling <- inner_join(modeling, words_nrc, by = "ParcelNumber")

Modeling

Feature Engineering

sale_list_ratio : Ratio of sale price to listing price. Values below 1 indicate that the property sold below asking (listing) price. While values above 1 typically indicate competing offers that drove the sale price above the asking (listing) price.
dom : days on market

modeling$sale_list_ratio <- as.numeric(modeling$ClosePrice) / as.numeric(modeling$ListPrice)
modeling$dom <- as.numeric(modeling$DaysOnMarket)

Baseline LM for Sale_List_Ratio

modeling_sale_list <- modeling %>%
  select(-ParcelNumber, -CurrentPrice, -DaysOnMarket, -ListPrice, -ClosePrice, -dom)
lm <- lm(sale_list_ratio ~ .,modeling_sale_list)
summary(lm)

## 
## Call:
## lm(formula = sale_list_ratio ~ ., data = modeling_sale_list)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8922 -0.0143  0.0070  0.0161  9.2823 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)    0.9854502  0.0005600 1759.649  < 2e-16 ***
## positive_bing  0.0007938  0.0001578    5.032 4.89e-07 ***
## negative_bing  0.0003491  0.0005336    0.654 0.512957    
## anger          0.0011924  0.0007951    1.500 0.133667    
## anticipation   0.0001867  0.0003069    0.608 0.543086    
## disgust       -0.0010419  0.0007471   -1.395 0.163160    
## fear          -0.0027638  0.0008109   -3.408 0.000654 ***
## joy           -0.0002858  0.0003355   -0.852 0.394272    
## negative      -0.0008303  0.0004998   -1.661 0.096657 .  
## positive      -0.0005333  0.0001496   -3.566 0.000363 ***
## sadness        0.0001683  0.0006257    0.269 0.787887    
## surprise      -0.0002054  0.0004552   -0.451 0.651924    
## trust          0.0003587  0.0003072    1.168 0.242946    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08147 on 31435 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.001737,   Adjusted R-squared:  0.001356 
## F-statistic: 4.558 on 12 and 31435 DF,  p-value: 2.088e-07