StreetEasy Real-Estate Data

The research and findings in the following project have been completed on behalf of StreetEasy.

Preprocessing Data

url1 <- 'https://s3.amazonaws.com/streeteasy-market-
data-api/data_repository/E1_rentalInventory_All.zip'

url2 <- 'https://s3.amazonaws.com/streeteasy-market-
data-api/data_repository/A9_daysOnMarket_All.zip'

url3 <- 'https://s3.amazonaws.com/streeteasy-market-
data-api/data_repository/A6_medianSalePrice_All.zip'

url4 <- 'https://streeteasy-market-data-api.s3.amazonaws.com/
data_repository/E2_medianAskingRent_All.zip'

temp = tempfile()

download.file(url4, temp)
med_AskRent <- read.csv(unz(temp, "E2_medianAskingRent_All.csv"))

download.file(url3, temp)
medSalePrice  <- read.csv(unz(temp, "A6_medianSalePrice_All.csv"))

download.file(url2, temp)
daysOnMarket <- read.csv(unz(temp, "A9_daysOnMarket_All.csv"))

download.file(url1, temp)
totalRentInv <- read.csv(unz(temp, "E1_rentalInventory_All.csv"))

unlink(temp)

Remove NAs

totalRentInv <- na.exclude(totalRentInv)
med_AskRent <-na.exclude(med_AskRent)
daysOnMarket <-na.exclude(daysOnMarket)
medSalePrice <- na.exclude(medSalePrice)

Median Sale Price

medSalePriceSub <- subset(medSalePrice, as.character(medSalePrice$Area) %in% as.character(daysOnMarket$Area) & medSalePrice$AreaType == "neighborhood")
medSalePriceLong <- gather(medSalePriceSub, Date, "Median Sale Price", 4:ncol(medSalePrice), factor_key = T)
medSalePriceLong <- medSalePriceLong[order(medSalePriceLong$Area, medSalePriceLong$Date),]
medSalePriceLong$Date <- sub("X", "", medSalePriceLong[,4])
medSalePriceLong$Date <- sub(".", "-", medSalePriceLong[,4], fixed = T)
medSalePriceLong$Date <- as.Date(paste(medSalePriceLong$Date,1, sep = "-"), "%Y-%m-%d")
medSalePrice_zoo <- as.zooreg(medSalePriceLong[,c(4,1:3,5)])

medSalePriceCast <- cast(medSalePriceLong, Date ~ Area)
medSalePrice_ts <- xts(medSalePriceCast, order.by = medSalePriceCast$Date)

Median Asking Rent

medAskRent <-subset(med_AskRent, as.character(med_AskRent$Area) %in% as.character(daysOnMarket$Area) & med_AskRent$AreaType == "neighborhood")
medAskRentLong <- gather(medAskRent, Date, "Median Asking Rent", 4:ncol(med_AskRent), factor_key = T)
medAskRentLong <- medAskRentLong[order(medAskRentLong$Area, medAskRentLong$Date),]
row.names(medAskRentLong) <- 1:NROW(medAskRentLong)
medAskRentLong$Date <- sub("X", "", medAskRentLong[,4])
medAskRentLong$Date <- sub(".", "-", medAskRentLong[,4], fixed = T)
medAskRentLong$Date <- as.Date(paste(medAskRentLong$Date,1,sep="-"),"%Y-%m-%d")
med_AskRent_zoo <- as.zooreg(medAskRentLong[,c(4,1:3,5)])
medAskRentCast <- cast(medAskRentLong, Date ~ Area)
medAskRent_ts = xts(medAskRentCast, order.by = medAskRentCast$Date)

Days on Market i.e. Housing supply

daysOnMktSub <- subset(daysOnMarket, as.character(daysOnMarket$Area) %in% as.character(medAskRent$Area))
daysOnMarketLong <- gather(daysOnMktSub, Date, "Days on Market", 4:ncol(daysOnMarket), factor_key = T)
daysOnMarketLong <- daysOnMarketLong[order(daysOnMarketLong$Area, daysOnMarketLong$Date),]
row.names(daysOnMarket) <- 1:NROW(daysOnMarket)
daysOnMarketLong$Date <- sub("X", "", daysOnMarketLong[,4])
daysOnMarketLong$Date <- sub(".", "-", daysOnMarketLong[,4], fixed = T)
daysOnMarketLong$Date <- as.Date(paste(daysOnMarketLong$Date,1,sep="-"),"%Y-%m-%d")
daysOnMarket_zoo <- as.zooreg(daysOnMarketLong[,c(4,1,2,3,5)])
daysOnMarketCast <- cast(daysOnMarketLong, Date ~ Area)
daysOnMarketCast$Date <- as.yearmon(as.character(daysOnMarketCast$Date))
daysOnMarket_ts <- as.xts(daysOnMarketCast, order.by = as.yearmon(daysOnMarketCast$Date))

Total Rental Inventory

ttlRentInv <- subset(totalRentInv, as.character(totalRentInv$Area) %in% as.character(daysOnMarket$Area) & totalRentInv$AreaType == "neighborhood")
ttlRentIvLong <- gather(ttlRentInv, Date, " Total Rent Inventory", 4:ncol(totalRentInv), factor_key = T)
ttlRentIvLong <- ttlRentIvLong[order(ttlRentIvLong$Area,ttlRentIvLong$Date),]
ttlRentIvLong$Date <- sub("X", "", ttlRentIvLong[,4])
ttlRentIvLong$Date <- sub(".", "-", ttlRentIvLong[,4], fixed =  T)
ttlRentIvLong$Date <- as.Date(paste(ttlRentIvLong$Date,1, sep = "-"), "%Y-%m-%d")
ttlRentInv_zoo <-as.zooreg(ttlRentIvLong[,c(4,1:3,5)])
ttlRentIvCast <- cast(ttlRentIvLong, Date ~ Area)
ttlRentInv_ts <- as.xts(ttlRentIvCast, order.by = ttlRentIvCast$Date)

Price-Rent Ratio calculation

m1 <- apply(as.matrix.noquote(medAskRent_ts), 2, as.numeric)
m2 <- apply(as.matrix.noquote(medSalePrice_ts), 2, as.numeric)
m <- t(m2)/t(m1*12)
priceRentRatio <- t(m)
priceRentRatio <- xts(priceRentRatio, order.by = medAskRentCast$Date)
priceRentIndx <- gather(as.data.frame(priceRentRatio), Date, "Price Rent Index", 1:9, factor_key = T )

Monthly growth rate for housing supply

diffDaysOnMkt_1 <- diff(daysOnMarket_ts)/lag.xts(daysOnMarket_ts,1)

diffDaysOnMkt <- gather(as.data.frame(diffDaysOnMkt_1), Date, "Month Over Month Homes Supply", 1:9,factor_key = T)

diffDaysOnMkt[is.na(diffDaysOnMkt)] <- 0

Panel data

panelData = daysOnMarketLong[,c(4,1:3,5)]

panelData<- cbind.data.frame(panelData, medAskRentLong$`Median Asking Rent`, medSalePriceLong$`Median Sale Price`, ttlRentIvLong$` Total Rent Inventory`)

colnames(panelData) <- c("Date", "Area", "Boro", "Area Type", "Days on Market", "Median Asking Rent", "Median Sale Price", "Total Rent Inventory")

panelData$Date <- as.yearmon(as.character(daysOnMarketLong$Date))

panelData$'Price Rent Index'<- priceRentIndx[,2]

panelData$'Home Supply Growth' <- diffDaysOnMkt$`Month Over Month Homes Supply`

panelData$Area <- factor(panelData$Area)

rownames(panelData) <- 1:NROW(panelData)

panelData$`Median Asking Rent` <-panelData$`Median Asking Rent`
panelData$indexTRI <- cut(panelData$`Total Rent Inventory`, breaks = 100, labels = F)
panelData$indexPI <- cut(panelData$`Price Rent Index`, breaks = 30, labels = F) 
head(panelData)

##       Date           Area      Boro    Area Type Days on Market
## 1 Jan 2010 Central Harlem Manhattan neighborhood          207.0
## 2 Feb 2010 Central Harlem Manhattan neighborhood          210.0
## 3 Mar 2010 Central Harlem Manhattan neighborhood          193.0
## 4 Apr 2010 Central Harlem Manhattan neighborhood           79.0
## 5 May 2010 Central Harlem Manhattan neighborhood          105.0
## 6 Jun 2010 Central Harlem Manhattan neighborhood          206.5
##   Median Asking Rent Median Sale Price Total Rent Inventory
## 1             1992.5            366022                  412
## 2             1995.0            394887                  382
## 3             1895.0            393025                  333
## 4             1897.5            332085                  298
## 5             1900.0            308763                  272
## 6             1950.0            531000                  309
##   Price Rent Index Home Supply Growth indexTRI indexPI
## 1         15.30832         0.00000000        8       5
## 2         16.49486         0.01449275        7       6
## 3         17.28342        -0.08095238        6       6
## 4         14.58432        -0.59067358        5       5
## 5         13.54224         0.32911392        4       4
## 6         22.69231         0.96666667        5       9

scatterplot(panelData$`Median Asking Rent` ~panelData$`Days on Market` + panelData$Area-1, reg.line=F, xlab = "Days on Market", ylab = "Median Asking Rent", legend.title = "Neighborhoods", grid =F )

Covariance tests: Fixed Effects vs. Random Effects vs. Partial Pooling

olS <-lm(panelData$`Price Rent Index`~ panelData$`Days on Market`)
kable(summary(olS)$coef, digits=3)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	25.963	0.466	55.769	0
panelData$`Days on Market`	-0.022	0.005	-4.008	0

fixed<- plm(panelData$`Price Rent Index`~  panelData$`Days on Market`,index = c('Area','Date'), data = panelData, model = 'within')
kable(summary(fixed)$coef, digits=3)

	Estimate	Std. Error	t-value	Pr(>\|t\|)
panelData$`Days on Market`	-0.008	0.005	-1.635	0.102

pFtest(fixed, olS, lower.tail=T) #p-value < .05, choose FE

## 
##  F test for individual effects
## 
## data:  panelData$`Price Rent Index` ~ panelData$`Days on Market`
## F = 51.45, df1 = 8, df2 = 863, p-value < 2.2e-16
## alternative hypothesis: significant effects

random <- plm(panelData$`Price Rent Index`~ panelData$`Days on Market`, index = c('Area', 'Date'), data = panelData, model = 'random')

kable(summary(random)$coef, digits=3)

	Estimate	Std. Error	t-value	Pr(>\|t\|)
(Intercept)	24.926	1.277	19.525	0.000
panelData$`Days on Market`	-0.008	0.005	-1.703	0.089

plot(random$residuals, within = F, pooling = F, random = T, 
    xlab = "Days on Market", ylab = "Residuals", sub = "Panel effect")

phtest(fixed,random) #p-value > .05, choose RE

## 
##  Hausman Test
## 
## data:  panelData$`Price Rent Index` ~ panelData$`Days on Market`
## chisq = 1.3684, df = 1, p-value = 0.2421
## alternative hypothesis: one model is inconsistent

pool<- plm(panelData$`Price Rent Index`~ panelData$`Days on Market`, index = c('Area','Date'), data = panelData, model = 'pooling')

kable(summary(pool)$coef, digits=3)

	Estimate	Std. Error	t-value	Pr(>\|t\|)
(Intercept)	25.963	0.466	55.769	0
panelData$`Days on Market`	-0.022	0.005	-4.008	0

plmtest(pool, type = c("bp")) #p-value < .05, H0 of no panel effect rejected, choose RE

## 
##  Lagrange Multiplier Test - (Breusch-Pagan) for balanced panels
## 
## data:  panelData$`Price Rent Index` ~ panelData$`Days on Market`
## chisq = 4013.4, df = 1, p-value < 2.2e-16
## alternative hypothesis: significant effects

pcdtest(fixed, test = "lm") #p-value <.05, cross-sectional dependence

## 
##  Breusch-Pagan LM test for cross-sectional dependence in panels
## 
## data:  panelData$`Price Rent Index` ~ panelData$`Days on Market`
## chisq = 408.13, df = 36, p-value < 2.2e-16
## alternative hypothesis: cross-sectional dependence

coeftest(random, vcovHC(random, type = 'HC0', method = 'white1'))

## 
## t test of coefficients:
## 
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                24.9262562  1.2691316 19.6404  < 2e-16 ***
## panelData$`Days on Market` -0.0083689  0.0043462 -1.9256  0.05448 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Construct IV using random effects model

iv1 <- predict(random)

panelData$iv1 <- iv1

Plots for exploration

tsRainbow <- rainbow(ncol(medAskRent_ts))
xyplot.ts(medAskRent_ts, col = tsRainbow, ylab = "Median Asking Rent")

diffDaysOnMkt_1 %>% as.data.frame() %>% na.omit() %>% GGally::ggpairs(axisLabels = "none", title = "Days on Market Monthly Growth Rate")

Sampling for softmax regression

set.seed(3214)
trainingRows <- sample(1:nrow(panelData), .7*nrow(panelData))
trainS <- panelData[sort(trainingRows),]
testS <- panelData[sort(-trainingRows),]
table(trainS$Area)

## 
##    Central Harlem           Chelsea Greenwich Village      Midtown East 
##                68                72                69                68 
##      Midtown West        Park Slope   Upper East Side   Upper West Side 
##                71                69                61                66 
##      Williamsburg 
##                67

Standardize variables for classification algorithm

stdZ01 <- function(x){(x-min(x))/(max(x)-min(x))
  }
train <- apply(trainS[,c(5:10,13)], 2, stdZ01) %>%
  cbind.data.frame(trainS[,c(1:2,11:12)]) 


test <- apply(testS[,c(5:10,13)], 2, stdZ01) %>%
  cbind.data.frame(testS[,c(1:2,11:12)])

Multinomial log-linear models via neural networks

options(contrasts = c("indexTRI", "indexPI"))

mod1<- multinom(Area ~`Days on Market` +  `Days on Market`*indexTRI +indexTRI, data = train, maxit=500) 

summary(mod1)
varImp(mod1)

mod2<- multinom(Area ~  indexTRI  , data = train) %>%
  update(.~.  + iv1*indexTRI + iv1, maxit=500, Hess= T)

summary(mod2)
varImp(mod2)

stargazer(mod1, type = "html", out = "mod1.htm",
           intercept.bottom = F, model.numbers = F,
          align = T)


	Dependent variable:

	Chelsea	Greenwich Village	Midtown East	Midtown West	Park Slope	Upper East Side	Upper West Side	Williamsburg

Constant	1.685^**	5.886^***	-5.951^***	-2.070^***	3.380^***	-20.775^***	-11.864^***	-0.127
	(0.683)	(0.827)	(1.109)	(0.745)	(0.715)	(2.929)	(2.265)	(0.733)

`Days on Market`	-7.224^***	-19.096^***	1.112	-1.351	-8.180^***	8.973	-0.701	-5.926^**
	(2.338)	(3.489)	(2.830)	(1.966)	(2.871)	(9.706)	(9.495)	(2.558)

indexTRI	-0.040	-0.336^***	0.170^***	0.071^**	-0.072^*	0.371^***	0.286^***	0.036
	(0.036)	(0.062)	(0.038)	(0.033)	(0.040)	(0.057)	(0.053)	(0.034)

`Days on Market`:indexTRI	0.224	0.839^***	0.234^*	0.253^**	-0.529^*	0.425^*	0.278	0.226
	(0.156)	(0.295)	(0.140)	(0.127)	(0.271)	(0.230)	(0.234)	(0.146)


Akaike Inf. Crit.	1,754.090	1,754.090	1,754.090	1,754.090	1,754.090	1,754.090	1,754.090	1,754.090

Note:	p<0.1; p<0.05; p<0.01

stargazer(mod2, type = "html", out = "mod2.htm",
           intercept.bottom = F, model.numbers = F,
          align = T)


	Dependent variable:

	Chelsea	Greenwich Village	Midtown East	Midtown West	Park Slope	Upper East Side	Upper West Side	Williamsburg

Constant	-4.903^***	-10.225^***	-3.830^**	-2.752^**	-4.349^***	-7.803^*	-15.790^**	-5.277^***
	(1.522)	(2.099)	(1.800)	(1.376)	(1.545)	(4.720)	(6.967)	(1.704)

indexTRI	0.490^***	0.790^***	0.549^***	0.450^***	0.321^**	0.850^***	0.917^***	0.526^***
	(0.127)	(0.188)	(0.123)	(0.117)	(0.144)	(0.158)	(0.187)	(0.125)

iv1	6.113^***	16.639^***	-3.379	-0.154	6.806^***	-14.580^**	1.910	4.439^*
	(2.175)	(2.998)	(2.710)	(2.021)	(2.209)	(6.880)	(9.409)	(2.398)

indexTRI:iv1	-0.588^***	-1.278^***	-0.389^**	-0.387^***	-0.470^**	-0.496^**	-0.670^***	-0.534^***
	(0.161)	(0.258)	(0.154)	(0.146)	(0.183)	(0.199)	(0.239)	(0.156)


Akaike Inf. Crit.	1,853.530	1,853.530	1,853.530	1,853.530	1,853.530	1,853.530	1,853.530	1,853.530

Note:	p<0.1; p<0.05; p<0.01

oddsRatioTable1 <- exp(coef(mod1))
kable(oddsRatioTable1, digits = 5, align = "c")#log-odds

	(Intercept)	`Days on Market`	indexTRI	`Days on Market`:indexTRI
Chelsea	5.39237	0.00073	0.96083	1.25101
Greenwich Village	359.83946	0.00000	0.71441	2.31483
Midtown East	0.00260	3.03910	1.18535	1.26317
Midtown West	0.12618	0.25892	1.07317	1.28728
Park Slope	29.38536	0.00028	0.93064	0.58925
Upper East Side	0.00000	7887.04534	1.44874	1.52988
Upper West Side	0.00001	0.49622	1.33067	1.32015
Williamsburg	0.88064	0.00267	1.03625	1.25363

oddsRatioTable2 <- exp(coef(mod2))
kable(oddsRatioTable2, digits = 3, align = "c")#log-odds

	(Intercept)	indexTRI	iv1	indexTRI:iv1
Chelsea	0.007	1.632	451.639	0.555
Greenwich Village	0.000	2.204	16838453.373	0.279
Midtown East	0.022	1.732	0.034	0.678
Midtown West	0.064	1.568	0.857	0.679
Park Slope	0.013	1.379	903.449	0.625
Upper East Side	0.000	2.339	0.000	0.609
Upper West Side	0.000	2.501	6.752	0.512
Williamsburg	0.005	1.691	84.649	0.586

For this m x m confusion matrix, negative recall (specificity) is high, a good sign in terms of our prediction goals. Additionally, balanced accuracy is the preferred evaluation metric here to decipher algorithm performance

t1 <- predict(mod1,test, type = "class")
cfMat <- confusionMatrix(t1, test$Area)
print(cfMat)

## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Central Harlem Chelsea Greenwich Village Midtown East
##   Central Harlem                 9       3                 5            0
##   Chelsea                       10       6                 1            0
##   Greenwich Village              1       1                17            0
##   Midtown East                   0       0                 0           15
##   Midtown West                   2       5                 0            6
##   Park Slope                     5       4                 5            0
##   Upper East Side                0       0                 0            2
##   Upper West Side                0       0                 0            6
##   Williamsburg                   2       6                 0            0
##                    Reference
## Prediction          Midtown West Park Slope Upper East Side
##   Central Harlem               3          1               0
##   Chelsea                      3          3               0
##   Greenwich Village            0         12               0
##   Midtown East                 4          0               1
##   Midtown West                 7          0               0
##   Park Slope                   0         10               0
##   Upper East Side              0          0              28
##   Upper West Side              0          0               7
##   Williamsburg                 9          2               0
##                    Reference
## Prediction          Upper West Side Williamsburg
##   Central Harlem                  0            5
##   Chelsea                         0            0
##   Greenwich Village               0            5
##   Midtown East                   10            5
##   Midtown West                    0            4
##   Park Slope                      0            4
##   Upper East Side                 3            0
##   Upper West Side                18            0
##   Williamsburg                    0            7
## 
## Overall Statistics
##                                          
##                Accuracy : 0.4466         
##                  95% CI : (0.3854, 0.509)
##     No Information Rate : 0.1374         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3768         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Central Harlem Class: Chelsea
## Sensitivity                        0.31034        0.24000
## Specificity                        0.92704        0.92827
## Pos Pred Value                     0.34615        0.26087
## Neg Pred Value                     0.91525        0.92050
## Prevalence                         0.11069        0.09542
## Detection Rate                     0.03435        0.02290
## Detection Prevalence               0.09924        0.08779
## Balanced Accuracy                  0.61869        0.58414
##                      Class: Greenwich Village Class: Midtown East
## Sensitivity                           0.60714             0.51724
## Specificity                           0.91880             0.91416
## Pos Pred Value                        0.47222             0.42857
## Neg Pred Value                        0.95133             0.93833
## Prevalence                            0.10687             0.11069
## Detection Rate                        0.06489             0.05725
## Detection Prevalence                  0.13740             0.13359
## Balanced Accuracy                     0.76297             0.71570
##                      Class: Midtown West Class: Park Slope
## Sensitivity                      0.26923           0.35714
## Specificity                      0.92797           0.92308
## Pos Pred Value                   0.29167           0.35714
## Neg Pred Value                   0.92017           0.92308
## Prevalence                       0.09924           0.10687
## Detection Rate                   0.02672           0.03817
## Detection Prevalence             0.09160           0.10687
## Balanced Accuracy                0.59860           0.64011
##                      Class: Upper East Side Class: Upper West Side
## Sensitivity                          0.7778                 0.5806
## Specificity                          0.9779                 0.9437
## Pos Pred Value                       0.8485                 0.5806
## Neg Pred Value                       0.9651                 0.9437
## Prevalence                           0.1374                 0.1183
## Detection Rate                       0.1069                 0.0687
## Detection Prevalence                 0.1260                 0.1183
## Balanced Accuracy                    0.8778                 0.7622
##                      Class: Williamsburg
## Sensitivity                      0.23333
## Specificity                      0.91810
## Pos Pred Value                   0.26923
## Neg Pred Value                   0.90254
## Prevalence                       0.11450
## Detection Rate                   0.02672
## Detection Prevalence             0.09924
## Balanced Accuracy                0.57572

kable(cfMat$table, align = "c")

	Central Harlem	Chelsea	Greenwich Village	Midtown East	Midtown West	Park Slope	Upper East Side	Upper West Side	Williamsburg
Central Harlem	9	3	5	0	3	1	0	0	5
Chelsea	10	6	1	0	3	3	0	0	0
Greenwich Village	1	1	17	0	0	12	0	0	5
Midtown East	0	0	0	15	4	0	1	10	5
Midtown West	2	5	0	6	7	0	0	0	4
Park Slope	5	4	5	0	0	10	0	0	4
Upper East Side	0	0	0	2	0	0	28	3	0
Upper West Side	0	0	0	6	0	0	7	18	0
Williamsburg	2	6	0	0	9	2	0	0	7

t2 <- predict(mod2,test, type = "class")
cfMat2 <- confusionMatrix(t2, test$Area)
print(cfMat2)

## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Central Harlem Chelsea Greenwich Village Midtown East
##   Central Harlem                14       0                 4            0
##   Chelsea                        3      10                 1            0
##   Greenwich Village              3       1                18            0
##   Midtown East                   0       0                 0           21
##   Midtown West                   1       5                 0            3
##   Park Slope                     7       3                 5            0
##   Upper East Side                0       0                 0            0
##   Upper West Side                0       0                 0            5
##   Williamsburg                   1       6                 0            0
##                    Reference
## Prediction          Midtown West Park Slope Upper East Side
##   Central Harlem               6          3               0
##   Chelsea                      7          6               0
##   Greenwich Village            0         13               0
##   Midtown East                 2          0               2
##   Midtown West                10          0               0
##   Park Slope                   0          2               0
##   Upper East Side              0          0              26
##   Upper West Side              0          0               8
##   Williamsburg                 1          4               0
##                    Reference
## Prediction          Upper West Side Williamsburg
##   Central Harlem                  0            6
##   Chelsea                         0            0
##   Greenwich Village               0            6
##   Midtown East                    8            7
##   Midtown West                    0            8
##   Park Slope                      0            1
##   Upper East Side                 9            0
##   Upper West Side                14            1
##   Williamsburg                    0            1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4427          
##                  95% CI : (0.3816, 0.5052)
##     No Information Rate : 0.1374          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3727          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Central Harlem Class: Chelsea
## Sensitivity                        0.48276        0.40000
## Specificity                        0.91845        0.92827
## Pos Pred Value                     0.42424        0.37037
## Neg Pred Value                     0.93450        0.93617
## Prevalence                         0.11069        0.09542
## Detection Rate                     0.05344        0.03817
## Detection Prevalence               0.12595        0.10305
## Balanced Accuracy                  0.70061        0.66414
##                      Class: Greenwich Village Class: Midtown East
## Sensitivity                            0.6429             0.72414
## Specificity                            0.9017             0.91845
## Pos Pred Value                         0.4390             0.52500
## Neg Pred Value                         0.9548             0.96396
## Prevalence                             0.1069             0.11069
## Detection Rate                         0.0687             0.08015
## Detection Prevalence                   0.1565             0.15267
## Balanced Accuracy                      0.7723             0.82130
##                      Class: Midtown West Class: Park Slope
## Sensitivity                      0.38462          0.071429
## Specificity                      0.92797          0.931624
## Pos Pred Value                   0.37037          0.111111
## Neg Pred Value                   0.93191          0.893443
## Prevalence                       0.09924          0.106870
## Detection Rate                   0.03817          0.007634
## Detection Prevalence             0.10305          0.068702
## Balanced Accuracy                0.65629          0.501526
##                      Class: Upper East Side Class: Upper West Side
## Sensitivity                         0.72222                0.45161
## Specificity                         0.96018                0.93939
## Pos Pred Value                      0.74286                0.50000
## Neg Pred Value                      0.95595                0.92735
## Prevalence                          0.13740                0.11832
## Detection Rate                      0.09924                0.05344
## Detection Prevalence                0.13359                0.10687
## Balanced Accuracy                   0.84120                0.69550
##                      Class: Williamsburg
## Sensitivity                     0.033333
## Specificity                     0.948276
## Pos Pred Value                  0.076923
## Neg Pred Value                  0.883534
## Prevalence                      0.114504
## Detection Rate                  0.003817
## Detection Prevalence            0.049618
## Balanced Accuracy               0.490805

kable(cfMat2$table, align = "c")

	Central Harlem	Chelsea	Greenwich Village	Midtown East	Midtown West	Park Slope	Upper East Side	Upper West Side	Williamsburg
Central Harlem	14	0	4	0	6	3	0	0	6
Chelsea	3	10	1	0	7	6	0	0	0
Greenwich Village	3	1	18	0	0	13	0	0	6
Midtown East	0	0	0	21	2	0	2	8	7
Midtown West	1	5	0	3	10	0	0	0	8
Park Slope	7	3	5	0	0	2	0	0	1
Upper East Side	0	0	0	0	0	0	26	9	0
Upper West Side	0	0	0	5	0	0	8	14	1
Williamsburg	1	6	0	0	1	4	0	0	1

Price-to-Rent Ratio plot

priceRentTs <- ts(priceRentRatio, start = 2010, frequency = 12)
priceRent_hts <- hts(priceRentTs)
plot1 <- aggts(priceRent_hts, levels = 1)
plot0<- aggts(priceRent_hts, levels = 0)
p1 <- autoplot(plot1[,c(1:5)])+xlab("Year") + ylab("Price to Rent Index") +scale_color_discrete(guide= guide_legend("Neighborhood"))
p2 <- autoplot(plot1[,c(6:9)])+xlab("Year") + ylab("Price to Rent Index") +scale_color_discrete(guide= guide_legend("Neighborhood"))
p0<- autoplot(plot0)+xlab("Year") + ylab("Price to Rent Index") + ggtitle("Total")

lay=rbind(c(1,1),c(2,2), c(3,3))
gridExtra::grid.arrange(p0,p1,p2, layout_matrix=lay)

StreetEasy Real-Estate Data

Celia Marraro

9 March, 2018

Preprocessing Data