Step 1 - Data Description

How many dimensions? Answer: There are 8548 observations * 13 variables. What are the variables types? Answer: 10 of them are numeric, 3 are character (region_name, region_state, region_type) What are the variable names? Answer: See below Remove all non-numeric variables and create a dataset called data_X.

dim(data_complete)
## [1] 8548   14
str(data_complete)
## tibble [8,548 x 14] (S3: tbl_df/tbl/data.frame)
##  $ region_id                    : num [1:8548] 534 2696 1887 1492 1358 ...
##  $ region_name                  : chr [1:8548] "Clayton County" "Dallas County" "Merrimack County" "Ramsey County" ...
##  $ region_state                 : chr [1:8548] "GA" "TX" "NH" "MN" ...
##  $ region_type                  : chr [1:8548] "county" "county" "county" "county" ...
##  $ period_begin                 : POSIXct[1:8548], format: "2020-08-03" "2020-04-13" ...
##  $ total_homes_sold             : num [1:8548] 71 375 81 165 104 194 332 287 232 247 ...
##  $ median_sale_price            : num [1:8548] 160000 275000 330000 280000 249500 ...
##  $ total_new_listings           : num [1:8548] 70 486 58 191 61 315 406 265 281 277 ...
##  $ median_new_listing_price     : num [1:8548] 185900 275000 304750 274900 230000 ...
##  $ active_listings              : num [1:8548] 590 6402 563 1183 834 ...
##  $ median_active_list_price     : num [1:8548] 180000 314900 324900 275000 269700 ...
##  $ average_of_median_list_price : num [1:8548] 149000 238000 329000 544450 419000 ...
##  $ average_of_median_offer_price: num [1:8548] 152000 249000 300000 527450 395000 ...
##  $ median_days_on_market        : num [1:8548] 20 31 47 17 63 10 51.5 22 7 7 ...
##  - attr(*, "na.action")= 'omit' Named int [1:43026] 1 2 3 4 5 6 8 9 10 11 ...
##   ..- attr(*, "names")= chr [1:43026] "1" "2" "3" "4" ...
names(data_complete)
##  [1] "region_id"                     "region_name"                  
##  [3] "region_state"                  "region_type"                  
##  [5] "period_begin"                  "total_homes_sold"             
##  [7] "median_sale_price"             "total_new_listings"           
##  [9] "median_new_listing_price"      "active_listings"              
## [11] "median_active_list_price"      "average_of_median_list_price" 
## [13] "average_of_median_offer_price" "median_days_on_market"
data_X <- select(data_complete, -c(1:5))
str(data_X)
## tibble [8,548 x 9] (S3: tbl_df/tbl/data.frame)
##  $ total_homes_sold             : num [1:8548] 71 375 81 165 104 194 332 287 232 247 ...
##  $ median_sale_price            : num [1:8548] 160000 275000 330000 280000 249500 ...
##  $ total_new_listings           : num [1:8548] 70 486 58 191 61 315 406 265 281 277 ...
##  $ median_new_listing_price     : num [1:8548] 185900 275000 304750 274900 230000 ...
##  $ active_listings              : num [1:8548] 590 6402 563 1183 834 ...
##  $ median_active_list_price     : num [1:8548] 180000 314900 324900 275000 269700 ...
##  $ average_of_median_list_price : num [1:8548] 149000 238000 329000 544450 419000 ...
##  $ average_of_median_offer_price: num [1:8548] 152000 249000 300000 527450 395000 ...
##  $ median_days_on_market        : num [1:8548] 20 31 47 17 63 10 51.5 22 7 7 ...
##  - attr(*, "na.action")= 'omit' Named int [1:43026] 1 2 3 4 5 6 8 9 10 11 ...
##   ..- attr(*, "names")= chr [1:43026] "1" "2" "3" "4" ...

Step 2 - Correlation Matrix

  1. active_listings and total_new_listings are highly correlated.
  2. active_listings and total_homes_sold are highly correlated.
  3. total_homes_sold and total_new_listings are highly correlated.
  4. median_sale_price and median_new_listing_price are highly correlated.
  5. median_sale_price and median_active_list_price are highly correlated.
datamatrix <- cor(data_X)
corrplot(datamatrix, order="hclust", type='upper', tl.srt = 45)

#ggcorrplot(datamatrix, p.mat = cor_pmat(data_X), hc.order=TRUE, type='lower')
#corrplot(datamatrix, is.corr = TRUE, win.asp = 1, method = "color", type='lower')
res2 <- rcorr(as.matrix(data_X), type="pearson")
# Extract the correlation coefficients
res2$r
##                               total_homes_sold median_sale_price
## total_homes_sold                   1.000000000        0.01488874
## median_sale_price                  0.014888742        1.00000000
## total_new_listings                 0.930001291        0.03674503
## median_new_listing_price           0.013822109        0.96148668
## active_listings                    0.828954932       -0.01779088
## median_active_list_price           0.020982650        0.96433240
## average_of_median_list_price       0.010647286        0.70945268
## average_of_median_offer_price      0.009417138        0.56716260
## median_days_on_market             -0.067720633       -0.09394526
##                               total_new_listings median_new_listing_price
## total_homes_sold                      0.93000129              0.013822109
## median_sale_price                     0.03674503              0.961486683
## total_new_listings                    1.00000000              0.051483548
## median_new_listing_price              0.05148355              1.000000000
## active_listings                       0.90251493             -0.002368816
## median_active_list_price              0.05197751              0.981059948
## average_of_median_list_price          0.03244726              0.721110264
## average_of_median_offer_price         0.02438527              0.575681284
## median_days_on_market                 0.00486223             -0.088338649
##                               active_listings median_active_list_price
## total_homes_sold                  0.828954932              0.020982650
## median_sale_price                -0.017790879              0.964332401
## total_new_listings                0.902514931              0.051977512
## median_new_listing_price         -0.002368816              0.981059948
## active_listings                   1.000000000             -0.003662179
## median_active_list_price         -0.003662179              1.000000000
## average_of_median_list_price     -0.005718765              0.714973064
## average_of_median_offer_price    -0.009827100              0.572153538
## median_days_on_market             0.207782640             -0.112967858
##                               average_of_median_list_price
## total_homes_sold                               0.010647286
## median_sale_price                              0.709452677
## total_new_listings                             0.032447257
## median_new_listing_price                       0.721110264
## active_listings                               -0.005718765
## median_active_list_price                       0.714973064
## average_of_median_list_price                   1.000000000
## average_of_median_offer_price                  0.766356259
## median_days_on_market                         -0.071201590
##                               average_of_median_offer_price
## total_homes_sold                                0.009417138
## median_sale_price                               0.567162597
## total_new_listings                              0.024385269
## median_new_listing_price                        0.575681284
## active_listings                                -0.009827100
## median_active_list_price                        0.572153538
## average_of_median_list_price                    0.766356259
## average_of_median_offer_price                   1.000000000
## median_days_on_market                          -0.076249786
##                               median_days_on_market
## total_homes_sold                        -0.06772063
## median_sale_price                       -0.09394526
## total_new_listings                       0.00486223
## median_new_listing_price                -0.08833865
## active_listings                          0.20778264
## median_active_list_price                -0.11296786
## average_of_median_list_price            -0.07120159
## average_of_median_offer_price           -0.07624979
## median_days_on_market                    1.00000000
# Extract p-values
res2$P
##                               total_homes_sold median_sale_price
## total_homes_sold                            NA      0.1686917912
## median_sale_price                 1.686918e-01                NA
## total_new_listings                0.000000e+00      0.0006789854
## median_new_listing_price          2.013190e-01      0.0000000000
## active_listings                   0.000000e+00      0.1000208971
## median_active_list_price          5.239317e-02      0.0000000000
## average_of_median_list_price      3.249764e-01      0.0000000000
## average_of_median_offer_price     3.839960e-01      0.0000000000
## median_days_on_market             3.670846e-10      0.0000000000
##                               total_new_listings median_new_listing_price
## total_homes_sold                    0.000000e+00             2.013190e-01
## median_sale_price                   6.789854e-04             0.000000e+00
## total_new_listings                            NA             1.913098e-06
## median_new_listing_price            1.913098e-06                       NA
## active_listings                     0.000000e+00             8.266671e-01
## median_active_list_price            1.523344e-06             0.000000e+00
## average_of_median_list_price        2.697358e-03             0.000000e+00
## average_of_median_offer_price       2.416126e-02             0.000000e+00
## median_days_on_market               6.530881e-01             4.440892e-16
##                               active_listings median_active_list_price
## total_homes_sold                    0.0000000             5.239317e-02
## median_sale_price                   0.1000209             0.000000e+00
## total_new_listings                  0.0000000             1.523344e-06
## median_new_listing_price            0.8266671             0.000000e+00
## active_listings                            NA             7.349564e-01
## median_active_list_price            0.7349564                       NA
## average_of_median_list_price        0.5970429             0.000000e+00
## average_of_median_offer_price       0.3636366             0.000000e+00
## median_days_on_market               0.0000000             0.000000e+00
##                               average_of_median_list_price
## total_homes_sold                              3.249764e-01
## median_sale_price                             0.000000e+00
## total_new_listings                            2.697358e-03
## median_new_listing_price                      0.000000e+00
## active_listings                               5.970429e-01
## median_active_list_price                      0.000000e+00
## average_of_median_list_price                            NA
## average_of_median_offer_price                 0.000000e+00
## median_days_on_market                         4.387202e-11
##                               average_of_median_offer_price
## total_homes_sold                               3.839960e-01
## median_sale_price                              0.000000e+00
## total_new_listings                             2.416126e-02
## median_new_listing_price                       0.000000e+00
## active_listings                                3.636366e-01
## median_active_list_price                       0.000000e+00
## average_of_median_list_price                   0.000000e+00
## average_of_median_offer_price                            NA
## median_days_on_market                          1.677769e-12
##                               median_days_on_market
## total_homes_sold                       3.670846e-10
## median_sale_price                      0.000000e+00
## total_new_listings                     6.530881e-01
## median_new_listing_price               4.440892e-16
## active_listings                        0.000000e+00
## median_active_list_price               0.000000e+00
## average_of_median_list_price           4.387202e-11
## average_of_median_offer_price          1.677769e-12
## median_days_on_market                            NA
# Insignificant correlations are leaved blank
corrplot(res2$r, type="upper", order="hclust",
         p.mat = res2$P, sig.level = 0.01, insig = "blank")

#Recall assumptions of Linear Regression
model <- lm(median_sale_price ~., data = data_X)
vif(model)
##              total_homes_sold            total_new_listings 
##                      7.831238                     13.263203 
##      median_new_listing_price               active_listings 
##                     27.849463                      6.998424 
##      median_active_list_price  average_of_median_list_price 
##                     27.369133                      3.389441 
## average_of_median_offer_price         median_days_on_market 
##                      2.432315                      1.361067
plot(model)

VIF High Variable Inflation Factor (VIF) is a test of multicollinearity. VIF value greater than 2.5 may be a cause of concern. The result shows most variables have a VIF of over 2.5, except for average_of_median_offer_price and median_days_on_market.

Step 3 - KMO

The MSA is 0.69 > 0.5, which means the Factor Analysis is appropriate on this data.

data_fa <- data_X[,-2] #remove the Dependent Variable - median_sale_price
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.69
## MSA for each item = 
##              total_homes_sold            total_new_listings 
##                          0.74                          0.64 
##      median_new_listing_price               active_listings 
##                          0.65                          0.71 
##      median_active_list_price  average_of_median_list_price 
##                          0.66                          0.80 
## average_of_median_offer_price         median_days_on_market 
##                          0.77                          0.21

Step 4 - Number of Factors

By examining the “scree” plot of the successive eigenvalues, we can see there are 4 factors. We will use 4 factors to perform the factor analysis. Also, let’s use orthogonal rotation (varimax) because in orthogonal rotation the rotated factors will remain uncorrelated.

ev <- eigen(cor(data_fa))
ev$values
## [1] 3.19605280 2.77311500 1.02901933 0.60218172 0.20407811 0.12730761 0.04985460
## [8] 0.01839082
plot(ev$values)

Step 5 - Run Factor Analysis

The output gives us the summary for loadings (weights of variable for each factor), the cumulative proportion of factors, which is 90.6% of explaining data variance. We effectively reduce dimensionality from 9 to 4 when only losing about 9.4% of the variance.

nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)
## 
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"),     rotation = "varimax")
## 
## Uniquenesses:
##              total_homes_sold            total_new_listings 
##                         0.124                         0.005 
##      median_new_listing_price               active_listings 
##                         0.005                         0.139 
##      median_active_list_price  average_of_median_list_price 
##                         0.032                         0.167 
## average_of_median_offer_price         median_days_on_market 
##                         0.274                         0.006 
## 
## Loadings:
##                               Factor1 Factor2 Factor3 Factor4
## total_homes_sold               0.931                         
## total_new_listings             0.997                         
## median_new_listing_price               0.926   0.369         
## active_listings                0.910                   0.181 
## median_active_list_price               0.910   0.370         
## average_of_median_list_price           0.465   0.785         
## average_of_median_offer_price          0.304   0.795         
## median_days_on_market                                  0.994 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.690   1.998   1.524   1.037
## Proportion Var   0.336   0.250   0.190   0.130
## Cumulative Var   0.336   0.586   0.776   0.906
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 44.14 on 2 degrees of freedom.
## The p-value is 2.6e-10
fa_var <- fa(r=data_fa, nfactors = 4, rotate="varimax", fm="pa")
fa.diagram(fa_var)

Step 6 - Regression

The factors number_of_listing, avg_offerlist_price, median_days_on_market are highly significant and price_of_listing is not significant in the model.

head(fa_var$scores)
##               PA2        PA1        PA3         PA4
## [1,] -0.641644761 -0.8008472 -0.9150289 -0.04476018
## [2,]  0.949713400 -0.2229838 -0.7273582  0.83383110
## [3,] -0.685135322 -0.2570445 -0.4293704  0.09444202
## [4,] -0.274558940 -0.9537548  0.6684606 -0.38062604
## [5,] -0.633189370 -0.8384564  0.1358717  0.21188338
## [6,] -0.007143516  0.4963289  0.1456217 -0.98243052
regdata <- cbind(data_X[2], fa_var$scores)
#Labeling the data

names(regdata) <- c("median_sale_price", "price_of_listing", "number_of_listing",
                    "avg_offerlist_price", "median_days_on_market")

#Split data in train 0.7 and test 0.3
#Train model
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]

#Regression Model using train data
model1 = lm(median_sale_price ~., train)
summary(model1)
## 
## Call:
## lm(formula = median_sale_price ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -800641  -16877    1925   17362 1905712 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           367520.2      695.0  528.83   <2e-16 ***
## price_of_listing         244.0      697.5    0.35    0.726    
## number_of_listing     177254.3      712.9  248.65   <2e-16 ***
## avg_offerlist_price    73835.1      758.3   97.37   <2e-16 ***
## median_days_on_market -15469.7      763.8  -20.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53740 on 5978 degrees of freedom
## Multiple R-squared:  0.9271, Adjusted R-squared:  0.9271 
## F-statistic: 1.901e+04 on 4 and 5978 DF,  p-value: < 2.2e-16
#Checking for multicollinearity VIF
vif(model1)
##      price_of_listing     number_of_listing   avg_offerlist_price 
##              1.003285              1.006250              1.006116 
## median_days_on_market 
##              1.003252

The vif of each and every variable is about 1.00, not exceeding 2.5.

Step 7 - Prediction

The factors number_of_listing, avg_offerlist_price, median_days_on_market are highly significant and price_of_listing is not significant in the model.

pred_test1 <- predict(model1, newdata = test, type = "response")

test$Satisfaction_Predicted <- pred_test1
head(test[c(1,6)], 10)
##    median_sale_price Satisfaction_Predicted
## 3             330000               288627.2
## 4             280000               253640.1
## 8             510000               487462.3
## 11            160000               131538.8
## 12            259000               252882.1
## 15            197990               199380.6
## 16            425000               406432.5
## 17            290000               277456.6
## 18            150000               149193.8
## 24            225000               234277.0