How many dimensions? Answer: There are 8548 observations * 13 variables. What are the variables types? Answer: 10 of them are numeric, 3 are character (region_name, region_state, region_type) What are the variable names? Answer: See below Remove all non-numeric variables and create a dataset called data_X.
dim(data_complete)
## [1] 8548 14
str(data_complete)
## tibble [8,548 x 14] (S3: tbl_df/tbl/data.frame)
## $ region_id : num [1:8548] 534 2696 1887 1492 1358 ...
## $ region_name : chr [1:8548] "Clayton County" "Dallas County" "Merrimack County" "Ramsey County" ...
## $ region_state : chr [1:8548] "GA" "TX" "NH" "MN" ...
## $ region_type : chr [1:8548] "county" "county" "county" "county" ...
## $ period_begin : POSIXct[1:8548], format: "2020-08-03" "2020-04-13" ...
## $ total_homes_sold : num [1:8548] 71 375 81 165 104 194 332 287 232 247 ...
## $ median_sale_price : num [1:8548] 160000 275000 330000 280000 249500 ...
## $ total_new_listings : num [1:8548] 70 486 58 191 61 315 406 265 281 277 ...
## $ median_new_listing_price : num [1:8548] 185900 275000 304750 274900 230000 ...
## $ active_listings : num [1:8548] 590 6402 563 1183 834 ...
## $ median_active_list_price : num [1:8548] 180000 314900 324900 275000 269700 ...
## $ average_of_median_list_price : num [1:8548] 149000 238000 329000 544450 419000 ...
## $ average_of_median_offer_price: num [1:8548] 152000 249000 300000 527450 395000 ...
## $ median_days_on_market : num [1:8548] 20 31 47 17 63 10 51.5 22 7 7 ...
## - attr(*, "na.action")= 'omit' Named int [1:43026] 1 2 3 4 5 6 8 9 10 11 ...
## ..- attr(*, "names")= chr [1:43026] "1" "2" "3" "4" ...
names(data_complete)
## [1] "region_id" "region_name"
## [3] "region_state" "region_type"
## [5] "period_begin" "total_homes_sold"
## [7] "median_sale_price" "total_new_listings"
## [9] "median_new_listing_price" "active_listings"
## [11] "median_active_list_price" "average_of_median_list_price"
## [13] "average_of_median_offer_price" "median_days_on_market"
data_X <- select(data_complete, -c(1:5))
str(data_X)
## tibble [8,548 x 9] (S3: tbl_df/tbl/data.frame)
## $ total_homes_sold : num [1:8548] 71 375 81 165 104 194 332 287 232 247 ...
## $ median_sale_price : num [1:8548] 160000 275000 330000 280000 249500 ...
## $ total_new_listings : num [1:8548] 70 486 58 191 61 315 406 265 281 277 ...
## $ median_new_listing_price : num [1:8548] 185900 275000 304750 274900 230000 ...
## $ active_listings : num [1:8548] 590 6402 563 1183 834 ...
## $ median_active_list_price : num [1:8548] 180000 314900 324900 275000 269700 ...
## $ average_of_median_list_price : num [1:8548] 149000 238000 329000 544450 419000 ...
## $ average_of_median_offer_price: num [1:8548] 152000 249000 300000 527450 395000 ...
## $ median_days_on_market : num [1:8548] 20 31 47 17 63 10 51.5 22 7 7 ...
## - attr(*, "na.action")= 'omit' Named int [1:43026] 1 2 3 4 5 6 8 9 10 11 ...
## ..- attr(*, "names")= chr [1:43026] "1" "2" "3" "4" ...
datamatrix <- cor(data_X)
corrplot(datamatrix, order="hclust", type='upper', tl.srt = 45)
#ggcorrplot(datamatrix, p.mat = cor_pmat(data_X), hc.order=TRUE, type='lower')
#corrplot(datamatrix, is.corr = TRUE, win.asp = 1, method = "color", type='lower')
res2 <- rcorr(as.matrix(data_X), type="pearson")
# Extract the correlation coefficients
res2$r
## total_homes_sold median_sale_price
## total_homes_sold 1.000000000 0.01488874
## median_sale_price 0.014888742 1.00000000
## total_new_listings 0.930001291 0.03674503
## median_new_listing_price 0.013822109 0.96148668
## active_listings 0.828954932 -0.01779088
## median_active_list_price 0.020982650 0.96433240
## average_of_median_list_price 0.010647286 0.70945268
## average_of_median_offer_price 0.009417138 0.56716260
## median_days_on_market -0.067720633 -0.09394526
## total_new_listings median_new_listing_price
## total_homes_sold 0.93000129 0.013822109
## median_sale_price 0.03674503 0.961486683
## total_new_listings 1.00000000 0.051483548
## median_new_listing_price 0.05148355 1.000000000
## active_listings 0.90251493 -0.002368816
## median_active_list_price 0.05197751 0.981059948
## average_of_median_list_price 0.03244726 0.721110264
## average_of_median_offer_price 0.02438527 0.575681284
## median_days_on_market 0.00486223 -0.088338649
## active_listings median_active_list_price
## total_homes_sold 0.828954932 0.020982650
## median_sale_price -0.017790879 0.964332401
## total_new_listings 0.902514931 0.051977512
## median_new_listing_price -0.002368816 0.981059948
## active_listings 1.000000000 -0.003662179
## median_active_list_price -0.003662179 1.000000000
## average_of_median_list_price -0.005718765 0.714973064
## average_of_median_offer_price -0.009827100 0.572153538
## median_days_on_market 0.207782640 -0.112967858
## average_of_median_list_price
## total_homes_sold 0.010647286
## median_sale_price 0.709452677
## total_new_listings 0.032447257
## median_new_listing_price 0.721110264
## active_listings -0.005718765
## median_active_list_price 0.714973064
## average_of_median_list_price 1.000000000
## average_of_median_offer_price 0.766356259
## median_days_on_market -0.071201590
## average_of_median_offer_price
## total_homes_sold 0.009417138
## median_sale_price 0.567162597
## total_new_listings 0.024385269
## median_new_listing_price 0.575681284
## active_listings -0.009827100
## median_active_list_price 0.572153538
## average_of_median_list_price 0.766356259
## average_of_median_offer_price 1.000000000
## median_days_on_market -0.076249786
## median_days_on_market
## total_homes_sold -0.06772063
## median_sale_price -0.09394526
## total_new_listings 0.00486223
## median_new_listing_price -0.08833865
## active_listings 0.20778264
## median_active_list_price -0.11296786
## average_of_median_list_price -0.07120159
## average_of_median_offer_price -0.07624979
## median_days_on_market 1.00000000
# Extract p-values
res2$P
## total_homes_sold median_sale_price
## total_homes_sold NA 0.1686917912
## median_sale_price 1.686918e-01 NA
## total_new_listings 0.000000e+00 0.0006789854
## median_new_listing_price 2.013190e-01 0.0000000000
## active_listings 0.000000e+00 0.1000208971
## median_active_list_price 5.239317e-02 0.0000000000
## average_of_median_list_price 3.249764e-01 0.0000000000
## average_of_median_offer_price 3.839960e-01 0.0000000000
## median_days_on_market 3.670846e-10 0.0000000000
## total_new_listings median_new_listing_price
## total_homes_sold 0.000000e+00 2.013190e-01
## median_sale_price 6.789854e-04 0.000000e+00
## total_new_listings NA 1.913098e-06
## median_new_listing_price 1.913098e-06 NA
## active_listings 0.000000e+00 8.266671e-01
## median_active_list_price 1.523344e-06 0.000000e+00
## average_of_median_list_price 2.697358e-03 0.000000e+00
## average_of_median_offer_price 2.416126e-02 0.000000e+00
## median_days_on_market 6.530881e-01 4.440892e-16
## active_listings median_active_list_price
## total_homes_sold 0.0000000 5.239317e-02
## median_sale_price 0.1000209 0.000000e+00
## total_new_listings 0.0000000 1.523344e-06
## median_new_listing_price 0.8266671 0.000000e+00
## active_listings NA 7.349564e-01
## median_active_list_price 0.7349564 NA
## average_of_median_list_price 0.5970429 0.000000e+00
## average_of_median_offer_price 0.3636366 0.000000e+00
## median_days_on_market 0.0000000 0.000000e+00
## average_of_median_list_price
## total_homes_sold 3.249764e-01
## median_sale_price 0.000000e+00
## total_new_listings 2.697358e-03
## median_new_listing_price 0.000000e+00
## active_listings 5.970429e-01
## median_active_list_price 0.000000e+00
## average_of_median_list_price NA
## average_of_median_offer_price 0.000000e+00
## median_days_on_market 4.387202e-11
## average_of_median_offer_price
## total_homes_sold 3.839960e-01
## median_sale_price 0.000000e+00
## total_new_listings 2.416126e-02
## median_new_listing_price 0.000000e+00
## active_listings 3.636366e-01
## median_active_list_price 0.000000e+00
## average_of_median_list_price 0.000000e+00
## average_of_median_offer_price NA
## median_days_on_market 1.677769e-12
## median_days_on_market
## total_homes_sold 3.670846e-10
## median_sale_price 0.000000e+00
## total_new_listings 6.530881e-01
## median_new_listing_price 4.440892e-16
## active_listings 0.000000e+00
## median_active_list_price 0.000000e+00
## average_of_median_list_price 4.387202e-11
## average_of_median_offer_price 1.677769e-12
## median_days_on_market NA
# Insignificant correlations are leaved blank
corrplot(res2$r, type="upper", order="hclust",
p.mat = res2$P, sig.level = 0.01, insig = "blank")
#Recall assumptions of Linear Regression
model <- lm(median_sale_price ~., data = data_X)
vif(model)
## total_homes_sold total_new_listings
## 7.831238 13.263203
## median_new_listing_price active_listings
## 27.849463 6.998424
## median_active_list_price average_of_median_list_price
## 27.369133 3.389441
## average_of_median_offer_price median_days_on_market
## 2.432315 1.361067
plot(model)
VIF High Variable Inflation Factor (VIF) is a test of multicollinearity. VIF value greater than 2.5 may be a cause of concern. The result shows most variables have a VIF of over 2.5, except for average_of_median_offer_price and median_days_on_market.
The MSA is 0.69 > 0.5, which means the Factor Analysis is appropriate on this data.
data_fa <- data_X[,-2] #remove the Dependent Variable - median_sale_price
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA = 0.69
## MSA for each item =
## total_homes_sold total_new_listings
## 0.74 0.64
## median_new_listing_price active_listings
## 0.65 0.71
## median_active_list_price average_of_median_list_price
## 0.66 0.80
## average_of_median_offer_price median_days_on_market
## 0.77 0.21
By examining the “scree” plot of the successive eigenvalues, we can see there are 4 factors. We will use 4 factors to perform the factor analysis. Also, let’s use orthogonal rotation (varimax) because in orthogonal rotation the rotated factors will remain uncorrelated.
ev <- eigen(cor(data_fa))
ev$values
## [1] 3.19605280 2.77311500 1.02901933 0.60218172 0.20407811 0.12730761 0.04985460
## [8] 0.01839082
plot(ev$values)
The output gives us the summary for loadings (weights of variable for each factor), the cumulative proportion of factors, which is 90.6% of explaining data variance. We effectively reduce dimensionality from 9 to 4 when only losing about 9.4% of the variance.
nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)
##
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"), rotation = "varimax")
##
## Uniquenesses:
## total_homes_sold total_new_listings
## 0.124 0.005
## median_new_listing_price active_listings
## 0.005 0.139
## median_active_list_price average_of_median_list_price
## 0.032 0.167
## average_of_median_offer_price median_days_on_market
## 0.274 0.006
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## total_homes_sold 0.931
## total_new_listings 0.997
## median_new_listing_price 0.926 0.369
## active_listings 0.910 0.181
## median_active_list_price 0.910 0.370
## average_of_median_list_price 0.465 0.785
## average_of_median_offer_price 0.304 0.795
## median_days_on_market 0.994
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 2.690 1.998 1.524 1.037
## Proportion Var 0.336 0.250 0.190 0.130
## Cumulative Var 0.336 0.586 0.776 0.906
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 44.14 on 2 degrees of freedom.
## The p-value is 2.6e-10
fa_var <- fa(r=data_fa, nfactors = 4, rotate="varimax", fm="pa")
fa.diagram(fa_var)
The factors number_of_listing, avg_offerlist_price, median_days_on_market are highly significant and price_of_listing is not significant in the model.
head(fa_var$scores)
## PA2 PA1 PA3 PA4
## [1,] -0.641644761 -0.8008472 -0.9150289 -0.04476018
## [2,] 0.949713400 -0.2229838 -0.7273582 0.83383110
## [3,] -0.685135322 -0.2570445 -0.4293704 0.09444202
## [4,] -0.274558940 -0.9537548 0.6684606 -0.38062604
## [5,] -0.633189370 -0.8384564 0.1358717 0.21188338
## [6,] -0.007143516 0.4963289 0.1456217 -0.98243052
regdata <- cbind(data_X[2], fa_var$scores)
#Labeling the data
names(regdata) <- c("median_sale_price", "price_of_listing", "number_of_listing",
"avg_offerlist_price", "median_days_on_market")
#Split data in train 0.7 and test 0.3
#Train model
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]
#Regression Model using train data
model1 = lm(median_sale_price ~., train)
summary(model1)
##
## Call:
## lm(formula = median_sale_price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -800641 -16877 1925 17362 1905712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 367520.2 695.0 528.83 <2e-16 ***
## price_of_listing 244.0 697.5 0.35 0.726
## number_of_listing 177254.3 712.9 248.65 <2e-16 ***
## avg_offerlist_price 73835.1 758.3 97.37 <2e-16 ***
## median_days_on_market -15469.7 763.8 -20.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53740 on 5978 degrees of freedom
## Multiple R-squared: 0.9271, Adjusted R-squared: 0.9271
## F-statistic: 1.901e+04 on 4 and 5978 DF, p-value: < 2.2e-16
#Checking for multicollinearity VIF
vif(model1)
## price_of_listing number_of_listing avg_offerlist_price
## 1.003285 1.006250 1.006116
## median_days_on_market
## 1.003252
The vif of each and every variable is about 1.00, not exceeding 2.5.
The factors number_of_listing, avg_offerlist_price, median_days_on_market are highly significant and price_of_listing is not significant in the model.
pred_test1 <- predict(model1, newdata = test, type = "response")
test$Satisfaction_Predicted <- pred_test1
head(test[c(1,6)], 10)
## median_sale_price Satisfaction_Predicted
## 3 330000 288627.2
## 4 280000 253640.1
## 8 510000 487462.3
## 11 160000 131538.8
## 12 259000 252882.1
## 15 197990 199380.6
## 16 425000 406432.5
## 17 290000 277456.6
## 18 150000 149193.8
## 24 225000 234277.0