1. Explanatory Data Analysis

Missing values
EDA Visualization

# 1. Describe missing data, provide summary of missing data, similar to the analysis in the Chapter 2 (table 3): Count of missing data/percent per variable, type of missing data (NA, null), total percent of missingness per dataset
summary(data)

##  region_state        period_begin                 total_homes_sold
##  Length:8564        Min.   :2020-01-06 00:00:00   Min.   :   1.0  
##  Class :character   1st Qu.:2020-03-23 00:00:00   1st Qu.:  81.0  
##  Mode  :character   Median :2020-06-08 00:00:00   Median : 152.0  
##                     Mean   :2020-06-05 12:25:23   Mean   : 216.3  
##                     3rd Qu.:2020-08-17 00:00:00   3rd Qu.: 270.0  
##                     Max.   :2020-10-26 00:00:00   Max.   :2363.0  
##                                                   NA's   :14      
##  median_sale_price total_new_listings median_new_listing_price
##  Min.   :  68450   Min.   :   1.0     Min.   :  89000         
##  1st Qu.: 254900   1st Qu.:  98.0     1st Qu.: 265000         
##  Median : 319000   Median : 185.0     Median : 329900         
##  Mean   : 367335   Mean   : 258.9     Mean   : 379304         
##  3rd Qu.: 417500   3rd Qu.: 316.0     3rd Qu.: 429900         
##  Max.   :3022500   Max.   :2513.0     Max.   :1650000         
##  NA's   :14        NA's   :2          NA's   :2               
##  active_listings   median_active_list_price average_of_median_list_price
##  Min.   :   16.0   Min.   : 100000          Min.   :  15000             
##  1st Qu.:  852.8   1st Qu.: 289755          1st Qu.: 307400             
##  Median : 1541.0   Median : 360000          Median : 399900             
##  Mean   : 2480.9   Mean   : 411724          Mean   : 451867             
##  3rd Qu.: 2849.0   3rd Qu.: 472413          3rd Qu.: 529125             
##  Max.   :21084.0   Max.   :1599999          Max.   :3498000             
##                                                                         
##  average_of_median_offer_price median_days_on_market
##  Min.   :    9000              Min.   :  3.00       
##  1st Qu.:  300000              1st Qu.: 20.00       
##  Median :  398700              Median : 33.25       
##  Mean   :  453276              Mean   : 36.66       
##  3rd Qu.:  530000              3rd Qu.: 49.00       
##  Max.   :16825370              Max.   :363.00       
##                                NA's   :14

sum(is.na(data))

## [1] 46

(colMeans(is.na(data)))*100

##                  region_state                  period_begin 
##                    0.00000000                    0.00000000 
##              total_homes_sold             median_sale_price 
##                    0.16347501                    0.16347501 
##            total_new_listings      median_new_listing_price 
##                    0.02335357                    0.02335357 
##               active_listings      median_active_list_price 
##                    0.00000000                    0.00000000 
##  average_of_median_list_price average_of_median_offer_price 
##                    0.00000000                    0.00000000 
##         median_days_on_market 
##                    0.16347501

miss_case_summary(data)

## # A tibble: 8,564 x 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1  1082      3     27.3
##  2  1480      3     27.3
##  3  1980      3     27.3
##  4  2461      3     27.3
##  5  3630      3     27.3
##  6  4251      3     27.3
##  7  5074      3     27.3
##  8  6541      3     27.3
##  9  7021      3     27.3
## 10  7229      3     27.3
## # ... with 8,554 more rows

miss_var_summary(data)

## # A tibble: 11 x 3
##    variable                      n_miss pct_miss
##    <chr>                          <int>    <dbl>
##  1 total_homes_sold                  14   0.163 
##  2 median_sale_price                 14   0.163 
##  3 median_days_on_market             14   0.163 
##  4 total_new_listings                 2   0.0234
##  5 median_new_listing_price           2   0.0234
##  6 region_state                       0   0     
##  7 period_begin                       0   0     
##  8 active_listings                    0   0     
##  9 median_active_list_price           0   0     
## 10 average_of_median_list_price       0   0     
## 11 average_of_median_offer_price      0   0

vis_miss(data)

# 2. Plot visualization of missing data pattern
gg_miss_upset(data)

vis_dat(data)

# 3. Describe if you have observed any patterns
ggplot(data, aes(x = region_state, y = average_of_median_list_price)) +
  geom_miss_point()

#4. Run statistical analysis to determine if your data is MCAR or MAR. For example, LittleMCAR
md.pattern(data)

##      region_state period_begin active_listings median_active_list_price
## 8548            1            1               1                        1
## 14              1            1               1                        1
## 2               1            1               1                        1
##                 0            0               0                        0
##      average_of_median_list_price average_of_median_offer_price
## 8548                            1                             1
## 14                              1                             1
## 2                               1                             1
##                                 0                             0
##      total_new_listings median_new_listing_price total_homes_sold
## 8548                  1                        1                1
## 14                    1                        1                0
## 2                     0                        0                1
##                       2                        2               14
##      median_sale_price median_days_on_market   
## 8548                 1                     1  0
## 14                   0                     0  3
## 2                    1                     1  2
##                     14                    14 46

#5. Explain what type of imputation will be performed: list-wise/pair-wise deletions, mean imputation, regression imputation etc
data <- na.omit(data)
sum(is.na(data))

## [1] 0

# 1. Create Univariate analysis for the variable of your interest (your Y variable). Calculate skewness and kurtosis and describe the results.
#My Y variable is the median sale price. The skewness of the Y variable is 3.233, indicating that it is a substantially positively skewed distribution.
skewness(data$median_sale_price)

## [1] 3.389518

hist(data$median_sale_price)

kurtosis(data$median_sale_price)

## [1] 22.17451

#The kurtosis is 20.452, indicating that the distribution is too peaked. It is the result of infrequent extreme deviations (too many scattered high prices)

# 2. Create Bivariate plot Box Plot for your Y variable and  one of other important metrics (your X). Describe figure.
#My X variable is the start date of the house listing. In the dataset, it period_begin. We can see for most periods, most median sale price remain under $500,000 with some outliers above $500,000. During April and August, there are extreme outliers that exceed $2,500,000.
#Out of curiosity, I changed the X to the state of the median price. In the dataset, it is region_state. We can see median sale prices vary from state to state. Median of California is above $500,000 with many outliers around $1,500,000. New York, on the other hand, while whose volume is not as high as California, has many extreme outliers above $2,000,000 and $2,500,000.
boxplot(data$median_sale_price ~ data$period_begin, data = data)

boxplot(data$median_sale_price ~ data$region_state, data = data)

# 4. Create a multivariate plot - Use the same plot as in 3 but add another important variable using colored symbols. Describe Figure. Make sure to add legend.
#I embedded both date and state into the plot. Most high-priced homes are in the state of CA, and some are in NY. The limitation is that with too many states as categorical variables, the colors are not very identical. I will work on trimming the dataset for future project.
ggplot(data, aes(period_begin, median_sale_price, color = region_state)) + geom_point()

#5. Dive into the states with exceptionally high price & large population: CA, NY; RI, DMV
dataCA <- read_excel("C:/Users/junol/OneDrive/2 Harrisburg University/Semester 5_2020 Late Fall - fee, transcript/699 Edmund Maher, Tue 6PM, 1024, 1205 0103 7-11AM/House Sale Data/Redfin/Final Project/11variablesCA.xlsx")
boxplot(dataCA$median_sale_price ~ dataCA$region_id, data = dataCA)

dataNY <- read_excel("C:/Users/junol/OneDrive/2 Harrisburg University/Semester 5_2020 Late Fall - fee, transcript/699 Edmund Maher, Tue 6PM, 1024, 1205 0103 7-11AM/House Sale Data/Redfin/Final Project/11variablesNY.xlsx")
boxplot(dataNY$median_sale_price ~ dataNY$region_id, data = dataNY)

dataRI <- read_excel("C:/Users/junol/OneDrive/2 Harrisburg University/Semester 5_2020 Late Fall - fee, transcript/699 Edmund Maher, Tue 6PM, 1024, 1205 0103 7-11AM/House Sale Data/Redfin/Final Project/11variablesRI.xlsx")
boxplot(dataRI$median_sale_price ~ dataRI$region_name, data = dataRI)

dataDMV <- read_excel("C:/Users/junol/OneDrive/2 Harrisburg University/Semester 5_2020 Late Fall - fee, transcript/699 Edmund Maher, Tue 6PM, 1024, 1205 0103 7-11AM/House Sale Data/Redfin/Final Project/11variablesDMV.xlsx")
boxplot(dataDMV$median_sale_price ~ dataDMV$region_name, data = dataDMV)

dataVA <- read_excel("C:/Users/junol/OneDrive/2 Harrisburg University/Semester 5_2020 Late Fall - fee, transcript/699 Edmund Maher, Tue 6PM, 1024, 1205 0103 7-11AM/House Sale Data/Redfin/Final Project/11variablesVA.xlsx")
boxplot(dataVA$median_sale_price ~ dataVA$region_id, data = dataVA)

2. Correlation Analysis

Linear regression
Logistic regression

# Step 1. Scale or normalize your data. Make sure to apply imputation if needed
#The output shows that all the numerical variables have been standardized with a mean value of zero.
preproc1 <- preProcess(data[,c(4,5,11)], method=c("center", "scale"))
data_scaled <- predict(preproc1, data[,c(4,5,11)])
summary(data_scaled)

##  median_sale_price total_new_listings median_days_on_market
##  Min.   :-1.5193   Min.   :-0.9273    Min.   :-1.4734      
##  1st Qu.:-0.5716   1st Qu.:-0.5789    1st Qu.:-0.7290      
##  Median :-0.2458   Median :-0.2629    Median :-0.1598      
##  Mean   : 0.0000   Mean   : 0.0000    Mean   : 0.0000      
##  3rd Qu.: 0.2548   3rd Qu.: 0.2039    3rd Qu.: 0.5408      
##  Max.   :13.4952   Max.   : 8.0932    Max.   :14.2904

#or we can use the scale function
#data_scaled <- as.data.frame(scale(data[,c(4,5,11)]))
#summary(data_scaled)

# Step 2. Build a multiple linear regression model or logistic regression (based on your Y)
#The multiple linear regression model explores the relationship between the median sale price (Y) and median days on market (X1) and total new listings in that area (X2).
linear_model <- lm(data_scaled$median_sale_price ~ data_scaled$median_days_on_market + data_scaled$total_new_listings, data=data_scaled)

# Step 3. Print summary and interpret table (see lecture slides). Describe the summary
#The summary shows that both median days on market (X1) and total new listings in that area (X2) have strong influence on the median sale price (Y), as both the Pr are less than 0.05.
summary(linear_model)

## 
## Call:
## lm(formula = data_scaled$median_sale_price ~ data_scaled$median_days_on_market + 
##     data_scaled$total_new_listings, data = data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5697 -0.5671 -0.2375  0.2390 14.8289 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -3.326e-17  1.076e-02   0.000  1.00000    
## data_scaled$median_days_on_market -9.413e-02  1.076e-02  -8.746  < 2e-16 ***
## data_scaled$total_new_listings     3.720e-02  1.076e-02   3.457  0.00055 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.995 on 8545 degrees of freedom
## Multiple R-squared:  0.01021,    Adjusted R-squared:  0.009978 
## F-statistic: 44.07 on 2 and 8545 DF,  p-value: < 2.2e-16

explanatory = c("median_days_on_market", "total_new_listings")
dependent = 'median_sale_price'
data_scaled %>%
  finalfit(dependent, explanatory)

##  Dependent: median_sale_price                  unit      value
##         median_days_on_market [-1.5,14.3] Mean (sd) -0.0 (1.0)
##            total_new_listings  [-0.9,8.1] Mean (sd) -0.0 (1.0)
##        Coefficient (univariable)     Coefficient (multivariable)
##  -0.09 (-0.12 to -0.07, p<0.001) -0.09 (-0.12 to -0.07, p<0.001)
##     0.04 (0.02 to 0.06, p=0.001)    0.04 (0.02 to 0.06, p=0.001)

plot(linear_model)

#explain the plots:
#https://data.library.virginia.edu/diagnostic-plots/

# Step 5. I would also like to explore the correlation between the variables
#From the graph below, there is almost no correlation between median days on market (X1) and total new listings in that area (X2), which is good because there would be no multicollinearity. It also makes sense if the price of the house would be lower if it is listed on the market for too long because it means fewer people showed interest in it.
corr <- round(cor(data_scaled),1)
ggcorrplot(corr, hc.order = TRUE, type = "lower", lab = TRUE, outline.col = "white",
           ggtheme = ggplot2::theme_grey,
           colors = c("#6D9EC1", "white", "#E46726"))

ggcorrplot(cor(data_scaled), p.mat = cor_pmat(data_scaled), hc.order=TRUE, type='lower')

# # Step 4. Perform another model and evaluate which model performs better.
# ##I performed the logistic regression analysis on the median sale price (Y) and the region state (X3). Region state is a categorical variable. The result is not significant, indicating that there is not a strong correlationship between Y and X3. The linear model performs better.

# data2 <- data
# #data_complete2$median_sale_price <- log(data_complete2$median_sale_price)
# preproc2 <- preProcess(data2[, c(1,4)], method=c("range"))
# data_scaled2 <- predict(preproc2, data2)
# summary(data_scaled2)
# 
# logit_model = glm(formula = data_scaled2$median_sale_price ~ data_scaled2$region_state, data = data_scaled2, 
#               family = binomial)
# 
# summary(logit_model)
# 
# #Print the table
# explanatory2 = c("region_state")
# dependent2 = 'median_sale_price'
# data_scaled2 %>%
#   finalfit(dependent2, explanatory2)

3. Cluster Analysis

K-means cluster analysis
Hierarchical analysis

#Use this dataframe: df
# Step 1. Find the optimal number of clusters (elbow, gap or silhouette methods)
##https://uc-r.github.io/kmeans_clustering#elbow
#The result (plot) suggests that 6 is the optimal number of clusters, as it appears to be the bend in the knee/elbow. The plot from average sihlouette method suggests 6 as the optimal number. The plot from gap statistic method suggests 1 as the optimal number - ignore. I choose 6 as the optimal number of clusters.
#method 1: elbow method
set.seed(123)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(df, k, nstart = 10 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

#6 is the optimal number of clusters
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")

#method 2: average silhouette method
avg_sil <- function(k) {
  km.res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(df))
  mean(ss[, 3])
}

# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15

# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)

plot(k.values, avg_sil_values,
       type = "b", pch = 19, frame = FALSE, 
       xlab = "Number of clusters K",
       ylab = "Average Silhouettes")

fviz_nbclust(df, kmeans, method = "silhouette")

#3 is the optimal number? Still choose 6

#method 3: gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
print(gap_stat, method = "firstmax")

## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 1
##           logW   E.logW       gap     SE.sim
##  [1,] 2.853226 3.319462 0.4662363 0.05856958
##  [2,] 2.536996 2.990989 0.4539934 0.05926627
##  [3,] 2.272295 2.734368 0.4620727 0.06141266
##  [4,] 2.038943 2.530756 0.4918135 0.06476126
##  [5,] 1.900035 2.373808 0.4737725 0.07111306
##  [6,] 1.748845 2.238805 0.4899595 0.07314937
##  [7,] 1.609755 2.113089 0.5033344 0.07281482
##  [8,] 1.513977 2.001064 0.4870869 0.06957714
##  [9,] 1.374078 1.893535 0.5194576 0.07168146
## [10,] 1.257140 1.796631 0.5394912 0.07193585

fviz_gap_stat(gap_stat)

# Step 2. Perform the K-Means cluster analysis
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

k6 <- kmeans(df, centers = 6, nstart = 25)
str(k6)

## List of 9
##  $ cluster     : Named int [1:44] 3 3 3 4 6 2 5 6 3 1 ...
##   ..- attr(*, "names")= chr [1:44] "AK" "AL" "AR" "AZ" ...
##  $ centers     : num [1:6, 1:2] -0.295 0.717 -0.416 -0.348 -0.669 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:2] "mean_7msp" "mean_8tnl"
##  $ totss       : num 86
##  $ withinss    : num [1:6] 1.02 1.47 2.59 0 2.15 ...
##  $ tot.withinss: num 9.01
##  $ betweenss   : num 77
##  $ size        : int [1:6] 4 8 15 1 12 4
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

cluster_plot <- fviz_cluster(k6, data = df)
cluster_plot

#try when number of clusters (k) is 3, 4, and 5
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
#plot to compare
p1 <- fviz_cluster(k6, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

#library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

#Most of the descriptions are embedded in the separate steps. The column/axis name “mean_7msp” is the mean of median sale price of each state; “mean_8tnl” is the mean of total new listing of each state. From the result of k-means clustering, we do see NY, DC, CA, HI have the highest average sale price. Result of Hirarchical analysis suggested the same.

# Step 3. Preform the hierarchical analysis
##https://uc-r.github.io/hc_clustering
# Dissimilarity matrix
d <- dist(df, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

# set.seed(1234)
# x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
# y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
# plot(x, y, col = "blue", pch = 19, cex = 2)
# #text(x + 0.05, y + 0.05, labels = as.character(1:12))
# 
# dataFrame <- data.frame(x, y)
# kmeansObj <- kmeans(dataFrame, centers = 3)
# names(kmeansObj)
# kmeansObj$cluster
# 
# plot(kmeansObj$cluster)

##Alternatively, we can use the agnes function.
# Compute with agnes
hc2 <- agnes(df, method = "complete")
# Agglomerative coefficient
hc2$ac

## [1] 0.9276342

# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

# function to compute coefficient
ac <- function(x) {
  agnes(df, method = x)$ac
}

map_dbl(m, ac)

##   average    single  complete      ward 
## 0.9114313 0.8612660 0.9276342 0.9509801

hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

# compute divisive hierarchical clustering
hc4 <- diana(df)

# Divise coefficient; amount of clustering structure found
hc4$dc

## [1] 0.9182127

# plot dendrogram
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")

# Ward's method
hc5 <- hclust(d, method = "ward.D2" )

# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)

# Number of members in each cluster
table(sub_grp)

## sub_grp
##  1  2  3  4 
## 26  5  4  9

plot(hc5, cex = 0.6)

#rect.hclust(hc5, k = 6, border = 1:6)

# Cut agnes() tree into 6 groups
hc_a <- agnes(df, method = "ward")
cutree(as.hclust(hc_a), k = 6)

## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO NC 
##  1  1  1  2  3  4  5  3  1  6  5  3  5  4  6  5  1  5  1  4  4  1  5  5  5  5 
## NE NH NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI 
##  1  1  4  1  6  3  5  5  4  5  1  1  1  6  4  4  4  1

# Cut diana() tree into 6 groups
hc_d <- diana(df)
cutree(as.hclust(hc_d), k = 6)

## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO NC 
##  1  2  2  3  4  1  2  5  2  6  2  5  2  1  6  2  2  2  2  5  1  1  2  2  2  2 
## NE NH NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI 
##  2  1  1  2  6  5  2  2  1  2  2  2  2  6  1  1  1  2

# Compute distance matrix
res.dist <- dist(df, method = "euclidean")

# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "complete")
hc2 <- hclust(res.dist, method = "ward.D2")

# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)

tanglegram(dend1, dend2)

#compute the entanglement
dend_list <- dendlist(dend1, dend2)

# tanglegram(dend1, dend2,
#   highlight_distinct_edges = FALSE, # Turn-off dashed lines
#   common_subtrees_color_lines = FALSE, # Turn-off line colors
#   common_subtrees_color_branches = TRUE, # Color common branches 
#   main = paste("entanglement =", round(entanglement(dend_list), 2))
#   )

tanglegram(dend1, dend2,
  highlight_distinct_edges = TRUE, # Turn-off dashed lines
  common_subtrees_color_lines = TRUE, # Turn-off line colors
  common_subtrees_color_branches = TRUE, # Color common branches 
  main = paste("entanglement =", round(entanglement(dend_list), 2))
  )

#plot all 4 dendrograms in one graph
# d1 <- plot(hc1, cex = 0.6, hang = -1)
# d2 <- pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
# d3 <- pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# d4 <- plot(hc5, cex = 0.6)
# grid.arrange(d1, d2, d3, d4, nrow = 2)

4. Factor Analysis

No need to scale data in this step. Use the original data. Reason: the result is the same. Plus we need original data to do prediction.

# ## Step 1 - Data Description
# How many dimensions?
# Answer: There are 8548 observations * 13 variables.
# What are the variables types?
# Answer: 10 of them are numeric, 3 are character (region_name, region_state, region_type)
# What are the variable names?
# Answer: See below
# Remove all non-numeric variables and create a dataset called data_X.

#Here we remove all non-numeric variable and create a dataset called data_X
dim(data)

## [1] 8548   11

str(data)

## tibble [8,548 x 11] (S3: tbl_df/tbl/data.frame)
##  $ region_state                 : chr [1:8548] "GA" "TX" "NH" "MN" ...
##  $ period_begin                 : POSIXct[1:8548], format: "2020-08-03" "2020-04-13" ...
##  $ total_homes_sold             : num [1:8548] 71 375 81 165 104 194 332 287 232 247 ...
##  $ median_sale_price            : num [1:8548] 160000 275000 330000 280000 249500 ...
##  $ total_new_listings           : num [1:8548] 70 486 58 191 61 315 406 265 281 277 ...
##  $ median_new_listing_price     : num [1:8548] 185900 275000 304750 274900 230000 ...
##  $ active_listings              : num [1:8548] 590 6402 563 1183 834 ...
##  $ median_active_list_price     : num [1:8548] 180000 314900 324900 275000 269700 ...
##  $ average_of_median_list_price : num [1:8548] 149000 238000 329000 544450 419000 ...
##  $ average_of_median_offer_price: num [1:8548] 152000 249000 300000 527450 395000 ...
##  $ median_days_on_market        : num [1:8548] 20 31 47 17 63 10 51.5 22 7 7 ...
##  - attr(*, "na.action")= 'omit' Named int [1:16] 1082 1480 1980 2461 3630 4251 5074 5469 5890 6541 ...
##   ..- attr(*, "names")= chr [1:16] "1082" "1480" "1980" "2461" ...

names(data)

##  [1] "region_state"                  "period_begin"                 
##  [3] "total_homes_sold"              "median_sale_price"            
##  [5] "total_new_listings"            "median_new_listing_price"     
##  [7] "active_listings"               "median_active_list_price"     
##  [9] "average_of_median_list_price"  "average_of_median_offer_price"
## [11] "median_days_on_market"

data_X <- select(data, -c(1:2))
str(data_X)

## tibble [8,548 x 9] (S3: tbl_df/tbl/data.frame)
##  $ total_homes_sold             : num [1:8548] 71 375 81 165 104 194 332 287 232 247 ...
##  $ median_sale_price            : num [1:8548] 160000 275000 330000 280000 249500 ...
##  $ total_new_listings           : num [1:8548] 70 486 58 191 61 315 406 265 281 277 ...
##  $ median_new_listing_price     : num [1:8548] 185900 275000 304750 274900 230000 ...
##  $ active_listings              : num [1:8548] 590 6402 563 1183 834 ...
##  $ median_active_list_price     : num [1:8548] 180000 314900 324900 275000 269700 ...
##  $ average_of_median_list_price : num [1:8548] 149000 238000 329000 544450 419000 ...
##  $ average_of_median_offer_price: num [1:8548] 152000 249000 300000 527450 395000 ...
##  $ median_days_on_market        : num [1:8548] 20 31 47 17 63 10 51.5 22 7 7 ...
##  - attr(*, "na.action")= 'omit' Named int [1:16] 1082 1480 1980 2461 3630 4251 5074 5469 5890 6541 ...
##   ..- attr(*, "names")= chr [1:16] "1082" "1480" "1980" "2461" ...

## Step 2 - Correlation Matrix
# 1. active_listings and total_new_listings are highly correlated.
# 2. active_listings and total_homes_sold are highly correlated.
# 3. total_homes_sold and total_new_listings are highly correlated.
# 4. median_sale_price and median_new_listing_price are highly correlated.
# 5. median_sale_price and median_active_list_price are highly correlated.
# VIF High Variable Inflation Factor (VIF) is a test of multicollinearity. VIF value greater than 2.5 may be a cause of concern.
# The result shows most variables have a VIF of over 2.5, except for average_of_median_offer_price and median_days_on_market.


datamatrix <- cor(data_X)
corrplot(datamatrix, order="hclust", type='upper', tl.srt = 45)

#ggcorrplot(datamatrix, p.mat = cor_pmat(data_X), hc.order=TRUE, type='lower')
#corrplot(datamatrix, is.corr = TRUE, win.asp = 1, method = "color", type='lower')
res2 <- rcorr(as.matrix(data_X), type="pearson")
# Extract the correlation coefficients
res2$r

##                               total_homes_sold median_sale_price
## total_homes_sold                   1.000000000        0.01488874
## median_sale_price                  0.014888742        1.00000000
## total_new_listings                 0.930001291        0.03674503
## median_new_listing_price           0.013822109        0.96148668
## active_listings                    0.828954932       -0.01779088
## median_active_list_price           0.020982650        0.96433240
## average_of_median_list_price       0.010647286        0.70945268
## average_of_median_offer_price      0.009417138        0.56716260
## median_days_on_market             -0.067720633       -0.09394526
##                               total_new_listings median_new_listing_price
## total_homes_sold                      0.93000129              0.013822109
## median_sale_price                     0.03674503              0.961486683
## total_new_listings                    1.00000000              0.051483548
## median_new_listing_price              0.05148355              1.000000000
## active_listings                       0.90251493             -0.002368816
## median_active_list_price              0.05197751              0.981059948
## average_of_median_list_price          0.03244726              0.721110264
## average_of_median_offer_price         0.02438527              0.575681284
## median_days_on_market                 0.00486223             -0.088338649
##                               active_listings median_active_list_price
## total_homes_sold                  0.828954932              0.020982650
## median_sale_price                -0.017790879              0.964332401
## total_new_listings                0.902514931              0.051977512
## median_new_listing_price         -0.002368816              0.981059948
## active_listings                   1.000000000             -0.003662179
## median_active_list_price         -0.003662179              1.000000000
## average_of_median_list_price     -0.005718765              0.714973064
## average_of_median_offer_price    -0.009827100              0.572153538
## median_days_on_market             0.207782640             -0.112967858
##                               average_of_median_list_price
## total_homes_sold                               0.010647286
## median_sale_price                              0.709452677
## total_new_listings                             0.032447257
## median_new_listing_price                       0.721110264
## active_listings                               -0.005718765
## median_active_list_price                       0.714973064
## average_of_median_list_price                   1.000000000
## average_of_median_offer_price                  0.766356259
## median_days_on_market                         -0.071201590
##                               average_of_median_offer_price
## total_homes_sold                                0.009417138
## median_sale_price                               0.567162597
## total_new_listings                              0.024385269
## median_new_listing_price                        0.575681284
## active_listings                                -0.009827100
## median_active_list_price                        0.572153538
## average_of_median_list_price                    0.766356259
## average_of_median_offer_price                   1.000000000
## median_days_on_market                          -0.076249786
##                               median_days_on_market
## total_homes_sold                        -0.06772063
## median_sale_price                       -0.09394526
## total_new_listings                       0.00486223
## median_new_listing_price                -0.08833865
## active_listings                          0.20778264
## median_active_list_price                -0.11296786
## average_of_median_list_price            -0.07120159
## average_of_median_offer_price           -0.07624979
## median_days_on_market                    1.00000000

# Extract p-values
res2$P

##                               total_homes_sold median_sale_price
## total_homes_sold                            NA      0.1686917912
## median_sale_price                 1.686918e-01                NA
## total_new_listings                0.000000e+00      0.0006789854
## median_new_listing_price          2.013190e-01      0.0000000000
## active_listings                   0.000000e+00      0.1000208971
## median_active_list_price          5.239317e-02      0.0000000000
## average_of_median_list_price      3.249764e-01      0.0000000000
## average_of_median_offer_price     3.839960e-01      0.0000000000
## median_days_on_market             3.670846e-10      0.0000000000
##                               total_new_listings median_new_listing_price
## total_homes_sold                    0.000000e+00             2.013190e-01
## median_sale_price                   6.789854e-04             0.000000e+00
## total_new_listings                            NA             1.913098e-06
## median_new_listing_price            1.913098e-06                       NA
## active_listings                     0.000000e+00             8.266671e-01
## median_active_list_price            1.523344e-06             0.000000e+00
## average_of_median_list_price        2.697358e-03             0.000000e+00
## average_of_median_offer_price       2.416126e-02             0.000000e+00
## median_days_on_market               6.530881e-01             4.440892e-16
##                               active_listings median_active_list_price
## total_homes_sold                    0.0000000             5.239317e-02
## median_sale_price                   0.1000209             0.000000e+00
## total_new_listings                  0.0000000             1.523344e-06
## median_new_listing_price            0.8266671             0.000000e+00
## active_listings                            NA             7.349564e-01
## median_active_list_price            0.7349564                       NA
## average_of_median_list_price        0.5970429             0.000000e+00
## average_of_median_offer_price       0.3636366             0.000000e+00
## median_days_on_market               0.0000000             0.000000e+00
##                               average_of_median_list_price
## total_homes_sold                              3.249764e-01
## median_sale_price                             0.000000e+00
## total_new_listings                            2.697358e-03
## median_new_listing_price                      0.000000e+00
## active_listings                               5.970429e-01
## median_active_list_price                      0.000000e+00
## average_of_median_list_price                            NA
## average_of_median_offer_price                 0.000000e+00
## median_days_on_market                         4.387202e-11
##                               average_of_median_offer_price
## total_homes_sold                               3.839960e-01
## median_sale_price                              0.000000e+00
## total_new_listings                             2.416126e-02
## median_new_listing_price                       0.000000e+00
## active_listings                                3.636366e-01
## median_active_list_price                       0.000000e+00
## average_of_median_list_price                   0.000000e+00
## average_of_median_offer_price                            NA
## median_days_on_market                          1.677769e-12
##                               median_days_on_market
## total_homes_sold                       3.670846e-10
## median_sale_price                      0.000000e+00
## total_new_listings                     6.530881e-01
## median_new_listing_price               4.440892e-16
## active_listings                        0.000000e+00
## median_active_list_price               0.000000e+00
## average_of_median_list_price           4.387202e-11
## average_of_median_offer_price          1.677769e-12
## median_days_on_market                            NA

# # Insignificant correlations are leaved blank
# corrplot(res2$r, type="upper", order="hclust",
#          p.mat = res2$P, sig.level = 0.01, insig = "blank")
#Recall assumptions of Linear Regression
model <- lm(median_sale_price ~., data = data_X)
plot(model)

vif(model)

##              total_homes_sold            total_new_listings 
##                      7.831238                     13.263203 
##      median_new_listing_price               active_listings 
##                     27.849463                      6.998424 
##      median_active_list_price  average_of_median_list_price 
##                     27.369133                      3.389441 
## average_of_median_offer_price         median_days_on_market 
##                      2.432315                      1.361067

## Step 3 - KMO
# The MSA is 0.69 > 0.5, which means the Factor Analysis is appropriate on this data.
data_fa <- data_X[,-2] #remove the Dependent Variable - median_sale_price
datamatrix <- cor(data_fa)
KMO(r=datamatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.69
## MSA for each item = 
##              total_homes_sold            total_new_listings 
##                          0.74                          0.64 
##      median_new_listing_price               active_listings 
##                          0.65                          0.71 
##      median_active_list_price  average_of_median_list_price 
##                          0.66                          0.80 
## average_of_median_offer_price         median_days_on_market 
##                          0.77                          0.21

# ## Step 4 - Number of Factors
# By examining the “scree” plot of the successive eigenvalues, we can see there are 4 factors.
# We will use 4 factors to perform the factor analysis. Also, let’s use orthogonal rotation (varimax) because in orthogonal rotation the rotated factors will remain uncorrelated.
ev <- eigen(cor(data_fa))
ev$values

## [1] 3.19605280 2.77311500 1.02901933 0.60218172 0.20407811 0.12730761 0.04985460
## [8] 0.01839082

plot(ev$values)

# Step 5 - Run Factor Analysis
# The output gives us the summary for loadings (weights of variable for each factor), the cumulative proportion of factors, which is 90.6% of explaining data variance.
# We effectively reduce dimensionality from 9 to 4 when only losing about 9.4% of the variance.
nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)

## 
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"),     rotation = "varimax")
## 
## Uniquenesses:
##              total_homes_sold            total_new_listings 
##                         0.124                         0.005 
##      median_new_listing_price               active_listings 
##                         0.005                         0.139 
##      median_active_list_price  average_of_median_list_price 
##                         0.032                         0.167 
## average_of_median_offer_price         median_days_on_market 
##                         0.274                         0.006 
## 
## Loadings:
##                               Factor1 Factor2 Factor3 Factor4
## total_homes_sold               0.931                         
## total_new_listings             0.997                         
## median_new_listing_price               0.926   0.369         
## active_listings                0.910                   0.181 
## median_active_list_price               0.910   0.370         
## average_of_median_list_price           0.465   0.785         
## average_of_median_offer_price          0.304   0.795         
## median_days_on_market                                  0.994 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.690   1.998   1.524   1.037
## Proportion Var   0.336   0.250   0.190   0.130
## Cumulative Var   0.336   0.586   0.776   0.906
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 44.14 on 2 degrees of freedom.
## The p-value is 2.6e-10

fa_var <- fa(r=data_fa, nfactors = 4, rotate="varimax", fm="pa")
fa.diagram(fa_var)

## Step 6 - Regression
##The factors number_of_listing, avg_offerlist_price, median_days_on_market are highly significant and price_of_listing is not significant in the model.
head(fa_var$scores)

##               PA2        PA1        PA3         PA4
## [1,] -0.641644761 -0.8008472 -0.9150289 -0.04476018
## [2,]  0.949713400 -0.2229838 -0.7273582  0.83383110
## [3,] -0.685135322 -0.2570445 -0.4293704  0.09444202
## [4,] -0.274558940 -0.9537548  0.6684606 -0.38062604
## [5,] -0.633189370 -0.8384564  0.1358717  0.21188338
## [6,] -0.007143516  0.4963289  0.1456217 -0.98243052

regdata <- cbind(data_X[2], fa_var$scores)
#Labeling the data
names(regdata) <- c("median_sale_price", "price_of_listing", "market_activeness",
                    "avg_deal_price", "median_days_on_market")
#Split data in train 0.7 and test 0.3
#Train model
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]
#Regression Model using train data
model1 = lm(median_sale_price ~., train)
plot(model1)

summary(model1)

## 
## Call:
## lm(formula = median_sale_price ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -800641  -16877    1925   17362 1905712 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           367520.2      695.0  528.83   <2e-16 ***
## price_of_listing         244.0      697.5    0.35    0.726    
## market_activeness     177254.3      712.9  248.65   <2e-16 ***
## avg_deal_price         73835.1      758.3   97.37   <2e-16 ***
## median_days_on_market -15469.7      763.8  -20.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53740 on 5978 degrees of freedom
## Multiple R-squared:  0.9271, Adjusted R-squared:  0.9271 
## F-statistic: 1.901e+04 on 4 and 5978 DF,  p-value: < 2.2e-16

#Checking for multicollinearity VIF
vif(model1)

##      price_of_listing     market_activeness        avg_deal_price 
##              1.003285              1.006250              1.006116 
## median_days_on_market 
##              1.003252

5. Prediction

## Step 7 - Prediction
##The factors number_of_listing, avg_offerlist_price, median_days_on_market are highly significant and price_of_listing is not significant in the model.
pred_test1 <- predict(model1, newdata = test, type = "response")
test$Satisfaction_Predicted <- pred_test1
head(test[c(1,6)], 10)

##    median_sale_price Satisfaction_Predicted
## 3             330000               288627.2
## 4             280000               253640.1
## 8             510000               487462.3
## 11            160000               131538.8
## 12            259000               252882.1
## 15            197990               199380.6
## 16            425000               406432.5
## 17            290000               277456.6
## 18            150000               149193.8
## 24            225000               234277.0

699 Housing Data Complete Analysis

Shidong Li

1/2/2021

1. Explanatory Data Analysis

2. Correlation Analysis

3. Cluster Analysis

4. Factor Analysis

5. Prediction