Updated_Mean_Sale_Data <- read_csv("~/Desktop/Data-200/Metro_mlp_uc_sfrcondo_sm_month.csv")
## Rows: 928 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): RegionName, RegionType, StateName
## dbl (70): RegionID, SizeRank, 2018-03-31, 2018-04-30, 2018-05-31, 2018-06-30...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Updated_Median_Sale_Data <- read_csv("~/Desktop/Data-200/Metro_median_sale_price_uc_sfrcondo_sm_sa_month.csv")
## Rows: 771 Columns: 150
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): RegionName, RegionType, StateName
## dbl (147): RegionID, SizeRank, 2011-09-30, 2011-10-31, 2011-11-30, 2011-12-3...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Updated_Days_on_Zillow <- read_csv("~/Desktop/Data-200/Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv")
## Rows: 726 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): RegionName, RegionType, StateName
## dbl (70): RegionID, SizeRank, 2018-03-31, 2018-04-30, 2018-05-31, 2018-06-30...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
January_2019_Prices <- Updated_Mean_Sale_Data %>%
dplyr::select(RegionID, RegionName, `2019-01-31`)
January_2019_Days_on_Zillow <- Updated_Days_on_Zillow %>%
dplyr::select(RegionID, RegionName, `2019-01-31`)
January_Merge <- merge(January_2019_Prices, January_2019_Days_on_Zillow,
by = c("RegionID", "RegionName"))
January_Merge_Clean <- na.omit(January_Merge)
January_Merge_Isolate <- January_Merge_Clean %>%
dplyr::select('2019-01-31.x', '2019-01-31.y')
ggplot(data = January_Merge_Clean, aes(x = `2019-01-31.x`, y = `2019-01-31.y`)) +
geom_point() +
labs(title = "January 2019 Zillow Snapshot",
x = "Mean Price",
y = "Avg Days on Zillow")

October_2023_Prices <- Updated_Mean_Sale_Data %>%
dplyr::select(RegionID, RegionName, StateName, `2023-10-31`)
October_2023_Days_on_Zillow <- Updated_Days_on_Zillow %>%
dplyr::select(RegionID, RegionName, StateName, `2023-10-31`)
October_2023_Merge <- merge(October_2023_Prices, October_2023_Days_on_Zillow,
by = c("RegionID", "RegionName", "StateName"))
October_2023_Merge_Clean <- na.omit(October_2023_Merge)
October_2023_Merge_Isolate <- October_2023_Merge_Clean %>%
dplyr::select(`2023-10-31.x`, `2023-10-31.y`)
ggplot(data = October_2023_Merge_Clean, aes(x = `2023-10-31.x`, y = `2023-10-31.y`)) +
geom_point() +
labs(title = "October 2023 Zillow Snapshot",
x = "Mean Price",
y = "Avg Days on Zillow")

library(stats)
wcss <- numeric(length = 10) # Assuming a maximum of 10 clusters
for (i in 1:10) {
model <- kmeans(January_Merge_Isolate, centers = i)
wcss[i] <- model$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

# Scale the data
scaled_January_Merge_Isolate <- scale(January_Merge_Isolate)
# Perform k-means clustering on the scaled data
k <- 4
set.seed(123)
January_2019_model <- kmeans(scaled_January_Merge_Isolate, centers = k)
# Plot the clustered data
plot(
January_Merge_Isolate,
col = January_2019_model$cluster,
ylab = "Days on Zillow",
xlab = "Price",
main = "January 2019"
)
points(January_2019_model$centers, col = 1:k, pch = 8, cex = 2)
abline(h = 60, col = "orange")

library(stats)
wcss <- numeric(length = 10) # Assuming a maximum of 10 clusters
for (i in 1:10) {
model <- kmeans(October_2023_Merge_Isolate, centers = i)
wcss[i] <- model$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

k <- 4
set.seed(123)
October_2023_Merge_Isolate_scaled <- scale(October_2023_Merge_Isolate)
October_2023_model <- kmeans(October_2023_Merge_Isolate_scaled, centers = k)
plot(October_2023_Merge_Isolate,
col = October_2023_model$cluster,
ylab = "Days on Zillow",
xlab = "Price",
main = "October 2023")
points(October_2023_model$centers, col = 1:k, pch = 8, cex = 2)
abline(h = 60, col = "orange")

January_2021_Prices <- Updated_Mean_Sale_Data %>%
dplyr::select(RegionID, RegionName, `2021-01-31`)
January_2021_Days_on_Zillow <- Updated_Days_on_Zillow %>%
dplyr::select(RegionID, RegionName, `2021-01-31`)
January21_Merge <- merge(January_2021_Prices, January_2021_Days_on_Zillow,
by = c("RegionID", "RegionName"))
January21_Merge_Clean <- na.omit(January21_Merge)
January21_Merge_Isolate <- January21_Merge_Clean %>%
dplyr::select('2021-01-31.x', '2021-01-31.y')
ggplot(data = January21_Merge_Clean, aes(x = `2021-01-31.x`, y = `2021-01-31.y`)) +
geom_point() +
labs(title = "January 2021 Zillow Snapshot",
x = "Mean Price",
y = "Avg Days on Zillow")

wcss <- numeric(length = 10) # Assuming a maximum of 10 clusters
for (i in 1:10) {
model <- kmeans(January21_Merge_Isolate, centers = i)
wcss[i] <- model$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

k <- 4
set.seed(123)
January21_Merge_Isolate_scaled <- scale(January21_Merge_Isolate)
January_2021_model <- kmeans(January21_Merge_Isolate_scaled, centers = k)
plot(January21_Merge_Isolate,
col = January_2021_model$cluster,
ylab = "Days on Zillow",
xlab = "Price",
main = "January 2021")
points(January_2021_model$centers, col = 1:k, pch = 8, cex = 2)
abline(h = 60, col = "orange")

cluster_assignments_2021 <- January_2021_model$cluster
January_2021_w_cluster <- cbind(January21_Merge_Clean, Cluster = cluster_assignments_2021)
cluster_assignments_2019 <- January_2019_model$cluster
January_2019_w_cluster <- cbind(January_Merge_Clean, Cluster = cluster_assignments_2019)
cluster_assignments_2023 <- October_2023_model$cluster
October_2023_w_cluster <- cbind(October_2023_Merge_Clean, Cluster = cluster_assignments_2023)
Linear_model_2019 <- lm(`2019-01-31.y` ~ `2019-01-31.x` , data = January_Merge_Clean)
summary(Linear_model_2019)
##
## Call:
## lm(formula = `2019-01-31.y` ~ `2019-01-31.x`, data = January_Merge_Clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.480 -17.524 -4.706 13.096 81.629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.187e+01 2.353e+00 39.041 < 2e-16 ***
## `2019-01-31.x` -3.462e-05 7.692e-06 -4.501 8.81e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.52 on 411 degrees of freedom
## Multiple R-squared: 0.04698, Adjusted R-squared: 0.04466
## F-statistic: 20.26 on 1 and 411 DF, p-value: 8.81e-06
plot(`2019-01-31.y` ~ `2019-01-31.x`, data = January_Merge_Clean, main = "January 2019",
ylab = "Days on Zillow", xlab = "Price")
abline(Linear_model_2019, col = "red")

Linear_model_2021 <- lm(`2021-01-31.y` ~ `2021-01-31.x`, data = January21_Merge_Clean)
summary(Linear_model_2021)
##
## Call:
## lm(formula = `2021-01-31.y` ~ `2021-01-31.x`, data = January21_Merge_Clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.707 -16.867 -5.815 12.165 139.441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.490e+01 1.976e+00 27.778 <2e-16 ***
## `2021-01-31.x` -1.048e-05 6.144e-06 -1.706 0.0886 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.5 on 530 degrees of freedom
## Multiple R-squared: 0.005462, Adjusted R-squared: 0.003585
## F-statistic: 2.911 on 1 and 530 DF, p-value: 0.08858
plot(`2021-01-31.y` ~ `2021-01-31.x`, data = January21_Merge_Clean, main = "January 2021",
ylab = "Days on Zillow", xlab = "Price")
abline(Linear_model_2021, col = "red")

Linear_model_2023 <- lm(`2023-10-31.y` ~ `2023-10-31.x`, data = October_2023_Merge_Clean)
summary(Linear_model_2023)
##
## Call:
## lm(formula = `2023-10-31.y` ~ `2023-10-31.x`, data = October_2023_Merge_Clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.007 -12.969 -3.453 9.119 82.872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.755e+01 1.267e+00 29.647 < 2e-16 ***
## `2023-10-31.x` 1.264e-05 3.126e-06 4.043 0.0000585 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.84 on 723 degrees of freedom
## Multiple R-squared: 0.02211, Adjusted R-squared: 0.02075
## F-statistic: 16.34 on 1 and 723 DF, p-value: 0.00005849
plot(`2023-10-31.y` ~ `2023-10-31.x`, data = October_2023_Merge_Clean, main = "October 2023",
ylab = "Days on Zillow", xlab = "Price")
abline(Linear_model_2023, col = "red")

Sold_Above_List <- read_csv("~/Desktop/Data-200/Metro_pct_sold_above_list_uc_sfrcondo_sm_month.csv")
## Rows: 619 Columns: 72
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): RegionName, RegionType, StateName
## dbl (69): RegionID, SizeRank, 2018-03-31, 2018-04-30, 2018-05-31, 2018-06-30...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
January_2019_PSA <- Sold_Above_List %>%
dplyr::select(RegionID, RegionName, `2019-01-31`)
January_2019_PSA_Merge<- merge(January_Merge, January_2019_PSA,
by = c("RegionID", "RegionName"))
num_cols <- ncol(January_2019_PSA_Merge)
all_col_names <- colnames(January_2019_PSA_Merge)
new_names <- c("Price", "Days_On_Zillow", "PSA")
colnames(January_2019_PSA_Merge)[(num_cols - 2):num_cols] <- new_names
January_2019_PSA_Merge_Clean <- na.omit(January_2019_PSA_Merge)
January2019_3D_Data <- January_2019_PSA_Merge_Clean %>%
dplyr::select(Price, Days_On_Zillow, PSA)
wcss <- numeric(length = 10)
for (i in 1:10) {
model <- kmeans(January2019_3D_Data, centers = i)
wcss[i] <- model$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

k <- 4
set.seed(123)
January2019_3D_Data <- January2019_3D_Data[, sapply(January2019_3D_Data, is.numeric)]
# Scale the numeric columns
January2019_3D_Data <- scale(January2019_3D_Data)
January2019_3D_Data <- kmeans(January2019_3D_Data, centers = k)
Cluster_Assign_3d1 <- January2019_3D_Data$cluster
data_with_clusters_2019_3D <- cbind(January_2019_PSA_Merge_Clean, Cluster = Cluster_Assign_3d1)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data_with_clusters_2019_3D <- as.data.frame(data_with_clusters_2019_3D)
plot_3d2019 <- plot_ly(data_with_clusters_2019_3D,
x = ~Price,
y = ~Days_On_Zillow,
z = ~PSA,
color = ~Cluster,
type = "scatter3d",
mode = "markers",
marker = list(size = 5))
plot_3d2019 <- plot_3d2019 %>% layout(
scene = list(
xaxis = list(title = "Price"),
yaxis = list(title = "Days_On_Zillow"),
zaxis = list(title = "PSA")
)
)
# Show the plot
plot_3d2019
January_2021_PSA <- Sold_Above_List %>%
dplyr::select(RegionID, RegionName, `2021-01-31`)
January_2021_PSA_Merge<- merge(January21_Merge, January_2021_PSA,
by = c("RegionID", "RegionName"))
num_cols <- ncol(January_2021_PSA_Merge)
all_col_names <- colnames(January_2021_PSA_Merge)
new_names <- c("Price", "Days_On_Zillow", "PSA")
colnames(January_2021_PSA_Merge)[(num_cols - 2):num_cols] <- new_names
January_2021_PSA_Merge_Clean <- na.omit(January_2021_PSA_Merge)
January2021_3D_Data <- January_2021_PSA_Merge_Clean %>%
dplyr::select(Price, Days_On_Zillow, PSA)
wcss <- numeric(length = 10)
for (i in 1:10) {
model <- kmeans(January2021_3D_Data, centers = i)
wcss[i] <- model$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

k <- 4
set.seed(123)
January2021_3D_Data <- January2021_3D_Data[, sapply(January2021_3D_Data, is.numeric)]
# Scale the numeric columns
January2021_3D_Data <- scale(January2021_3D_Data)
January2021_3D_Data <- kmeans(January2021_3D_Data, centers = k)
Cluster_Assign_3d1 <- January2021_3D_Data$cluster
data_with_clusters_2021_3D <- cbind(January_2021_PSA_Merge_Clean, Cluster = Cluster_Assign_3d1)
library(plotly)
data_with_clusters_2021_3D <- as.data.frame(data_with_clusters_2021_3D)
plot_3d2021 <- plot_ly(data_with_clusters_2021_3D,
x = ~Price,
y = ~Days_On_Zillow,
z = ~PSA,
color = ~Cluster,
type = "scatter3d",
mode = "markers",
marker = list(size = 5))
plot_3d2021 <- plot_3d2021 %>% layout(
scene = list(
xaxis = list(title = "Price"),
yaxis = list(title = "Days_On_Zillow"),
zaxis = list(title = "PSA")
)
)
# Show the plot
plot_3d2021
Sold_Above_List_October <- read_csv("~/Desktop/Data-200/Metro_pct_sold_above_list_uc_sfrcondo_sm_week.csv")
## Rows: 225 Columns: 303
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): RegionName, RegionType, StateName
## dbl (300): RegionID, SizeRank, 2018-01-27, 2018-02-03, 2018-02-10, 2018-02-1...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
October_2023_PSA <- Sold_Above_List_October %>%
dplyr::select(RegionID, RegionName, `2023-10-07`)
October_2023_PSA_Merge<- merge(October_2023_Merge, October_2023_PSA,
by = c("RegionID", "RegionName"))
num_cols <- ncol(October_2023_PSA_Merge)
all_col_names <- colnames(October_2023_PSA_Merge)
new_names <- c("Price", "Days_On_Zillow", "PSA")
colnames(October_2023_PSA_Merge)[(num_cols - 2):num_cols] <- new_names
October_2023_PSA_Merge_Clean <- na.omit(October_2023_PSA_Merge)
October2023_3D_Data <- October_2023_PSA_Merge_Clean %>%
dplyr::select(Price, Days_On_Zillow, PSA)
wcss <- numeric(length = 10)
for (i in 1:10) {
model <- kmeans(October2023_3D_Data, centers = i)
wcss[i] <- model$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")

k <- 4
set.seed(123)
October2023_3D_Data <- October2023_3D_Data[, sapply(October2023_3D_Data, is.numeric)]
# Scale the numeric columns
October2023_3D_Data <- scale(October2023_3D_Data)
October2023_3D_Data <- kmeans(October2023_3D_Data, centers = k)
Cluster_Assign_3d1 <- October2023_3D_Data$cluster
data_with_clusters_2023_3D <- cbind(October_2023_PSA_Merge_Clean, Cluster = Cluster_Assign_3d1)
library(plotly)
data_with_clusters_2023_3D <- as.data.frame(data_with_clusters_2023_3D)
plot_3d2023 <- plot_ly(data_with_clusters_2023_3D,
x = ~Price,
y = ~Days_On_Zillow,
z = ~PSA,
color = ~Cluster,
type = "scatter3d",
mode = "markers",
marker = list(size = 5))
plot_3d2023 <- plot_3d2023 %>% layout(
scene = list(
xaxis = list(title = "Price"),
yaxis = list(title = "Days_On_Zillow"),
zaxis = list(title = "PSA")
)
)
# Show the plot
plot_3d2023
mean(data_with_clusters_2019_3D$Price, na.rm = TRUE)
## [1] 279623.6
mean(data_with_clusters_2019_3D$PSA, na.rm = TRUE)
## [1] 0.1630137
hotspots_2019 <- data_with_clusters_2019_3D %>%
dplyr::filter(Days_On_Zillow <= 60)
hotspots_2019 <- hotspots_2019 %>%
mutate(Is_Hotspot = 1)
Not_hotspots_2019 <- data_with_clusters_2019_3D %>%
dplyr::filter(Days_On_Zillow >= 60)
Not_hotspots_2019 <- Not_hotspots_2019 %>%
mutate(Is_Hotspot = 0)
hotspot_2019_Merge <- rbind(hotspots_2019, Not_hotspots_2019)
mean(data_with_clusters_2021_3D$Price, na.rm = TRUE)
## [1] 288709.1
mean(data_with_clusters_2021_3D$PSA, na.rm = TRUE)
## [1] 0.2887255
hotspots_2021 <- data_with_clusters_2021_3D %>%
dplyr::filter(Days_On_Zillow <= 60)
hotspots_2021 <- hotspots_2021 %>%
mutate(Is_Hotspot = 1)
Not_hotspots_2021 <- data_with_clusters_2021_3D %>%
dplyr::filter(Days_On_Zillow >= 60)
Not_hotspots_2021 <- Not_hotspots_2021 %>%
mutate(Is_Hotspot = 0)
hotspot_2021_Merge <- rbind(hotspots_2021, Not_hotspots_2021)
mean(data_with_clusters_2023_3D$Price, na.rm = TRUE)
## [1] 409055.4
mean(data_with_clusters_2023_3D$PSA, na.rm = TRUE)
## [1] 0.3268164
hotspots_2023 <- data_with_clusters_2023_3D %>%
dplyr::filter(Days_On_Zillow <= 60)
hotspots_2023 <- hotspots_2023 %>%
mutate(Is_Hotspot = 1)
Not_hotspots_2023 <- data_with_clusters_2023_3D %>%
dplyr::filter(Days_On_Zillow >= 60)
Not_hotspots_2023 <- Not_hotspots_2023 %>%
mutate(Is_Hotspot = 0)
hotspot_2023_Merge <- rbind(hotspots_2023, Not_hotspots_2023)
logistic_model_2019 <- glm(Is_Hotspot ~ Price + PSA, data = hotspot_2019_Merge, family = binomial)
summary(logistic_model_2019)
##
## Call:
## glm(formula = Is_Hotspot ~ Price + PSA, family = binomial, data = hotspot_2019_Merge)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9257 -0.5800 -0.4102 -0.2619 2.4498
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.988e+00 5.911e-01 -8.440 < 2e-16 ***
## Price 2.439e-06 9.661e-07 2.524 0.0116 *
## PSA 1.537e+01 2.360e+00 6.512 7.4e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.63 on 341 degrees of freedom
## Residual deviance: 263.68 on 339 degrees of freedom
## AIC: 269.68
##
## Number of Fisher Scoring iterations: 5
logistic_model_2021 <- glm(Is_Hotspot ~ Price + PSA, data = hotspot_2021_Merge, family = binomial)
summary(logistic_model_2021)
##
## Call:
## glm(formula = Is_Hotspot ~ Price + PSA, family = binomial, data = hotspot_2021_Merge)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8766 -0.6352 0.3772 0.7093 1.8334
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.865e+00 3.813e-01 -4.891 1e-06 ***
## Price -1.118e-06 8.012e-07 -1.395 0.163
## PSA 1.304e+01 1.509e+00 8.646 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 523.24 on 458 degrees of freedom
## Residual deviance: 403.21 on 456 degrees of freedom
## AIC: 409.21
##
## Number of Fisher Scoring iterations: 5
logistic_model_2023 <- glm(Is_Hotspot ~ Price + PSA, data = hotspot_2023_Merge, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model_2023)
##
## Call:
## glm(formula = Is_Hotspot ~ Price + PSA, family = binomial, data = hotspot_2023_Merge)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.52236 0.00026 0.00444 0.05053 1.23994
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.488e+00 2.334e+00 -1.923 0.05452 .
## Price -3.600e-07 1.852e-06 -0.194 0.84591
## PSA 4.840e+01 1.592e+01 3.041 0.00236 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 75.493 on 223 degrees of freedom
## Residual deviance: 32.847 on 221 degrees of freedom
## AIC: 38.847
##
## Number of Fisher Scoring iterations: 10