This notebook uses US Real Estate sample dataset from ZenRow, by way of r/datasets. The dataset contains records of 10,000 houses on sale in U.S. cities from ttps://www.zillow.com/, scrapped by ZenRow.
# Load libaries
library(tidyverse)
library(scales)
library(skimr)
library(janitor)
library(psych)
library(gghalves)
library(ggstatsplot)
library(dendextend)
library(factoextra)
library(wesanderson)
library(ggsci)
library(colorspace)
library(geofacet)
library(gt)
# set theme
theme_set(theme_minimal(base_size = 10))
theme_update(panel.grid.minor=element_blank(),
plot.title.position="plot",
axis.title=element_text(face="bold",size=9))
# import data
real_estate = read_csv("us-cities-real-estate-sample.csv", na="NULL") %>%
clean_names()
Missing column names filled in: 'X47' [47]
── Column specification ───────────────────────────────────────────────────────────────────────
cols(
.default = col_character(),
zpid = col_double(),
id = col_double(),
hasImage = col_logical(),
unformattedPrice = col_double(),
addressZipcode = col_double(),
isUndisclosedAddress = col_logical(),
beds = col_double(),
baths = col_double(),
area = col_double(),
latitude = col_double(),
longitude = col_double(),
isZillowOwned = col_logical(),
has3DModel = col_logical(),
hasVideo = col_logical(),
hasAdditionalAttributions = col_logical(),
isFeaturedListing = col_logical(),
list = col_logical(),
relaxed = col_logical(),
openHouseStartDate = col_datetime(format = ""),
X47 = col_logical()
)
ℹ Use `spec()` for the full column specifications.
glimpse(real_estate)
Rows: 10,000
Columns: 47
$ zpid <dbl> 115423997, 2100098805, 14543206, 72977167, 17599151, 1…
$ id <dbl> 115423997, 2100098805, 14543206, 72977167, 17599151, 1…
$ provider_listing_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ img_src <chr> "https://photos.zillowstatic.com/fp/688dbfb9d9af6a37bb…
$ has_image <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, …
$ detail_url <chr> "https://www.zillow.com/homedetails/1053-Lutheran-Chur…
$ status_type <chr> "FOR_SALE", "FOR_SALE", "FOR_SALE", "FOR_SALE", "FOR_S…
$ status_text <chr> "House for sale", "Active", "Townhouse for sale", "Hou…
$ country_currency <chr> "$", "$", "$", "$", "$", "$", "$", "$", "$", "$", "$",…
$ price <chr> "$330,000", "$99,900", "$390,000", "$254,900", "$648,7…
$ unformatted_price <dbl> 330000, 99900, 390000, 254900, 648700, 339900, 149000,…
$ address <chr> "1053 Lutheran Church Rd, Bardstown, KY 40004", "0 Old…
$ address_street <chr> "1053 Lutheran Church Rd", "0 Old Swanzey Rd", "1718 W…
$ address_city <chr> "Bardstown", "Chesterfield", "Atlanta", "Wilmington", …
$ address_state <chr> "KY", "NH", "GA", "DE", "CA", "VA", "MS", "OK", "NV", …
$ address_zipcode <dbl> 40004, 3443, 30329, 19809, 91730, 24431, 39531, 73090,…
$ is_undisclosed_address <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ beds <dbl> 3, NA, 3, 3, 4, 3, NA, 2, 3, NA, NA, 2, 3, 3, 2, 3, 2,…
$ baths <dbl> 3, NA, 3, 1, 2, 3, NA, 1, 3, NA, NA, 2, 2, 3, 2, 2, 1,…
$ area <dbl> 2054, NA, 2154, 1025, 1322, 2026, 3600, 925, 1828, NA,…
$ latitude <dbl> 37.85539, 42.88115, 33.83270, 39.75484, 34.11825, 38.1…
$ longitude <dbl> -85.53178, -72.39908, -84.32765, -75.50329, -117.58391…
$ is_zillow_owned <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,…
$ variable_data_type <chr> "DAYS_ON", "DAYS_ON", "PRICE_REDUCTION", "PRICE_REDUCT…
$ variable_data_text <chr> "1 day on Zillow", "36 days on Zillow", "$5,000 (Jun 1…
$ variable_data_is_fresh <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ badge_info <chr> "ForSale", "ForSale", NA, "ForSale", NA, NA, NA, "ForS…
$ pgapt <chr> "For Sale (Broker)", "For Sale (Broker)", "ForSale", "…
$ sgapt <chr> "291700", NA, "For Sale (Broker)", "255100", "For Sale…
$ zestimate <chr> "false", "false", "397900", "false", "681500", "342400…
$ should_show_zestimate_as_price <chr> "false", "false", "false", "false", "false", "false", …
$ has3d_model <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,…
$ has_video <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ is_home_rec <chr> NA, "Listing provided by NEREN", "false", NA, "false",…
$ info2string <chr> "https://photos.zillowstatic.com/fp/44097c1919ccfd9c21…
$ info3string <chr> "Demaree & Hubbard", NA, "https://photos.zillowstatic.…
$ broker_name <chr> "true", "false", "Opendoor Brokerage, LLC", "true", NA…
$ has_additional_attributions <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, F…
$ is_featured_listing <lgl> TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TR…
$ list <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FA…
$ relaxed <lgl> NA, NA, FALSE, NA, FALSE, FALSE, FALSE, NA, NA, FALSE,…
$ has_open_house <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "t…
$ open_house_start_date <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2…
$ open_house_end_date <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2…
$ open_house_description <chr> "", "", NA, "", NA, NA, NA, "", "", NA, NA, "", NA, "O…
$ info6string <chr> "", "", NA, "", NA, NA, "Richard Reynolds", "", "", "P…
$ x47 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# skim
skim(real_estate)
── Data Summary ────────────────────────
Values
Name real_estate
Number of rows 10000
Number of columns 47
_______________________
Column type frequency:
character 27
logical 10
numeric 9
POSIXct 1
________________________
Group variables None
── Variable type: character ───────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 provider_listing_id 8663 0.134 0 14 25 1293 0
2 img_src 0 1 75 231 0 9940 0
3 detail_url 0 1 57 121 0 10000 0
4 status_type 0 1 8 8 0 1 0
5 status_text 0 1 3 26 0 24 0
6 country_currency 0 1 1 1 0 1 0
7 price 0 1 0 12 8 2019 0
8 address 0 1 17 72 0 10000 0
9 address_street 0 1 0 64 1 9980 0
10 address_city 0 1 0 26 1 4096 0
11 address_state 0 1 2 2 0 51 0
12 variable_data_type 1001 0.900 7 17 0 8 0
13 variable_data_text 1001 0.900 7 26 0 1518 0
14 variable_data_is_fresh 9820 0.018 5 11 0 8 0
15 badge_info 6458 0.354 7 10 0 3 0
16 pgapt 5 1.00 7 17 0 9 0
17 sgapt 1688 0.831 5 17 0 1503 0
18 zestimate 3229 0.677 5 16 0 2412 0
19 should_show_zestimate_as_price 3 1.00 4 6 0 4 0
20 is_home_rec 2731 0.727 5 43 0 36 0
21 info2string 6887 0.311 5 88 0 140 0
22 info3string 4004 0.600 6 88 0 1831 0
23 broker_name 1776 0.822 3 73 0 3194 0
24 has_open_house 9391 0.0609 4 19 0 31 0
25 open_house_end_date 9396 0.0604 19 31 0 72 0
26 open_house_description 6057 0.394 0 31 3542 63 0
27 info6string 5646 0.435 0 41 3547 762 0
── Variable type: logical ─────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean count
1 has_image 100 0.99 1 "TRU: 9900"
2 is_undisclosed_address 0 1 0.0015 "FAL: 9985, TRU: 15"
3 is_zillow_owned 0 1 0.0067 "FAL: 9933, TRU: 67"
4 has3d_model 0 1 0.0449 "FAL: 9551, TRU: 449"
5 has_video 0 1 0.009 "FAL: 9910, TRU: 90"
6 has_additional_attributions 5 1.00 0.491 "FAL: 5089, TRU: 4906"
7 is_featured_listing 0 1 0.355 "FAL: 6453, TRU: 3547"
8 list 0 1 0.645 "TRU: 6453, FAL: 3547"
9 relaxed 3339 0.666 0.0312 "FAL: 6453, TRU: 208"
10 x47 10000 0 NaN ": "
── Variable type: numeric ─────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25
1 zpid 0 1 865572492. 968581736. 82 63347023.
2 id 0 1 865572492. 968581736. 82 63347023.
3 unformatted_price 8 0.999 421134. 4917697. 1 110000
4 address_zipcode 2 1.00 49847. 30054. 1001 24739.
5 beds 2696 0.730 3.11 3.26 0 2
6 baths 2933 0.707 2.42 1.26 1 2
7 area 3054 0.695 2085. 7719. 1 1200
8 latitude 393 0.961 39.0 5.67 19.1 35.2
9 longitude 393 0.961 -92.1 17.5 -160. -104.
p50 p75 p100 hist
1 174180452 2071098630. 2146980069 ▇▁▁▁▅
2 174180452 2071098630. 2146980069 ▇▁▁▁▅
3 225000 388250 474900000 ▇▁▁▁▁
4 48959 77488. 99921 ▇▇▆▅▇
5 3 4 234 ▇▁▁▁▁
6 2 3 24 ▇▁▁▁▁
7 1612 2208. 435600 ▇▁▁▁▁
8 39.6 42.3 65.1 ▁▅▇▁▁
9 -87.8 -78.6 -67.0 ▁▁▃▆▇
── Variable type: POSIXct ─────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max
1 open_house_start_date 9396 0.0604 2021-06-26 16:00:00 2021-07-11 15:00:00
median n_unique
1 2021-06-27 13:00:00 43
# check for duplicates: rows
n_distinct(real_estate$id)
[1] 10000
real_estate %>% get_dupes() %>% flatten() # no duplicates found
No variable names specified - using all columns.
No duplicate combinations found of: zpid, id, provider_listing_id, img_src, has_image, detail_url, status_type, status_text, country_currency, ... and 38 other variables
list()
# listing count by state
tab_state = real_estate %>% count(address_state)
# function reference: https://medium.com/@NickDoesData/visualizing-geographic-data-in-r-fb2e0f5b59c5
create_gradient_state_tile_map <- function(state, value, title, subtitle, caption, legend_title, state_grid='us_state_grid2') {
df <- as.tibble(data.frame(state, value))
fig <- df %>%
mutate(x = 1) %>%
mutate(label_y = .5) %>%
mutate(label_x = 1) %>%
ggplot()+
geom_bar(mapping=aes(x=x, fill=value)) +
facet_geo(~ state, grid=state_grid) +
ggtitle(title) +
geom_text(aes(x=label_x, y=label_y, label=state), color='#ffffff', size=3.5)
return(fig)
}
create_gradient_state_tile_map(tab_state$address_state, tab_state$n, title='Sale listing count by state\n', legend_title = "", caption="") +
scale_fill_continuous_sequential(palette="Heat", limits=c(1,323), breaks=c(1,100,200,323)) +
theme_void(base_size=10) +
theme(strip.text.x = element_blank(),
plot.margin = unit(c(1,1.5,1,1.5), "cm"),
plot.title=element_text(size=11),
legend.title=element_text(size=9)) +
guides(fill = guide_colorbar(title="Count",
title.position = "top",
barwidth = unit(.5, "lines"),
barheight = unit(10, "lines")))
# median price by state
re_st = real_estate %>% drop_na(unformatted_price) %>%
group_by(address_state) %>%
filter(address_state!= "DC") %>%
summarise(median_price=median(unformatted_price))
create_gradient_state_tile_map(re_st$address_state, re_st$median_price, title='Median sale price by state\n', legend_title = "", caption="") +
scale_fill_continuous_sequential(palette="Batlow",
labels = unit_format(unit = "K", scale = 1e-3)) +
theme_void(base_size=10) +
theme(strip.text.x = element_blank(),
plot.margin = unit(c(1,1.5,1,1.5), "cm"),
plot.title=element_text(size=11),
legend.title=element_text(size=9)) +
guides(fill = guide_colorbar(title="Median price",
title.position = "top",
barwidth = unit(.5, "lines"),
barheight = unit(10, "lines")))
# median price by address_city (cities with more than 40 listings)
real_estate %>% group_by(address_city) %>%
summarise(listing_count=n(), med = median(unformatted_price)) %>%
arrange(desc(listing_count)) %>%
filter(listing_count>=40) %>%
arrange(desc(med)) %>%
mutate(lab = paste0(address_city," ","(","n=",listing_count,")")) %>%
mutate(lab= fct_inorder(lab)) %>%
ggplot(aes(y= fct_rev(lab), x= med, fill=I(ifelse(med==max(med), "#f6bd60","#adb5bd")))) +
geom_col(width=0.75) +
geom_text(aes(label=scales::dollar(med)),size=3, color="black", hjust=1.2) +
scale_x_continuous(labels = unit_format(unit = "K", scale = 1e-3), expand=c(0,0),
limits=c(0,700000)) +
labs(x= "Median sale price (in USD)", y= "City (listing count)",
subtitle="Median sale price by city (cities with >=40 sale listings)")
# is_featured_listing and price
psych::describeBy(real_estate$unformatted_price, real_estate$is_featured_listing, mat=T)
# status text and price
real_estate %>% count(status_text) %>% arrange(desc(n))
real_estate %>% drop_na(unformatted_price) %>%
mutate(status_grp = fct_lump(factor(status_text), 6,other_level = "Other status text")) %>%
mutate(status_grp = str_remove_all(status_grp,"for sale")) %>%
mutate(status_grp = if_else(status_grp=="Home ","Other status text",status_grp)) %>%
group_by(status_grp) %>%
summarise(median=median(unformatted_price),
min=min(unformatted_price), max=max(unformatted_price), range=max-min,
mean = mean(unformatted_price)) %>%
rename(status_text= status_grp) %>%
arrange(desc(median)) %>%
gt() %>%
fmt_number(
columns = c("median","min","max","range","mean"),
decimals=0
) %>%
data_color(
columns =c("median","min","max","range","mean"),
colors = scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
domain = NULL
)
) %>%
tab_style(
style = list(
cell_borders(
sides = "bottom",
color = "black",
weight = px(3)
)
),
locations = list(
cells_column_labels(
columns = gt::everything()
)
)
) %>%
tab_options(table.font.size=14) %>%
tab_header(title=md("**Sale price by status text**"))
| Sale price by status text | |||||
|---|---|---|---|---|---|
| status_text | median | min | max | range | mean |
| Townhouse | 315,000 | 700 | 25,995,000 | 25,994,300 | 551,101 |
| Multi-family home | 299,999 | 3,750 | 9,950,000 | 9,946,250 | 481,909 |
| House | 290,000 | 5,500 | 474,900,000 | 474,894,500 | 639,913 |
| Condo | 265,000 | 29,495 | 65,750,000 | 65,720,505 | 523,100 |
| Other status text | 230,000 | 1 | 20,150,000 | 20,149,999 | 351,428 |
| Lot / Land | 72,000 | 350 | 9,200,000 | 9,199,650 | 134,977 |
# boxplot (price, beds, baths, area)
real_estate %>% select (zpid, unformatted_price, beds, baths, area) %>%
pivot_longer(!zpid) %>%
ggplot(aes(x=value, color=name)) +
geom_boxplot(outlier.alpha = 0.8, outlier.shape = 21, show.legend = F) +
facet_wrap(~name, scales = "free", ncol=1, strip.position = "left") +
theme_light(base_size=10) +
theme(strip.placement = "outside",
axis.text.y = element_blank(),
axis.ticks.y=element_blank()
) +
scale_x_continuous(labels=scales::comma) +
scale_y_continuous(breaks=c(0)) +
scale_color_npg()
# area and price
real_estate %>%
select(unformatted_price, area) %>%
drop_na() %>%
mutate(price_zs = (unformatted_price- mean(unformatted_price))/ sd(unformatted_price),
area_zs=(area- mean(area))/ sd(area)) %>%
filter(between(price_zs,-3,3)) %>%
filter(between(area_zs,-3,3)) %>%
ggplot(aes(x=area, y=unformatted_price)) +
geom_point(shape=21, alpha=0.5) +
geom_smooth(method='lm', se=FALSE, color="#f6bd60") +
stat_cor(method="pearson",label.x=15000, size=3.5) +
scale_y_continuous(labels=scales::label_number_si()) +
scale_x_continuous(labels = unit_format(unit = "K", scale = 1e-3, accuracy = 1)) +
labs(x="Area", y="Sale price (in USD)", subtitle="Area and sale price")
# drop price outliers
re_price = real_estate %>%
drop_na(unformatted_price) %>%
mutate(price_zs = (unformatted_price- mean(unformatted_price))/ sd(unformatted_price)) %>%
filter(between(price_zs,-3,3))
summary(re_price$unformatted_price)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 109900 225000 341653 386400 15000000
# listing count by number of baths
re_price %>% mutate(baths2 = as.integer(floor(baths))) %>%
count(baths2) %>%
ggplot(aes(y=fct_rev(factor(baths2)), x=n)) +
geom_segment(aes(x=0, xend=n, y=fct_rev(factor(baths2)),yend=fct_rev(factor(baths2)))) +
geom_point() +
geom_text(aes(label=n),size=3, hjust=-0.7) +
scale_x_continuous(limits=c(0,3500)) +
labs(x="Sale listing count", y="Number of baths", subtitle="Sale listing count by number of baths")
# baths and price
re_price %>% mutate(baths2 = as.integer(floor(baths))) %>%
mutate(baths2_grp = fct_lump(factor(baths2), 6,other_level = "7 - 24")) %>%
ggplot(aes(y=fct_rev(baths2_grp), x=unformatted_price, color=fct_rev(baths2_grp))) +
geom_boxplot(outlier.shape = 21, outlier.alpha = 0.5, show.legend = F) +
scale_x_continuous(labels=scales::label_number_si()) +
labs(x="Sale price (in USD)", y="Number of baths", subtitle="Number of baths and sale price") +
scale_color_aaas(na.value="black")
# listing count by number of beds
re_price %>% mutate(beds_grp = fct_lump(factor(beds), 7,other_level = ">7")) %>%
count(beds_grp) %>%
ggplot(aes(y=fct_rev(beds_grp), x=n, fill=n)) +
geom_col(width=0.75, show.legend=F) +
geom_text(aes(label=n),size=3, hjust=1.2, color="white") +
scale_fill_continuous_sequential(palette="Red-Blue") +
labs(x="Sale listing count", y="Number of beds", subtitle="Sale listing count by number of beds")
# beds and price
re_price %>% mutate(beds_grp = fct_lump(factor(beds), 7,other_level = ">7")) %>%
ggplot(aes(y=fct_rev(beds_grp), x=unformatted_price, color=fct_rev(beds_grp))) +
geom_boxplot(outlier.shape = 21, outlier.alpha = 0.5, show.legend=F) +
scale_x_continuous(labels=scales::label_number_si()) +
labs(x="Sale price (in USD)", y="Number of beds", subtitle="Number of beds and sale price") +
scale_color_aaas(na.value="black")
# correlation
re1 = real_estate %>%
select(unformatted_price, baths, beds, area) %>%
drop_na()
dim(re1)
[1] 6571 4
set.seed(123)
ggcorrmat(
data=re1,
cor.vars=c(unformatted_price:area),
title="Correlation",
)
# drop outliers
re2 = re1 %>% mutate(price_zs = (unformatted_price- mean(unformatted_price))/ sd(unformatted_price),
area_zs=(area- mean(area))/ sd(area),
beds_zs=(beds- mean(beds))/ sd(beds),
baths_zs=(baths- mean(baths))/ sd(baths)) %>%
filter(between(price_zs,-3,3)) %>%
filter(between(area_zs,-3,3)) %>%
filter(between(beds_zs,-3,3)) %>%
filter(between(baths_zs,-3,3)) %>%
select(unformatted_price, baths, beds, area)
dim(re2)
[1] 6430 4
# scale
re2_scaled = scale(re2)
# correlation after dropping outliers
set.seed(123)
ggcorrmat(
data=re2,
cor.vars=c(unformatted_price:area),
title="Correlation",
subtitle="After dropping outliers"
)
# check optimal clusters: elbow method
set.seed(123)
fviz_nbclust(re2_scaled,kmeans,method="wss")
# k means clustering
# 4 clusters
set.seed(123)
km4= kmeans(re2_scaled,centers=4,nstart=50)
km4
K-means clustering with 4 clusters of sizes 1019, 18, 3310, 2083
Cluster means:
unformatted_price baths beds area
1 0.45324379 1.37863176 1.2998296 1.58383154
2 15.03809329 2.92301511 1.0855574 3.02158826
3 -0.09365508 0.09967272 0.1995130 -0.01750613
4 -0.20285299 -0.85806853 -0.9622921 -0.77310016
Clustering vector:
[1] 3 3 4 3 3 4 3 4 3 3 4 3 4 4 3 4 3 4 1 3 3 4 1 3 1 3 3 3 4 3 3 4 4 3 3 4 3 3 4 3 4 3 4 4
[45] 4 3 1 4 4 3 1 4 4 3 1 4 1 4 3 4 4 1 4 3 3 4 4 3 3 3 1 4 1 4 3 1 3 3 3 4 4 1 4 3 4 4 4 4
[89] 1 1 4 4 4 4 4 4 3 3 4 3 3 4 3 1 3 3 4 3 4 4 3 4 3 4 4 4 3 3 3 3 1 3 4 4 3 3 4 1 3 1 3 3
[133] 3 3 3 3 4 1 3 4 3 3 3 4 1 3 3 3 3 3 4 3 4 3 4 1 4 3 4 4 3 1 4 3 3 3 3 3 1 3 4 4 1 4 3 3
[177] 4 4 3 1 4 3 4 3 3 3 4 3 4 3 1 3 1 3 4 4 3 3 3 3 4 4 3 3 3 3 4 3 3 3 4 3 4 3 4 1 4 3 4 3
[221] 4 3 4 3 1 4 4 1 3 3 3 1 3 3 3 4 3 4 3 1 3 4 3 3 3 3 4 3 3 3 4 3 3 3 3 1 1 3 3 1 4 1 4 4
[265] 3 4 3 1 4 3 3 4 3 1 3 1 4 4 1 3 3 3 4 4 4 3 3 4 1 1 3 1 4 4 1 4 4 4 4 3 3 1 3 3 4 3 3 3
[309] 3 1 3 3 4 4 1 3 1 4 3 4 1 3 4 3 4 4 3 4 4 1 3 4 3 3 3 4 4 3 3 3 3 3 3 1 3 3 3 4 3 3 3 1
[353] 1 4 4 4 3 4 4 4 3 1 4 4 4 4 3 3 4 1 3 4 4 3 3 1 4 3 3 3 1 3 4 3 3 3 3 3 4 3 1 3 3 3 4 3
[397] 3 3 3 3 4 3 3 4 3 3 3 3 3 3 4 3 4 3 3 3 1 3 1 4 3 3 3 4 4 3 1 4 4 3 3 4 4 4 3 4 3 3 3 3
[441] 3 3 1 4 3 4 4 3 3 1 1 4 3 4 3 4 3 4 3 4 3 4 4 3 3 4 4 3 4 3 1 4 3 3 1 3 3 1 4 3 4 3 4 4
[485] 3 1 1 4 4 3 3 4 3 4 3 1 3 3 3 4 1 4 4 1 4 4 3 4 1 3 4 1 3 1 3 3 4 4 3 3 1 3 4 1 3 4 4 4
[529] 3 3 3 1 4 3 3 3 3 3 3 4 3 4 4 4 3 4 1 1 4 4 3 1 3 4 3 1 4 4 4 1 3 2 3 1 4 4 4 4 1 4 3 4
[573] 1 3 3 3 4 3 3 4 3 3 3 1 1 3 4 1 3 3 3 3 4 4 3 4 4 4 4 1 4 3 3 3 3 3 1 3 4 3 4 3 4 4 1 4
[617] 1 3 4 1 3 3 1 4 3 1 3 4 4 4 3 3 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3 3 4 3 3 3 3 4 1 3 3 4 4 1
[661] 3 3 3 4 3 1 3 3 4 3 4 4 4 4 3 4 3 3 1 1 3 3 3 4 3 3 3 4 4 3 3 3 1 3 3 3 4 3 1 4 3 3 3 1
[705] 3 1 3 3 4 3 4 1 3 3 3 4 3 4 1 3 3 1 4 1 4 3 3 4 3 3 3 4 3 1 3 4 3 3 3 4 4 1 4 3 3 3 1 3
[749] 4 3 3 1 1 3 3 1 3 3 1 3 4 4 4 3 3 3 3 4 3 4 3 3 4 3 1 1 3 3 1 4 1 4 1 4 4 3 3 4 3 3 3 4
[793] 1 3 4 3 1 1 1 3 3 1 3 4 1 3 3 4 4 4 4 3 3 3 1 3 4 4 4 4 3 3 3 3 3 1 4 3 4 3 4 4 3 3 1 3
[837] 3 1 4 1 4 1 3 4 3 3 4 4 4 3 3 4 3 3 4 3 3 3 1 3 3 3 3 3 3 3 3 4 4 4 4 3 3 3 4 1 4 4 1 3
[881] 4 3 1 3 3 4 3 4 1 3 4 3 3 4 3 3 3 3 4 1 4 1 3 1 1 3 4 4 4 1 3 3 1 3 4 3 3 3 3 1 4 4 1 3
[925] 4 4 3 3 3 1 1 1 4 1 3 3 3 1 3 4 1 4 4 4 4 4 3 4 3 3 3 3 3 4 4 1 3 4 4 3 3 4 4 3 3 4 4 3
[969] 4 3 4 4 3 3 1 4 1 3 1 3 1 3 3 4 3 3 4 4 4 3 3 1 3 4 3 3 3 4 3 4
[ reached getOption("max.print") -- omitted 5430 entries ]
Within cluster sum of squares by cluster:
[1] 4164.1092 488.3428 3584.7652 1657.0068
(between_SS / total_SS = 61.5 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
[7] "size" "iter" "ifault"
fviz_cluster(km4, data=re2_scaled, labelsize=0) #cluster plot
with(re2,pairs(re2_scaled,col=(1:4)[km4$cluster])) #pair plot
#summary by k4 cluster ID
re2_cid = re2 %>% mutate(cluster_id = as.factor(km4$cluster))
by(re2_cid, re2_cid$cluster_id, summary)
re2_cid$cluster_id: 1
unformatted_price baths beds area cluster_id
Min. : 9000 Min. :1.000 Min. :0.000 Min. : 1060 1:1019
1st Qu.: 350000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.: 2538 2: 0
Median : 499900 Median :4.000 Median :4.000 Median : 2939 3: 0
Mean : 689508 Mean :3.584 Mean :4.406 Mean : 3155 4: 0
3rd Qu.: 729000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.: 3476
Max. :5000000 Max. :6.000 Max. :7.000 Max. :11326
-----------------------------------------------------------------------
re2_cid$cluster_id: 2
unformatted_price baths beds area cluster_id
Min. : 6500000 Min. :3.00 Min. :3.000 Min. :1782 1: 0
1st Qu.: 7845000 1st Qu.:5.00 1st Qu.:4.000 1st Qu.:3540 2:18
Median : 9444000 Median :5.00 Median :4.000 Median :4237 3: 0
Mean :10268778 Mean :5.00 Mean :4.167 Mean :4419 4: 0
3rd Qu.:11998750 3rd Qu.:5.75 3rd Qu.:4.750 3rd Qu.:5080
Max. :17750000 Max. :6.00 Max. :6.000 Max. :8500
-----------------------------------------------------------------------
re2_cid$cluster_id: 3
unformatted_price baths beds area cluster_id
Min. : 3750 Min. :1.000 Min. :1.000 Min. : 1 1: 0
1st Qu.: 184924 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:1438 2: 0
Median : 278000 Median :2.000 Median :3.000 Median :1702 3:3310
Mean : 330307 Mean :2.412 Mean :3.176 Mean :1746 4: 0
3rd Qu.: 399999 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2020
Max. :3899990 Max. :4.000 Max. :6.000 Max. :3463
-----------------------------------------------------------------------
re2_cid$cluster_id: 4
unformatted_price baths beds area cluster_id
Min. : 700 Min. :1.000 Min. :0.000 Min. : 192 1: 0
1st Qu.: 135000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 864 2: 0
Median : 215000 Median :2.000 Median :2.000 Median :1067 3: 0
Mean : 258586 Mean :1.534 Mean :1.877 Mean :1082 4:2083
3rd Qu.: 325000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1290
Max. :2500000 Max. :3.000 Max. :3.000 Max. :3120
# clean df
clean_df = real_estate[!duplicated(as.list(real_estate))] %>% # drop id col
drop_na(unformatted_price) %>% # drop listings without price
mutate_at(vars(baths, beds), ~replace_na(., 0)) #replace NA with 0
dim(clean_df)
[1] 9992 46
# count of missing in area col
sum(is.na(clean_df$area))
[1] 3048
# df without na area col
cdf2a = clean_df %>% drop_na(area)
dim(cdf2a)
[1] 6944 46
# df without na area col and outliers
cdf2b = cdf2a %>% #select(unformatted_price, area, beds, baths) %>%
# get zscore
mutate(price_zs = (unformatted_price- mean(unformatted_price))/ sd(unformatted_price),
area_zs=(area- mean(area))/ sd(area),
beds_zs=(beds- mean(beds))/ sd(beds),
baths_zs=(baths- mean(baths))/ sd(baths)) %>%
# drop outliers
filter(between(price_zs,-3,3)) %>%
filter(between(area_zs,-3,3)) %>%
filter(between(beds_zs,-3,3)) %>%
filter(between(baths_zs,-3,3))
dim(cdf2b)
[1] 6849 50
nrow(cdf2a) - nrow(cdf2b)
[1] 95
# df with missing area imputation and without outliers
cdf3 = clean_df %>%
# impute missing vals using median of bed and bath
group_by(beds, baths) %>%
mutate(area2=ifelse(is.na(area),median(area,na.rm=TRUE),area)) %>%
drop_na(area2) %>%
ungroup() %>%
# get zscore
mutate(price_zs = (unformatted_price- mean(unformatted_price))/ sd(unformatted_price),
area_zs=(area2- mean(area2))/ sd(area2),
beds_zs=(beds- mean(beds))/ sd(beds),
baths_zs=(baths- mean(baths))/ sd(baths)) %>%
# drop outliers
filter(between(price_zs,-3,3)) %>%
filter(between(area_zs,-3,3)) %>%
filter(between(beds_zs,-3,3)) %>%
filter(between(baths_zs,-3,3))
dim(cdf3)
[1] 9880 51