Goal: to predict the rental prices in the SF rental market Click here for the data.](https://github.com/rfordatascience/tidytuesday/blob/main/data/2022/2022-07-05/readme.md)
rent <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-07-05/rent.csv')
data <- rent %>%
#Treat missing values
select(-address, -descr, -details, -lat, -lon, -date, -year, -room_in_apt) %>%
na.omit() %>%
# Log transform variables with pos-skewed distribution
mutate(price=log(price))
data %>%
ggplot(aes(price, sqft)) +
scale_y_log10()+
geom_point()
data %>%
ggplot(aes(price, as.factor(beds))) +
geom_boxplot()
#Title
data %>%
# tokenzie title
unnest_tokens(output = word, input = title) %>%
# calculate avg rent per word
group_by(word) %>%
summarise(price = mean(price),
n =n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "//d")) %>%
slice_max(order_by = price, n=20) %>%
# Plot
ggplot(aes(price, fct_reorder(word, price))) +
geom_point() +
labs(y= "words on Title")
data_binarized_tbl <- data %>%
select(-post_id, -title) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 14,394
## Columns: 85
## $ nhood__campbell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__concord_/_pleasant_hill_/_martinez` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__cupertino <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__daly_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__danville_/_san_ramon` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__dublin_/_pleasanton` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__fairfield_/_vacaville` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__foster_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__hayward_/_castro_valley` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__milpitas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__mountain_view <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__napa_county <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__palo_alto <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__petaluma <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__pittsburg_/_antioch` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__rohnert_pk_/_cotati` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_francisco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_jose_central <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_jose_east <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_jose_north <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_jose_south <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_jose_west <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_mateo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__san_rafael <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__santa_clara <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__santa_cruz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__santa_rosa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__SOMA_/_south_beach` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__sunnyvale <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nhood__union_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__vallejo_/_benicia` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__willow_glen_/_cambrian` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `nhood__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ city__cambrian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__campbell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__concord <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__cupertino <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__daly_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__dublin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__fairfield <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__foster_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__hayward <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__milpitas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__mountain_view <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__napa_county <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__oakland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__palo_alto <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__petaluma <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__pittsburg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__rohnert_park <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__san_francisco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__san_jose <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__san_mateo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__san_rafael <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__san_ramon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__santa_clara <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__santa_cruz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__santa_rosa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__sunnyvale <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__union_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ city__vallejo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `city__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ county__alameda <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ county__contra_costa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__marin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__napa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__san_francisco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__san_mateo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__santa_clara <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__santa_cruz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__solano <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ county__sonoma <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `price__-Inf_7.52294091807237` <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 0,…
## $ price__7.52294091807237_7.80384330353877 <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1,…
## $ price__7.80384330353877_8.07868822922987 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ price__8.07868822922987_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `beds__-Inf_2` <dbl> 0, 1, 0, 1, 1, 1, 0, 0, 1,…
## $ beds__2_3 <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0,…
## $ beds__3_Inf <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `baths__-Inf_2` <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 1,…
## $ baths__2_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0,…
## $ `sqft__-Inf_887` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ sqft__887_1100 <dbl> 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ sqft__1100_1500 <dbl> 0, 0, 1, 0, 1, 1, 0, 0, 0,…
## $ sqft__1500_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0,…
#Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(price__8.07868822922987_Inf)
data_corr_tbl
## # A tibble: 85 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 price 8.07868822922987_Inf 1
## 2 city san_francisco 0.389
## 3 county san_francisco 0.389
## 4 price -Inf_7.52294091807237 -0.342
## 5 price 7.80384330353877_8.07868822922987 -0.330
## 6 price 7.52294091807237_7.80384330353877 -0.328
## 7 sqft 1500_Inf 0.324
## 8 beds -Inf_2 -0.254
## 9 beds 3_Inf 0.241
## 10 sqft -Inf_887 -0.240
## # ℹ 75 more rows
#Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 69 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps