goal: to predict the rental prices in Sf rental market click here for the data
#import data
rent <-rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-07-05/rent.csv')
## Rows: 200796 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): post_id, nhood, city, county, address, title, descr, details
## dbl (9): date, year, price, beds, baths, sqft, room_in_apt, lat, lon
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(rent)
Name | rent |
Number of rows | 200796 |
Number of columns | 17 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
post_id | 0 | 1.00 | 9 | 14 | 0 | 200796 | 0 |
nhood | 0 | 1.00 | 4 | 43 | 0 | 167 | 0 |
city | 0 | 1.00 | 5 | 19 | 0 | 104 | 0 |
county | 1394 | 0.99 | 4 | 13 | 0 | 10 | 0 |
address | 196888 | 0.02 | 1 | 38 | 0 | 2869 | 0 |
title | 2517 | 0.99 | 2 | 298 | 0 | 184961 | 0 |
descr | 197542 | 0.02 | 13 | 16975 | 0 | 3025 | 0 |
details | 192780 | 0.04 | 4 | 595 | 0 | 7667 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
date | 0 | 1.00 | 20095718.38 | 44694.07 | 20000902.00 | 20050227.00 | 20110924.00 | 20120805.0 | 20180717.00 | ▁▇▁▆▃ |
year | 0 | 1.00 | 2009.51 | 4.48 | 2000.00 | 2005.00 | 2011.00 | 2012.0 | 2018.00 | ▁▇▁▆▃ |
price | 0 | 1.00 | 2135.36 | 1427.75 | 220.00 | 1295.00 | 1800.00 | 2505.0 | 40000.00 | ▇▁▁▁▁ |
beds | 6608 | 0.97 | 1.89 | 1.08 | 0.00 | 1.00 | 2.00 | 3.0 | 12.00 | ▇▂▁▁▁ |
baths | 158121 | 0.21 | 1.68 | 0.69 | 1.00 | 1.00 | 2.00 | 2.0 | 8.00 | ▇▁▁▁▁ |
sqft | 136117 | 0.32 | 1201.83 | 5000.22 | 80.00 | 750.00 | 1000.00 | 1360.0 | 900000.00 | ▇▁▁▁▁ |
room_in_apt | 0 | 1.00 | 0.00 | 0.04 | 0.00 | 0.00 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
lat | 193145 | 0.04 | 37.67 | 0.35 | 33.57 | 37.40 | 37.76 | 37.8 | 40.43 | ▁▁▅▇▁ |
lon | 196484 | 0.02 | -122.21 | 0.78 | -123.20 | -122.42 | -122.26 | -122.0 | -74.20 | ▇▁▁▁▁ |
data <- rent %>%
# treat missing values
select(-address, -descr, -details, -lat, -lon, -date, -room_in_apt,-year ) %>%
na.omit() %>%
mutate(price = log(price))
#explore data identify good predictors sqft
data %>%
ggplot(aes(price,sqft)) + scale_y_log10()+ geom_point()
beds
data %>%
ggplot(aes(price, as.factor(beds))) +
geom_boxplot()
title
data %>%
unnest_tokens(output = word,input = title) %>%
group_by(word) %>%
summarise(price =mean(price),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "//d")) %>%
slice_max(order_by = price, n = 20) %>%
ggplot(aes(price, fct_reorder(word, price)))+
geom_point() +
labs(y ="words in Title")
EDA Shortcut
data %>%
select(-post_id, -title)
## # A tibble: 14,394 × 7
## nhood city county price beds baths sqft
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 alameda alameda alameda 7.86 4 3 1756
## 2 alameda alameda alameda 7.23 2 1 700
## 3 alameda alameda alameda 7.58 3 2 1400
## 4 alameda alameda alameda 7.40 2 1.5 895
## 5 alameda alameda alameda 8.04 2 1 1200
## 6 alameda alameda alameda 6.91 2 1 1185
## 7 alameda alameda alameda 7.35 3 2.5 1626
## 8 alameda alameda alameda 7.67 3 3 1014
## 9 alameda alameda alameda 7.65 1 1 752
## 10 alameda alameda alameda 8.34 3 2.5 2205
## # ℹ 14,384 more rows
data %>% glimpse()
## Rows: 14,394
## Columns: 9
## $ post_id <chr> "4168358289", "pre2013_59350", "pre2013_72024", "pre2013_64956…
## $ nhood <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "alamed…
## $ city <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "alamed…
## $ county <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "alamed…
## $ price <dbl> 7.861342, 7.226209, 7.575585, 7.402452, 8.039157, 6.907755, 7.…
## $ beds <dbl> 4, 2, 3, 2, 2, 2, 3, 3, 1, 3, 3, 2, 2, 2, 2, 2, 4, 2, 2, 1, 3,…
## $ baths <dbl> 3.0, 1.0, 2.0, 1.5, 1.0, 1.0, 2.5, 3.0, 1.0, 2.5, 2.5, 2.5, 1.…
## $ sqft <dbl> 1756, 700, 1400, 895, 1200, 1185, 1626, 1014, 752, 2205, 1500,…
## $ title <chr> "Nov 2 Newly remodeled 4br/3ba (2 mastersuites) in quiet Frem…
#step 2 correlate
#preprocess data
#build models
#evaluate models
#make Predictions