goal: to predict the rental prices in Sf rental market click here for the data

#import data

rent <-rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-07-05/rent.csv')
## Rows: 200796 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): post_id, nhood, city, county, address, title, descr, details
## dbl (9): date, year, price, beds, baths, sqft, room_in_apt, lat, lon
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(rent)
Data summary
Name rent
Number of rows 200796
Number of columns 17
_______________________
Column type frequency:
character 8
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
post_id 0 1.00 9 14 0 200796 0
nhood 0 1.00 4 43 0 167 0
city 0 1.00 5 19 0 104 0
county 1394 0.99 4 13 0 10 0
address 196888 0.02 1 38 0 2869 0
title 2517 0.99 2 298 0 184961 0
descr 197542 0.02 13 16975 0 3025 0
details 192780 0.04 4 595 0 7667 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
date 0 1.00 20095718.38 44694.07 20000902.00 20050227.00 20110924.00 20120805.0 20180717.00 ▁▇▁▆▃
year 0 1.00 2009.51 4.48 2000.00 2005.00 2011.00 2012.0 2018.00 ▁▇▁▆▃
price 0 1.00 2135.36 1427.75 220.00 1295.00 1800.00 2505.0 40000.00 ▇▁▁▁▁
beds 6608 0.97 1.89 1.08 0.00 1.00 2.00 3.0 12.00 ▇▂▁▁▁
baths 158121 0.21 1.68 0.69 1.00 1.00 2.00 2.0 8.00 ▇▁▁▁▁
sqft 136117 0.32 1201.83 5000.22 80.00 750.00 1000.00 1360.0 900000.00 ▇▁▁▁▁
room_in_apt 0 1.00 0.00 0.04 0.00 0.00 0.00 0.0 1.00 ▇▁▁▁▁
lat 193145 0.04 37.67 0.35 33.57 37.40 37.76 37.8 40.43 ▁▁▅▇▁
lon 196484 0.02 -122.21 0.78 -123.20 -122.42 -122.26 -122.0 -74.20 ▇▁▁▁▁
data <- rent %>%
    
    # treat missing values
    select(-address, -descr, -details, -lat, -lon, -date, -room_in_apt,-year ) %>%
    na.omit() %>%
  
    mutate(price = log(price))

#explore data identify good predictors sqft

data %>% 
    ggplot(aes(price,sqft)) + scale_y_log10()+ geom_point()

beds

data %>%
    ggplot(aes(price, as.factor(beds))) +
    geom_boxplot()

title

data %>%
    
    unnest_tokens(output = word,input = title) %>%
    
    
    group_by(word) %>%
    summarise(price =mean(price),
              n = n()) %>%
    ungroup() %>%
    filter(n > 10, !str_detect(word, "//d")) %>%
    slice_max(order_by = price, n = 20) %>%
   
    
     ggplot(aes(price, fct_reorder(word, price)))+
    geom_point() +
    labs(y ="words in Title")

EDA Shortcut

data %>%
    select(-post_id, -title)
## # A tibble: 14,394 × 7
##    nhood   city    county  price  beds baths  sqft
##    <chr>   <chr>   <chr>   <dbl> <dbl> <dbl> <dbl>
##  1 alameda alameda alameda  7.86     4   3    1756
##  2 alameda alameda alameda  7.23     2   1     700
##  3 alameda alameda alameda  7.58     3   2    1400
##  4 alameda alameda alameda  7.40     2   1.5   895
##  5 alameda alameda alameda  8.04     2   1    1200
##  6 alameda alameda alameda  6.91     2   1    1185
##  7 alameda alameda alameda  7.35     3   2.5  1626
##  8 alameda alameda alameda  7.67     3   3    1014
##  9 alameda alameda alameda  7.65     1   1     752
## 10 alameda alameda alameda  8.34     3   2.5  2205
## # ℹ 14,384 more rows
data %>% glimpse()
## Rows: 14,394
## Columns: 9
## $ post_id <chr> "4168358289", "pre2013_59350", "pre2013_72024", "pre2013_64956…
## $ nhood   <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "alamed…
## $ city    <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "alamed…
## $ county  <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "alamed…
## $ price   <dbl> 7.861342, 7.226209, 7.575585, 7.402452, 8.039157, 6.907755, 7.…
## $ beds    <dbl> 4, 2, 3, 2, 2, 2, 3, 3, 1, 3, 3, 2, 2, 2, 2, 2, 4, 2, 2, 1, 3,…
## $ baths   <dbl> 3.0, 1.0, 2.0, 1.5, 1.0, 1.0, 2.5, 3.0, 1.0, 2.5, 2.5, 2.5, 1.…
## $ sqft    <dbl> 1756, 700, 1400, 895, 1200, 1185, 1626, 1014, 752, 2205, 1500,…
## $ title   <chr> "Nov  2 Newly remodeled 4br/3ba (2 mastersuites) in quiet Frem…
 #step 2 correlate

#preprocess data

#build models

#evaluate models

#make Predictions