Get the Data

raw_pdx <- read.csv("C:/Users/karol/Desktop/PORTLAND HOUSE.csv", stringsAsFactors=TRUE)

Prepare the Data

This data has 25731 obs. of 32 variables

## raw_pdx <- raw_pdx%>%select(-id)

head(raw_pdx)
##   id yearBuilt     City latitude longitude zipcode bathrooms bedrooms
## 1  1      2007 Fairview 45.54357 -122.4418   97024         3        3
## 2  2      2001 Fairview 45.54758 -122.4532   97024         3        3
## 3  3      1982  Gresham 45.48823 -122.4444   97080         3        4
## 4  4      1953 Portland 45.52663 -122.4641   97230         1        3
## 5  5      1967  Gresham 45.51124 -122.4315   97030         3        6
## 6  6      1967  Gresham 45.48799 -122.4162   97080         2        3
##   DateListed  DateSold daysOnZillow      homeType lastSoldPrice livingArea
## 1  4/26/2021 5/21/2021           25     TOWNHOUSE        315400       1806
## 2   3/1/2021 4/23/2021           53 SINGLE_FAMILY        400000       1518
## 3  5/24/2021  6/4/2021           11 SINGLE_FAMILY        512000       2724
## 4  5/24/2021  6/4/2021           11 SINGLE_FAMILY        348000       1217
## 5  5/18/2021  6/1/2021           14     APARTMENT        510000       2400
## 6  5/18/2021  6/1/2021           14 SINGLE_FAMILY        404200       1150
##   lotSize  price priceHistory.1.price propertyTaxRate hasCooling hasFireplace
## 1    1555 315400               212000            1.12      FALSE         TRUE
## 2    3484 400000               375000            1.12       TRUE         TRUE
## 3    9583 512000               479000            1.12       TRUE         TRUE
## 4   13939 348000               339500            1.12         NA         TRUE
## 5    8545 510000               252450            1.12       TRUE           NA
## 6    7000 404200               204500            1.12      FALSE         TRUE
##   hasGarage hasHeating hasView schools0distance schools0level schools0rating
## 1     FALSE       TRUE   FALSE              0.4    Elementary              5
## 2     FALSE       TRUE    TRUE              1.2    Elementary              5
## 3     FALSE       TRUE    TRUE              0.8    Elementary              5
## 4     FALSE       TRUE   FALSE              0.8    Elementary              2
## 5     FALSE       TRUE   FALSE              0.3    Elementary              2
## 6     FALSE       TRUE   FALSE              0.4    Elementary              2
##   schools1distance schools1level schools1rating schools2distance school2level
## 1              1.1        Middle              2              2.6         High
## 2              1.0        Middle              2              3.4         High
## 3              1.7        Middle              6              1.4         High
## 4              0.7        Middle              2              3.8         High
## 5              0.9        Middle              6              0.3         High
## 6              0.4        Middle              6              1.4         High
##   schools2rating
## 1              3
## 2              3
## 3              3
## 4              3
## 5              3
## 6              3
# convert variables

raw_pdx <-  raw_pdx %>% 
  
  mutate(
    
    yearBuilt = as.numeric(yearBuilt),
    bathrooms = as.numeric(bathrooms),
    bedrooms = as.numeric(bedrooms),
    daysOnZillow = as.numeric(daysOnZillow),
    lastSoldPrice = as.numeric(lastSoldPrice ),
    livingArea = as.numeric(livingArea),
    lotSize= as.numeric(lotSize),
    price = as.numeric(price),
    priceHistory.1.price= as.numeric(priceHistory.1.price),
    
    
    schools0rating = as.factor(schools0rating),
    schools1rating = as.factor(schools1rating),
    schools2rating= as.factor(schools2rating),
    zipcode = as.factor(zipcode)

    
  )

Missing data

is.na(raw_pdx) %>% colSums()
##                   id            yearBuilt                 City 
##                    0                  546                    0 
##             latitude            longitude              zipcode 
##                   13                   13                    0 
##            bathrooms             bedrooms           DateListed 
##                  484                  770                    0 
##             DateSold         daysOnZillow             homeType 
##                    0                    6                    0 
##        lastSoldPrice           livingArea              lotSize 
##                    0                  465                 2890 
##                price priceHistory.1.price      propertyTaxRate 
##                    0                 1419                   11 
##           hasCooling         hasFireplace            hasGarage 
##                 4496                 4624                    0 
##           hasHeating              hasView     schools0distance 
##                    1                    0                   25 
##        schools0level       schools0rating     schools1distance 
##                    0                   25                   36 
##        schools1level       schools1rating     schools2distance 
##                    0                   36                 2242 
##         school2level       schools2rating 
##                    0                 2242
clean_data <- raw_pdx %>%
  filter(!is.na(yearBuilt))%>%
  filter(!is.na(longitude))%>%
  filter(!is.na(bedrooms))%>%
  filter(!is.na(daysOnZillow))%>%
  filter(!is.na(livingArea))%>%
  filter(!is.na(priceHistory.1.price))%>%
  filter(!is.na(hasFireplace))%>%
  filter(!is.na(latitude))%>%
  filter(!is.na(hasHeating))%>%
  filter(!is.na(hasCooling))%>%
  filter(!is.na(bathrooms))%>%
  filter(!is.na(lotSize))%>%
  filter(!is.na(propertyTaxRate))%>%
  filter(!is.na(schools0distance))%>%
  filter(!is.na(schools1distance))%>%
  filter(!is.na(schools2distance))%>%
  filter(!is.na(schools0rating))%>%
  filter(!is.na(schools2rating))%>%
  filter(!is.na(schools1rating))%>%
  filter(!is.na(schools1rating))
summary(clean_data)
##        id          yearBuilt             City         latitude    
##  Min.   :    1   Min.   :   0   Portland   :5232   Min.   :45.26  
##  1st Qu.: 6207   1st Qu.:1965   Beaverton  :1456   1st Qu.:45.42  
##  Median :15156   Median :1989   Hillsboro  :1194   Median :45.47  
##  Mean   :13622   Mean   :1981   Lake Oswego: 942   Mean   :45.47  
##  3rd Qu.:20440   3rd Qu.:2003   Tigard     : 913   3rd Qu.:45.52  
##  Max.   :25730   Max.   :2021   Gresham    : 845   Max.   :45.62  
##                                 (Other)    :3906                  
##    longitude         zipcode        bathrooms        bedrooms     
##  Min.   :-123.1   97229  :  834   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:-122.8   97045  :  713   1st Qu.: 2.00   1st Qu.: 3.000  
##  Median :-122.7   97007  :  706   Median : 3.00   Median : 3.000  
##  Mean   :-122.7   97086  :  632   Mean   : 2.78   Mean   : 3.568  
##  3rd Qu.:-122.6   97123  :  599   3rd Qu.: 3.00   3rd Qu.: 4.000  
##  Max.   :-122.3   97068  :  573   Max.   :10.00   Max.   :10.000  
##                   (Other):10431                                   
##      DateListed          DateSold      daysOnZillow              homeType    
##  8/9/2019 :   71   5/28/2021 :  159   Min.   :  1   APARTMENT        :   38  
##  6/4/2021 :   66   6/30/2021 :  142   1st Qu.: 80   CONDO            :  136  
##  3/30/2021:   65   10/30/2020:  137   Median :192   HOME_TYPE_UNKNOWN:    2  
##  8/6/2019 :   64   7/31/2020 :  128   Mean   :187   SINGLE_FAMILY    :13659  
##  8/19/2019:   62   4/30/2021 :  127   3rd Qu.:294   TOWNHOUSE        :  653  
##  8/23/2019:   60   9/30/2020 :  122   Max.   :422                            
##  (Other)  :14100   (Other)   :13673                                          
##  lastSoldPrice       livingArea       lotSize             price        
##  Min.   :    443   Min.   :  416   Min.   :       0   Min.   :    500  
##  1st Qu.: 450000   1st Qu.: 1664   1st Qu.:    4791   1st Qu.: 450000  
##  Median : 551000   Median : 2206   Median :    7405   Median : 551000  
##  Mean   : 634458   Mean   : 2399   Mean   :   17065   Mean   : 634827  
##  3rd Qu.: 710000   3rd Qu.: 2892   3rd Qu.:   10018   3rd Qu.: 710000  
##  Max.   :6300000   Max.   :14014   Max.   :18992160   Max.   :6300000  
##                                                                        
##  priceHistory.1.price propertyTaxRate hasCooling      hasFireplace   
##  Min.   :    895      Min.   :1.010   Mode :logical   Mode :logical  
##  1st Qu.: 415000      1st Qu.:1.080   FALSE:807       FALSE:645      
##  Median : 525000      Median :1.120   TRUE :13681     TRUE :13843    
##  Mean   : 594080      Mean   :1.113                                  
##  3rd Qu.: 679992      3rd Qu.:1.130                                  
##  Max.   :6888000      Max.   :1.130                                  
##                                                                      
##  hasGarage       hasHeating       hasView        schools0distance
##  Mode :logical   Mode :logical   Mode :logical   Min.   :0.0000  
##  FALSE:14150     FALSE:24        FALSE:8574      1st Qu.:0.4000  
##  TRUE :338       TRUE :14464     TRUE :5914      Median :0.6000  
##                                                  Mean   :0.8057  
##                                                  3rd Qu.:1.0000  
##                                                  Max.   :9.4000  
##                                                                  
##     schools0level   schools0rating schools1distance    schools1level  
##            :    0   7      :2862   Min.   : 0.000             :    0  
##  Elementary:13077   5      :2359   1st Qu.: 0.800   Elementary:    3  
##  Primary   : 1411   6      :2323   Median : 1.300   High      :    0  
##                     8      :1606   Mean   : 1.546   Middle    :14485  
##                     3      :1368   3rd Qu.: 2.100                     
##                     4      :1332   Max.   :11.900                     
##                     (Other):2638                                      
##  schools1rating schools2distance school2level schools2rating
##  5      :2347   Min.   : 0.100       :    0   5      :3804  
##  8      :2262   1st Qu.: 1.000   High:14488   8      :2365  
##  3      :2238   Median : 1.700                3      :1962  
##  6      :2104   Mean   : 1.911                6      :1787  
##  7      :1655   3rd Qu.: 2.500                9      :1746  
##  4      :1595   Max.   :10.800                4      :1230  
##  (Other):2287                                 (Other):1594
clean_data <- 
  clean_data %>% 
  mutate(price_category = case_when( 
    price < 551000 ~ "below",
    price >= 551000 ~ "above")) %>% 
  mutate(price_category = as.factor(price_category)) 

Take a look at the Data

library(gt) ## tables

clean_data %>% 
  count(price_category, 
        name ="total") %>%
  mutate(percent = total/sum(total)*100,
         percent = round(percent, 2)) %>%
 gt() %>%
  tab_header(
    title = "Portland, OR and its Metropolitan Area Median House Prices",
    subtitle = "Above and below 551,000$"
  ) %>%
  cols_label(
    price_category = "Price",
    total = "Total",
    percent = "Percent"
  ) %>% 
  fmt_number(
    columns = vars(total),
    suffixing = TRUE
  ) 
Portland, OR and its Metropolitan Area Median House Prices
Above and below 551,000$
Price Total Percent
above 7.25K 50.02
below 7.24K 49.98
library(ggmap)

qmplot(x = longitude, 
       y = latitude, 
       data = clean_data, 
       geom = "point", 
       color = price_category, 
       alpha = 0.4) +
  scale_alpha(guide = 'none')

houses_pdx <-
  clean_data %>% 
  select( # select our predictors
    longitude, 
    latitude, 
    price_category,
    bathrooms, 
    yearBuilt, 
    homeType,
    bedrooms, 
    livingArea, 
    lotSize,
    schools2distance,
    schools1distance,
    schools0distance,
    schools1distance)

glimpse(houses_pdx)
## Rows: 14,488
## Columns: 12
## $ longitude        <dbl> -122.4418, -122.4532, -122.4444, -122.4162, -122.4513…
## $ latitude         <dbl> 45.54357, 45.54758, 45.48823, 45.48799, 45.49818, 45.…
## $ price_category   <fct> below, below, below, below, below, below, below, belo…
## $ bathrooms        <dbl> 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0…
## $ yearBuilt        <dbl> 2007, 2001, 1982, 1967, 1978, 2018, 2006, 2017, 1958,…
## $ homeType         <fct> TOWNHOUSE, SINGLE_FAMILY, SINGLE_FAMILY, SINGLE_FAMIL…
## $ bedrooms         <dbl> 3, 3, 4, 3, 3, 4, 3, 3, 4, 4, 3, 2, 4, 3, 4, 3, 3, 4,…
## $ livingArea       <dbl> 1806, 1518, 2724, 1150, 2036, 1947, 1548, 2209, 2838,…
## $ lotSize          <dbl> 1555, 3484, 9583, 7000, 6969, 4791, 5009, 5227, 6480,…
## $ schools2distance <dbl> 2.6, 3.4, 1.4, 1.4, 1.4, 2.2, 1.5, 1.4, 0.9, 2.3, 2.2…
## $ schools1distance <dbl> 1.1, 1.0, 1.7, 0.4, 2.1, 2.5, 0.5, 1.8, 0.3, 2.6, 2.5…
## $ schools0distance <dbl> 0.4, 1.2, 0.8, 0.4, 1.0, 0.3, 0.5, 1.0, 0.1, 0.4, 0.6…
pdx_long <- houses_pdx %>% 
  select(-longitude,-latitude, -homeType, -yearBuilt, -lotSize)%>%
    pivot_longer(!price_category, names_to = "features", values_to = "values")


# Print the first 10 rows
pdx_long %>% 
  slice_head(n = 10)
## # A tibble: 10 × 3
##    price_category features         values
##    <fct>          <chr>             <dbl>
##  1 below          bathrooms           3  
##  2 below          bedrooms            3  
##  3 below          livingArea       1806  
##  4 below          schools2distance    2.6
##  5 below          schools1distance    1.1
##  6 below          schools0distance    0.4
##  7 below          bathrooms           3  
##  8 below          bedrooms            3  
##  9 below          livingArea       1518  
## 10 below          schools2distance    3.4
theme_set(theme_light())

# Make a box plot for each predictor feature
pdx_long %>% 
  ggplot(mapping = aes(x = price_category, y = values, fill = features)) +
  geom_boxplot() + 
  facet_wrap(~ features, scales = "free", ncol = 4) +
  scale_color_viridis_d(option = "plasma", end = .7) +
  theme(legend.position = "none")

Data Splitting

# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible 
set.seed(504)

# Put 3/4 of the data into the training set 
data_split <- initial_split(houses_pdx, 
                           prop = 3/4)

# Create dataframes for the two sets:
train_data <- training(data_split) 
test_data <- testing(data_split)

Validaton Set

house_folds <-
 vfold_cv(train_data, 
          v = 5, 
          strata = price_category) 
pdx_rec <-
  recipe(price_category ~ .,
         data = train_data) %>%
  update_role(longitude, latitude, 
              new_role = "ID") %>% 
  
  step_log(bathrooms, livingArea) %>% ## step_log() will log transform data
  
  step_naomit(everything(), skip = TRUE) %>% 
  
  step_novel(all_nominal(), -all_outcomes()) %>% # converts all nominal variables to factors and takes care of other issues related to categorical variables.
  
  step_normalize(all_numeric(), -all_outcomes(), 
                 -longitude, -latitude) %>% # step_normalize() normalizes (center and scales) the numeric variables to have a standard deviation of one and a mean of zero
  
  step_dummy(all_nominal(), -all_outcomes()) %>% #converts our factor columns into numeric binary (0 and 1) variables.
  
  step_zv(all_numeric(), -all_outcomes()) %>% ## step_zv(): removes any numeric variables that have zero variance.
  
  step_corr(all_predictors(), threshold = 0.7, method = "spearman") # step_corr(): will remove predictor variables that have large correlations with other predictor variables.
summary(pdx_rec)
## # A tibble: 12 × 4
##    variable         type    role      source  
##    <chr>            <chr>   <chr>     <chr>   
##  1 longitude        numeric ID        original
##  2 latitude         numeric ID        original
##  3 bathrooms        numeric predictor original
##  4 yearBuilt        numeric predictor original
##  5 homeType         nominal predictor original
##  6 bedrooms         numeric predictor original
##  7 livingArea       numeric predictor original
##  8 lotSize          numeric predictor original
##  9 schools2distance numeric predictor original
## 10 schools1distance numeric predictor original
## 11 schools0distance numeric predictor original
## 12 price_category   nominal outcome   original
prep_data <- 
  pdx_rec %>% # use the recipe object
  prep() %>% # perform the recipe on training data
  juice() # extract only the preprocessed dataframe 

The Model- Logistic regression

log_spec <- # your model specification
  logistic_reg() %>%  # model type
  set_engine(engine = "glm") %>%  # model engine
  set_mode("classification") # model mode

# Show your model specification
log_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
pdx_wflow <- # new workflow object
 workflow() %>% # use workflow function
 add_recipe(pdx_rec) %>%   # use the new recipe
 add_model(log_spec)   # add your model spec

pdx_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
## 
## • step_log()
## • step_naomit()
## • step_novel()
## • step_normalize()
## • step_dummy()
## • step_zv()
## • step_corr()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
# save model coefficients for a fitted model object from a workflow

get_model <- function(x) {
  pull_workflow_fit(x) %>% tidy()
}

# same as before with one exception
log_res_2 <- 
  pdx_wflow %>% 
  fit_resamples(
    resamples = house_folds, 
    metrics = metric_set(
      recall, precision, f_meas, 
      accuracy, kap,
      roc_auc, sens, spec),
      control = control_resamples(
      save_pred = TRUE,
      extract = get_model) # use extract and our new function
    ) 

log_res_2$.extracts[[1]]
## NULL

To get the results use:

log_res_2$.extracts[[1]][[1]]
## NULL

All of the results can be flattened and collected using:

all_coef <- map_dfr(log_res_2$.extracts, ~ .x[[1]][[1]])
filter(all_coef, term == "bedrooms")
## # A tibble: 4 × 5
##   term     estimate std.error statistic  p.value
##   <chr>       <dbl>     <dbl>     <dbl>    <dbl>
## 1 bedrooms    0.118    0.0418      2.82 0.00474 
## 2 bedrooms    0.123    0.0409      3.00 0.00269 
## 3 bedrooms    0.122    0.0409      2.98 0.00288 
## 4 bedrooms    0.152    0.0416      3.65 0.000262

Performance metrics

Show performance for every single fold:

log_res_2 %>%  collect_metrics(summarize = FALSE)
## # A tibble: 32 × 5
##    id    .metric   .estimator .estimate .config             
##    <chr> <chr>     <chr>          <dbl> <chr>               
##  1 Fold2 recall    binary         0.808 Preprocessor1_Model1
##  2 Fold2 precision binary         0.813 Preprocessor1_Model1
##  3 Fold2 f_meas    binary         0.811 Preprocessor1_Model1
##  4 Fold2 accuracy  binary         0.811 Preprocessor1_Model1
##  5 Fold2 kap       binary         0.622 Preprocessor1_Model1
##  6 Fold2 sens      binary         0.808 Preprocessor1_Model1
##  7 Fold2 spec      binary         0.814 Preprocessor1_Model1
##  8 Fold2 roc_auc   binary         0.886 Preprocessor1_Model1
##  9 Fold3 recall    binary         0.834 Preprocessor1_Model1
## 10 Fold3 precision binary         0.819 Preprocessor1_Model1
## # … with 22 more rows

Collect predictions

To obtain the actual model predictions, we use the function collect_predictions and save the result as log_pred:

log_pred <- 
  log_res_2 %>%
  collect_predictions()


log_pred %>% 
  conf_mat(price_category, .pred_class) 
##           Truth
## Prediction above below
##      above  3595   790
##      below   754  3553
log_pred %>% 
  conf_mat(price_category, .pred_class) %>% 
  autoplot(type = "heatmap")+
  theme_minimal()

ROC Curve

log_pred %>% 
  group_by(id) %>% # id contains our folds
  roc_curve(price_category, .pred_above) %>% 
  autoplot()+
  theme_minimal()

Use the workflow to train our model

pdx_fit <- fit(pdx_wflow, train_data)

This allows us to use the model trained by this workflow to predict labels for our test set, and compare the performance metrics with the basic model we created previously.

pdx_fit %>% ## display results
pull_workflow_fit() %>%
tidy()%>%
  filter(p.value < 0.05)
## # A tibble: 7 × 5
##   term               estimate std.error statistic      p.value
##   <chr>                 <dbl>     <dbl>     <dbl>        <dbl>
## 1 yearBuilt            0.201     0.0419      4.80 0.00000160  
## 2 bedrooms             0.123     0.0368      3.33 0.000858    
## 3 livingArea          -2.53      0.0548    -46.1  0           
## 4 lotSize             -0.0562    0.0202     -2.78 0.00543     
## 5 schools2distance     0.0798    0.0303      2.63 0.00845     
## 6 schools1distance    -0.174     0.0311     -5.61 0.0000000206
## 7 homeType_TOWNHOUSE   0.945     0.179       5.28 0.000000128
library(vip)

pdx_fit %>%
extract_fit_parsnip() %>%
   vip()+
  theme_minimal()

# Make predictions on the test set
pred_results <- test_data %>% 
  select(price_category) %>% 
  bind_cols(pdx_fit %>% 
              predict(new_data = test_data)) %>% 
  bind_cols(pdx_fit %>% 
              predict(new_data = test_data, type = "prob"))

# Print the results
pred_results %>% 
  slice_head(n = 10)
##    price_category .pred_class .pred_above .pred_below
## 1           below       above 0.796748877   0.2032511
## 2           below       below 0.298118188   0.7018818
## 3           below       above 0.516464311   0.4835357
## 4           below       below 0.005322731   0.9946773
## 5           below       below 0.035205673   0.9647943
## 6           below       below 0.084535990   0.9154640
## 7           below       below 0.049387971   0.9506120
## 8           below       below 0.009201868   0.9907981
## 9           below       below 0.021328081   0.9786719
## 10          below       above 0.818118929   0.1818811

Let’s take a look at the confusion matrix:

pred_results%>% 
  conf_mat(price_category, .pred_class) %>% 
  autoplot(type = "heatmap")+
  theme_minimal()

# Evaluate other desired metrics

eval_metrics <- metric_set(ppv, recall, accuracy, f_meas)
eval_metrics(data = pred_results, truth = price_category, estimate = .pred_class)
## # A tibble: 4 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 ppv      binary         0.813
## 2 recall   binary         0.823
## 3 accuracy binary         0.817
## 4 f_meas   binary         0.818
pred_results %>% 
  roc_auc(price_category, .pred_above)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.895

Make a roc_chart

pred_results %>% 
  roc_curve(truth = price_category, .pred_above) %>% 
  autoplot()+
  theme_minimal()