raw_pdx <- read.csv("C:/Users/karol/Desktop/PORTLAND HOUSE.csv", stringsAsFactors=TRUE)
This data has 25731 obs. of 32 variables
## raw_pdx <- raw_pdx%>%select(-id)
head(raw_pdx)
## id yearBuilt City latitude longitude zipcode bathrooms bedrooms
## 1 1 2007 Fairview 45.54357 -122.4418 97024 3 3
## 2 2 2001 Fairview 45.54758 -122.4532 97024 3 3
## 3 3 1982 Gresham 45.48823 -122.4444 97080 3 4
## 4 4 1953 Portland 45.52663 -122.4641 97230 1 3
## 5 5 1967 Gresham 45.51124 -122.4315 97030 3 6
## 6 6 1967 Gresham 45.48799 -122.4162 97080 2 3
## DateListed DateSold daysOnZillow homeType lastSoldPrice livingArea
## 1 4/26/2021 5/21/2021 25 TOWNHOUSE 315400 1806
## 2 3/1/2021 4/23/2021 53 SINGLE_FAMILY 400000 1518
## 3 5/24/2021 6/4/2021 11 SINGLE_FAMILY 512000 2724
## 4 5/24/2021 6/4/2021 11 SINGLE_FAMILY 348000 1217
## 5 5/18/2021 6/1/2021 14 APARTMENT 510000 2400
## 6 5/18/2021 6/1/2021 14 SINGLE_FAMILY 404200 1150
## lotSize price priceHistory.1.price propertyTaxRate hasCooling hasFireplace
## 1 1555 315400 212000 1.12 FALSE TRUE
## 2 3484 400000 375000 1.12 TRUE TRUE
## 3 9583 512000 479000 1.12 TRUE TRUE
## 4 13939 348000 339500 1.12 NA TRUE
## 5 8545 510000 252450 1.12 TRUE NA
## 6 7000 404200 204500 1.12 FALSE TRUE
## hasGarage hasHeating hasView schools0distance schools0level schools0rating
## 1 FALSE TRUE FALSE 0.4 Elementary 5
## 2 FALSE TRUE TRUE 1.2 Elementary 5
## 3 FALSE TRUE TRUE 0.8 Elementary 5
## 4 FALSE TRUE FALSE 0.8 Elementary 2
## 5 FALSE TRUE FALSE 0.3 Elementary 2
## 6 FALSE TRUE FALSE 0.4 Elementary 2
## schools1distance schools1level schools1rating schools2distance school2level
## 1 1.1 Middle 2 2.6 High
## 2 1.0 Middle 2 3.4 High
## 3 1.7 Middle 6 1.4 High
## 4 0.7 Middle 2 3.8 High
## 5 0.9 Middle 6 0.3 High
## 6 0.4 Middle 6 1.4 High
## schools2rating
## 1 3
## 2 3
## 3 3
## 4 3
## 5 3
## 6 3
# convert variables
raw_pdx <- raw_pdx %>%
mutate(
yearBuilt = as.numeric(yearBuilt),
bathrooms = as.numeric(bathrooms),
bedrooms = as.numeric(bedrooms),
daysOnZillow = as.numeric(daysOnZillow),
lastSoldPrice = as.numeric(lastSoldPrice ),
livingArea = as.numeric(livingArea),
lotSize= as.numeric(lotSize),
price = as.numeric(price),
priceHistory.1.price= as.numeric(priceHistory.1.price),
schools0rating = as.factor(schools0rating),
schools1rating = as.factor(schools1rating),
schools2rating= as.factor(schools2rating),
zipcode = as.factor(zipcode)
)
is.na(raw_pdx) %>% colSums()
## id yearBuilt City
## 0 546 0
## latitude longitude zipcode
## 13 13 0
## bathrooms bedrooms DateListed
## 484 770 0
## DateSold daysOnZillow homeType
## 0 6 0
## lastSoldPrice livingArea lotSize
## 0 465 2890
## price priceHistory.1.price propertyTaxRate
## 0 1419 11
## hasCooling hasFireplace hasGarage
## 4496 4624 0
## hasHeating hasView schools0distance
## 1 0 25
## schools0level schools0rating schools1distance
## 0 25 36
## schools1level schools1rating schools2distance
## 0 36 2242
## school2level schools2rating
## 0 2242
clean_data <- raw_pdx %>%
filter(!is.na(yearBuilt))%>%
filter(!is.na(longitude))%>%
filter(!is.na(bedrooms))%>%
filter(!is.na(daysOnZillow))%>%
filter(!is.na(livingArea))%>%
filter(!is.na(priceHistory.1.price))%>%
filter(!is.na(hasFireplace))%>%
filter(!is.na(latitude))%>%
filter(!is.na(hasHeating))%>%
filter(!is.na(hasCooling))%>%
filter(!is.na(bathrooms))%>%
filter(!is.na(lotSize))%>%
filter(!is.na(propertyTaxRate))%>%
filter(!is.na(schools0distance))%>%
filter(!is.na(schools1distance))%>%
filter(!is.na(schools2distance))%>%
filter(!is.na(schools0rating))%>%
filter(!is.na(schools2rating))%>%
filter(!is.na(schools1rating))%>%
filter(!is.na(schools1rating))
summary(clean_data)
## id yearBuilt City latitude
## Min. : 1 Min. : 0 Portland :5232 Min. :45.26
## 1st Qu.: 6207 1st Qu.:1965 Beaverton :1456 1st Qu.:45.42
## Median :15156 Median :1989 Hillsboro :1194 Median :45.47
## Mean :13622 Mean :1981 Lake Oswego: 942 Mean :45.47
## 3rd Qu.:20440 3rd Qu.:2003 Tigard : 913 3rd Qu.:45.52
## Max. :25730 Max. :2021 Gresham : 845 Max. :45.62
## (Other) :3906
## longitude zipcode bathrooms bedrooms
## Min. :-123.1 97229 : 834 Min. : 0.00 Min. : 0.000
## 1st Qu.:-122.8 97045 : 713 1st Qu.: 2.00 1st Qu.: 3.000
## Median :-122.7 97007 : 706 Median : 3.00 Median : 3.000
## Mean :-122.7 97086 : 632 Mean : 2.78 Mean : 3.568
## 3rd Qu.:-122.6 97123 : 599 3rd Qu.: 3.00 3rd Qu.: 4.000
## Max. :-122.3 97068 : 573 Max. :10.00 Max. :10.000
## (Other):10431
## DateListed DateSold daysOnZillow homeType
## 8/9/2019 : 71 5/28/2021 : 159 Min. : 1 APARTMENT : 38
## 6/4/2021 : 66 6/30/2021 : 142 1st Qu.: 80 CONDO : 136
## 3/30/2021: 65 10/30/2020: 137 Median :192 HOME_TYPE_UNKNOWN: 2
## 8/6/2019 : 64 7/31/2020 : 128 Mean :187 SINGLE_FAMILY :13659
## 8/19/2019: 62 4/30/2021 : 127 3rd Qu.:294 TOWNHOUSE : 653
## 8/23/2019: 60 9/30/2020 : 122 Max. :422
## (Other) :14100 (Other) :13673
## lastSoldPrice livingArea lotSize price
## Min. : 443 Min. : 416 Min. : 0 Min. : 500
## 1st Qu.: 450000 1st Qu.: 1664 1st Qu.: 4791 1st Qu.: 450000
## Median : 551000 Median : 2206 Median : 7405 Median : 551000
## Mean : 634458 Mean : 2399 Mean : 17065 Mean : 634827
## 3rd Qu.: 710000 3rd Qu.: 2892 3rd Qu.: 10018 3rd Qu.: 710000
## Max. :6300000 Max. :14014 Max. :18992160 Max. :6300000
##
## priceHistory.1.price propertyTaxRate hasCooling hasFireplace
## Min. : 895 Min. :1.010 Mode :logical Mode :logical
## 1st Qu.: 415000 1st Qu.:1.080 FALSE:807 FALSE:645
## Median : 525000 Median :1.120 TRUE :13681 TRUE :13843
## Mean : 594080 Mean :1.113
## 3rd Qu.: 679992 3rd Qu.:1.130
## Max. :6888000 Max. :1.130
##
## hasGarage hasHeating hasView schools0distance
## Mode :logical Mode :logical Mode :logical Min. :0.0000
## FALSE:14150 FALSE:24 FALSE:8574 1st Qu.:0.4000
## TRUE :338 TRUE :14464 TRUE :5914 Median :0.6000
## Mean :0.8057
## 3rd Qu.:1.0000
## Max. :9.4000
##
## schools0level schools0rating schools1distance schools1level
## : 0 7 :2862 Min. : 0.000 : 0
## Elementary:13077 5 :2359 1st Qu.: 0.800 Elementary: 3
## Primary : 1411 6 :2323 Median : 1.300 High : 0
## 8 :1606 Mean : 1.546 Middle :14485
## 3 :1368 3rd Qu.: 2.100
## 4 :1332 Max. :11.900
## (Other):2638
## schools1rating schools2distance school2level schools2rating
## 5 :2347 Min. : 0.100 : 0 5 :3804
## 8 :2262 1st Qu.: 1.000 High:14488 8 :2365
## 3 :2238 Median : 1.700 3 :1962
## 6 :2104 Mean : 1.911 6 :1787
## 7 :1655 3rd Qu.: 2.500 9 :1746
## 4 :1595 Max. :10.800 4 :1230
## (Other):2287 (Other):1594
clean_data <-
clean_data %>%
mutate(price_category = case_when(
price < 551000 ~ "below",
price >= 551000 ~ "above")) %>%
mutate(price_category = as.factor(price_category))
library(gt) ## tables
clean_data %>%
count(price_category,
name ="total") %>%
mutate(percent = total/sum(total)*100,
percent = round(percent, 2)) %>%
gt() %>%
tab_header(
title = "Portland, OR and its Metropolitan Area Median House Prices",
subtitle = "Above and below 551,000$"
) %>%
cols_label(
price_category = "Price",
total = "Total",
percent = "Percent"
) %>%
fmt_number(
columns = vars(total),
suffixing = TRUE
)
| Portland, OR and its Metropolitan Area Median House Prices | ||
| Above and below 551,000$ | ||
| Price | Total | Percent |
|---|---|---|
| above | 7.25K | 50.02 |
| below | 7.24K | 49.98 |
library(ggmap)
qmplot(x = longitude,
y = latitude,
data = clean_data,
geom = "point",
color = price_category,
alpha = 0.4) +
scale_alpha(guide = 'none')
houses_pdx <-
clean_data %>%
select( # select our predictors
longitude,
latitude,
price_category,
bathrooms,
yearBuilt,
homeType,
bedrooms,
livingArea,
lotSize,
schools2distance,
schools1distance,
schools0distance,
schools1distance)
glimpse(houses_pdx)
## Rows: 14,488
## Columns: 12
## $ longitude <dbl> -122.4418, -122.4532, -122.4444, -122.4162, -122.4513…
## $ latitude <dbl> 45.54357, 45.54758, 45.48823, 45.48799, 45.49818, 45.…
## $ price_category <fct> below, below, below, below, below, below, below, belo…
## $ bathrooms <dbl> 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0…
## $ yearBuilt <dbl> 2007, 2001, 1982, 1967, 1978, 2018, 2006, 2017, 1958,…
## $ homeType <fct> TOWNHOUSE, SINGLE_FAMILY, SINGLE_FAMILY, SINGLE_FAMIL…
## $ bedrooms <dbl> 3, 3, 4, 3, 3, 4, 3, 3, 4, 4, 3, 2, 4, 3, 4, 3, 3, 4,…
## $ livingArea <dbl> 1806, 1518, 2724, 1150, 2036, 1947, 1548, 2209, 2838,…
## $ lotSize <dbl> 1555, 3484, 9583, 7000, 6969, 4791, 5009, 5227, 6480,…
## $ schools2distance <dbl> 2.6, 3.4, 1.4, 1.4, 1.4, 2.2, 1.5, 1.4, 0.9, 2.3, 2.2…
## $ schools1distance <dbl> 1.1, 1.0, 1.7, 0.4, 2.1, 2.5, 0.5, 1.8, 0.3, 2.6, 2.5…
## $ schools0distance <dbl> 0.4, 1.2, 0.8, 0.4, 1.0, 0.3, 0.5, 1.0, 0.1, 0.4, 0.6…
pdx_long <- houses_pdx %>%
select(-longitude,-latitude, -homeType, -yearBuilt, -lotSize)%>%
pivot_longer(!price_category, names_to = "features", values_to = "values")
# Print the first 10 rows
pdx_long %>%
slice_head(n = 10)
## # A tibble: 10 × 3
## price_category features values
## <fct> <chr> <dbl>
## 1 below bathrooms 3
## 2 below bedrooms 3
## 3 below livingArea 1806
## 4 below schools2distance 2.6
## 5 below schools1distance 1.1
## 6 below schools0distance 0.4
## 7 below bathrooms 3
## 8 below bedrooms 3
## 9 below livingArea 1518
## 10 below schools2distance 3.4
theme_set(theme_light())
# Make a box plot for each predictor feature
pdx_long %>%
ggplot(mapping = aes(x = price_category, y = values, fill = features)) +
geom_boxplot() +
facet_wrap(~ features, scales = "free", ncol = 4) +
scale_color_viridis_d(option = "plasma", end = .7) +
theme(legend.position = "none")
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible
set.seed(504)
# Put 3/4 of the data into the training set
data_split <- initial_split(houses_pdx,
prop = 3/4)
# Create dataframes for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
house_folds <-
vfold_cv(train_data,
v = 5,
strata = price_category)
pdx_rec <-
recipe(price_category ~ .,
data = train_data) %>%
update_role(longitude, latitude,
new_role = "ID") %>%
step_log(bathrooms, livingArea) %>% ## step_log() will log transform data
step_naomit(everything(), skip = TRUE) %>%
step_novel(all_nominal(), -all_outcomes()) %>% # converts all nominal variables to factors and takes care of other issues related to categorical variables.
step_normalize(all_numeric(), -all_outcomes(),
-longitude, -latitude) %>% # step_normalize() normalizes (center and scales) the numeric variables to have a standard deviation of one and a mean of zero
step_dummy(all_nominal(), -all_outcomes()) %>% #converts our factor columns into numeric binary (0 and 1) variables.
step_zv(all_numeric(), -all_outcomes()) %>% ## step_zv(): removes any numeric variables that have zero variance.
step_corr(all_predictors(), threshold = 0.7, method = "spearman") # step_corr(): will remove predictor variables that have large correlations with other predictor variables.
summary(pdx_rec)
## # A tibble: 12 × 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 longitude numeric ID original
## 2 latitude numeric ID original
## 3 bathrooms numeric predictor original
## 4 yearBuilt numeric predictor original
## 5 homeType nominal predictor original
## 6 bedrooms numeric predictor original
## 7 livingArea numeric predictor original
## 8 lotSize numeric predictor original
## 9 schools2distance numeric predictor original
## 10 schools1distance numeric predictor original
## 11 schools0distance numeric predictor original
## 12 price_category nominal outcome original
prep_data <-
pdx_rec %>% # use the recipe object
prep() %>% # perform the recipe on training data
juice() # extract only the preprocessed dataframe
log_spec <- # your model specification
logistic_reg() %>% # model type
set_engine(engine = "glm") %>% # model engine
set_mode("classification") # model mode
# Show your model specification
log_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
pdx_wflow <- # new workflow object
workflow() %>% # use workflow function
add_recipe(pdx_rec) %>% # use the new recipe
add_model(log_spec) # add your model spec
pdx_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## • step_log()
## • step_naomit()
## • step_novel()
## • step_normalize()
## • step_dummy()
## • step_zv()
## • step_corr()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
# save model coefficients for a fitted model object from a workflow
get_model <- function(x) {
pull_workflow_fit(x) %>% tidy()
}
# same as before with one exception
log_res_2 <-
pdx_wflow %>%
fit_resamples(
resamples = house_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(
save_pred = TRUE,
extract = get_model) # use extract and our new function
)
log_res_2$.extracts[[1]]
## NULL
To get the results use:
log_res_2$.extracts[[1]][[1]]
## NULL
All of the results can be flattened and collected using:
all_coef <- map_dfr(log_res_2$.extracts, ~ .x[[1]][[1]])
filter(all_coef, term == "bedrooms")
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 bedrooms 0.118 0.0418 2.82 0.00474
## 2 bedrooms 0.123 0.0409 3.00 0.00269
## 3 bedrooms 0.122 0.0409 2.98 0.00288
## 4 bedrooms 0.152 0.0416 3.65 0.000262
Show performance for every single fold:
log_res_2 %>% collect_metrics(summarize = FALSE)
## # A tibble: 32 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Fold2 recall binary 0.808 Preprocessor1_Model1
## 2 Fold2 precision binary 0.813 Preprocessor1_Model1
## 3 Fold2 f_meas binary 0.811 Preprocessor1_Model1
## 4 Fold2 accuracy binary 0.811 Preprocessor1_Model1
## 5 Fold2 kap binary 0.622 Preprocessor1_Model1
## 6 Fold2 sens binary 0.808 Preprocessor1_Model1
## 7 Fold2 spec binary 0.814 Preprocessor1_Model1
## 8 Fold2 roc_auc binary 0.886 Preprocessor1_Model1
## 9 Fold3 recall binary 0.834 Preprocessor1_Model1
## 10 Fold3 precision binary 0.819 Preprocessor1_Model1
## # … with 22 more rows
To obtain the actual model predictions, we use the function collect_predictions and save the result as log_pred:
log_pred <-
log_res_2 %>%
collect_predictions()
log_pred %>%
conf_mat(price_category, .pred_class)
## Truth
## Prediction above below
## above 3595 790
## below 754 3553
log_pred %>%
conf_mat(price_category, .pred_class) %>%
autoplot(type = "heatmap")+
theme_minimal()
log_pred %>%
group_by(id) %>% # id contains our folds
roc_curve(price_category, .pred_above) %>%
autoplot()+
theme_minimal()
pdx_fit <- fit(pdx_wflow, train_data)
This allows us to use the model trained by this workflow to predict labels for our test set, and compare the performance metrics with the basic model we created previously.
pdx_fit %>% ## display results
pull_workflow_fit() %>%
tidy()%>%
filter(p.value < 0.05)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 yearBuilt 0.201 0.0419 4.80 0.00000160
## 2 bedrooms 0.123 0.0368 3.33 0.000858
## 3 livingArea -2.53 0.0548 -46.1 0
## 4 lotSize -0.0562 0.0202 -2.78 0.00543
## 5 schools2distance 0.0798 0.0303 2.63 0.00845
## 6 schools1distance -0.174 0.0311 -5.61 0.0000000206
## 7 homeType_TOWNHOUSE 0.945 0.179 5.28 0.000000128
library(vip)
pdx_fit %>%
extract_fit_parsnip() %>%
vip()+
theme_minimal()
# Make predictions on the test set
pred_results <- test_data %>%
select(price_category) %>%
bind_cols(pdx_fit %>%
predict(new_data = test_data)) %>%
bind_cols(pdx_fit %>%
predict(new_data = test_data, type = "prob"))
# Print the results
pred_results %>%
slice_head(n = 10)
## price_category .pred_class .pred_above .pred_below
## 1 below above 0.796748877 0.2032511
## 2 below below 0.298118188 0.7018818
## 3 below above 0.516464311 0.4835357
## 4 below below 0.005322731 0.9946773
## 5 below below 0.035205673 0.9647943
## 6 below below 0.084535990 0.9154640
## 7 below below 0.049387971 0.9506120
## 8 below below 0.009201868 0.9907981
## 9 below below 0.021328081 0.9786719
## 10 below above 0.818118929 0.1818811
Let’s take a look at the confusion matrix:
pred_results%>%
conf_mat(price_category, .pred_class) %>%
autoplot(type = "heatmap")+
theme_minimal()
# Evaluate other desired metrics
eval_metrics <- metric_set(ppv, recall, accuracy, f_meas)
eval_metrics(data = pred_results, truth = price_category, estimate = .pred_class)
## # A tibble: 4 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 ppv binary 0.813
## 2 recall binary 0.823
## 3 accuracy binary 0.817
## 4 f_meas binary 0.818
pred_results %>%
roc_auc(price_category, .pred_above)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.895
pred_results %>%
roc_curve(truth = price_category, .pred_above) %>%
autoplot()+
theme_minimal()