Dataset: House sales in King County,USA Task: Predict House price using Regression
#This data contains information on homes sold in the Seattle, Washington area between 2015 and 2016.
#create training and test datasets from the home_sales data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✔ broom 1.0.0 ✔ rsample 1.0.0
## ✔ dials 1.0.0 ✔ tune 1.0.0
## ✔ infer 1.0.2 ✔ workflows 1.0.0
## ✔ modeldata 1.0.0 ✔ workflowsets 0.2.1
## ✔ parsnip 1.0.0 ✔ yardstick 1.0.0
## ✔ recipes 1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
#home_sales <- read_rds("data/home_sales.rds")
#telecom_df <- read_rds("data/telecom_df.rds")
#loans_df <- read_rds("data/loan_df.rds")
library(readr)
home_sales <- read_csv("/home/kn/Documents/TIDY/Modelling_tidymodels/kc_house_data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(home_sales)
#Create an rsample object, home_split, that contains the instructions for randomly splitting the home_sales data into a training and test dataset
#Allocate 70% of the data into training and stratify the results by price
home_split <- initial_split(home_sales,
prop = 0.7,
strata = price)
#create a traing data set from home_split called home_training
home_training <- home_split %>%
training()
#Create the home_test tibble by passing home_split into the appropriate function i.e testing () for generating test datasets
home_test <- home_split %>%
testing()
#Check the number of rows in the training and test datasets by passing them into the nrow() function
nrow(home_test)
## [1] 6486
#EXERCISE FOR YOU: check the number of rows in the training data sets/tibbles
nrow(home_training)
## [1] 15127
#number of rows from the initial dataset
#Just for comparison
#it is appropriate to allocate more rows into the test set. This will provide more data for the model evaluation step.
nrow(home_sales)
## [1] 21613
#What is stratified sampling in R?
#One commonly used sampling method is stratified random sampling, in which #a population is split into groups and a certain number of members from #each group are randomly selected to be included in the sample
#EXERCISE ON DATA WRANGLING
# calculate summary statistics for the price variable in the training and test datasets. The home_training and home_test tibbles
# Distribution of price in training data
home_training %>%
summarize(min_price = min(price),
max_price = max(price),
mean_price = mean(price),
sd_price = sd(price))
## # A tibble: 1 × 4
## min_price max_price mean_price sd_price
## <dbl> <dbl> <dbl> <dbl>
## 1 75000 7700000 540275. 366108.
# Distribution of price in test data
home_test %>%
summarise(min_price = min(price),
max_price = max(price),
mean_price = mean(price),
sd_price = sd(price))
## # A tibble: 1 × 4
## min_price max_price mean_price sd_price
## <dbl> <dbl> <dbl> <dbl>
## 1 78000 6885000 539652. 369521.
#In this exercise, you will define a parsnip linear regression object and train your model to predict price using yr_built and sqft_living as predictor variables from the home_sales data.
# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>%
set_engine('lm') %>% # Set the model engine
set_mode('regression') # Set the model mode
#Train your model to predict selling_price using yr_built and sqft_living as predictor variables from the home_training dataset.
lm_fit <- linear_model %>%
fit(price ~ yr_built + sqft_living,
data = home_training)
#EXERCISE
# Print lm_fit to view model information
print(lm_fit)
## parsnip model object
##
##
## Call:
## stats::lm(formula = price ~ yr_built + sqft_living, data = data)
##
## Coefficients:
## (Intercept) yr_built sqft_living
## 4630850.9 -2395.7 303.7
#Excellent work!
#You have defined your model with linear_reg() and trained it to predict price using yr_built and sqft_living.
#Printing a parsnip model fit object displays useful model information, such as the training time, model formula used during training, and the estimated model parameters
#Now lets explore your models estimated parameters
tidy(lm_fit)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4630851. 143356. 32.3 1.95e-221
## 2 yr_built -2396. 73.5 -32.6 1.68e-225
## 3 sqft_living 304. 2.35 129. 0
#WHAT DO WE OBSERVE FROM THE TIBBLE??
#Since sqft_living has a positive estimated parameter, the price of homes increases with the square footage.
#Conversely, since yr_built(which represents the home_age) has a negative estimated parameter, older homes tend to have lower prices
#Now let us predict home prices
#Before you can evaluate model performance, you must add your predictions to the test dataset
#EXERCISE
#use your trained model, lm_fit, to predict price in the home_test dataset
#HINT: Create a tibble, home_predictions (OR ANY NAME), that contains the predicted selling prices of homes in the test dataset.
lm_fit <- linear_model %>%
fit(price ~ yr_built + sqft_living,
data = home_test)
home_predictions <- predict(lm_fit,
new_data = home_test)
home_predictions
## # A tibble: 6,486 × 1
## .pred
## <dbl>
## 1 515710.
## 2 372928.
## 3 307803.
## 4 787548.
## 5 620764.
## 6 612415.
## 7 710341.
## 8 330291.
## 9 510257.
## 10 495966.
## # … with 6,476 more rows
#Create a tibble with the price, yr_built, and sqft_living columns from the test dataset and the predicted home prices
home_test_results <- home_test %>%
select(price, yr_built, sqft_living) %>%
bind_cols(home_predictions)
#view results
home_test_results
## # A tibble: 6,486 × 4
## price yr_built sqft_living .pred
## <dbl> <dbl> <dbl> <dbl>
## 1 604000 1965 1960 515710.
## 2 257500 1995 1715 372928.
## 3 400000 1977 1370 307803.
## 4 650000 1979 2950 787548.
## 5 329000 1985 2450 620764.
## 6 719000 2005 2570 612415.
## 7 687500 1929 2330 710341.
## 8 535000 1929 1090 330291.
## 9 322500 1981 2060 510257.
## 10 550000 1933 1660 495966.
## # … with 6,476 more rows
#Congratualtions!
# You have trained a linear regression model and used it to predict the prices of homes in the test dataset! The model only used two predictor variables, but the predicted values in the .pred column seem reasonable!
#NEXT WEEK
#Evaluating model performance