Dataset: House sales in King County,USA Task: Predict House price using Regression

#This data contains information on homes sold in the Seattle, Washington area between 2015 and 2016.
#create training and test datasets from the home_sales data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✔ broom        1.0.0     ✔ rsample      1.0.0
## ✔ dials        1.0.0     ✔ tune         1.0.0
## ✔ infer        1.0.2     ✔ workflows    1.0.0
## ✔ modeldata    1.0.0     ✔ workflowsets 0.2.1
## ✔ parsnip      1.0.0     ✔ yardstick    1.0.0
## ✔ recipes      1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
#home_sales <- read_rds("data/home_sales.rds")
#telecom_df <- read_rds("data/telecom_df.rds")
#loans_df <- read_rds("data/loan_df.rds")

library(readr)
home_sales <- read_csv("/home/kn/Documents/TIDY/Modelling_tidymodels/kc_house_data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): id
## dbl  (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(home_sales)
#Create an rsample object, home_split, that contains the instructions for randomly splitting the home_sales data into a training and test dataset

#Allocate 70% of the data into training and stratify the results by price

home_split <- initial_split(home_sales, 
                            prop = 0.7, 
                            strata = price)
#create a traing data set from home_split called home_training
home_training <- home_split %>% 
  training()
#Create the home_test tibble by passing home_split into the appropriate function i.e testing () for generating test datasets
home_test <- home_split %>% 
  testing()
#Check the number of rows in the training and test datasets by passing them into the nrow() function
nrow(home_test)
## [1] 6486
#EXERCISE FOR YOU: check the number of rows in the training data sets/tibbles
nrow(home_training)
## [1] 15127
#number of rows from the initial dataset
#Just for comparison
#it is appropriate to allocate more rows into the test set. This will provide more data for the model evaluation step.

nrow(home_sales)
## [1] 21613
#What is stratified sampling in R?
#One commonly used sampling method is stratified random sampling, in which #a population is split into groups and a certain number of members from #each group are randomly selected to be included in the sample
#EXERCISE ON DATA WRANGLING 
# calculate summary statistics for the price variable in the training and test datasets. The home_training and home_test tibbles

# Distribution of price in training data
home_training %>% 
  summarize(min_price = min(price),
            max_price = max(price),
            mean_price = mean(price),
            sd_price = sd(price))
## # A tibble: 1 × 4
##   min_price max_price mean_price sd_price
##       <dbl>     <dbl>      <dbl>    <dbl>
## 1     75000   7700000    540275.  366108.
# Distribution of price in test data
home_test %>% 
  summarise(min_price = min(price),
            max_price = max(price),
            mean_price = mean(price),
            sd_price = sd(price))
## # A tibble: 1 × 4
##   min_price max_price mean_price sd_price
##       <dbl>     <dbl>      <dbl>    <dbl>
## 1     78000   6885000    539652.  369521.
#In this exercise, you will define a parsnip linear regression object and train your model to predict price using yr_built and sqft_living as predictor variables from the home_sales data.

# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  set_engine('lm') %>%    # Set the model engine
  set_mode('regression')  # Set the model mode
#Train your model to predict selling_price using yr_built and sqft_living as predictor variables from the home_training dataset.

lm_fit <- linear_model %>% 
  fit(price ~ yr_built + sqft_living,
      data = home_training)

#EXERCISE
# Print lm_fit to view model information

print(lm_fit)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = price ~ yr_built + sqft_living, data = data)
## 
## Coefficients:
## (Intercept)     yr_built  sqft_living  
##   4630850.9      -2395.7        303.7
#Excellent work! 
#You have defined your model with linear_reg() and trained it to predict price using yr_built and sqft_living. 

#Printing a parsnip model fit object displays useful model information, such as the training time, model formula used during training, and the estimated model parameters
#Now lets explore your models estimated parameters
tidy(lm_fit)
## # A tibble: 3 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept) 4630851. 143356.        32.3 1.95e-221
## 2 yr_built      -2396.     73.5      -32.6 1.68e-225
## 3 sqft_living     304.      2.35     129.  0
#WHAT DO WE OBSERVE FROM THE TIBBLE??
#Since sqft_living has a positive estimated parameter, the price of homes increases with the square footage.

#Conversely, since yr_built(which represents the home_age) has a negative estimated parameter, older homes tend to have lower prices
#Now let us predict home prices
#Before you can evaluate model performance, you must add your predictions to the test dataset

#EXERCISE
#use your trained model, lm_fit, to predict price in the home_test dataset
#HINT: Create a tibble, home_predictions (OR ANY NAME), that contains the predicted selling prices of homes in the test dataset.

lm_fit <- linear_model %>% 
  fit(price ~ yr_built + sqft_living,
      data = home_test)



home_predictions <- predict(lm_fit,
                            new_data = home_test)

home_predictions
## # A tibble: 6,486 × 1
##      .pred
##      <dbl>
##  1 515710.
##  2 372928.
##  3 307803.
##  4 787548.
##  5 620764.
##  6 612415.
##  7 710341.
##  8 330291.
##  9 510257.
## 10 495966.
## # … with 6,476 more rows
#Create a tibble with the price, yr_built, and sqft_living columns from the test dataset and the predicted home prices

home_test_results <- home_test %>% 
  select(price, yr_built, sqft_living) %>% 
  bind_cols(home_predictions)

#view results
home_test_results
## # A tibble: 6,486 × 4
##     price yr_built sqft_living   .pred
##     <dbl>    <dbl>       <dbl>   <dbl>
##  1 604000     1965        1960 515710.
##  2 257500     1995        1715 372928.
##  3 400000     1977        1370 307803.
##  4 650000     1979        2950 787548.
##  5 329000     1985        2450 620764.
##  6 719000     2005        2570 612415.
##  7 687500     1929        2330 710341.
##  8 535000     1929        1090 330291.
##  9 322500     1981        2060 510257.
## 10 550000     1933        1660 495966.
## # … with 6,476 more rows
                        #Congratualtions!

# You have trained a linear regression model and used it to predict the prices of homes in the test dataset! The model only used two predictor variables, but the predicted values in the .pred column seem reasonable!
#NEXT WEEK
                  #Evaluating model performance