Viewing The Data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.2.3
Info <- read.csv("https://raw.githubusercontent.com/AldataSci/FinalProject-621-/main/kc_house_data.csv")

str(Info)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : chr  "20141013T000000" "20141209T000000" "20150225T000000" "20141209T000000" ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...

The housing dataset contains no empty values..

## There are no empty values.. 
sapply(Info,function(x){sum(is.na(x))})
##            id          date         price      bedrooms     bathrooms 
##             0             0             0             0             0 
##   sqft_living      sqft_lot        floors    waterfront          view 
##             0             0             0             0             0 
##     condition         grade    sqft_above sqft_basement      yr_built 
##             0             0             0             0             0 
##  yr_renovated       zipcode           lat          long sqft_living15 
##             0             0             0             0             0 
##    sqft_lot15 
##             0

Almost all of the columns are either numeric or integer columns but the date column is character and contains the date in a weird format.

summary(Info)
##        id                date               price            bedrooms     
##  Min.   :1.000e+06   Length:21613       Min.   :  75000   Min.   : 0.000  
##  1st Qu.:2.123e+09   Class :character   1st Qu.: 321950   1st Qu.: 3.000  
##  Median :3.905e+09   Mode  :character   Median : 450000   Median : 3.000  
##  Mean   :4.580e+09                      Mean   : 540088   Mean   : 3.371  
##  3rd Qu.:7.309e+09                      3rd Qu.: 645000   3rd Qu.: 4.000  
##  Max.   :9.900e+09                      Max.   :7700000   Max.   :33.000  
##    bathrooms      sqft_living       sqft_lot           floors     
##  Min.   :0.000   Min.   :  290   Min.   :    520   Min.   :1.000  
##  1st Qu.:1.750   1st Qu.: 1427   1st Qu.:   5040   1st Qu.:1.000  
##  Median :2.250   Median : 1910   Median :   7618   Median :1.500  
##  Mean   :2.115   Mean   : 2080   Mean   :  15107   Mean   :1.494  
##  3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10688   3rd Qu.:2.000  
##  Max.   :8.000   Max.   :13540   Max.   :1651359   Max.   :3.500  
##    waterfront            view          condition         grade       
##  Min.   :0.000000   Min.   :0.0000   Min.   :1.000   Min.   : 1.000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.: 7.000  
##  Median :0.000000   Median :0.0000   Median :3.000   Median : 7.000  
##  Mean   :0.007542   Mean   :0.2343   Mean   :3.409   Mean   : 7.657  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.: 8.000  
##  Max.   :1.000000   Max.   :4.0000   Max.   :5.000   Max.   :13.000  
##    sqft_above   sqft_basement       yr_built     yr_renovated   
##  Min.   : 290   Min.   :   0.0   Min.   :1900   Min.   :   0.0  
##  1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951   1st Qu.:   0.0  
##  Median :1560   Median :   0.0   Median :1975   Median :   0.0  
##  Mean   :1788   Mean   : 291.5   Mean   :1971   Mean   :  84.4  
##  3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997   3rd Qu.:   0.0  
##  Max.   :9410   Max.   :4820.0   Max.   :2015   Max.   :2015.0  
##     zipcode           lat             long        sqft_living15 
##  Min.   :98001   Min.   :47.16   Min.   :-122.5   Min.   : 399  
##  1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3   1st Qu.:1490  
##  Median :98065   Median :47.57   Median :-122.2   Median :1840  
##  Mean   :98078   Mean   :47.56   Mean   :-122.2   Mean   :1987  
##  3rd Qu.:98118   3rd Qu.:47.68   3rd Qu.:-122.1   3rd Qu.:2360  
##  Max.   :98199   Max.   :47.78   Max.   :-121.3   Max.   :6210  
##    sqft_lot15    
##  Min.   :   651  
##  1st Qu.:  5100  
##  Median :  7620  
##  Mean   : 12768  
##  3rd Qu.: 10083  
##  Max.   :871200

Visualizing the data we can see the house prices are skewed so we log transformed the price.

ggplot(Info,aes(x=price,color="lightblue")) + geom_histogram(fill="lightblue") +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data_Cleaning

In this section I’ve cleaned the date column and remove unecessary characters and put it in a proper time-date format.

library(stringr)

Info$date <- str_replace(Info$date,"T000000","")
head(Info$date)
## [1] "20141013" "20141209" "20150225" "20141209" "20150218" "20140512"
## okay now convert it to date

Info$date <- as.numeric(Info$date)

Info$date <- as.Date(as.character(Info$date),format = "%Y%m%d")
head(Info$date)
## [1] "2014-10-13" "2014-12-09" "2015-02-25" "2014-12-09" "2015-02-18"
## [6] "2014-05-12"
## Order the date.. Sort the Date Frame by the date column.. and call it by Info 2.. 

Info2 <- Info[order(Info$date),]
head(Info2)
##              id       date   price bedrooms bathrooms sqft_living sqft_lot
## 173  1999700045 2014-05-02  313000        3      1.50        1340     7912
## 313  1860600135 2014-05-02 2384000        5      2.50        3650     9050
## 503  5467900070 2014-05-02  342000        3      2.00        1930    11947
## 776  4040800810 2014-05-02  420000        3      2.25        2000     8030
## 1042 7197300105 2014-05-02  550000        4      2.50        1940    10500
## 1394 5100401414 2014-05-02  490000        2      1.00         880     6380
##      floors waterfront view condition grade sqft_above sqft_basement yr_built
## 173     1.5          0    0         3     7       1340             0     1955
## 313     2.0          0    4         5    10       3370           280     1921
## 503     1.0          0    0         4     8       1930             0     1966
## 776     1.0          0    0         4     8       1000          1000     1963
## 1042    1.0          0    0         4     7       1140           800     1976
## 1394    1.0          0    0         3     7        880             0     1938
##      yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 173             0   98133 47.7658 -122.339          1480       7940
## 313             0   98119 47.6345 -122.367          2880       5400
## 503             0   98042 47.3672 -122.151          2200      12825
## 776             0   98008 47.6188 -122.114          2070       8250
## 1042            0   98052 47.6830 -122.114          2200      10500
## 1394         1994   98115 47.6924 -122.322          1340       6380
## Omit the id column since it is irrelevant to predicting the price I assume..

head(Info2 %>%
  select(-id) -> Info2)
##            date   price bedrooms bathrooms sqft_living sqft_lot floors
## 173  2014-05-02  313000        3      1.50        1340     7912    1.5
## 313  2014-05-02 2384000        5      2.50        3650     9050    2.0
## 503  2014-05-02  342000        3      2.00        1930    11947    1.0
## 776  2014-05-02  420000        3      2.25        2000     8030    1.0
## 1042 2014-05-02  550000        4      2.50        1940    10500    1.0
## 1394 2014-05-02  490000        2      1.00         880     6380    1.0
##      waterfront view condition grade sqft_above sqft_basement yr_built
## 173           0    0         3     7       1340             0     1955
## 313           0    4         5    10       3370           280     1921
## 503           0    0         4     8       1930             0     1966
## 776           0    0         4     8       1000          1000     1963
## 1042          0    0         4     7       1140           800     1976
## 1394          0    0         3     7        880             0     1938
##      yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 173             0   98133 47.7658 -122.339          1480       7940
## 313             0   98119 47.6345 -122.367          2880       5400
## 503             0   98042 47.3672 -122.151          2200      12825
## 776             0   98008 47.6188 -122.114          2070       8250
## 1042            0   98052 47.6830 -122.114          2200      10500
## 1394         1994   98115 47.6924 -122.322          1340       6380

I have visualized the various distribution of the many columns in the dataset, we can see that the data points are mostly skewed to the right.

## look at the distribution with the density plot.. 
Info2 %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density(color = "red")

## convert it to a long format..
Info3 <- gather(Info2)
## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(Info3,aes(x=key,y=value)) + geom_boxplot(fill="lightblue") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ylim(c(0,2e+06))
## Warning: Removed 21811 rows containing non-finite values (`stat_boxplot()`).

## zoom on the boxplot further
ggplot(Info3,aes(x=key,y=value)) + geom_boxplot(fill="lightblue") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ylim(c(0,15000))
## Warning: Removed 92340 rows containing non-finite values (`stat_boxplot()`).


Looking at the correlation matrix we can see that some columns are correlated with some of the other columns. For price we see that bathroom,sqft_living,grade and sqft_above may play a role in determining the price of a house.

## Correlation Matrix
Info_num <- Info2[,sapply(Info2,is.numeric)]

cor_matrix <- cor(Info_num)

## addCoef and number.cex to adjust the text.. 
corrplot(cor_matrix,method="shade",type="upper", addCoef.col = TRUE,tl.cex= 0.5,number.cex = 0.5)

Model Creation With Tidy Models..

Here we will use tidymodels to create a bunch of decision tree models to find the best model to predict the house prices.

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.2.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.3     ✔ rsample      1.1.1
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.3     ✔ yardstick    1.1.0
## ✔ recipes      1.0.5
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ dials::prune()    masks rpart::prune()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
## used startifed sampling as Julia Silge did in her video to prevent class imbalance.. (80/20 split is recommended..)
set.seed(234)
Info_split <- initial_split(Info2,strata = price)
Info_train <- training(Info_split)
Info_test <- testing(Info_split)


## created the startifed sampling.. 
set.seed(345)
Info_folds <- vfold_cv(Info_train,strata = price)
# how complicated the tree will be, how deep the tree will be, how many data points do we need to split the nodes.. 
tree_spec <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) %>%
  set_engine("rpart") %>%
  set_mode("regression")

tree_spec
## Decision Tree Model Specification (regression)
## 
## Main Arguments:
##   cost_complexity = tune()
##   tree_depth = tune()
##   min_n = tune()
## 
## Computational engine: rpart
#Try all same combinations of the same values use helper functions to dial the first value in the tibble is the amount of models we are trying

# set of parameters we are going to try on the model
tree_grid <- grid_regular(cost_complexity(),tree_depth(),min_n(),levels=4)
## creating the workflow.
doParallel::registerDoParallel()
set.seed(456)
Info_rs <- tune_grid(
  tree_spec,
  price~.,
  resamples = Info_folds,
  grid = tree_grid,
  metrics = metric_set(rmse,rsq)
)

Info_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 4
##    splits               id     .metrics           .notes          
##    <list>               <chr>  <list>             <list>          
##  1 <split [14586/1623]> Fold01 <tibble [128 × 7]> <tibble [0 × 3]>
##  2 <split [14586/1623]> Fold02 <tibble [128 × 7]> <tibble [0 × 3]>
##  3 <split [14587/1622]> Fold03 <tibble [128 × 7]> <tibble [0 × 3]>
##  4 <split [14588/1621]> Fold04 <tibble [128 × 7]> <tibble [0 × 3]>
##  5 <split [14588/1621]> Fold05 <tibble [128 × 7]> <tibble [0 × 3]>
##  6 <split [14589/1620]> Fold06 <tibble [128 × 7]> <tibble [0 × 3]>
##  7 <split [14589/1620]> Fold07 <tibble [128 × 7]> <tibble [0 × 3]>
##  8 <split [14589/1620]> Fold08 <tibble [128 × 7]> <tibble [0 × 3]>
##  9 <split [14589/1620]> Fold09 <tibble [128 × 7]> <tibble [0 × 3]>
## 10 <split [14590/1619]> Fold10 <tibble [128 × 7]> <tibble [0 × 3]>

Collect Metrics

In the plot we see that a tree with low depth has bad rmse and rsq, while a tree with better depth and more nodes has higher rmse and rsq. We can choose a decision tree somewhere in the middle to prevent overfitting.

autoplot(Info_rs) + theme_light()

head(collect_metrics(Info_rs))
## # A tibble: 6 × 9
##   cost_complexity tree_depth min_n .metric .esti…¹    mean     n std_err .config
##             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
## 1    0.0000000001          1     2 rmse    standa… 3.02e+5    10 7.46e+3 Prepro…
## 2    0.0000000001          1     2 rsq     standa… 3.25e-1    10 7.97e-3 Prepro…
## 3    0.0000001             1     2 rmse    standa… 3.02e+5    10 7.46e+3 Prepro…
## 4    0.0000001             1     2 rsq     standa… 3.25e-1    10 7.97e-3 Prepro…
## 5    0.0001                1     2 rmse    standa… 3.02e+5    10 7.46e+3 Prepro…
## 6    0.0001                1     2 rsq     standa… 3.25e-1    10 7.97e-3 Prepro…
## # … with abbreviated variable name ¹​.estimator
show_best(Info_rs,"rmse")
## # A tibble: 5 × 9
##   cost_complexity tree_depth min_n .metric .estim…¹   mean     n std_err .config
##             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
## 1    0.0000000001         15    14 rmse    standard 1.68e5    10   5999. Prepro…
## 2    0.0000001            15    14 rmse    standard 1.68e5    10   5999. Prepro…
## 3    0.0001               15    14 rmse    standard 1.68e5    10   6438. Prepro…
## 4    0.0000000001         10    14 rmse    standard 1.69e5    10   6134. Prepro…
## 5    0.0000001            10    14 rmse    standard 1.69e5    10   6134. Prepro…
## # … with abbreviated variable name ¹​.estimator
# These are the values that maxmizes the rmse in this dataset of house prices.. 
select_best(Info_rs,"rmse")
## # A tibble: 1 × 4
##   cost_complexity tree_depth min_n .config              
##             <dbl>      <int> <int> <chr>                
## 1    0.0000000001         15    14 Preprocessor1_Model29
show_best(Info_rs,"rsq")
## # A tibble: 5 × 9
##   cost_complexity tree_depth min_n .metric .estima…¹  mean     n std_err .config
##             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
## 1    0.0000000001         15    14 rsq     standard  0.795    10  0.0122 Prepro…
## 2    0.0000001            15    14 rsq     standard  0.795    10  0.0122 Prepro…
## 3    0.0001               15    14 rsq     standard  0.793    10  0.0136 Prepro…
## 4    0.0000000001         10    14 rsq     standard  0.791    10  0.0125 Prepro…
## 5    0.0000001            10    14 rsq     standard  0.791    10  0.0125 Prepro…
## # … with abbreviated variable name ¹​.estimator

Final Tree Model:

After viewing many parameters of the model we can update our workflow object with the values from select_best for the rmse.

# tree specification that has values.. choose the best tree model.. 
final_info <- finalize_model(tree_spec,select_best(Info_rs,"rmse"))

Fitting

final_fit <- fit(final_info,price~.,Info_train)
final_rs <- last_fit(final_info,price~.,Info_split)
# Metrics collected in the testing data.. 
collect_metrics(final_rs)
## # A tibble: 2 × 4
##   .metric .estimator  .estimate .config             
##   <chr>   <chr>           <dbl> <chr>               
## 1 rmse    standard   161330.    Preprocessor1_Model1
## 2 rsq     standard        0.807 Preprocessor1_Model1

Plotting

Here we can see which predictors were important in the creation of the final model.

library(vip)
## Warning: package 'vip' was built under R version 4.2.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
final_fit %>%
  vip(geom = "col",aesthetics = list(fill = "lightblue",alpha = 0.8)) +
  scale_y_continuous(expand = c(0,0))


Actual Price and Expected Price

There may be issues of overplotting in the plot.

collect_predictions(final_rs) %>%
  ggplot(aes(price,.pred)) + 
  geom_abline(slope = 1,lty = 2,alpha= 0.5,color="red") + 
  geom_point(alpha = 0.6, color = "midnightblue")

Write to CSV

Final_predictions<- as.data.frame(cbind(collect_predictions(final_rs)$price,collect_predictions(final_rs)$.pred))
colnames(Final_predictions) <- c("Price","Predictions")

head(Final_predictions)
##    Price Predictions
## 1 342000    241400.0
## 2 482000    455904.5
## 3 242500    269891.7
## 4 435000    431677.3
## 5 626000    505556.7
## 6 675000    624349.3
write_csv(Final_predictions,"C:\\Users\\alhaq\\OneDrive\\Desktop\\Data 621\\predictions.csv")

Thank you!