library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(corrplot)## corrplot 0.92 loaded
library(rpart.plot)## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.2.3
Info <- read.csv("https://raw.githubusercontent.com/AldataSci/FinalProject-621-/main/kc_house_data.csv")
str(Info)## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : chr "20141013T000000" "20141209T000000" "20150225T000000" "20141209T000000" ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
The housing dataset contains no empty values..
## There are no empty values..
sapply(Info,function(x){sum(is.na(x))})## id date price bedrooms bathrooms
## 0 0 0 0 0
## sqft_living sqft_lot floors waterfront view
## 0 0 0 0 0
## condition grade sqft_above sqft_basement yr_built
## 0 0 0 0 0
## yr_renovated zipcode lat long sqft_living15
## 0 0 0 0 0
## sqft_lot15
## 0
Almost all of the columns are either numeric or integer columns but the date column is character and contains the date in a weird format.
summary(Info)## id date price bedrooms
## Min. :1.000e+06 Length:21613 Min. : 75000 Min. : 0.000
## 1st Qu.:2.123e+09 Class :character 1st Qu.: 321950 1st Qu.: 3.000
## Median :3.905e+09 Mode :character Median : 450000 Median : 3.000
## Mean :4.580e+09 Mean : 540088 Mean : 3.371
## 3rd Qu.:7.309e+09 3rd Qu.: 645000 3rd Qu.: 4.000
## Max. :9.900e+09 Max. :7700000 Max. :33.000
## bathrooms sqft_living sqft_lot floors
## Min. :0.000 Min. : 290 Min. : 520 Min. :1.000
## 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040 1st Qu.:1.000
## Median :2.250 Median : 1910 Median : 7618 Median :1.500
## Mean :2.115 Mean : 2080 Mean : 15107 Mean :1.494
## 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688 3rd Qu.:2.000
## Max. :8.000 Max. :13540 Max. :1651359 Max. :3.500
## waterfront view condition grade
## Min. :0.000000 Min. :0.0000 Min. :1.000 Min. : 1.000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 7.000
## Median :0.000000 Median :0.0000 Median :3.000 Median : 7.000
## Mean :0.007542 Mean :0.2343 Mean :3.409 Mean : 7.657
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :1.000000 Max. :4.0000 Max. :5.000 Max. :13.000
## sqft_above sqft_basement yr_built yr_renovated
## Min. : 290 Min. : 0.0 Min. :1900 Min. : 0.0
## 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.0
## Median :1560 Median : 0.0 Median :1975 Median : 0.0
## Mean :1788 Mean : 291.5 Mean :1971 Mean : 84.4
## 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997 3rd Qu.: 0.0
## Max. :9410 Max. :4820.0 Max. :2015 Max. :2015.0
## zipcode lat long sqft_living15
## Min. :98001 Min. :47.16 Min. :-122.5 Min. : 399
## 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3 1st Qu.:1490
## Median :98065 Median :47.57 Median :-122.2 Median :1840
## Mean :98078 Mean :47.56 Mean :-122.2 Mean :1987
## 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1 3rd Qu.:2360
## Max. :98199 Max. :47.78 Max. :-121.3 Max. :6210
## sqft_lot15
## Min. : 651
## 1st Qu.: 5100
## Median : 7620
## Mean : 12768
## 3rd Qu.: 10083
## Max. :871200
Visualizing the data we can see the house prices are skewed so we log transformed the price.
ggplot(Info,aes(x=price,color="lightblue")) + geom_histogram(fill="lightblue") +
scale_x_log10()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In this section I’ve cleaned the date column and remove unecessary characters and put it in a proper time-date format.
library(stringr)
Info$date <- str_replace(Info$date,"T000000","")
head(Info$date)## [1] "20141013" "20141209" "20150225" "20141209" "20150218" "20140512"
## okay now convert it to date
Info$date <- as.numeric(Info$date)
Info$date <- as.Date(as.character(Info$date),format = "%Y%m%d")
head(Info$date)## [1] "2014-10-13" "2014-12-09" "2015-02-25" "2014-12-09" "2015-02-18"
## [6] "2014-05-12"
## Order the date.. Sort the Date Frame by the date column.. and call it by Info 2..
Info2 <- Info[order(Info$date),]
head(Info2)## id date price bedrooms bathrooms sqft_living sqft_lot
## 173 1999700045 2014-05-02 313000 3 1.50 1340 7912
## 313 1860600135 2014-05-02 2384000 5 2.50 3650 9050
## 503 5467900070 2014-05-02 342000 3 2.00 1930 11947
## 776 4040800810 2014-05-02 420000 3 2.25 2000 8030
## 1042 7197300105 2014-05-02 550000 4 2.50 1940 10500
## 1394 5100401414 2014-05-02 490000 2 1.00 880 6380
## floors waterfront view condition grade sqft_above sqft_basement yr_built
## 173 1.5 0 0 3 7 1340 0 1955
## 313 2.0 0 4 5 10 3370 280 1921
## 503 1.0 0 0 4 8 1930 0 1966
## 776 1.0 0 0 4 8 1000 1000 1963
## 1042 1.0 0 0 4 7 1140 800 1976
## 1394 1.0 0 0 3 7 880 0 1938
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 173 0 98133 47.7658 -122.339 1480 7940
## 313 0 98119 47.6345 -122.367 2880 5400
## 503 0 98042 47.3672 -122.151 2200 12825
## 776 0 98008 47.6188 -122.114 2070 8250
## 1042 0 98052 47.6830 -122.114 2200 10500
## 1394 1994 98115 47.6924 -122.322 1340 6380
## Omit the id column since it is irrelevant to predicting the price I assume..
head(Info2 %>%
select(-id) -> Info2)## date price bedrooms bathrooms sqft_living sqft_lot floors
## 173 2014-05-02 313000 3 1.50 1340 7912 1.5
## 313 2014-05-02 2384000 5 2.50 3650 9050 2.0
## 503 2014-05-02 342000 3 2.00 1930 11947 1.0
## 776 2014-05-02 420000 3 2.25 2000 8030 1.0
## 1042 2014-05-02 550000 4 2.50 1940 10500 1.0
## 1394 2014-05-02 490000 2 1.00 880 6380 1.0
## waterfront view condition grade sqft_above sqft_basement yr_built
## 173 0 0 3 7 1340 0 1955
## 313 0 4 5 10 3370 280 1921
## 503 0 0 4 8 1930 0 1966
## 776 0 0 4 8 1000 1000 1963
## 1042 0 0 4 7 1140 800 1976
## 1394 0 0 3 7 880 0 1938
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 173 0 98133 47.7658 -122.339 1480 7940
## 313 0 98119 47.6345 -122.367 2880 5400
## 503 0 98042 47.3672 -122.151 2200 12825
## 776 0 98008 47.6188 -122.114 2070 8250
## 1042 0 98052 47.6830 -122.114 2200 10500
## 1394 1994 98115 47.6924 -122.322 1340 6380
I have visualized the various distribution of the many columns in the dataset, we can see that the data points are mostly skewed to the right.
## look at the distribution with the density plot..
Info2 %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density(color = "red")## convert it to a long format..
Info3 <- gather(Info2)## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(Info3,aes(x=key,y=value)) + geom_boxplot(fill="lightblue") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ylim(c(0,2e+06))## Warning: Removed 21811 rows containing non-finite values (`stat_boxplot()`).
## zoom on the boxplot further
ggplot(Info3,aes(x=key,y=value)) + geom_boxplot(fill="lightblue") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ylim(c(0,15000))## Warning: Removed 92340 rows containing non-finite values (`stat_boxplot()`).
Looking at the correlation matrix we can see that some columns are correlated with some of the other columns. For price we see that bathroom,sqft_living,grade and sqft_above may play a role in determining the price of a house.
## Correlation Matrix
Info_num <- Info2[,sapply(Info2,is.numeric)]
cor_matrix <- cor(Info_num)
## addCoef and number.cex to adjust the text..
corrplot(cor_matrix,method="shade",type="upper", addCoef.col = TRUE,tl.cex= 0.5,number.cex = 0.5)Here we will use tidymodels to create a bunch of decision tree models to find the best model to predict the house prices.
library(tidymodels)## Warning: package 'tidymodels' was built under R version 4.2.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.3 ✔ rsample 1.1.1
## ✔ dials 1.1.0 ✔ tune 1.0.1
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.1.0 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.3 ✔ yardstick 1.1.0
## ✔ recipes 1.0.5
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dials::prune() masks rpart::prune()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
## used startifed sampling as Julia Silge did in her video to prevent class imbalance.. (80/20 split is recommended..)
set.seed(234)
Info_split <- initial_split(Info2,strata = price)
Info_train <- training(Info_split)
Info_test <- testing(Info_split)
## created the startifed sampling..
set.seed(345)
Info_folds <- vfold_cv(Info_train,strata = price)# how complicated the tree will be, how deep the tree will be, how many data points do we need to split the nodes..
tree_spec <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("rpart") %>%
set_mode("regression")
tree_spec## Decision Tree Model Specification (regression)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
## min_n = tune()
##
## Computational engine: rpart
#Try all same combinations of the same values use helper functions to dial the first value in the tibble is the amount of models we are trying
# set of parameters we are going to try on the model
tree_grid <- grid_regular(cost_complexity(),tree_depth(),min_n(),levels=4)## creating the workflow.
doParallel::registerDoParallel()
set.seed(456)
Info_rs <- tune_grid(
tree_spec,
price~.,
resamples = Info_folds,
grid = tree_grid,
metrics = metric_set(rmse,rsq)
)
Info_rs## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [14586/1623]> Fold01 <tibble [128 × 7]> <tibble [0 × 3]>
## 2 <split [14586/1623]> Fold02 <tibble [128 × 7]> <tibble [0 × 3]>
## 3 <split [14587/1622]> Fold03 <tibble [128 × 7]> <tibble [0 × 3]>
## 4 <split [14588/1621]> Fold04 <tibble [128 × 7]> <tibble [0 × 3]>
## 5 <split [14588/1621]> Fold05 <tibble [128 × 7]> <tibble [0 × 3]>
## 6 <split [14589/1620]> Fold06 <tibble [128 × 7]> <tibble [0 × 3]>
## 7 <split [14589/1620]> Fold07 <tibble [128 × 7]> <tibble [0 × 3]>
## 8 <split [14589/1620]> Fold08 <tibble [128 × 7]> <tibble [0 × 3]>
## 9 <split [14589/1620]> Fold09 <tibble [128 × 7]> <tibble [0 × 3]>
## 10 <split [14590/1619]> Fold10 <tibble [128 × 7]> <tibble [0 × 3]>
In the plot we see that a tree with low depth has bad rmse and rsq, while a tree with better depth and more nodes has higher rmse and rsq. We can choose a decision tree somewhere in the middle to prevent overfitting.
autoplot(Info_rs) + theme_light()head(collect_metrics(Info_rs))## # A tibble: 6 × 9
## cost_complexity tree_depth min_n .metric .esti…¹ mean n std_err .config
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 1 2 rmse standa… 3.02e+5 10 7.46e+3 Prepro…
## 2 0.0000000001 1 2 rsq standa… 3.25e-1 10 7.97e-3 Prepro…
## 3 0.0000001 1 2 rmse standa… 3.02e+5 10 7.46e+3 Prepro…
## 4 0.0000001 1 2 rsq standa… 3.25e-1 10 7.97e-3 Prepro…
## 5 0.0001 1 2 rmse standa… 3.02e+5 10 7.46e+3 Prepro…
## 6 0.0001 1 2 rsq standa… 3.25e-1 10 7.97e-3 Prepro…
## # … with abbreviated variable name ¹​.estimator
show_best(Info_rs,"rmse")## # A tibble: 5 × 9
## cost_complexity tree_depth min_n .metric .estim…¹ mean n std_err .config
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 15 14 rmse standard 1.68e5 10 5999. Prepro…
## 2 0.0000001 15 14 rmse standard 1.68e5 10 5999. Prepro…
## 3 0.0001 15 14 rmse standard 1.68e5 10 6438. Prepro…
## 4 0.0000000001 10 14 rmse standard 1.69e5 10 6134. Prepro…
## 5 0.0000001 10 14 rmse standard 1.69e5 10 6134. Prepro…
## # … with abbreviated variable name ¹​.estimator
# These are the values that maxmizes the rmse in this dataset of house prices..
select_best(Info_rs,"rmse")## # A tibble: 1 × 4
## cost_complexity tree_depth min_n .config
## <dbl> <int> <int> <chr>
## 1 0.0000000001 15 14 Preprocessor1_Model29
show_best(Info_rs,"rsq")## # A tibble: 5 × 9
## cost_complexity tree_depth min_n .metric .estima…¹ mean n std_err .config
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 15 14 rsq standard 0.795 10 0.0122 Prepro…
## 2 0.0000001 15 14 rsq standard 0.795 10 0.0122 Prepro…
## 3 0.0001 15 14 rsq standard 0.793 10 0.0136 Prepro…
## 4 0.0000000001 10 14 rsq standard 0.791 10 0.0125 Prepro…
## 5 0.0000001 10 14 rsq standard 0.791 10 0.0125 Prepro…
## # … with abbreviated variable name ¹​.estimator
After viewing many parameters of the model we can update our workflow object with the values from select_best for the rmse.
# tree specification that has values.. choose the best tree model..
final_info <- finalize_model(tree_spec,select_best(Info_rs,"rmse"))final_fit <- fit(final_info,price~.,Info_train)
final_rs <- last_fit(final_info,price~.,Info_split)# Metrics collected in the testing data..
collect_metrics(final_rs)## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 161330. Preprocessor1_Model1
## 2 rsq standard 0.807 Preprocessor1_Model1
Here we can see which predictors were important in the creation of the final model.
library(vip)## Warning: package 'vip' was built under R version 4.2.3
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_fit %>%
vip(geom = "col",aesthetics = list(fill = "lightblue",alpha = 0.8)) +
scale_y_continuous(expand = c(0,0))There may be issues of overplotting in the plot.
collect_predictions(final_rs) %>%
ggplot(aes(price,.pred)) +
geom_abline(slope = 1,lty = 2,alpha= 0.5,color="red") +
geom_point(alpha = 0.6, color = "midnightblue")Final_predictions<- as.data.frame(cbind(collect_predictions(final_rs)$price,collect_predictions(final_rs)$.pred))
colnames(Final_predictions) <- c("Price","Predictions")
head(Final_predictions)## Price Predictions
## 1 342000 241400.0
## 2 482000 455904.5
## 3 242500 269891.7
## 4 435000 431677.3
## 5 626000 505556.7
## 6 675000 624349.3
write_csv(Final_predictions,"C:\\Users\\alhaq\\OneDrive\\Desktop\\Data 621\\predictions.csv")