#load libraries to use
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(leaflet)
#creates an interactive map

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
#extends ggplot2 by adding several functions to reduce the complexity of combining geoms with transformed data.

library(Amelia)
## Loading required package: Rcpp
## 
## Attaching package: 'Rcpp'
## The following object is masked from 'package:rsample':
## 
##     populate
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
# used for imputing missing data

library(corrplot)
## corrplot 0.92 loaded
#visualizing a correlation matrix in R

library(cluster)
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.0

load the Kings County data set and investigate its structure

#load the data set to use
kc_house_data <- read_csv("housing dataset/kc_house_data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): id
## dbl  (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(kc_house_data)

dim(kc_house_data)
## [1] 21613    21
str(kc_house_data)
## spc_tbl_ [21,613 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id           : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
##  $ date         : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ zipcode      : num [1:21613] 98178 98125 98028 98136 98074 ...
##  $ lat          : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num [1:21613] -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num [1:21613] 5650 7639 8062 5000 7503 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   date = col_datetime(format = ""),
##   ..   price = col_double(),
##   ..   bedrooms = col_double(),
##   ..   bathrooms = col_double(),
##   ..   sqft_living = col_double(),
##   ..   sqft_lot = col_double(),
##   ..   floors = col_double(),
##   ..   waterfront = col_double(),
##   ..   view = col_double(),
##   ..   condition = col_double(),
##   ..   grade = col_double(),
##   ..   sqft_above = col_double(),
##   ..   sqft_basement = col_double(),
##   ..   yr_built = col_double(),
##   ..   yr_renovated = col_double(),
##   ..   zipcode = col_double(),
##   ..   lat = col_double(),
##   ..   long = col_double(),
##   ..   sqft_living15 = col_double(),
##   ..   sqft_lot15 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(kc_house_data)
## # A tibble: 6 × 21
##   id         date                  price bedrooms bathrooms sqft_living sqft_lot
##   <chr>      <dttm>                <dbl>    <dbl>     <dbl>       <dbl>    <dbl>
## 1 7129300520 2014-10-13 00:00:00  221900        3      1           1180     5650
## 2 6414100192 2014-12-09 00:00:00  538000        3      2.25        2570     7242
## 3 5631500400 2015-02-25 00:00:00  180000        2      1            770    10000
## 4 2487200875 2014-12-09 00:00:00  604000        4      3           1960     5000
## 5 1954400510 2015-02-18 00:00:00  510000        3      2           1680     8080
## 6 7237550310 2014-05-12 00:00:00 1225000        4      4.5         5420   101930
## # ℹ 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## #   condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## #   yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## #   sqft_living15 <dbl>, sqft_lot15 <dbl>

This data set contains house sale prices for King County, Seattle Washington State which includes Seattle. It includes homes sold between May 2014 and May 2015.

to predict housing prices in King County using a linear regression model, as well as find out which attribute plays the key role in determining the housing price.

id: unique ID per house sale

date: date of the house sale

price: price of house in USD

bedrooms: number of bedrooms

bathrooms: number of bathrooms where 0.5 represents a bathroom with a toilets but with no shower

sqft_living: square footage of the apartments interior living space

sqft_lot: square footage of the land space

floors number of floors

waterfront: an index indicate if the house was overlooking the waterfront or not. 0 reps no waterfront 1, reps with waterfront

view: an index from 0 to 4 of how good the view of the property was. 0 reps no good view, 4 reps very good view.

conditions: an index from 1 to 5 on the condition of the house. 1 reps poorer conditions and 5 reps superb condition

grade: an index from 1 to 3. 1 to 3 falls short of building construction and design, 7 has an average level construction and design, and 11 to 13 has higher quality level of construction and design.

sqft_above: The square footage of the interior housing space that is above the ground level

sqft_basement: The square footage of the interior housing space that is below the ground level

yr_built: The year of house built

yr_renovated: The year of the house’s last renovation

zipcode: The zipcode is the postal code to indicate the area the house is in

sqft_living15: The average square footage of interior housing living space for the nearest 15 neighboring houses

sqft_lot 15: The average square footage of land space for the nearest 15 neighboring houses

#Next, we get understanding about different statiscal features using summary()

summary(kc_house_data)
##       id                 date                         price        
##  Length:21613       Min.   :2014-05-02 00:00:00   Min.   :  75000  
##  Class :character   1st Qu.:2014-07-22 00:00:00   1st Qu.: 321950  
##  Mode  :character   Median :2014-10-16 00:00:00   Median : 450000  
##                     Mean   :2014-10-29 04:38:01   Mean   : 540088  
##                     3rd Qu.:2015-02-17 00:00:00   3rd Qu.: 645000  
##                     Max.   :2015-05-27 00:00:00   Max.   :7700000  
##     bedrooms        bathrooms      sqft_living       sqft_lot      
##  Min.   : 0.000   Min.   :0.000   Min.   :  290   Min.   :    520  
##  1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427   1st Qu.:   5040  
##  Median : 3.000   Median :2.250   Median : 1910   Median :   7618  
##  Mean   : 3.371   Mean   :2.115   Mean   : 2080   Mean   :  15107  
##  3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10688  
##  Max.   :33.000   Max.   :8.000   Max.   :13540   Max.   :1651359  
##      floors        waterfront            view          condition    
##  Min.   :1.000   Min.   :0.000000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :1.500   Median :0.000000   Median :0.0000   Median :3.000  
##  Mean   :1.494   Mean   :0.007542   Mean   :0.2343   Mean   :3.409  
##  3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000  
##  Max.   :3.500   Max.   :1.000000   Max.   :4.0000   Max.   :5.000  
##      grade          sqft_above   sqft_basement       yr_built   
##  Min.   : 1.000   Min.   : 290   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.000   Median :1560   Median :   0.0   Median :1975  
##  Mean   : 7.657   Mean   :1788   Mean   : 291.5   Mean   :1971  
##  3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997  
##  Max.   :13.000   Max.   :9410   Max.   :4820.0   Max.   :2015  
##   yr_renovated       zipcode           lat             long       
##  Min.   :   0.0   Min.   :98001   Min.   :47.16   Min.   :-122.5  
##  1st Qu.:   0.0   1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3  
##  Median :   0.0   Median :98065   Median :47.57   Median :-122.2  
##  Mean   :  84.4   Mean   :98078   Mean   :47.56   Mean   :-122.2  
##  3rd Qu.:   0.0   3rd Qu.:98118   3rd Qu.:47.68   3rd Qu.:-122.1  
##  Max.   :2015.0   Max.   :98199   Max.   :47.78   Max.   :-121.3  
##  sqft_living15    sqft_lot15    
##  Min.   : 399   Min.   :   651  
##  1st Qu.:1490   1st Qu.:  5100  
##  Median :1840   Median :  7620  
##  Mean   :1987   Mean   : 12768  
##  3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :6210   Max.   :871200

data cleaning

Missing Value Detection: Amelia Package Missingness Map Function was used to identify the missing data in the dataset. From the map below, it can be observed that the dataset does not consist of any missing data for any of the variables.

missmap(kc_house_data)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

# Outlier Detection: Outliers are detected and analyzed using the Outlier Boxplots. From the outlier boxplot we inferred that the data consists of many outliers for the target variable, Price.
# However, the outliers variable corresponded to outliers for Number of Bedrooms, Number of Bathrooms and Square Feet Living. Upon further investigation, we found that the outliers correspond to high value of condition, view and grade.
# Thus, we concluded that these outliers are legitimate outliers and we decided to retain them in the data.

boxplot(kc_house_data$price)

boxplot(kc_house_data$bedrooms)

boxplot(kc_house_data$bathrooms)

boxplot(kc_house_data$sqft_living)

# There were two findings:-
# All data clean up will be performed at the copy of original data set, namely house_clean
#clean data chunk
house_clean <- kc_house_data 
nrow(house_clean)
## [1] 21613
#An observation with 33 bedrooms in 1620 ft, and come with only 1.75 bathrooms. The data was imputed to only 3 bedrooms.
max(house_clean$bedrooms)
## [1] 33
house_clean$bedrooms[house_clean$bedrooms == 33] = 3
nrow(house_clean)
## [1] 21613
max(house_clean$bedrooms)
## [1] 11
# 13 observations with 0 bedroom, and come with 0.75 to 2.5 bathroom. Since it is not common to have houses without bedrooms, thus we decided to exclude these observations.

min(house_clean$bedrooms)
## [1] 0
house_clean <- house_clean [house_clean$bedrooms != 0,]

nrow(house_clean)
## [1] 21600
#In total, there are 10 observations with 0 bathroom. After the step above, there are only 3 remaining observation with 0 bathroom. Since it is not common to have houses without bathrooms, thus we decided to exclude these observations.

?min()

min(house_clean$bathrooms)
## [1] 0
house_clean <- house_clean[house_clean$bathrooms != 0,]
nrow(house_clean)
## [1] 21597

Data transformation

while traversing the data we found that some of the columns need to have their data types adjusted in order to meet our goal. Thus we made the decision to retain all 21 original columns along with the transformed data.

# 1.Date: Changing the Date Format for Regression

house_clean$date<-(substr(house_clean$date, 1, 8))

house_clean$date<- ymd(house_clean$date)
## Warning: All formats failed to parse. No formats found.
house_clean$date<-as.numeric(as.Date(house_clean$date, origin = "1900-11-01"))

head(house_clean$date)
## [1] NA NA NA NA NA NA
# b. Age: new column, data types as numeric continuous
house_clean$age = 2015 - house_clean$yr_built + 1
head(house_clean$age)
## [1] 61 65 83 51 29 15
#c. renovated: new column, data types as numeric nominal
#convert "yr_renovated into a simpler categorical variable o indicate of a house has been renovated in the past

# legend to refer: 0 means no renovation has been done
#                  !=0 means renovation has been done


house_clean$renovated <- cut(house_clean$yr_renovated, breaks = c(-1, 0, 3000), labels = c("1", "2"))

house_clean$renovated <- as.numeric(house_clean$renovated)

head(house_clean$renovated)
## [1] 1 2 1 1 1 1
# d. price category: new column, data types as numerical nominal
# convert price variable from numeric continuous variable to a numeric nominal variable for categorical modelling purpose.

house_clean$price_cat = cut(house_clean$price, breaks = c(0,350000,450000,700000,10000000), labels=c("1","2","3","4"))

house_clean$price[1:10]
##  [1]  221900  538000  180000  604000  510000 1225000  257500  291850  229500
## [10]  323000
house_clean$price_cat[1:10]
##  [1] 1 3 1 3 3 4 1 1 1 1
## Levels: 1 2 3 4
view(house_clean)

EDA

objective of data visualization and pattern discovery is to reveal the relationships between the house features and the target variable, price. We want to identify the house features which affect the price variable and could be potential predictors. Through visualization, we gathered the following information about the data.

1. The correlation matrix gives a summary of correlations between the variables in the data set.

The objective behind analyzing the correlation between the continuous variables in the data was to identify variables that have significant linear relationship with price and those which do not.

#house_clean.cor = cor(house_clean[sapply(house_clean, function(x) !is.factor(x))])
#corrplot(house_clean.cor)
ggcorr(house_clean, geom = "blank", label = T, label_size = 3, hjust = 1, size = 3, layout.exp = 2) +
  geom_point(size = 8, aes(color = coefficient > 0, alpha = abs(coefficient) >= 0.5)) +
  scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
  guides(color = F, alpha = F)
## Warning in ggcorr(house_clean, geom = "blank", label = T, label_size = 3, :
## data in column(s) 'id', 'price_cat' are not numeric and were ignored
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#correlation matrix, we can see that all variables are positively correlated with price. Furthermore, sqft_living, grade, sqft_above, and sqft_living15 seems to have strong correlations with price
# b. scatterplot matrix - high posiive correlation
pairs(~price+bathrooms+sqft_living+grade+sqft_above+sqft_living15, data=house_clean, main="High Positive Corr. ScatterPlot Matrix")

# low positive correlation
pairs(~price+bedrooms+floors+waterfront+view+sqft_basement+lat, data=house_clean, main="Low Positive Corr. ScatterPlot Matrix")

#c. Latitude vs Longitude, colored by Price: 
# We can see that the area without the color points are not incorporated cities. We observe that price increases as we move from southern area to northern area across the latitude but has little variation as we move across the longitude.

coordinates_data <- dplyr::select(house_clean, price, lat, long)

head(coordinates_data)
## # A tibble: 6 × 3
##     price   lat  long
##     <dbl> <dbl> <dbl>
## 1  221900  47.5 -122.
## 2  538000  47.7 -122.
## 3  180000  47.7 -122.
## 4  604000  47.5 -122.
## 5  510000  47.6 -122.
## 6 1225000  47.7 -122.
pal = colorNumeric("YlOrRd", domain = coordinates_data$price)
int_map <- coordinates_data %>%
leaflet()%>%
addProviderTiles(providers$OpenStreetMap.Mapnik)%>%
addCircleMarkers(col = ~pal(price), opacity = 1.1, radius = 0.3) %>% 
addLegend(pal = pal, values = ~price)
## Assuming "long" and "lat" are longitude and latitude, respectively
int_map
# Price vs Square Feet Living, colour by Number of Bathroom: The scatterplot below indicates the house price increases as square foot living and number of bathroom increase.

plot(house_clean$sqft_living15, house_clean$price, pch=19, col=house_clean$bathrooms, xlab='Square foot living+No.of bathrooms',ylab='House Price')

# predictive modelling
# a. multiple regression
# The first model is built by having the high positive correlation variables based on the corr plot.
library(dplyr)

hr <- select(house_clean,price,bathrooms,sqft_living,grade,sqft_above,sqft_living15)

set.seed(123)

split <- sample.split(hr$price, SplitRatio = 0.8)

training_set <- subset(hr, split==TRUE)

test_set <- subset(hr, split==FALSE)

regressor <- lm(formula=price~., data=training_set)

summary(regressor)
## 
## Call:
## lm(formula = price ~ ., data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1005385  -137443   -22645   100515  4736642 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.664e+05  1.525e+04 -43.702  < 2e-16 ***
## bathrooms     -3.493e+04  3.859e+03  -9.053  < 2e-16 ***
## sqft_living    2.561e+02  5.070e+00  50.504  < 2e-16 ***
## grade          1.126e+05  2.777e+03  40.542  < 2e-16 ***
## sqft_above    -8.285e+01  5.010e+00 -16.537  < 2e-16 ***
## sqft_living15  1.788e+01  4.506e+00   3.967  7.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 253800 on 17819 degrees of freedom
## Multiple R-squared:  0.5496, Adjusted R-squared:  0.5495 
## F-statistic:  4349 on 5 and 17819 DF,  p-value: < 2.2e-16
# As shown, the multiple R-squared returned the value of 0.5495 which are not consider strong for the model even though all the variables are highly positive correlated to the output. 

let us fit our trained model to the train set we split to know how well it can perform

lm.fit <- lm(price~., data=test_set)

summary(lm.fit)
## 
## Call:
## lm(formula = price ~ ., data = test_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -719874 -128037  -18360   89293 2007325 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.718e+05  2.818e+04 -20.291  < 2e-16 ***
## bathrooms     -3.772e+04  7.234e+03  -5.214 1.95e-07 ***
## sqft_living    1.880e+02  9.612e+00  19.562  < 2e-16 ***
## grade          1.080e+05  5.147e+03  20.984  < 2e-16 ***
## sqft_above    -7.728e+01  9.308e+00  -8.303  < 2e-16 ***
## sqft_living15  5.133e+01  8.652e+00   5.933 3.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 213200 on 3766 degrees of freedom
## Multiple R-squared:  0.5238, Adjusted R-squared:  0.5232 
## F-statistic: 828.6 on 5 and 3766 DF,  p-value: < 2.2e-16
# we can concluded, the value of multiple R-square drops as we eliminate the variables. The highest multiple R-square value is gained when the variables bedrooms, bathrooms, sqft_living, waterfront, view, condition, grade, sqft_above, yr_built, yr_renovated, zipcode, lat, long, and sqft_lot15 are considered.

Accuracy check and Prediction of the model:

#Accuracy of the model on the train dataset

pred=regressor$fitted.values

tally_table=data.frame(actual=training_set$price, predicted=pred)
mape=mean(abs(tally_table$actual-tally_table$predicted)/tally_table$actual)
accuracy=1-mape

cat("The accuracy on the train data is:",accuracy)
## The accuracy on the train data is: 0.6630708

#Prediction on the test data
pred_test=predict(newdata=test_set, regressor)
tally_table=data.frame(actual=test_set$price, predicted=pred_test)
mape=mean(abs(tally_table$actual-tally_table$predicted)/tally_table$actual)
accuracy=1-mape
cat(" and the accuracy on the test data is:",accuracy)
##  and the accuracy on the test data is: 0.6604951