load the Kings County data set and investigate its structure
#load the data set to use
kc_house_data <- read_csv("housing dataset/kc_house_data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(kc_house_data)
dim(kc_house_data)
## [1] 21613 21
str(kc_house_data)
## spc_tbl_ [21,613 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
## $ date : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ zipcode : num [1:21613] 98178 98125 98028 98136 98074 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
## $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num [1:21613] 5650 7639 8062 5000 7503 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. date = col_datetime(format = ""),
## .. price = col_double(),
## .. bedrooms = col_double(),
## .. bathrooms = col_double(),
## .. sqft_living = col_double(),
## .. sqft_lot = col_double(),
## .. floors = col_double(),
## .. waterfront = col_double(),
## .. view = col_double(),
## .. condition = col_double(),
## .. grade = col_double(),
## .. sqft_above = col_double(),
## .. sqft_basement = col_double(),
## .. yr_built = col_double(),
## .. yr_renovated = col_double(),
## .. zipcode = col_double(),
## .. lat = col_double(),
## .. long = col_double(),
## .. sqft_living15 = col_double(),
## .. sqft_lot15 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(kc_house_data)
## # A tibble: 6 × 21
## id date price bedrooms bathrooms sqft_living sqft_lot
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7129300520 2014-10-13 00:00:00 221900 3 1 1180 5650
## 2 6414100192 2014-12-09 00:00:00 538000 3 2.25 2570 7242
## 3 5631500400 2015-02-25 00:00:00 180000 2 1 770 10000
## 4 2487200875 2014-12-09 00:00:00 604000 4 3 1960 5000
## 5 1954400510 2015-02-18 00:00:00 510000 3 2 1680 8080
## 6 7237550310 2014-05-12 00:00:00 1225000 4 4.5 5420 101930
## # ℹ 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## # condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## # yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## # sqft_living15 <dbl>, sqft_lot15 <dbl>
sqft_lot: square footage of the land space
floors number of floors
waterfront: an index indicate if the house was overlooking the
waterfront or not. 0 reps no waterfront 1, reps with waterfront
view: an index from 0 to 4 of how good the view of the property was.
0 reps no good view, 4 reps very good view.
conditions: an index from 1 to 5 on the condition of the house. 1
reps poorer conditions and 5 reps superb condition
grade: an index from 1 to 3. 1 to 3 falls short of building
construction and design, 7 has an average level construction and design,
and 11 to 13 has higher quality level of construction and design.
yr_built: The year of house built
yr_renovated: The year of the house’s last renovation
zipcode: The zipcode is the postal code to indicate the area the
house is in
data cleaning
Missing Value Detection: Amelia Package Missingness Map Function was
used to identify the missing data in the dataset. From the map below, it
can be observed that the dataset does not consist of any missing data
for any of the variables.
missmap(kc_house_data)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

# Outlier Detection: Outliers are detected and analyzed using the Outlier Boxplots. From the outlier boxplot we inferred that the data consists of many outliers for the target variable, Price.
# However, the outliers variable corresponded to outliers for Number of Bedrooms, Number of Bathrooms and Square Feet Living. Upon further investigation, we found that the outliers correspond to high value of condition, view and grade.
# Thus, we concluded that these outliers are legitimate outliers and we decided to retain them in the data.
boxplot(kc_house_data$price)
boxplot(kc_house_data$bedrooms)

boxplot(kc_house_data$bathrooms)
boxplot(kc_house_data$sqft_living)

# There were two findings:-
# All data clean up will be performed at the copy of original data set, namely house_clean
#clean data chunk
house_clean <- kc_house_data
nrow(house_clean)
## [1] 21613
#An observation with 33 bedrooms in 1620 ft, and come with only 1.75 bathrooms. The data was imputed to only 3 bedrooms.
max(house_clean$bedrooms)
## [1] 33
house_clean$bedrooms[house_clean$bedrooms == 33] = 3
nrow(house_clean)
## [1] 21613
max(house_clean$bedrooms)
## [1] 11
# 13 observations with 0 bedroom, and come with 0.75 to 2.5 bathroom. Since it is not common to have houses without bedrooms, thus we decided to exclude these observations.
min(house_clean$bedrooms)
## [1] 0
house_clean <- house_clean [house_clean$bedrooms != 0,]
nrow(house_clean)
## [1] 21600
#In total, there are 10 observations with 0 bathroom. After the step above, there are only 3 remaining observation with 0 bathroom. Since it is not common to have houses without bathrooms, thus we decided to exclude these observations.
?min()
min(house_clean$bathrooms)
## [1] 0
house_clean <- house_clean[house_clean$bathrooms != 0,]
nrow(house_clean)
## [1] 21597
Data transformation
while traversing the data we found that some of the columns need to
have their data types adjusted in order to meet our goal. Thus we made
the decision to retain all 21 original columns along with the
transformed data.
# 1.Date: Changing the Date Format for Regression
house_clean$date<-(substr(house_clean$date, 1, 8))
house_clean$date<- ymd(house_clean$date)
## Warning: All formats failed to parse. No formats found.
house_clean$date<-as.numeric(as.Date(house_clean$date, origin = "1900-11-01"))
head(house_clean$date)
## [1] NA NA NA NA NA NA
# b. Age: new column, data types as numeric continuous
house_clean$age = 2015 - house_clean$yr_built + 1
head(house_clean$age)
## [1] 61 65 83 51 29 15
#c. renovated: new column, data types as numeric nominal
#convert "yr_renovated into a simpler categorical variable o indicate of a house has been renovated in the past
# legend to refer: 0 means no renovation has been done
# !=0 means renovation has been done
house_clean$renovated <- cut(house_clean$yr_renovated, breaks = c(-1, 0, 3000), labels = c("1", "2"))
house_clean$renovated <- as.numeric(house_clean$renovated)
head(house_clean$renovated)
## [1] 1 2 1 1 1 1
# d. price category: new column, data types as numerical nominal
# convert price variable from numeric continuous variable to a numeric nominal variable for categorical modelling purpose.
house_clean$price_cat = cut(house_clean$price, breaks = c(0,350000,450000,700000,10000000), labels=c("1","2","3","4"))
house_clean$price[1:10]
## [1] 221900 538000 180000 604000 510000 1225000 257500 291850 229500
## [10] 323000
house_clean$price_cat[1:10]
## [1] 1 3 1 3 3 4 1 1 1 1
## Levels: 1 2 3 4
view(house_clean)
EDA
objective of data visualization and pattern discovery is to reveal
the relationships between the house features and the target variable,
price. We want to identify the house features which affect the price
variable and could be potential predictors. Through visualization, we
gathered the following information about the data.
1. The correlation matrix gives a summary of correlations between
the variables in the data set.
The objective behind analyzing the correlation between the
continuous variables in the data was to identify variables that have
significant linear relationship with price and those which do not.
#house_clean.cor = cor(house_clean[sapply(house_clean, function(x) !is.factor(x))])
#corrplot(house_clean.cor)
ggcorr(house_clean, geom = "blank", label = T, label_size = 3, hjust = 1, size = 3, layout.exp = 2) +
geom_point(size = 8, aes(color = coefficient > 0, alpha = abs(coefficient) >= 0.5)) +
scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
guides(color = F, alpha = F)
## Warning in ggcorr(house_clean, geom = "blank", label = T, label_size = 3, :
## data in column(s) 'id', 'price_cat' are not numeric and were ignored
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#correlation matrix, we can see that all variables are positively correlated with price. Furthermore, sqft_living, grade, sqft_above, and sqft_living15 seems to have strong correlations with price
# b. scatterplot matrix - high posiive correlation
pairs(~price+bathrooms+sqft_living+grade+sqft_above+sqft_living15, data=house_clean, main="High Positive Corr. ScatterPlot Matrix")

# low positive correlation
pairs(~price+bedrooms+floors+waterfront+view+sqft_basement+lat, data=house_clean, main="Low Positive Corr. ScatterPlot Matrix")

#c. Latitude vs Longitude, colored by Price:
# We can see that the area without the color points are not incorporated cities. We observe that price increases as we move from southern area to northern area across the latitude but has little variation as we move across the longitude.
coordinates_data <- dplyr::select(house_clean, price, lat, long)
head(coordinates_data)
## # A tibble: 6 × 3
## price lat long
## <dbl> <dbl> <dbl>
## 1 221900 47.5 -122.
## 2 538000 47.7 -122.
## 3 180000 47.7 -122.
## 4 604000 47.5 -122.
## 5 510000 47.6 -122.
## 6 1225000 47.7 -122.
pal = colorNumeric("YlOrRd", domain = coordinates_data$price)
int_map <- coordinates_data %>%
leaflet()%>%
addProviderTiles(providers$OpenStreetMap.Mapnik)%>%
addCircleMarkers(col = ~pal(price), opacity = 1.1, radius = 0.3) %>%
addLegend(pal = pal, values = ~price)
## Assuming "long" and "lat" are longitude and latitude, respectively
int_map
# Price vs Square Feet Living, colour by Number of Bathroom: The scatterplot below indicates the house price increases as square foot living and number of bathroom increase.
plot(house_clean$sqft_living15, house_clean$price, pch=19, col=house_clean$bathrooms, xlab='Square foot living+No.of bathrooms',ylab='House Price')

# predictive modelling
# a. multiple regression
# The first model is built by having the high positive correlation variables based on the corr plot.
library(dplyr)
hr <- select(house_clean,price,bathrooms,sqft_living,grade,sqft_above,sqft_living15)
set.seed(123)
split <- sample.split(hr$price, SplitRatio = 0.8)
training_set <- subset(hr, split==TRUE)
test_set <- subset(hr, split==FALSE)
regressor <- lm(formula=price~., data=training_set)
summary(regressor)
##
## Call:
## lm(formula = price ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1005385 -137443 -22645 100515 4736642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.664e+05 1.525e+04 -43.702 < 2e-16 ***
## bathrooms -3.493e+04 3.859e+03 -9.053 < 2e-16 ***
## sqft_living 2.561e+02 5.070e+00 50.504 < 2e-16 ***
## grade 1.126e+05 2.777e+03 40.542 < 2e-16 ***
## sqft_above -8.285e+01 5.010e+00 -16.537 < 2e-16 ***
## sqft_living15 1.788e+01 4.506e+00 3.967 7.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 253800 on 17819 degrees of freedom
## Multiple R-squared: 0.5496, Adjusted R-squared: 0.5495
## F-statistic: 4349 on 5 and 17819 DF, p-value: < 2.2e-16
# As shown, the multiple R-squared returned the value of 0.5495 which are not consider strong for the model even though all the variables are highly positive correlated to the output.
Accuracy check and Prediction of the model:
#Accuracy of the model on the train dataset
pred=regressor$fitted.values
tally_table=data.frame(actual=training_set$price, predicted=pred)
mape=mean(abs(tally_table$actual-tally_table$predicted)/tally_table$actual)
accuracy=1-mape
cat("The accuracy on the train data is:",accuracy)
## The accuracy on the train data is: 0.6630708
#Prediction on the test data
pred_test=predict(newdata=test_set, regressor)
tally_table=data.frame(actual=test_set$price, predicted=pred_test)
mape=mean(abs(tally_table$actual-tally_table$predicted)/tally_table$actual)
accuracy=1-mape
cat(" and the accuracy on the test data is:",accuracy)
## and the accuracy on the test data is: 0.6604951