Data exploration
Data cleaning
Explore more
Model Building
rm(list = ls())
setwd("C:/Users/MANISHA/Desktop/CISC-Project/FinalProjectWork")
## All required libraries should be mentioned here.
library(ggplot2)
library(caret)
## Loading required package: lattice
library(corrplot)
## corrplot 0.84 loaded
library(rpart)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(mlr)
## Loading required package: ParamHelpers
##
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
##
## train
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(rpart)
library(caret)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.1.5
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.0.1 v purrr 0.3.0
## v tidyr 0.8.2 v dplyr 0.7.8
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.0.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::combine() masks randomForest::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:randomForest':
##
## combine
################## Library Section Ends Here ####################
## Read input House Data file in csv format.
housedata = read.csv("house_data.csv", header = T)
## Dimension of the input data.
dim(housedata)
## [1] 42703 20
### 42703 observations.
### 20 attributes.
## Structure of the file.
str(housedata)
## 'data.frame': 42703 obs. of 20 variables:
## $ year_built : int 1978 1958 2002 2004 2006 2005 1979 1958 1958 1961 ...
## $ stories : int 1 1 1 1 1 1 1 1 1 1 ...
## $ num_bedrooms : int 4 3 3 4 4 3 3 5 5 1 ...
## $ full_bathrooms : int 1 1 2 2 2 2 2 2 2 1 ...
## $ half_bathrooms : int 1 1 0 0 0 0 1 0 0 0 ...
## $ livable_sqft : int 1689 1984 1581 1829 1580 1621 2285 1745 1747 998 ...
## $ total_sqft : int 1859 2002 1578 2277 1749 1672 2365 1741 1745 1161 ...
## $ garage_type : Factor w/ 3 levels "attached","detached",..: 1 1 3 1 1 1 2 3 3 3 ...
## $ garage_sqft : int 508 462 0 479 430 430 532 0 0 0 ...
## $ carport_sqft : int 0 0 625 0 0 0 0 0 0 242 ...
## $ has_fireplace : logi TRUE TRUE FALSE TRUE TRUE TRUE ...
## $ has_pool : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_central_heating: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_central_cooling: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ house_number : int 42670 5194 4366 3302 582 78445 246 35725 35725 73327 ...
## $ street_name : Factor w/ 11124 levels "Aaron Cliff",..: 6335 3532 3933 7172 4467 7172 3963 4902 4902 5959 ...
## $ unit_number : int NA NA NA NA NA NA NA NA NA NA ...
## $ city : Factor w/ 47 levels "Amystad","Brownport",..: 13 13 20 20 20 20 27 20 20 20 ...
## $ zip_code : int 10907 10907 11203 11203 11203 11203 10924 11203 11203 11203 ...
## $ sale_price : num 270897 302404 2519996 197193 207897 ...
### We have categorical variables.
qualitative_var = c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type", "has_fireplace", "has_pool",
"has_central_heating", "has_central_cooling", "house_number", "street_name" , "unit_number", "city", "zip_code")
### We have numerical variables.
quantitative_var = c("year_built", "livable_sqft", "total_sqft", "garage_sqft", "carport_sqft", "sale_price")
### R has identified garage_type, street_number and city as factor.
### R has identified has_fireplace, has_pool, has_central_heating, has_central_cooling as boolean type. I will convert these four attributes to factor.
## Sumamry statistics.
summary(housedata)
## year_built stories num_bedrooms full_bathrooms
## Min. :1852 Min. :0.000 Min. : 0.000 Min. :0.000
## 1st Qu.:1980 1st Qu.:1.000 1st Qu.: 3.000 1st Qu.:1.000
## Median :1994 Median :1.000 Median : 3.000 Median :2.000
## Mean :1991 Mean :1.366 Mean : 3.209 Mean :1.924
## 3rd Qu.:2005 3rd Qu.:2.000 3rd Qu.: 4.000 3rd Qu.:2.000
## Max. :2017 Max. :4.000 Max. :31.000 Max. :8.000
##
## half_bathrooms livable_sqft total_sqft garage_type
## Min. :0.0000 Min. : -3 Min. : 5 attached:34079
## 1st Qu.:0.0000 1st Qu.: 1380 1st Qu.: 1466 detached: 2712
## Median :1.0000 Median : 1808 Median : 1937 none : 5912
## Mean :0.5272 Mean : 1988 Mean : 2127
## 3rd Qu.:1.0000 3rd Qu.: 2486 3rd Qu.: 2640
## Max. :1.0000 Max. :12406 Max. :15449
##
## garage_sqft carport_sqft has_fireplace has_pool
## Min. : -4.0 Min. : 0.00 Mode :logical Mode :logical
## 1st Qu.: 412.0 1st Qu.: 0.00 FALSE:15717 FALSE:35101
## Median : 464.0 Median : 0.00 TRUE :26986 TRUE :7602
## Mean : 455.9 Mean : 41.66
## 3rd Qu.: 606.0 3rd Qu.: 0.00
## Max. :8318.0 Max. :9200.00
##
## has_central_heating has_central_cooling house_number
## Mode :logical Mode :logical Min. : 0
## FALSE:2609 FALSE:4141 1st Qu.: 674
## TRUE :40094 TRUE :38562 Median : 4530
## Mean :18212
## 3rd Qu.:24845
## Max. :99971
##
## street_name unit_number city
## Matthew Points : 128 Min. : 3 Chadstad : 4962
## Sanders Inlet : 98 1st Qu.:1063 Coletown : 3739
## Jessica Highway: 95 Median :2033 Jeffreyhaven : 2981
## Jordan Points : 94 Mean :2027 North Erinville: 2868
## Andrea Glen : 88 3rd Qu.:2921 Port Andrealand: 2669
## Mckenzie Trace : 88 Max. :3998 Hallfort : 2448
## (Other) :42112 NA's :39615 (Other) :23036
## zip_code sale_price
## Min. :10004 Min. : 626
## 1st Qu.:10537 1st Qu.: 270899
## Median :11071 Median : 378001
## Mean :11031 Mean : 413507
## 3rd Qu.:11510 3rd Qu.: 497697
## Max. :11989 Max. :21041998
##
### Few things about summary statistics.
### minimum livable_sqft = -3, minimum total_sqft = 5, gagage_sqft minimum is -4, maximum is 8318, maximum carport_sqft is 9200, sale_price minimum is 626, maximum is more than 21 million.
### City and street_name labels are numric value. Good for PCA or regression but not good for classifications. I will have to factorize of these two variables.
### I can get rid of Zip code, unit number and house number as of now as those attributes do not have any direct need in the analysis. If I need to consolidate the address of house, I will use them. For now, I will not use them in regression, classification, clustering.
dontneed = c("unit_number", "house_number", "zip_code")
## Create new data frame with the attributes I need for the analysis.
names(housedata)
## [1] "year_built" "stories" "num_bedrooms"
## [4] "full_bathrooms" "half_bathrooms" "livable_sqft"
## [7] "total_sqft" "garage_type" "garage_sqft"
## [10] "carport_sqft" "has_fireplace" "has_pool"
## [13] "has_central_heating" "has_central_cooling" "house_number"
## [16] "street_name" "unit_number" "city"
## [19] "zip_code" "sale_price"
mydata = housedata[, -c(15,17,19)]
dim(mydata)
## [1] 42703 17
str(mydata)
## 'data.frame': 42703 obs. of 17 variables:
## $ year_built : int 1978 1958 2002 2004 2006 2005 1979 1958 1958 1961 ...
## $ stories : int 1 1 1 1 1 1 1 1 1 1 ...
## $ num_bedrooms : int 4 3 3 4 4 3 3 5 5 1 ...
## $ full_bathrooms : int 1 1 2 2 2 2 2 2 2 1 ...
## $ half_bathrooms : int 1 1 0 0 0 0 1 0 0 0 ...
## $ livable_sqft : int 1689 1984 1581 1829 1580 1621 2285 1745 1747 998 ...
## $ total_sqft : int 1859 2002 1578 2277 1749 1672 2365 1741 1745 1161 ...
## $ garage_type : Factor w/ 3 levels "attached","detached",..: 1 1 3 1 1 1 2 3 3 3 ...
## $ garage_sqft : int 508 462 0 479 430 430 532 0 0 0 ...
## $ carport_sqft : int 0 0 625 0 0 0 0 0 0 242 ...
## $ has_fireplace : logi TRUE TRUE FALSE TRUE TRUE TRUE ...
## $ has_pool : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ has_central_heating: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ has_central_cooling: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ street_name : Factor w/ 11124 levels "Aaron Cliff",..: 6335 3532 3933 7172 4467 7172 3963 4902 4902 5959 ...
## $ city : Factor w/ 47 levels "Amystad","Brownport",..: 13 13 20 20 20 20 27 20 20 20 ...
## $ sale_price : num 270897 302404 2519996 197193 207897 ...
new_qualitative_var = c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type", "has_fireplace","has_pool",
"has_central_heating", "has_central_cooling", "street_name", "city")
## Convert boolean and categorical attributes into factor.
mydata[,new_qualitative_var] = lapply(mydata[, new_qualitative_var], factor)
## Missing values.
as.data.frame(colSums(is.na(mydata)))
## colSums(is.na(mydata))
## year_built 0
## stories 0
## num_bedrooms 0
## full_bathrooms 0
## half_bathrooms 0
## livable_sqft 0
## total_sqft 0
## garage_type 0
## garage_sqft 0
## carport_sqft 0
## has_fireplace 0
## has_pool 0
## has_central_heating 0
## has_central_cooling 0
## street_name 0
## city 0
## sale_price 0
### There is no missing values in any columns.
## Outliers.
hist(mydata$sale_price,
freq = FALSE,
breaks = 100,
col = "grey",
xlab = "Sale Price",
main = "Histogram, rug plot, density curve")
rug(jitter(mydata$sale_price))
lines(density(mydata$sale_price), col = "red", lwd = 1)
box()
boxplot(mydata$sale_price, horizontal = F, col = "dark grey", main = "Box Plot of Sales Price", varwidth = T)
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$stories)) +
geom_boxplot() +
xlab("Stories") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$num_bedrooms)) +
geom_boxplot() +
xlab("Number of Bedrooms") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$full_bathrooms)) +
geom_boxplot() +
xlab("Full Bathrooms") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$half_bathrooms)) +
geom_boxplot() +
xlab("Half Bathrooms") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$garage_type)) +
geom_boxplot() +
xlab("Garage Type") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$has_fireplace)) +
geom_boxplot() +
xlab("Has Fire Place") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$has_pool)) +
geom_boxplot() +
xlab("Has Pool") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$has_central_heating)) +
geom_boxplot() +
xlab("Has Central Heating") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$has_central_cooling)) +
geom_boxplot() +
xlab("Has Central Cooling") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$year_built)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
xlab("Built Year") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$livable_sqft)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
xlab("Livable area") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$total_sqft)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
xlab("Total area") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$garage_sqft)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
xlab("Garage area") +
ylab("Sale Price")
ggplot(mydata,
aes_string(y = mydata$sale_price, x = mydata$carport_sqft)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
xlab("Carport area") +
ylab("Sale Price")
### Distribution of data
### Count of house sales based on house built year.
countperyr_df = as.data.frame(table(as.factor(mydata$year_built)))
countperyr_ordered = countperyr_df[order(countperyr_df$Freq, decreasing = T),]
### Interested to see is it a very old house sells more of a house built recentyears. First 10 highest selling buil in year are from 1988 to 2016.
head(countperyr_ordered, 10)
## Var1 Freq
## 117 2005 2046
## 116 2004 1914
## 118 2006 1720
## 126 2014 1691
## 127 2015 1671
## 115 2003 1589
## 101 1989 1324
## 100 1988 1307
## 114 2002 1233
## 128 2016 1203
### Chck variable one by one and remove outliear.
### sale_price has many outliers.
summary(mydata$sale_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 626 270899 378001 413507 497697 21041998
### Minimum price 626 and maximum price 21 million
### highest 30 sale price
#mydata[order(mydata$sale_price, decreasing = T), ][1:30,c(1,2,3,4,5,6,7,17)]
### 1st one definately an error. livable_sqft 1446 and total_sqft = 1438, price 21million, definately an odd one.
### I will find out all observations where livable_sqft > total_sqft.
odd_data = subset(mydata, mydata$livable_sqft>mydata$total_sqft)
### Delete such records
mydata = subset(mydata, !(mydata$livable_sqft>mydata$total_sqft))
### Lowest 30 sale price
#mydata[order(mydata$sale_price, decreasing = F), ][1:30,c(1,2,3,4,5,6,7,9, 17)]
#subset(mydata, mydata$sale_price < 100000, c(1,2,3,4,5,6,7,9, 17))
### Looks like there are many odd values in sale_price. For simplicity let's remove the outliers and store them for future reference.
### Number of outliers in sales_price
outlier_values = boxplot.stats(mydata$sale_price)$out
length(outlier_values)
## [1] 1698
#Outlier range
Q3 = quantile(mydata$sale_price)[4]
Q1 = quantile(mydata$sale_price)[2]
# Q3 + IQR * 1.5
maximum = Q3 + IQR(mydata$sale_price) * 1.5
# Q1 - IQR * 1.5
minimum = Q1 - IQR(mydata$sale_price) * 1.5
#Maximum = 834758.8
#Minimum = -47255.25
### Remove ovservations related to outliers of sales_price
outlier_price = subset(mydata, mydata$sale_price < minimum | mydata$sale_price > maximum)
odd_data = rbind(odd_data, outlier_price)
### Remove observations related to outliers.
mydata = subset(mydata, !(mydata$sale_price < minimum | mydata$sale_price > maximum))
hist(mydata$sale_price,
freq = FALSE,
breaks = 100,
col = "grey",
xlab = "Sale Price",
main = "Histogram, rug plot, density curve")
rug(jitter(mydata$sale_price))
lines(density(mydata$sale_price), col = "red", lwd = 1)
box()
boxplot(mydata$sale_price, horizontal = F, col = "dark grey", main = "Box Plot of Sales Price")
### Annomali reduction from sale price.
#Outlier range
Q3 = quantile(mydata$sale_price)[4]
Q1 = quantile(mydata$sale_price)[2]
# Q3 + IQR * 1.5
maximum = Q3 + IQR(mydata$sale_price) * 1.5
# Q1 - IQR * 1.5
minimum = Q1 - IQR(mydata$sale_price) * 1.5
#mydata[order(mydata$sale_price, decreasing = F),]
temp_data = subset(mydata, mydata$sale_price < 25000 )
mydata = subset(mydata, mydata$sale_price > 25000 )
odd_data = rbind(odd_data, temp_data)
summary(mydata)
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## Min. :1852 0: 1 3 :13995 2 :19714 0:16070
## 1st Qu.:1981 1:22270 4 : 9832 1 :10123 1:19560
## Median :1996 2:13016 2 : 7452 3 : 4807
## Mean :1991 3: 225 5 : 3134 4 : 747
## 3rd Qu.:2005 4: 118 1 : 765 0 : 180
## Max. :2017 6 : 368 5 : 49
## (Other): 84 (Other): 10
## livable_sqft total_sqft garage_type garage_sqft
## Min. : -2 Min. : 6 attached:29466 Min. : -4.0
## 1st Qu.: 1403 1st Qu.: 1515 detached: 2236 1st Qu.: 419.0
## Median : 1828 Median : 1969 none : 3928 Median : 466.0
## Mean : 1962 Mean : 2104 Mean : 462.9
## 3rd Qu.: 2452 3rd Qu.: 2617 3rd Qu.: 604.0
## Max. :12406 Max. :15449 Max. :5040.0
##
## carport_sqft has_fireplace has_pool has_central_heating
## Min. : 0.00 FALSE:12621 FALSE:29951 FALSE: 2242
## 1st Qu.: 0.00 TRUE :23009 TRUE : 5679 TRUE :33388
## Median : 0.00
## Mean : 28.43
## 3rd Qu.: 0.00
## Max. :9200.00
##
## has_central_cooling street_name city
## FALSE: 3327 Sanders Inlet : 71 Chadstad : 4248
## TRUE :32303 Jeremy Knolls : 68 Coletown : 3239
## Richardson Throughway: 62 North Erinville: 2405
## Kenneth Plains : 60 Jeffreyhaven : 2316
## Johnson Ville : 56 Port Andrealand: 2210
## Michelle Streets : 56 Hallfort : 2201
## (Other) :35257 (Other) :19011
## sale_price
## Min. : 25196
## 1st Qu.:283496
## Median :381776
## Mean :387050
## 3rd Qu.:486992
## Max. :834751
##
boxplot(mydata$sale_price)
### Check summary of the data
summary(mydata)
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## Min. :1852 0: 1 3 :13995 2 :19714 0:16070
## 1st Qu.:1981 1:22270 4 : 9832 1 :10123 1:19560
## Median :1996 2:13016 2 : 7452 3 : 4807
## Mean :1991 3: 225 5 : 3134 4 : 747
## 3rd Qu.:2005 4: 118 1 : 765 0 : 180
## Max. :2017 6 : 368 5 : 49
## (Other): 84 (Other): 10
## livable_sqft total_sqft garage_type garage_sqft
## Min. : -2 Min. : 6 attached:29466 Min. : -4.0
## 1st Qu.: 1403 1st Qu.: 1515 detached: 2236 1st Qu.: 419.0
## Median : 1828 Median : 1969 none : 3928 Median : 466.0
## Mean : 1962 Mean : 2104 Mean : 462.9
## 3rd Qu.: 2452 3rd Qu.: 2617 3rd Qu.: 604.0
## Max. :12406 Max. :15449 Max. :5040.0
##
## carport_sqft has_fireplace has_pool has_central_heating
## Min. : 0.00 FALSE:12621 FALSE:29951 FALSE: 2242
## 1st Qu.: 0.00 TRUE :23009 TRUE : 5679 TRUE :33388
## Median : 0.00
## Mean : 28.43
## 3rd Qu.: 0.00
## Max. :9200.00
##
## has_central_cooling street_name city
## FALSE: 3327 Sanders Inlet : 71 Chadstad : 4248
## TRUE :32303 Jeremy Knolls : 68 Coletown : 3239
## Richardson Throughway: 62 North Erinville: 2405
## Kenneth Plains : 60 Jeffreyhaven : 2316
## Johnson Ville : 56 Port Andrealand: 2210
## Michelle Streets : 56 Hallfort : 2201
## (Other) :35257 (Other) :19011
## sale_price
## Min. : 25196
## 1st Qu.:283496
## Median :381776
## Mean :387050
## 3rd Qu.:486992
## Max. :834751
##
### Plot continuous variables
ggplot(mydata, aes(x = " ", y = mydata$livable_sqft)) +
geom_boxplot(outlier.size = NA) +
geom_point() +
ylab("Livable Area") +
xlab("Box Plot of Livable Area")
ggplot(mydata, aes(x = " ", y = mydata$total_sqft)) +
geom_boxplot(outlier.size = NA) +
geom_point() +
ylab("Total Area") +
xlab("Box Plot of Total Area")
ggplot(mydata, aes(x = " ", y = mydata$garage_sqft)) +
geom_boxplot(outlier.size = NA) +
geom_point() +
ylab("Garage Area") +
xlab("Box Plot of Garage Area")
ggplot(mydata, aes(x = " ", y = mydata$carport_sqft)) +
geom_boxplot(outlier.size = NA) +
geom_point() +
ylab("Carport Area") +
xlab("Box Plot of Carport Area")
### Remove the observations realted to outliers for each variables and store them into odd-data dataframe.
#### livable_sqft
summary(mydata$livable_sqft)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2 1403 1828 1962 2452 12406
Q1 = quantile(mydata$livable_sqft)[2]
Q3 = quantile(mydata$livable_sqft)[4]
maximum = Q3 + IQR(mydata$livable_sqft) * 1.5
minimum = Q1 - IQR(mydata$livable_sqft) * 1.5
odd_data = rbind(odd_data, subset(mydata, mydata$livable_sqft < minimum | mydata$livable_sqft > maximum))
mydata = subset(mydata, !(mydata$livable_sqft < minimum | mydata$livable_sqft > maximum))
#mydata[order(mydata$livable_sqft, decreasing = F), ]
#### Remove negative values in livable_sqft
odd_data = rbind(odd_data, subset(mydata, (mydata$livable_sqft < 0)))
mydata = subset(mydata, !(mydata$livable_sqft < 0))
#### total_sqft
summary(mydata$total_sqft)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 87 1513 1959 2084 2594 6351
Q1 = quantile(mydata$total_sqft)[2]
Q3 = quantile(mydata$total_sqft)[4]
maximum = Q3 + IQR(mydata$total_sqft) * 1.5
minimum = Q1 - IQR(mydata$total_sqft) * 1.5
odd_data = rbind(odd_data, subset(mydata, mydata$total_sqft < minimum | mydata$total_sqft > maximum))
mydata = subset(mydata, !(mydata$total_sqft < minimum | mydata$total_sqft > maximum))
#mydata[order(mydata$livable_sqft, decreasing = F), ]
#### Remove negative values in livable_sqft
#mydata[order(mydata$total_sqft<100, decreasing = T),]
odd_data = rbind(odd_data, subset(mydata, (mydata$total_sqft <100)))
mydata = subset(mydata, !(mydata$total_sqft < 100))
#### garage_sqft
summary(mydata$garage_sqft)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.0 418.0 465.0 460.2 601.0 5040.0
ggplot(mydata, aes(x = mydata$garage_type, y = mydata$garage_sqft))+
geom_boxplot()
Q1 = quantile(mydata$garage_sqft)[2]
Q3 = quantile(mydata$garage_sqft)[4]
maximum = Q3 + IQR(mydata$garage_sqft) * 1.5
minimum = Q1 - IQR(mydata$garage_sqft) * 1.5
odd_data = rbind(odd_data, subset(mydata, mydata$totgarage_sqft < minimum | mydata$garage_sqft > maximum))
mydata = subset(mydata, !(mydata$garage_sqft < minimum | mydata$garage_sqft > maximum))
#mydata[order(mydata$livable_sqft, decreasing = F), ]
#### Remove negative values in livable_sqft
#mydata[order(mydata$garage_sqft, decreasing = F),]
odd_data = rbind(odd_data, subset(mydata, (mydata$garage_sqft <50)))
mydata = subset(mydata, !(mydata$garage_sqft < 50))
summary(mydata)
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## Min. :1852 0: 1 3 :12836 2 :17934 0:13160
## 1st Qu.:1985 1:18361 4 : 9432 1 : 7844 1:17762
## Median :1999 2:12263 2 : 5253 3 : 4557
## Mean :1994 3: 200 5 : 2854 4 : 502
## 3rd Qu.:2006 4: 97 6 : 278 0 : 53
## Max. :2017 1 : 211 5 : 30
## (Other): 58 (Other): 2
## livable_sqft total_sqft garage_type garage_sqft
## Min. : 124 Min. : 128 attached:28800 Min. :144
## 1st Qu.:1485 1st Qu.:1600 detached: 2122 1st Qu.:432
## Median :1905 Median :2046 none : 0 Median :480
## Mean :2022 Mean :2156 Mean :513
## 3rd Qu.:2506 3rd Qu.:2656 3rd Qu.:610
## Max. :4025 Max. :4215 Max. :875
##
## carport_sqft has_fireplace has_pool has_central_heating
## Min. : 0.000 FALSE: 9231 FALSE:25681 FALSE: 1325
## 1st Qu.: 0.000 TRUE :21691 TRUE : 5241 TRUE :29597
## Median : 0.000
## Mean : 3.803
## 3rd Qu.: 0.000
## Max. :1534.000
##
## has_central_cooling street_name city
## FALSE: 2120 Richardson Throughway: 62 Chadstad : 3683
## TRUE :28802 Johnson Ville : 47 Coletown : 2959
## Booker Pines : 45 North Erinville: 2238
## Boyle Brooks : 44 Hallfort : 2124
## Avery Islands : 43 Port Andrealand: 2116
## Olson Fort : 42 Jeffreyhaven : 1887
## (Other) :30639 (Other) :15915
## sale_price
## Min. : 25830
## 1st Qu.:304290
## Median :396899
## Mean :404523
## 3rd Qu.:493294
## Max. :834751
##
### Investigation of categorical data
### stories
table(mydata$stories)
##
## 0 1 2 3 4
## 1 18361 12263 200 97
mydata[mydata$stories == 0,] ## May be an undergroung building. Odd one.
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## 35436 1991 0 3 2 0
## livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 35436 1365 1375 attached 550 0
## has_fireplace has_pool has_central_heating has_central_cooling
## 35436 TRUE FALSE TRUE TRUE
## street_name city sale_price
## 35436 Bobby Views Chadstad 390597
odd_data = rbind(odd_data, mydata[mydata$stories == 0,])
mydata = mydata[!(mydata$stories == 0),]
mydata$stories = factor(mydata$stories)
### num_bedrooms
table(mydata$num_bedrooms)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 13 211 5253 12835 9432 2854 278 28 10 1 0 0
## 13 14 21 31
## 1 2 0 3
mydata$num_bedrooms = factor(mydata$num_bedrooms)
### Check 0 bedrooms houses.
table(mydata$num_bedrooms)
##
## 0 1 2 3 4 5 6 7 8 9 13 14
## 13 211 5253 12835 9432 2854 278 28 10 1 1 2
## 31
## 3
odd_data = rbind(odd_data, subset(mydata, mydata$num_bedrooms == 0))
mydata = subset(mydata, !(mydata$num_bedrooms == 0))
mydata$num_bedrooms = factor(mydata$num_bedrooms)
subset(mydata, mydata$num_bedrooms == 31)
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## 6331 2014 2 31 2 1
## 23846 2015 2 31 2 1
## 33539 2016 1 31 2 0
## livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 6331 1717 1758 attached 434 0
## 23846 2549 2654 attached 612 0
## 33539 1856 1890 attached 412 0
## has_fireplace has_pool has_central_heating has_central_cooling
## 6331 FALSE FALSE TRUE TRUE
## 23846 FALSE FALSE TRUE TRUE
## 33539 FALSE FALSE TRUE TRUE
## street_name city sale_price
## 6331 Stanley Islands Coletown 497073
## 23846 Barbara Roads Davidfort 470607
## 33539 Steven Cape Davidfort 406349
odd_data = rbind(odd_data, subset(mydata, mydata$num_bedrooms == 31))
mydata = subset(mydata, !(mydata$num_bedrooms == 31))
mydata$num_bedrooms = factor(mydata$num_bedrooms)
subset(mydata, mydata$num_bedrooms %in% c(8:14))
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## 2096 2001 2 8 2 1
## 4670 2000 2 8 2 1
## 4673 1999 2 8 3 1
## 4674 1996 2 8 3 1
## 5888 2013 1 14 3 1
## 6577 2011 1 13 3 1
## 10044 1998 2 8 3 1
## 14285 1972 1 9 3 1
## 15466 1998 2 8 3 1
## 16879 2016 2 14 3 0
## 35049 2000 2 8 2 1
## 35052 2003 2 8 2 1
## 36807 2001 2 8 2 1
## 36812 1997 2 8 3 1
## livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 2096 2904 2954 attached 579 0
## 4670 2678 2724 attached 580 0
## 4673 2700 2745 attached 776 0
## 4674 2709 2751 attached 774 0
## 5888 2761 3303 attached 585 0
## 6577 2397 2888 attached 546 0
## 10044 2709 2748 attached 776 0
## 14285 3392 3484 attached 444 0
## 15466 2711 2748 attached 776 0
## 16879 2917 3131 attached 475 0
## 35049 2665 2707 attached 582 0
## 35052 2668 2715 attached 584 0
## 36807 2684 2724 attached 589 0
## 36812 2711 2752 attached 775 0
## has_fireplace has_pool has_central_heating has_central_cooling
## 2096 TRUE TRUE TRUE TRUE
## 4670 TRUE FALSE TRUE TRUE
## 4673 TRUE TRUE TRUE TRUE
## 4674 TRUE TRUE TRUE TRUE
## 5888 TRUE FALSE TRUE TRUE
## 6577 FALSE TRUE TRUE TRUE
## 10044 TRUE TRUE TRUE TRUE
## 14285 FALSE FALSE TRUE TRUE
## 15466 TRUE TRUE TRUE TRUE
## 16879 TRUE FALSE TRUE TRUE
## 35049 TRUE FALSE TRUE TRUE
## 35052 TRUE FALSE TRUE TRUE
## 36807 TRUE TRUE TRUE TRUE
## 36812 TRUE TRUE TRUE TRUE
## street_name city sale_price
## 2096 Gilbert Valley East Lucas 409496
## 4670 Obrien Pine North Erinville 453601
## 4673 Kennedy Hill North Erinville 398157
## 4674 Edward Viaduct North Erinville 554397
## 5888 Nancy Bridge South Anthony 513451
## 6577 Clarke Hollow South Anthony 793796
## 10044 Mark Circles North Erinville 485097
## 14285 Cooper Point Chadstad 553138
## 15466 Obrien Pine North Erinville 522904
## 16879 Stephen Cove Coletown 612363
## 35049 Nelson Haven Hallfort 403197
## 35052 Joseph Stream Hallfort 403204
## 36807 Reid Junctions North Erinville 409502
## 36812 Tina View North Erinville 530464
odd_data = rbind(odd_data, subset(mydata, mydata$num_bedrooms %in% c(8:14)))
mydata = subset(mydata, !(mydata$num_bedrooms %in% c(8:14)))
mydata$num_bedrooms = factor(mydata$num_bedrooms)
### full_bathrooms
table(mydata$full_bathrooms)
##
## 0 1 2 3 4 5 6 7 8
## 50 7838 17922 4547 502 30 2 0 0
subset(mydata, mydata$full_bathrooms == 0 & mydata$half_bathrooms == 0)
## [1] year_built stories num_bedrooms
## [4] full_bathrooms half_bathrooms livable_sqft
## [7] total_sqft garage_type garage_sqft
## [10] carport_sqft has_fireplace has_pool
## [13] has_central_heating has_central_cooling street_name
## [16] city sale_price
## <0 rows> (or 0-length row.names)
subset(mydata, mydata$full_bathrooms == 0)
## year_built stories num_bedrooms full_bathrooms half_bathrooms
## 51 1955 1 1 0 1
## 506 2002 1 2 0 1
## 1543 1937 1 1 0 1
## 1902 1947 1 1 0 1
## 2234 1948 1 1 0 1
## 2293 1922 1 1 0 1
## 2800 1928 1 1 0 1
## 3351 1930 1 3 0 1
## 3579 2000 1 2 0 1
## 5430 1954 1 1 0 1
## 7057 1950 1 2 0 1
## 7552 1924 1 1 0 1
## 10163 1902 1 3 0 1
## 10876 1947 1 2 0 1
## 12545 1957 1 1 0 1
## 12920 1932 1 2 0 1
## 12982 1924 1 1 0 1
## 13993 1968 2 1 0 1
## 14117 1921 1 2 0 1
## 14137 1909 1 2 0 1
## 14335 1950 1 1 0 1
## 14627 1977 1 1 0 1
## 15829 1966 1 2 0 1
## 17481 1949 1 2 0 1
## 18092 1962 1 2 0 1
## 18099 1953 1 2 0 1
## 19973 1948 1 1 0 1
## 19988 1941 1 2 0 1
## 21120 1993 1 1 0 1
## 24828 1945 1 3 0 1
## 24829 1946 1 3 0 1
## 26547 1994 1 1 0 1
## 27099 1982 1 1 0 1
## 28794 1938 1 1 0 1
## 30339 1954 1 3 0 1
## 30395 1946 1 1 0 1
## 30443 1953 1 2 0 1
## 30902 1939 2 1 0 1
## 32738 1921 1 2 0 1
## 33879 1948 1 1 0 1
## 34352 1938 1 2 0 1
## 35507 1920 1 3 0 1
## 36935 1996 2 1 0 1
## 38362 1938 1 1 0 1
## 39249 1946 1 1 0 1
## 39468 1938 1 2 0 1
## 39469 1948 1 3 0 1
## 39625 1949 1 2 0 1
## 42369 1994 1 1 0 1
## 42378 1958 1 2 0 1
## livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 51 813 980 detached 700 0
## 506 1297 1300 attached 416 0
## 1543 596 645 detached 249 0
## 1902 406 588 detached 401 0
## 2234 571 826 detached 218 0
## 2293 693 1022 detached 289 0
## 2800 429 723 detached 422 0
## 3351 1239 1640 detached 442 0
## 3579 954 956 attached 572 0
## 5430 758 817 attached 222 0
## 7057 943 1084 detached 397 0
## 7552 835 929 attached 224 0
## 10163 917 1125 detached 181 0
## 10876 723 1112 detached 361 0
## 12545 608 754 attached 213 0
## 12920 620 775 detached 322 0
## 12982 732 836 detached 397 0
## 13993 796 841 attached 844 0
## 14117 921 1059 detached 194 0
## 14137 1162 1470 detached 358 0
## 14335 602 842 attached 221 0
## 14627 478 482 detached 575 0
## 15829 1061 1411 attached 312 0
## 17481 997 1004 detached 275 0
## 18092 703 808 attached 395 0
## 18099 687 976 detached 283 0
## 19973 646 782 detached 404 0
## 19988 1228 1236 attached 332 0
## 21120 1066 1175 attached 326 0
## 24828 989 1289 detached 315 0
## 24829 989 1290 detached 310 0
## 26547 1047 1170 attached 324 0
## 27099 817 879 attached 424 0
## 28794 556 566 detached 216 0
## 30339 966 1116 detached 764 0
## 30395 594 863 detached 477 0
## 30443 826 1026 attached 292 0
## 30902 707 850 attached 680 0
## 32738 762 981 detached 277 0
## 33879 543 696 detached 244 0
## 34352 892 1033 detached 183 0
## 35507 1534 1760 detached 519 0
## 36935 1445 1446 attached 764 0
## 38362 806 1035 detached 305 307
## 39249 888 1218 detached 440 0
## 39468 962 1186 detached 440 0
## 39469 1011 1146 detached 324 263
## 39625 817 903 detached 172 0
## 42369 1053 1164 attached 329 0
## 42378 772 1184 detached 478 690
## has_fireplace has_pool has_central_heating has_central_cooling
## 51 FALSE FALSE FALSE FALSE
## 506 TRUE TRUE TRUE TRUE
## 1543 FALSE FALSE FALSE FALSE
## 1902 FALSE FALSE FALSE FALSE
## 2234 FALSE FALSE FALSE FALSE
## 2293 FALSE FALSE FALSE FALSE
## 2800 FALSE FALSE FALSE FALSE
## 3351 FALSE FALSE FALSE FALSE
## 3579 FALSE FALSE FALSE FALSE
## 5430 FALSE TRUE FALSE TRUE
## 7057 FALSE TRUE FALSE FALSE
## 7552 FALSE FALSE FALSE FALSE
## 10163 FALSE FALSE FALSE FALSE
## 10876 FALSE FALSE FALSE FALSE
## 12545 TRUE FALSE FALSE FALSE
## 12920 FALSE FALSE FALSE FALSE
## 12982 FALSE FALSE FALSE FALSE
## 13993 FALSE FALSE TRUE TRUE
## 14117 TRUE FALSE TRUE TRUE
## 14137 FALSE FALSE FALSE FALSE
## 14335 FALSE FALSE FALSE FALSE
## 14627 FALSE FALSE FALSE FALSE
## 15829 FALSE FALSE FALSE FALSE
## 17481 FALSE FALSE FALSE FALSE
## 18092 FALSE FALSE FALSE FALSE
## 18099 FALSE FALSE FALSE FALSE
## 19973 FALSE FALSE FALSE FALSE
## 19988 TRUE FALSE FALSE FALSE
## 21120 FALSE FALSE TRUE TRUE
## 24828 FALSE FALSE FALSE FALSE
## 24829 FALSE FALSE FALSE FALSE
## 26547 FALSE FALSE TRUE TRUE
## 27099 FALSE FALSE TRUE TRUE
## 28794 FALSE FALSE FALSE FALSE
## 30339 FALSE FALSE FALSE FALSE
## 30395 FALSE FALSE FALSE FALSE
## 30443 FALSE FALSE FALSE FALSE
## 30902 FALSE FALSE FALSE FALSE
## 32738 FALSE FALSE FALSE FALSE
## 33879 FALSE FALSE FALSE FALSE
## 34352 FALSE FALSE FALSE FALSE
## 35507 FALSE FALSE TRUE FALSE
## 36935 FALSE FALSE FALSE FALSE
## 38362 TRUE FALSE FALSE FALSE
## 39249 FALSE FALSE FALSE FALSE
## 39468 FALSE FALSE FALSE FALSE
## 39469 FALSE FALSE FALSE FALSE
## 39625 FALSE FALSE FALSE FALSE
## 42369 FALSE FALSE TRUE TRUE
## 42378 FALSE FALSE FALSE FALSE
## street_name city sale_price
## 51 William Valley Lake Christinaport 126001
## 506 Darius Row South Anthony 321302
## 1543 Smith View Chadstad 157504
## 1902 Adams Stream Scottberg 283498
## 2234 Roger Cliffs Davidfort 214199
## 2293 Patterson Centers Davidfort 233103
## 2800 Fields Groves Lake Dariusborough 207899
## 3351 Ana Glen Chadstad 409500
## 3579 Fuller Light Chadstad 466196
## 5430 Angela Harbor Lake Christinaport 283501
## 7057 Cheyenne Park Jeffreyhaven 103318
## 7552 Lewis Hollow Davidfort 277203
## 10163 Adams Stream North Erinville 173879
## 10876 Wanda Crest Lake Christinaport 119704
## 12545 Mitchell Lodge Scottberg 214200
## 12920 Davies Centers Davidfort 126001
## 12982 Price Freeway Davidfort 440996
## 13993 Richardson Shores Chadstad 810806
## 14117 King Meadow Chadstad 333901
## 14137 Reginald Circle Chadstad 197819
## 14335 Hunter Passage West Gregoryview 114659
## 14627 Tammy Manor Chadstad 459897
## 15829 Hernandez Fort Leahview 37800
## 17481 Smith Expressway Hallfort 207901
## 18092 Robinson Canyon East Janiceville 220496
## 18099 Alexis Fork East Lucas 504002
## 19973 Ryan Lights Justinport 396898
## 19988 Woodard Junction Justinport 390600
## 21120 Katherine Passage Lewishaven 233101
## 24828 Billy Brook Chadstad 313736
## 24829 Billy Brook Chadstad 200339
## 26547 Katherine Passage Lewishaven 233098
## 27099 Kevin Wall Lewishaven 195302
## 28794 Little Bypass East Lucas 149308
## 30339 Ryan Lights Chadstad 352796
## 30395 Victor Springs Chadstad 95757
## 30443 Scott Brook Chadstad 274676
## 30902 Mitchell Lodge Coletown 520384
## 32738 Cody Roads West Terrence 176396
## 33879 Harrell Avenue Jeffreyhaven 90088
## 34352 Elizabeth Walk Davidfort 251998
## 35507 Abbott View Chadstad 472502
## 36935 Gonzalez Hollow North Erinville 535501
## 38362 Pham Station Lake Carolyn 239398
## 39249 Stephanie Lake Scottberg 103948
## 39468 Aaron Mill East Lucas 113397
## 39469 Jennifer Isle East Lucas 437216
## 39625 Velasquez Track Davidfort 214203
## 42369 Perez Mount Lewishaven 233099
## 42378 Hernandez Fort Leahview 65517
subset(mydata, mydata$full_bathrooms == 5, select = c(full_bathrooms, half_bathrooms, total_sqft, sale_price))
## full_bathrooms half_bathrooms total_sqft sale_price
## 1510 5 0 3803 560700
## 3224 5 0 3763 718203
## 4193 5 0 3636 693002
## 4765 5 1 3004 441000
## 6340 5 1 4215 803250
## 6729 5 0 3696 584012
## 6730 5 0 3690 620554
## 6752 5 0 3695 650164
## 8694 5 0 3136 585902
## 12969 5 0 2776 440996
## 13778 5 0 2915 566998
## 17034 5 1 4209 715680
## 17476 5 0 3815 631264
## 17538 5 0 3802 440999
## 22782 5 0 3813 598497
## 22783 5 0 3700 612362
## 22784 5 0 3693 752849
## 27864 5 0 3614 706860
## 28202 5 0 3700 663391
## 33223 5 0 3642 740884
## 33546 5 0 3817 636934
## 33568 5 0 3813 652051
## 36792 5 0 3781 522903
## 38545 5 0 3693 779310
## 38736 5 1 2507 415800
## 38737 5 1 2509 415171
## 38791 5 0 3731 618656
## 38812 5 0 3814 645121
## 38826 5 0 3813 729538
## 39681 5 0 2777 289800
mydata$full_bathrooms = factor(mydata$full_bathrooms)
### half_bathrooms
table(mydata$half_bathrooms)
##
## 0 1
## 13149 17742
### garage_type
table(mydata$garage_type)
##
## attached detached none
## 28771 2120 0
mydata$garage_type = factor(mydata$garage_type)
### has_fireplace
table(mydata$has_fireplace)
##
## FALSE TRUE
## 9218 21673
table(mydata$has_pool)
##
## FALSE TRUE
## 25665 5226
table(mydata$has_central_heating)
##
## FALSE TRUE
## 1320 29571
table(mydata$has_central_cooling)
##
## FALSE TRUE
## 2115 28776
### Data clean up is done.
### I will check Multicollinearity in the data.
## Corplot/Corrplotmatrix
names(mydata)
## [1] "year_built" "stories" "num_bedrooms"
## [4] "full_bathrooms" "half_bathrooms" "livable_sqft"
## [7] "total_sqft" "garage_type" "garage_sqft"
## [10] "carport_sqft" "has_fireplace" "has_pool"
## [13] "has_central_heating" "has_central_cooling" "street_name"
## [16] "city" "sale_price"
m = cor(mydata[quantitative_var])
corrplot(m, method = "number", type = "lower")
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )
### sale_price has very low correlation with carport_sqft. So, I will remove carport_sqft.
### total_sqft and livable_sqft highly correlated. I will remove livable_sqft.
names(mydata)
## [1] "year_built" "stories" "num_bedrooms"
## [4] "full_bathrooms" "half_bathrooms" "livable_sqft"
## [7] "total_sqft" "garage_type" "garage_sqft"
## [10] "carport_sqft" "has_fireplace" "has_pool"
## [13] "has_central_heating" "has_central_cooling" "street_name"
## [16] "city" "sale_price"
mydata = mydata[,-c(6,10)]
## How sale_price is correlated with categorical variables?
summary(aov(formula = sale_price ~ stories,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## stories 3 5.225e+13 1.742e+13 913.9 <2e-16 ***
## Residuals 30887 5.886e+14 1.906e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ num_bedrooms,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## num_bedrooms 6 1.061e+14 1.768e+13 1021 <2e-16 ***
## Residuals 30884 5.348e+14 1.732e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ full_bathrooms,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## full_bathrooms 6 1.329e+14 2.215e+13 1347 <2e-16 ***
## Residuals 30884 5.079e+14 1.645e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ half_bathrooms,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## half_bathrooms 1 1.442e+12 1.442e+12 69.67 <2e-16 ***
## Residuals 30889 6.394e+14 2.070e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ garage_type,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## garage_type 1 1.816e+13 1.816e+13 900.6 <2e-16 ***
## Residuals 30889 6.227e+14 2.016e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ has_fireplace,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## has_fireplace 1 2.958e+13 2.958e+13 1495 <2e-16 ***
## Residuals 30889 6.113e+14 1.979e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ has_pool,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## has_pool 1 4.100e+13 4.100e+13 2111 <2e-16 ***
## Residuals 30889 5.998e+14 1.942e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ has_central_cooling,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## has_central_cooling 1 1.951e+13 1.951e+13 970 <2e-16 ***
## Residuals 30889 6.213e+14 2.011e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(formula = sale_price ~ has_central_heating,data = mydata))
## Df Sum Sq Mean Sq F value Pr(>F)
## has_central_heating 1 2.156e+13 2.156e+13 1075 <2e-16 ***
## Residuals 30889 6.193e+14 2.005e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#summary(aov(formula = sale_price ~ mydata$street_name,data = mydata))
#summary(aov(formula = mydata$sale_price ~ mydata$city,data = mydata))
### Change factor levels to something greater than 0.
levels(mydata$has_fireplace) <- c(1,2)
levels(mydata$has_pool) <- c(1,2)
levels(mydata$has_central_heating) <- c(1,2)
levels(mydata$has_central_cooling) <- c(1,2)
levels(mydata$garage_type) <- c(1,2)
### Write final data set
write.csv(mydata, "houseprice_model.csv", row.names = F)
write.csv(odd_data, "houseprice_odd_observations.csv", row.names = F)
# model -
### Price prediction
### Classfication 1. Type of house, city
### Clustering
### Regression with PCA
## Define file name
filename = "houseprice_model.csv"
## Load CSV file from local directory
inputdata = read.csv(filename, header = T)
str(inputdata)
## 'data.frame': 30891 obs. of 15 variables:
## $ year_built : int 1978 1958 2004 2006 2005 1979 2005 2006 2003 2004 ...
## $ stories : int 1 1 1 1 1 1 1 1 1 1 ...
## $ num_bedrooms : int 4 3 4 4 3 3 4 4 3 4 ...
## $ full_bathrooms : int 1 1 2 2 2 2 4 2 1 2 ...
## $ half_bathrooms : int 1 1 0 0 0 1 0 1 1 1 ...
## $ total_sqft : int 1859 2002 2277 1749 1672 2365 2254 2679 2000 2197 ...
## $ garage_type : int 1 1 1 1 1 2 1 1 1 1 ...
## $ garage_sqft : int 508 462 479 430 430 532 502 624 428 397 ...
## $ has_fireplace : int 2 2 2 2 2 2 2 2 2 2 ...
## $ has_pool : int 1 1 1 1 1 1 1 1 1 2 ...
## $ has_central_heating: int 2 2 2 2 2 2 2 2 2 2 ...
## $ has_central_cooling: int 2 2 2 2 2 2 2 2 2 2 ...
## $ street_name : Factor w/ 9511 levels "Aaron Cliff",..: 5438 3030 6133 3840 6133 3410 2047 6901 1298 9436 ...
## $ city : Factor w/ 46 levels "Amystad","Brownport",..: 13 13 20 20 20 27 20 20 20 20 ...
## $ sale_price : num 270897 302404 197193 207897 196559 ...
categorical_var = c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type","has_fireplace","has_pool", "has_central_heating", "has_central_cooling")
inputdata[,categorical_var] = lapply(inputdata[, categorical_var], factor)
dim(inputdata)
## [1] 30891 15
names(inputdata)
## [1] "year_built" "stories" "num_bedrooms"
## [4] "full_bathrooms" "half_bathrooms" "total_sqft"
## [7] "garage_type" "garage_sqft" "has_fireplace"
## [10] "has_pool" "has_central_heating" "has_central_cooling"
## [13] "street_name" "city" "sale_price"
## 30915 observations and 15 attributes.
## Randomly select 20% of the data set for this analysis.
## I will go for validation set approach. 70% training set and 30% test set.
## Normalize continuous variable using formula (x - min(x))/(max(x) - min(x))
normalize = function(x) {
return((x - min(x))/(max(x) - min(x)))
}
workdata = inputdata[,-c(1,13,14)]
#workdata <- scale(workdata, center = T, scale = T)
set.seed(125)
index = sample(1:nrow(workdata), 5000)
sampledata = workdata[index,]
sampledata = as.data.frame(sampledata)
## Normalize sale_price, total_sqft, garage_sqft.
sampledata$sale_price = normalize(sampledata$sale_price)
sampledata$total_sqft = normalize(sampledata$total_sqft)
sampledata$garage_sqft = normalize(sampledata$garage_sqft)
summary(sampledata)
## stories num_bedrooms full_bathrooms half_bathrooms total_sqft
## 1:2942 1: 27 0: 9 0:2118 Min. :0.0000
## 2:2012 2: 867 1:1274 1:2882 1st Qu.:0.3043
## 3: 29 3:2027 2:2888 Median :0.4256
## 4: 17 4:1572 3: 752 Mean :0.4531
## 5: 455 4: 73 3rd Qu.:0.5875
## 6: 48 5: 4 Max. :1.0000
## 7: 4 6: 0
## garage_type garage_sqft has_fireplace has_pool has_central_heating
## 1:4657 Min. :0.0000 1:1446 1:4159 1: 205
## 2: 343 1st Qu.:0.3940 2:3554 2: 841 2:4795
## Median :0.4583
## Mean :0.5046
## 3rd Qu.:0.6402
## Max. :1.0000
##
## has_central_cooling sale_price
## 1: 342 Min. :0.0000
## 2:4658 1st Qu.:0.3406
## Median :0.4583
## Mean :0.4669
## 3rd Qu.:0.5802
## Max. :1.0000
##
# Creating dummy variables for categorical variables
sampledata = dummy.data.frame(sampledata, c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type", "has_fireplace", "has_pool", "has_central_cooling", "has_central_heating"))
names(sampledata)
## [1] "stories1" "stories2" "stories3"
## [4] "stories4" "num_bedrooms1" "num_bedrooms2"
## [7] "num_bedrooms3" "num_bedrooms4" "num_bedrooms5"
## [10] "num_bedrooms6" "num_bedrooms7" "full_bathrooms0"
## [13] "full_bathrooms1" "full_bathrooms2" "full_bathrooms3"
## [16] "full_bathrooms4" "full_bathrooms5" "half_bathrooms0"
## [19] "half_bathrooms1" "total_sqft" "garage_type1"
## [22] "garage_type2" "garage_sqft" "has_fireplace1"
## [25] "has_fireplace2" "has_pool1" "has_pool2"
## [28] "has_central_heating1" "has_central_heating2" "has_central_cooling1"
## [31] "has_central_cooling2" "sale_price"
str(sampledata)
## 'data.frame': 5000 obs. of 32 variables:
## $ stories1 : int 1 0 0 0 1 0 0 1 0 0 ...
## $ stories2 : int 0 1 1 1 0 1 1 0 1 1 ...
## $ stories3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stories4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_bedrooms1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_bedrooms2 : int 0 1 0 0 0 0 0 1 0 0 ...
## $ num_bedrooms3 : int 1 0 0 1 1 0 0 0 0 0 ...
## $ num_bedrooms4 : int 0 0 1 0 0 1 1 0 0 1 ...
## $ num_bedrooms5 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ num_bedrooms6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_bedrooms7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ full_bathrooms0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ full_bathrooms1 : int 1 0 0 0 0 0 0 1 0 0 ...
## $ full_bathrooms2 : int 0 1 1 1 1 1 1 0 0 0 ...
## $ full_bathrooms3 : int 0 0 0 0 0 0 0 0 1 1 ...
## $ full_bathrooms4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ full_bathrooms5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ half_bathrooms0 : int 0 1 0 0 0 0 0 1 1 1 ...
## $ half_bathrooms1 : int 1 0 1 1 1 1 1 0 0 0 ...
## $ total_sqft : num 0.536 0.615 0.637 0.332 0.573 ...
## $ garage_type1 : int 1 1 1 1 1 1 1 0 1 1 ...
## $ garage_type2 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ garage_sqft : num 0.317 0.443 0.635 0.436 0.668 ...
## $ has_fireplace1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_fireplace2 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ has_pool1 : int 1 1 1 1 1 1 1 1 1 0 ...
## $ has_pool2 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ has_central_heating1: int 0 0 0 0 0 0 0 1 0 0 ...
## $ has_central_heating2: int 1 1 1 1 1 1 1 0 1 1 ...
## $ has_central_cooling1: int 1 0 0 0 0 0 0 1 0 0 ...
## $ has_central_cooling2: int 0 1 1 1 1 1 1 0 1 1 ...
## $ sale_price : num 0.615 0.647 0.686 0.482 0.666 ...
## - attr(*, "dummies")=List of 9
## ..$ stories : int 1 2 3 4
## ..$ num_bedrooms : int 5 6 7 8 9 10 11
## ..$ full_bathrooms : int 12 13 14 15 16 17
## ..$ half_bathrooms : int 18 19
## ..$ garage_type : int 21 22
## ..$ has_fireplace : int 24 25
## ..$ has_pool : int 26 27
## ..$ has_central_heating: int 28 29
## ..$ has_central_cooling: int 30 31
## Now we have 31 variables and 2000 observations.
## Model Development
## Training and Validation set
set.seed(125)
training_index = sample(1:nrow(sampledata), nrow(sampledata)*.7)
training = sampledata[training_index,]
testing = sampledata[-training_index,]
##Decision tree for classification
#Develop Model on training data
fit_DT = rpart(sale_price ~., data = training, method = "anova")
#Summary of DT model
summary(fit_DT)
## Call:
## rpart(formula = sale_price ~ ., data = training, method = "anova")
## n= 3500
##
## CP nsplit rel error xerror xstd
## 1 0.31943386 0 1.0000000 1.0006793 0.02384119
## 2 0.05168657 1 0.6805661 0.6999094 0.01804805
## 3 0.05060668 2 0.6288796 0.6444641 0.01684658
## 4 0.01448357 3 0.5782729 0.5974777 0.01622982
## 5 0.01071923 4 0.5637893 0.5821069 0.01595812
## 6 0.01044152 5 0.5530701 0.5727082 0.01583220
## 7 0.01000000 6 0.5426286 0.5702178 0.01581326
##
## Variable importance
## total_sqft garage_sqft stories1 stories2
## 39 16 10 10
## full_bathrooms1 full_bathrooms3 num_bedrooms2 full_bathrooms2
## 9 8 1 1
## num_bedrooms5 has_pool1 has_pool2 num_bedrooms3
## 1 1 1 1
## num_bedrooms4
## 1
##
## Node number 1: 3500 observations, complexity param=0.3194339
## mean=0.4679249, MSE=0.03241327
## left son=2 (1839 obs) right son=3 (1661 obs)
## Primary splits:
## total_sqft < 0.4403462 to the left, improve=0.31943390, (0 missing)
## garage_sqft < 0.5314637 to the left, improve=0.17990530, (0 missing)
## full_bathrooms1 < 0.5 to the right, improve=0.13682310, (0 missing)
## num_bedrooms2 < 0.5 to the right, improve=0.09494075, (0 missing)
## full_bathrooms3 < 0.5 to the left, improve=0.09296820, (0 missing)
## Surrogate splits:
## garage_sqft < 0.5506156 to the left, agree=0.758, adj=0.490, (0 split)
## stories1 < 0.5 to the right, agree=0.691, adj=0.348, (0 split)
## stories2 < 0.5 to the left, agree=0.691, adj=0.348, (0 split)
## full_bathrooms1 < 0.5 to the right, agree=0.669, adj=0.303, (0 split)
## full_bathrooms3 < 0.5 to the left, agree=0.653, adj=0.270, (0 split)
##
## Node number 2: 1839 observations, complexity param=0.05060668
## mean=0.3712206, MSE=0.01881842
## left son=4 (976 obs) right son=5 (863 obs)
## Primary splits:
## total_sqft < 0.3149134 to the left, improve=0.16589520, (0 missing)
## num_bedrooms2 < 0.5 to the right, improve=0.09454217, (0 missing)
## garage_sqft < 0.2414501 to the left, improve=0.06922495, (0 missing)
## has_fireplace1 < 0.5 to the right, improve=0.06033045, (0 missing)
## has_fireplace2 < 0.5 to the left, improve=0.06033045, (0 missing)
## Surrogate splits:
## full_bathrooms1 < 0.5 to the right, agree=0.622, adj=0.195, (0 split)
## num_bedrooms4 < 0.5 to the left, agree=0.620, adj=0.191, (0 split)
## garage_sqft < 0.4562244 to the left, agree=0.607, adj=0.162, (0 split)
## full_bathrooms2 < 0.5 to the left, agree=0.604, adj=0.156, (0 split)
## stories1 < 0.5 to the right, agree=0.591, adj=0.129, (0 split)
##
## Node number 3: 1661 observations, complexity param=0.05168657
## mean=0.5749925, MSE=0.02564764
## left son=6 (977 obs) right son=7 (684 obs)
## Primary splits:
## total_sqft < 0.6315579 to the left, improve=0.13764220, (0 missing)
## garage_sqft < 0.5164159 to the left, improve=0.04964865, (0 missing)
## has_pool1 < 0.5 to the right, improve=0.04330838, (0 missing)
## has_pool2 < 0.5 to the left, improve=0.04330838, (0 missing)
## full_bathrooms3 < 0.5 to the left, improve=0.02000909, (0 missing)
## Surrogate splits:
## num_bedrooms5 < 0.5 to the left, agree=0.673, adj=0.206, (0 split)
## full_bathrooms3 < 0.5 to the left, agree=0.655, adj=0.162, (0 split)
## garage_sqft < 0.6436389 to the left, agree=0.644, adj=0.135, (0 split)
## full_bathrooms2 < 0.5 to the right, agree=0.636, adj=0.117, (0 split)
## full_bathrooms4 < 0.5 to the left, agree=0.613, adj=0.060, (0 split)
##
## Node number 4: 976 observations, complexity param=0.01448357
## mean=0.3186807, MSE=0.01487658
## left son=8 (361 obs) right son=9 (615 obs)
## Primary splits:
## num_bedrooms2 < 0.5 to the right, improve=0.11316540, (0 missing)
## has_fireplace2 < 0.5 to the left, improve=0.09729332, (0 missing)
## has_fireplace1 < 0.5 to the right, improve=0.09729332, (0 missing)
## garage_sqft < 0.2414501 to the left, improve=0.08566344, (0 missing)
## num_bedrooms3 < 0.5 to the left, improve=0.07126648, (0 missing)
## Surrogate splits:
## num_bedrooms3 < 0.5 to the left, agree=0.880, adj=0.676, (0 split)
## total_sqft < 0.1993342 to the left, agree=0.707, adj=0.208, (0 split)
## garage_sqft < 0.3426813 to the left, agree=0.694, adj=0.172, (0 split)
## has_fireplace1 < 0.5 to the right, agree=0.647, adj=0.044, (0 split)
## has_fireplace2 < 0.5 to the left, agree=0.647, adj=0.044, (0 split)
##
## Node number 5: 863 observations
## mean=0.43064, MSE=0.01662386
##
## Node number 6: 977 observations, complexity param=0.01044152
## mean=0.5252783, MSE=0.02001787
## left son=12 (783 obs) right son=13 (194 obs)
## Primary splits:
## has_pool2 < 0.5 to the left, improve=0.060567860, (0 missing)
## has_pool1 < 0.5 to the right, improve=0.060567860, (0 missing)
## total_sqft < 0.5757656 to the left, improve=0.044638620, (0 missing)
## garage_sqft < 0.5225718 to the left, improve=0.034419710, (0 missing)
## has_central_cooling1 < 0.5 to the left, improve=0.009487644, (0 missing)
## Surrogate splits:
## has_pool1 < 0.5 to the right, agree=1.000, adj=1.00, (0 split)
## garage_sqft < 0.1826265 to the right, agree=0.803, adj=0.01, (0 split)
##
## Node number 7: 684 observations, complexity param=0.01071923
## mean=0.6460024, MSE=0.0251164
## left son=14 (463 obs) right son=15 (221 obs)
## Primary splits:
## total_sqft < 0.7789614 to the left, improve=0.07078494, (0 missing)
## has_pool2 < 0.5 to the left, improve=0.03501981, (0 missing)
## has_pool1 < 0.5 to the right, improve=0.03501981, (0 missing)
## stories1 < 0.5 to the left, improve=0.02514110, (0 missing)
## stories2 < 0.5 to the right, improve=0.02211745, (0 missing)
## Surrogate splits:
## full_bathrooms4 < 0.5 to the left, agree=0.709, adj=0.100, (0 split)
## garage_sqft < 0.8071135 to the left, agree=0.694, adj=0.054, (0 split)
## num_bedrooms6 < 0.5 to the left, agree=0.687, adj=0.032, (0 split)
## full_bathrooms5 < 0.5 to the left, agree=0.681, adj=0.014, (0 split)
##
## Node number 8: 361 observations
## mean=0.2651267, MSE=0.01315519
##
## Node number 9: 615 observations
## mean=0.3501165, MSE=0.0132153
##
## Node number 12: 783 observations
## mean=0.5079463, MSE=0.01777941
##
## Node number 13: 194 observations
## mean=0.5952319, MSE=0.02294652
##
## Node number 14: 463 observations
## mean=0.6168714, MSE=0.02321827
##
## Node number 15: 221 observations
## mean=0.7070324, MSE=0.02359049
#Lets predict for training data
pred_DT_train = predict(fit_DT, training[,names(testing) != "sale_price"])
#rpart.plot::rpart.plot(fit_DT)
#Lets predict for training data
pred_DT_test = predict(fit_DT,testing[,names(testing) != "sale_price"])
# For training data
print(postResample(pred = pred_DT_train, obs = training[,32]))
## RMSE Rsquared MAE
## 0.1326211 0.4573714 0.1036356
# RMSE Rsquared MAE
# 0.1310250 0.4232061 0.1030601
# For testing data
print(postResample(pred = pred_DT_test, obs = testing[,32]))
## RMSE Rsquared MAE
## 0.1367510 0.4325352 0.1065761
## Linear regression
#set.seed(125)
#Develop Model on training data
fit_LR = lm(sale_price ~ ., data = training)
#Summary of LR model
summary(fit_LR)
##
## Call:
## lm(formula = sale_price ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63185 -0.08246 -0.00696 0.08371 0.60077
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.358477 0.112551 3.185 0.00146 **
## stories1 -0.001875 0.035711 -0.053 0.95812
## stories2 -0.025648 0.035690 -0.719 0.47241
## stories3 -0.021127 0.046123 -0.458 0.64695
## stories4 NA NA NA NA
## num_bedrooms1 0.115640 0.080971 1.428 0.15333
## num_bedrooms2 0.075720 0.074790 1.012 0.31140
## num_bedrooms3 0.092329 0.074551 1.238 0.21563
## num_bedrooms4 0.080023 0.074451 1.075 0.28252
## num_bedrooms5 0.062485 0.074571 0.838 0.40213
## num_bedrooms6 0.027197 0.077420 0.351 0.72539
## num_bedrooms7 NA NA NA NA
## full_bathrooms0 -0.147983 0.094601 -1.564 0.11784
## full_bathrooms1 -0.202890 0.075571 -2.685 0.00729 **
## full_bathrooms2 -0.141605 0.074707 -1.895 0.05811 .
## full_bathrooms3 -0.110856 0.074294 -1.492 0.13576
## full_bathrooms4 -0.088406 0.076422 -1.157 0.24743
## full_bathrooms5 NA NA NA NA
## half_bathrooms0 -0.037954 0.006164 -6.158 8.22e-10 ***
## half_bathrooms1 NA NA NA NA
## total_sqft 0.531363 0.018570 28.614 < 2e-16 ***
## garage_type1 0.001582 0.009563 0.165 0.86860
## garage_type2 NA NA NA NA
## garage_sqft 0.049067 0.016646 2.948 0.00322 **
## has_fireplace1 -0.023393 0.005086 -4.599 4.40e-06 ***
## has_fireplace2 NA NA NA NA
## has_pool1 -0.068618 0.005975 -11.484 < 2e-16 ***
## has_pool2 NA NA NA NA
## has_central_heating1 -0.064806 0.017905 -3.619 0.00030 ***
## has_central_heating2 NA NA NA NA
## has_central_cooling1 0.084015 0.014710 5.711 1.21e-08 ***
## has_central_cooling2 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1277 on 3477 degrees of freedom
## Multiple R-squared: 0.5, Adjusted R-squared: 0.4968
## F-statistic: 158 on 22 and 3477 DF, p-value: < 2.2e-16
#Tune LR model with significant attributes
#fit_LR = lm(sale_price ~ half_bathrooms0 + total_sqft + garage_sqft + has_fireplace1 + has_pool1 + has_central_heating1 + #has_central_cooling1, data = training)
#summary(fit_LR)
#fit_LR = lm(sale_price ~ total_sqft + garage_sqft + has_fireplace1 + has_pool1 + has_central_heating1 + has_central_cooling1, data = #training)
#summary(fit_LR)
#Lets predict for training data
pred_LR_train = predict(fit_LR, training[,names(testing) != "sale_price"])
## Warning in predict.lm(fit_LR, training[, names(testing) != "sale_price"]):
## prediction from a rank-deficient fit may be misleading
#Lets predict for testing data
pred_LR_test = predict(fit_LR,testing[,names(testing) != "sale_price"])
## Warning in predict.lm(fit_LR, testing[, names(testing) != "sale_price"]):
## prediction from a rank-deficient fit may be misleading
# For training data
print(postResample(pred = pred_LR_train, obs = training[,32]))
## RMSE Rsquared MAE
## 0.1273099 0.4999640 0.1001539
# For testing data
print(postResample(pred = pred_LR_test, obs = testing[,32]))
## RMSE Rsquared MAE
## 0.1296649 0.4897216 0.1006418
## Random Forest
set.seed(125)
#Develop Model on training data
fit_RF = randomForest(sale_price~., data = training)
#Lets predict for training data
pred_RF_train = predict(fit_RF, training[,names(testing) != "sale_price"])
#Lets predict for testing data
pred_RF_test = predict(fit_RF,testing[,names(testing) != "sale_price"])
# For training data
print(postResample(pred = pred_RF_train, obs = training[,32]))
## RMSE Rsquared MAE
## 0.07697968 0.83599538 0.06028191
# For testing data
print(postResample(pred = pred_RF_test, obs = testing[,32]))
## RMSE Rsquared MAE
## 0.12621684 0.51661004 0.09614311
## XGBoost
set.seed(125)
#Develop Model on training data
fit_XGB = gbm(sale_price~., data = training, n.trees = 500, interaction.depth = 2)
## Distribution not specified, assuming gaussian ...
summary(fit_XGB)
## var rel.inf
## total_sqft total_sqft 72.51642947
## garage_sqft garage_sqft 11.67893823
## has_pool2 has_pool2 2.95749330
## num_bedrooms2 num_bedrooms2 1.51573004
## full_bathrooms1 full_bathrooms1 1.36716574
## stories1 stories1 1.15302987
## has_pool1 has_pool1 1.15121070
## has_fireplace2 has_fireplace2 1.03776150
## num_bedrooms6 num_bedrooms6 0.96464375
## has_central_cooling1 has_central_cooling1 0.64381098
## half_bathrooms0 half_bathrooms0 0.49125280
## full_bathrooms2 full_bathrooms2 0.45902342
## num_bedrooms3 num_bedrooms3 0.45736319
## garage_type2 garage_type2 0.41879147
## full_bathrooms3 full_bathrooms3 0.39931495
## garage_type1 garage_type1 0.38331197
## stories2 stories2 0.37498045
## has_fireplace1 has_fireplace1 0.35101053
## num_bedrooms5 num_bedrooms5 0.29200133
## has_central_heating1 has_central_heating1 0.28846560
## full_bathrooms4 full_bathrooms4 0.27843334
## has_central_cooling2 has_central_cooling2 0.24889987
## half_bathrooms1 half_bathrooms1 0.22046680
## has_central_heating2 has_central_heating2 0.19988186
## num_bedrooms1 num_bedrooms1 0.09070008
## num_bedrooms4 num_bedrooms4 0.05988875
## stories3 stories3 0.00000000
## stories4 stories4 0.00000000
## num_bedrooms7 num_bedrooms7 0.00000000
## full_bathrooms0 full_bathrooms0 0.00000000
## full_bathrooms5 full_bathrooms5 0.00000000
#Lets predict for training data
pred_XGB_train = predict(fit_XGB, training[,names(testing) != "sale_price"], n.trees = 500)
#Lets predict for testing data
pred_XGB_test = predict(fit_XGB,testing[,names(testing) != "sale_price"], n.trees = 500)
# For training data
print(postResample(pred = pred_XGB_train, obs = training[,32]))
## RMSE Rsquared MAE
## 0.11815175 0.57008116 0.09244493
# For testing data
print(postResample(pred = pred_XGB_test, obs = testing[,32]))
## RMSE Rsquared MAE
## 0.1303687 0.4848113 0.1012367
## Dimensionality Reduction using PCA
#principal component analysis
prin_comp = prcomp(training)
summary(prin_comp)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 0.8517 0.7386 0.6619 0.6233 0.53045 0.50885 0.43017
## Proportion of Variance 0.2150 0.1617 0.1299 0.1152 0.08341 0.07676 0.05485
## Cumulative Proportion 0.2150 0.3767 0.5066 0.6218 0.70520 0.78196 0.83681
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.41580 0.31951 0.29892 0.24655 0.17730 0.14314
## Proportion of Variance 0.05125 0.03026 0.02649 0.01802 0.00932 0.00607
## Cumulative Proportion 0.88806 0.91832 0.94481 0.96283 0.97215 0.97822
## PC14 PC15 PC16 PC17 PC18 PC19
## Standard deviation 0.12827 0.12573 0.09819 0.09596 0.09260 0.07595
## Proportion of Variance 0.00488 0.00469 0.00286 0.00273 0.00254 0.00171
## Cumulative Proportion 0.98310 0.98779 0.99064 0.99337 0.99592 0.99763
## PC20 PC21 PC22 PC23 PC24 PC25
## Standard deviation 0.06515 0.04294 0.03134 0.03060 8.692e-15 6.774e-15
## Proportion of Variance 0.00126 0.00055 0.00029 0.00028 0.000e+00 0.000e+00
## Cumulative Proportion 0.99888 0.99943 0.99972 1.00000 1.000e+00 1.000e+00
## PC26 PC27 PC28 PC29 PC30
## Standard deviation 5.153e-15 3.407e-15 5.741e-16 3.708e-16 3.637e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC31 PC32
## Standard deviation 2.781e-16 1.513e-16
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00
#compute standard deviation of each principal component
std_dev = prin_comp$sdev
#compute variance
pr_var = std_dev^2
#proportion of variance explained
prop_varex = pr_var/sum(pr_var)
#cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
#add a training set with principal components
train.data = data.frame(sale_price = training$sale_price, prin_comp$x)
# From the above plot selecting 7 components since it explains almost 80+ % data variance
train.data =train.data[,1:7]
#transform test into PCA
test.data = predict(prin_comp, newdata = testing)
test.data = as.data.frame(test.data)
#select the first 7 components
test.data=test.data[,1:7]
## Decision tree for classification
#Develop Model on training data
fit_DT = rpart(sale_price ~., data = train.data, method = "anova")
summary(fit_DT)
## Call:
## rpart(formula = sale_price ~ ., data = train.data, method = "anova")
## n= 3500
##
## CP nsplit rel error xerror xstd
## 1 0.19938976 0 1.0000000 1.0006180 0.02384784
## 2 0.04185881 1 0.8006102 0.8029023 0.02040718
## 3 0.03701896 3 0.7168926 0.7526404 0.01967466
## 4 0.02629120 4 0.6798737 0.7230709 0.01895893
## 5 0.01450642 5 0.6535825 0.6787105 0.01843399
## 6 0.01376076 6 0.6390760 0.6725201 0.01845176
## 7 0.01264261 7 0.6253153 0.6568818 0.01816089
## 8 0.01000000 9 0.6000301 0.6311420 0.01771516
##
## Variable importance
## PC1 PC6 PC2 PC4 PC3 PC5
## 38 21 17 13 8 3
##
## Node number 1: 3500 observations, complexity param=0.1993898
## mean=0.4679249, MSE=0.03241327
## left son=2 (1379 obs) right son=3 (2121 obs)
## Primary splits:
## PC1 < -0.3956497 to the left, improve=0.19938980, (0 missing)
## PC2 < -0.7045554 to the left, improve=0.08676619, (0 missing)
## PC6 < -0.8567894 to the right, improve=0.06508839, (0 missing)
## PC4 < 0.3340385 to the left, improve=0.04248052, (0 missing)
## PC3 < 0.7199597 to the right, improve=0.03457135, (0 missing)
## Surrogate splits:
## PC2 < -0.680802 to the left, agree=0.724, adj=0.299, (0 split)
## PC6 < 0.3787574 to the right, agree=0.660, adj=0.138, (0 split)
## PC3 < -0.5653524 to the left, agree=0.639, adj=0.083, (0 split)
## PC4 < -0.9244089 to the left, agree=0.619, adj=0.032, (0 split)
##
## Node number 2: 1379 observations, complexity param=0.03701896
## mean=0.3682236, MSE=0.02526514
## left son=4 (1166 obs) right son=5 (213 obs)
## Primary splits:
## PC6 < -0.3736896 to the right, improve=0.12053940, (0 missing)
## PC1 < -1.070884 to the left, improve=0.11684440, (0 missing)
## PC3 < 0.6401534 to the right, improve=0.04935712, (0 missing)
## PC2 < -0.8271217 to the left, improve=0.04071553, (0 missing)
## PC4 < 0.275964 to the left, improve=0.03082658, (0 missing)
## Surrogate splits:
## PC3 < -0.8649294 to the right, agree=0.891, adj=0.291, (0 split)
## PC4 < 1.114466 to the left, agree=0.860, adj=0.094, (0 split)
##
## Node number 3: 2121 observations, complexity param=0.04185881
## mean=0.5327473, MSE=0.02639593
## left son=6 (458 obs) right son=7 (1663 obs)
## Primary splits:
## PC6 < 0.3515094 to the right, improve=0.06893502, (0 missing)
## PC4 < 0.2324192 to the left, improve=0.06546717, (0 missing)
## PC2 < -0.4165453 to the left, improve=0.03682105, (0 missing)
## PC5 < 0.5967187 to the left, improve=0.03615933, (0 missing)
## PC1 < 0.8507392 to the left, improve=0.02774672, (0 missing)
## Surrogate splits:
## PC2 < -0.7444782 to the left, agree=0.792, adj=0.037, (0 split)
##
## Node number 4: 1166 observations, complexity param=0.0262912
## mean=0.3446369, MSE=0.02125769
## left son=8 (691 obs) right son=9 (475 obs)
## Primary splits:
## PC1 < -0.721545 to the left, improve=0.12033350, (0 missing)
## PC3 < 0.1877787 to the right, improve=0.04939080, (0 missing)
## PC2 < 1.075145 to the left, improve=0.04535920, (0 missing)
## PC4 < 0.2749883 to the left, improve=0.04030743, (0 missing)
## PC5 < 0.4711739 to the left, improve=0.03565906, (0 missing)
## Surrogate splits:
## PC2 < 0.7670407 to the left, agree=0.854, adj=0.642, (0 split)
## PC3 < -0.7038985 to the right, agree=0.791, adj=0.486, (0 split)
## PC4 < -0.3971587 to the right, agree=0.712, adj=0.293, (0 split)
## PC6 < 0.2197383 to the left, agree=0.708, adj=0.284, (0 split)
## PC5 < -0.5441839 to the right, agree=0.644, adj=0.126, (0 split)
##
## Node number 5: 213 observations
## mean=0.497341, MSE=0.02748589
##
## Node number 6: 458 observations
## mean=0.4514639, MSE=0.01985569
##
## Node number 7: 1663 observations, complexity param=0.04185881
## mean=0.5551332, MSE=0.02587641
## left son=14 (530 obs) right son=15 (1133 obs)
## Primary splits:
## PC4 < -0.5265816 to the left, improve=0.13101940, (0 missing)
## PC1 < 0.9421486 to the left, improve=0.07440345, (0 missing)
## PC5 < 0.5957716 to the left, improve=0.04313677, (0 missing)
## PC6 < 0.2899075 to the left, improve=0.04265583, (0 missing)
## PC2 < -0.4145367 to the left, improve=0.03811003, (0 missing)
## Surrogate splits:
## PC2 < -0.5660033 to the left, agree=0.775, adj=0.294, (0 split)
## PC5 < 0.4645153 to the right, agree=0.740, adj=0.183, (0 split)
## PC3 < 0.7496727 to the right, agree=0.707, adj=0.081, (0 split)
## PC6 < -1.55537 to the left, agree=0.683, adj=0.006, (0 split)
##
## Node number 8: 691 observations
## mean=0.3027036, MSE=0.01879866
##
## Node number 9: 475 observations, complexity param=0.01376076
## mean=0.4056389, MSE=0.01855567
## left son=18 (381 obs) right son=19 (94 obs)
## Primary splits:
## PC4 < 0.2820162 to the left, improve=0.17711820, (0 missing)
## PC5 < 0.6763835 to the left, improve=0.10337390, (0 missing)
## PC6 < 0.2518676 to the right, improve=0.10007340, (0 missing)
## PC2 < 1.075145 to the left, improve=0.08818329, (0 missing)
## PC3 < -0.6859192 to the left, improve=0.08549251, (0 missing)
## Surrogate splits:
## PC5 < 0.473236 to the left, agree=0.895, adj=0.468, (0 split)
## PC2 < 0.9995193 to the left, agree=0.846, adj=0.223, (0 split)
## PC3 < 1.194617 to the left, agree=0.823, adj=0.106, (0 split)
## PC6 < 0.6186065 to the left, agree=0.808, adj=0.032, (0 split)
##
## Node number 14: 530 observations, complexity param=0.01450642
## mean=0.4700004, MSE=0.01550114
## left son=28 (139 obs) right son=29 (391 obs)
## Primary splits:
## PC6 < 0.260784 to the right, improve=0.20031420, (0 missing)
## PC1 < 0.8796377 to the left, improve=0.19173190, (0 missing)
## PC4 < -0.6631524 to the left, improve=0.10831870, (0 missing)
## PC2 < -0.2848248 to the right, improve=0.04906147, (0 missing)
## PC5 < -0.2042186 to the right, improve=0.03720161, (0 missing)
## Surrogate splits:
## PC3 < -0.5485734 to the left, agree=0.796, adj=0.223, (0 split)
## PC5 < 0.5733684 to the right, agree=0.745, adj=0.029, (0 split)
##
## Node number 15: 1133 observations, complexity param=0.01264261
## mean=0.594957, MSE=0.02575356
## left son=30 (1082 obs) right son=31 (51 obs)
## Primary splits:
## PC6 < -0.987985 to the right, improve=0.04185659, (0 missing)
## PC1 < -0.1246466 to the left, improve=0.04132996, (0 missing)
## PC2 < -0.7385828 to the left, improve=0.04030826, (0 missing)
## PC5 < 0.4921896 to the left, improve=0.02653610, (0 missing)
## PC4 < 0.443252 to the left, improve=0.01779070, (0 missing)
## Surrogate splits:
## PC2 < 1.082758 to the left, agree=0.960, adj=0.118, (0 split)
## PC5 < -1.141208 to the right, agree=0.957, adj=0.039, (0 split)
##
## Node number 18: 381 observations
## mean=0.3771634, MSE=0.01314705
##
## Node number 19: 94 observations
## mean=0.5210555, MSE=0.02387027
##
## Node number 28: 139 observations
## mean=0.3765419, MSE=0.006215923
##
## Node number 29: 391 observations
## mean=0.5032248, MSE=0.01459306
##
## Node number 30: 1082 observations, complexity param=0.01264261
## mean=0.5878289, MSE=0.02477831
## left son=60 (185 obs) right son=61 (897 obs)
## Primary splits:
## PC1 < -0.1246466 to the left, improve=0.06143927, (0 missing)
## PC6 < -0.9380561 to the left, improve=0.04366694, (0 missing)
## PC2 < -0.7385828 to the left, improve=0.04039399, (0 missing)
## PC3 < -1.04195 to the left, improve=0.03039344, (0 missing)
## PC5 < 0.2242114 to the left, improve=0.02102874, (0 missing)
## Surrogate splits:
## PC2 < 0.9707428 to the right, agree=0.920, adj=0.530, (0 split)
## PC4 < -0.4336536 to the left, agree=0.861, adj=0.189, (0 split)
## PC3 < -1.04195 to the left, agree=0.854, adj=0.146, (0 split)
## PC6 < -0.9590436 to the left, agree=0.834, adj=0.027, (0 split)
##
## Node number 31: 51 observations
## mean=0.7461839, MSE=0.02249651
##
## Node number 60: 185 observations
## mean=0.5019139, MSE=0.02263828
##
## Node number 61: 897 observations
## mean=0.6055483, MSE=0.02338334
#Lets predict for training data
pred_DT_train = predict(fit_DT, train.data)
#Lets predict for training data
pred_DT_test = predict(fit_DT,test.data)
# For training data
print(postResample(pred = pred_DT_train, obs = training$sale_price))
## RMSE Rsquared MAE
## 0.1394594 0.3999699 0.1083676
# For testing data
print(postResample(pred = pred_DT_test, obs = testing$sale_price))
## RMSE Rsquared MAE
## 0.1392642 0.4115576 0.1085944
## Linear regression
#Develop Model on training data
fit_LR = lm(sale_price ~ ., data = train.data)
summary(fit_LR)
##
## Call:
## lm(formula = sale_price ~ ., data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.66425 -0.10623 -0.01402 0.09119 0.64428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.467925 0.002577 181.545 < 2e-16 ***
## PC1 0.096500 0.003027 31.882 < 2e-16 ***
## PC2 0.018497 0.003490 5.300 1.23e-07 ***
## PC3 -0.016207 0.003895 -4.162 3.24e-05 ***
## PC4 0.045123 0.004136 10.910 < 2e-16 ***
## PC5 0.011854 0.004860 2.439 0.0148 *
## PC6 -0.071513 0.005066 -14.116 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1525 on 3493 degrees of freedom
## Multiple R-squared: 0.2841, Adjusted R-squared: 0.2829
## F-statistic: 231 on 6 and 3493 DF, p-value: < 2.2e-16
#Lets predict for training data
pred_LR_train = predict(fit_LR, train.data)
#Lets predict for testing data
pred_LR_test = predict(fit_LR,test.data)
# For training data
print(postResample(pred = pred_LR_train, obs = training$sale_price))
## RMSE Rsquared MAE
## 0.1523318 0.2840902 0.1197623
# For testing data
print(postResample(pred = pred_LR_test, obs =testing$sale_price))
## RMSE Rsquared MAE
## 0.1524032 0.2947531 0.1207978
## Random forest
#Develop Model on training data
fit_RF = randomForest(sale_price~., data = train.data)
#Lets predict for training data
pred_RF_train = predict(fit_RF, train.data)
#Lets predict for testing data
pred_RF_test = predict(fit_RF,test.data)
# For training data
print(postResample(pred = pred_RF_train, obs = training$sale_price))
## RMSE Rsquared MAE
## 0.03800913 0.96136663 0.02514029
# For testing data
print(postResample(pred = pred_RF_test, obs = testing$sale_price))
## RMSE Rsquared MAE
## 0.08330477 0.79210076 0.05516627
## XGBoost
#Develop Model on training data
fit_XGB = gbm(sale_price~., data = train.data, n.trees = 500, interaction.depth = 2)
## Distribution not specified, assuming gaussian ...
#Lets predict for training data
pred_XGB_train = predict(fit_XGB, train.data, n.trees = 500)
#Lets predict for testing data
pred_XGB_test = predict(fit_XGB,test.data, n.trees = 500)
# For training data
print(postResample(pred = pred_XGB_train, obs = training$sale_price))
## RMSE Rsquared MAE
## 0.10125217 0.69408487 0.07612766
# For testing data
print(postResample(pred = pred_XGB_test, obs = testing$sale_price))
## RMSE Rsquared MAE
## 0.11165618 0.62704084 0.08318098
## In this section I will try to predict city from sale_price.
cityhomes = as.data.frame(table(inputdata$city))
citydata = data.frame(sale_price = inputdata$sale_price, total_sqft = inputdata$total_sqft, garage_sqft = inputdata$garage_sqft, city = inputdata$city)
str(citydata)
## 'data.frame': 30891 obs. of 4 variables:
## $ sale_price : num 270897 302404 197193 207897 196559 ...
## $ total_sqft : int 1859 2002 2277 1749 1672 2365 2254 2679 2000 2197 ...
## $ garage_sqft: int 508 462 479 430 430 532 502 624 428 397 ...
## $ city : Factor w/ 46 levels "Amystad","Brownport",..: 13 13 20 20 20 27 20 20 20 20 ...
citydata = aggregate(citydata[, -4], by = list(citydata$city), FUN = median)
str(citydata)
## 'data.frame': 46 obs. of 4 variables:
## $ Group.1 : Factor w/ 46 levels "Amystad","Brownport",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ sale_price : num 381148 422101 415802 213253 541800 ...
## $ total_sqft : num 2196 2012 1778 1583 2242 ...
## $ garage_sqft: num 579 405 458 456 487 ...
citydata
## Group.1 sale_price total_sqft garage_sqft
## 1 Amystad 381148.0 2196.0 579.0
## 2 Brownport 422101.0 2012.0 405.0
## 3 Chadstad 415802.0 1778.0 458.0
## 4 Clarkberg 213252.5 1583.0 456.5
## 5 Coletown 541800.0 2242.0 487.0
## 6 Davidfort 400050.5 2202.0 477.0
## 7 Davidtown 443518.0 2926.0 575.0
## 8 East Amychester 327601.5 1933.5 476.5
## 9 East Janiceville 398793.5 2095.5 462.0
## 10 East Justin 343350.5 1944.5 559.5
## 11 East Lucas 346501.5 2137.0 486.0
## 12 Fosterberg 279716.0 2054.0 486.0
## 13 Hallfort 323823.0 1787.0 484.0
## 14 Jeffreyhaven 244439.0 1738.0 457.0
## 15 Jenniferberg 274683.0 1862.0 456.0
## 16 Joshuafurt 463054.0 2701.0 616.0
## 17 Julieberg 88203.0 1717.0 418.0
## 18 Justinport 566999.0 2291.0 507.0
## 19 Lake Carolyn 251054.5 1507.0 460.5
## 20 Lake Christinaport 196558.0 1730.5 462.0
## 21 Lake Dariusborough 321298.0 2112.0 451.0
## 22 Lake Jack 485101.0 1938.0 438.0
## 23 Lake Jennifer 351539.0 2001.0 574.0
## 24 Leahview 154980.0 1411.0 423.0
## 25 Lewishaven 388082.0 1970.0 483.0
## 26 Martinezfort 189000.0 1289.0 481.0
## 27 Morrisport 497070.0 2295.5 482.0
## 28 New Michele 302402.0 1876.0 418.0
## 29 North Erinville 451076.0 2360.0 518.0
## 30 Port Adamtown 407608.0 2055.0 538.5
## 31 Port Andrealand 485103.5 2234.0 496.5
## 32 Port Daniel 176396.0 1498.0 450.0
## 33 Port Jonathanborough 415796.0 2438.5 551.5
## 34 Richardport 579603.0 2852.0 592.0
## 35 Rickytown 627481.0 2906.0 723.0
## 36 Scottberg 277202.5 2080.0 536.0
## 37 South Anthony 359100.0 2189.0 501.0
## 38 South Stevenfurt 365396.0 2179.0 574.0
## 39 Toddshire 144267.5 1546.0 443.5
## 40 Wendybury 409615.0 2264.5 601.0
## 41 West Ann 409496.0 1976.0 479.0
## 42 West Brittanyview 315000.0 1717.0 579.0
## 43 West Gerald 175141.5 1614.5 448.5
## 44 West Gregoryview 573301.5 2691.0 589.0
## 45 West Lydia 275939.0 1598.0 456.0
## 46 West Terrence 355321.5 1927.5 506.0
names(citydata)[1] <- "city"
summary(citydata[,-1])
## sale_price total_sqft garage_sqft
## Min. : 88203 Min. :1289 Min. :405.0
## 1st Qu.:276255 1st Qu.:1748 1st Qu.:456.6
## Median :357211 Median :2006 Median :483.5
## Mean :356843 Mean :2032 Mean :502.2
## 3rd Qu.:420526 3rd Qu.:2226 3rd Qu.:548.2
## Max. :627481 Max. :2926 Max. :723.0
citydata$sale_price = normalize(citydata$sale_price)
citydata$total_sqft = normalize(citydata$total_sqft)
citydata$garage_sqft = normalize(citydata$garage_sqft)
summary(citydata)
## city sale_price total_sqft garage_sqft
## Amystad : 1 Min. :0.0000 Min. :0.0000 Min. :0.0000
## Brownport: 1 1st Qu.:0.3487 1st Qu.:0.2804 1st Qu.:0.1623
## Chadstad : 1 Median :0.4988 Median :0.4383 Median :0.2469
## Clarkberg: 1 Mean :0.4981 Mean :0.4537 Mean :0.3056
## Coletown : 1 3rd Qu.:0.6162 3rd Qu.:0.5724 3rd Qu.:0.4505
## Davidfort: 1 Max. :1.0000 Max. :1.0000 Max. :1.0000
## (Other) :40
## KNN classification
row.names(citydata) <- citydata$city
citydata = citydata[,-1]
citydata
## sale_price total_sqft garage_sqft
## Amystad 0.5432170 0.55406231 0.54716981
## Brownport 0.6191575 0.44166158 0.00000000
## Chadstad 0.6074770 0.29871717 0.16666667
## Clarkberg 0.2318832 0.17959682 0.16194969
## Coletown 0.8411191 0.58216249 0.25786164
## Davidfort 0.5782685 0.55772755 0.22641509
## Davidtown 0.6588717 1.00000000 0.53459119
## East Amychester 0.4439241 0.39370800 0.22484277
## East Janiceville 0.5759376 0.49266952 0.17924528
## East Justin 0.4731280 0.40042761 0.48584906
## East Lucas 0.4789710 0.51802077 0.25471698
## Fosterberg 0.3551285 0.46731827 0.25471698
## Hallfort 0.4369175 0.30421503 0.24842767
## Jeffreyhaven 0.2897133 0.27428222 0.16352201
## Jenniferberg 0.3457957 0.35003054 0.16037736
## Joshuafurt 0.6950979 0.86255345 0.66352201
## Julieberg 0.0000000 0.26145388 0.04088050
## Justinport 0.8878463 0.61209530 0.32075472
## Lake Carolyn 0.3019806 0.13317043 0.17452830
## Lake Christinaport 0.2009261 0.26970067 0.17924528
## Lake Dariusborough 0.4322353 0.50274893 0.14465409
## Lake Jack 0.7359803 0.39645693 0.10377358
## Lake Jennifer 0.4883122 0.43494197 0.53144654
## Leahview 0.1238267 0.07452657 0.05660377
## Lewishaven 0.5560750 0.41600489 0.24528302
## Martinezfort 0.1869110 0.00000000 0.23899371
## Morrisport 0.7581748 0.61484423 0.24213836
## New Michele 0.3971959 0.35858277 0.04088050
## North Erinville 0.6728867 0.65424557 0.35534591
## Port Adamtown 0.5922826 0.46792914 0.41981132
## Port Andrealand 0.7359850 0.57727550 0.28773585
## Port Daniel 0.1635390 0.12767257 0.14150943
## Port Jonathanborough 0.6074659 0.70219914 0.46069182
## Richardport 0.9112183 0.95479536 0.58805031
## Rickytown 1.0000000 0.98778253 1.00000000
## Scottberg 0.3504677 0.48320098 0.41194969
## South Anthony 0.5023327 0.54978619 0.30188679
## South Stevenfurt 0.5140076 0.54367746 0.53144654
## Toddshire 0.1039621 0.15699450 0.12106918
## Wendybury 0.5960043 0.59590715 0.61635220
## West Ann 0.5957836 0.41967013 0.23270440
## West Brittanyview 0.4205567 0.26145388 0.54716981
## West Gerald 0.1612128 0.19883934 0.13679245
## West Gregoryview 0.8995333 0.85644472 0.57861635
## West Lydia 0.3481247 0.18875993 0.16037736
## West Terrence 0.4953262 0.39004276 0.31761006
k2 = kmeans(citydata, centers = 2, nstart = 25)
#k2
#str(k2)
fviz_cluster(k2, data = citydata)
k3 <- kmeans(citydata, centers = 3, nstart = 25)
k4 <- kmeans(citydata, centers = 4, nstart = 25)
k5 <- kmeans(citydata, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = citydata) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = citydata) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = citydata) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = citydata) + ggtitle("k = 5")
grid.arrange(p1, p2, p3, p4, nrow = 2)
set.seed(125)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(citydata, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(125)
fviz_nbclust(citydata, kmeans, method = "wss")
# Compute k-means clustering with k = 4
set.seed(125)
final <- kmeans(citydata, 5, nstart = 25)
print(final)
## K-means clustering with 5 clusters of sizes 8, 6, 11, 5, 16
##
## Cluster means:
## sale_price total_sqft garage_sqft
## 1 0.4972470 0.4677001 0.5113994
## 2 0.7505796 0.6238037 0.3207547
## 3 0.1920072 0.1695452 0.1432247
## 4 0.8329442 0.9323152 0.6729560
## 5 0.5097817 0.4285851 0.1938876
##
## Clustering vector:
## Amystad Brownport Chadstad
## 1 5 5
## Clarkberg Coletown Davidfort
## 3 2 5
## Davidtown East Amychester East Janiceville
## 4 5 5
## East Justin East Lucas Fosterberg
## 1 5 5
## Hallfort Jeffreyhaven Jenniferberg
## 5 3 5
## Joshuafurt Julieberg Justinport
## 4 3 2
## Lake Carolyn Lake Christinaport Lake Dariusborough
## 3 3 5
## Lake Jack Lake Jennifer Leahview
## 5 1 3
## Lewishaven Martinezfort Morrisport
## 5 3 2
## New Michele North Erinville Port Adamtown
## 5 2 1
## Port Andrealand Port Daniel Port Jonathanborough
## 2 3 2
## Richardport Rickytown Scottberg
## 4 4 1
## South Anthony South Stevenfurt Toddshire
## 5 1 3
## Wendybury West Ann West Brittanyview
## 1 5 1
## West Gerald West Gregoryview West Lydia
## 3 4 3
## West Terrence
## 5
##
## Within cluster sum of squares by cluster:
## [1] 0.16061998 0.09702199 0.20153819 0.24885822 0.38679605
## (between_SS / total_SS = 83.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fviz_cluster(final, data = citydata)
citydata %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 5 x 4
## Cluster sale_price total_sqft garage_sqft
## <int> <dbl> <dbl> <dbl>
## 1 1 0.497 0.468 0.511
## 2 2 0.751 0.624 0.321
## 3 3 0.192 0.170 0.143
## 4 4 0.833 0.932 0.673
## 5 5 0.510 0.429 0.194
End of Analysis
```