Exercising Data Mining for Real Estate Investment

Step of project execution

Data exploration
Data cleaning
Explore more
Model Building
- Predict price
- Predict Location
- Predict type of house

rm(list = ls())

setwd("C:/Users/MANISHA/Desktop/CISC-Project/FinalProjectWork")

## All required libraries should be mentioned here.

library(ggplot2)

library(caret)

## Loading required package: lattice

library(corrplot)

## corrplot 0.84 loaded

library(rpart)

library(Metrics)

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

library(mlr)

## Loading required package: ParamHelpers

## 
## Attaching package: 'mlr'

## The following object is masked from 'package:caret':
## 
##     train

library(dummies)

## dummies-1.5.6 provided by Decision Patterns

library(rpart)

library(caret)

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(gbm)

## Loaded gbm 2.1.5

library(factoextra)

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v tibble  2.0.1     v purrr   0.3.0
## v tidyr   0.8.2     v dplyr   0.7.8
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.0.1     v forcats 0.3.0

## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::combine()       masks randomForest::combine()
## x dplyr::filter()        masks stats::filter()
## x dplyr::lag()           masks stats::lag()
## x purrr::lift()          masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:randomForest':
## 
##     combine

################## Library Section Ends Here ####################

## Read input House Data file in csv format. 

housedata = read.csv("house_data.csv", header = T)

## Dimension of the input data.

dim(housedata)

## [1] 42703    20

### 42703 observations. 
### 20 attributes.

## Structure of the file. 

str(housedata)

## 'data.frame':    42703 obs. of  20 variables:
##  $ year_built         : int  1978 1958 2002 2004 2006 2005 1979 1958 1958 1961 ...
##  $ stories            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ num_bedrooms       : int  4 3 3 4 4 3 3 5 5 1 ...
##  $ full_bathrooms     : int  1 1 2 2 2 2 2 2 2 1 ...
##  $ half_bathrooms     : int  1 1 0 0 0 0 1 0 0 0 ...
##  $ livable_sqft       : int  1689 1984 1581 1829 1580 1621 2285 1745 1747 998 ...
##  $ total_sqft         : int  1859 2002 1578 2277 1749 1672 2365 1741 1745 1161 ...
##  $ garage_type        : Factor w/ 3 levels "attached","detached",..: 1 1 3 1 1 1 2 3 3 3 ...
##  $ garage_sqft        : int  508 462 0 479 430 430 532 0 0 0 ...
##  $ carport_sqft       : int  0 0 625 0 0 0 0 0 0 242 ...
##  $ has_fireplace      : logi  TRUE TRUE FALSE TRUE TRUE TRUE ...
##  $ has_pool           : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_central_heating: logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_central_cooling: logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ house_number       : int  42670 5194 4366 3302 582 78445 246 35725 35725 73327 ...
##  $ street_name        : Factor w/ 11124 levels "Aaron Cliff",..: 6335 3532 3933 7172 4467 7172 3963 4902 4902 5959 ...
##  $ unit_number        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ city               : Factor w/ 47 levels "Amystad","Brownport",..: 13 13 20 20 20 20 27 20 20 20 ...
##  $ zip_code           : int  10907 10907 11203 11203 11203 11203 10924 11203 11203 11203 ...
##  $ sale_price         : num  270897 302404 2519996 197193 207897 ...

### We have categorical variables.  

qualitative_var = c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type", "has_fireplace", "has_pool", 
                    "has_central_heating", "has_central_cooling", "house_number", "street_name" , "unit_number", "city", "zip_code")

### We have numerical variables. 
quantitative_var = c("year_built", "livable_sqft", "total_sqft", "garage_sqft", "carport_sqft", "sale_price")

### R has identified garage_type, street_number and city as factor. 
### R has identified has_fireplace, has_pool, has_central_heating, has_central_cooling as boolean type. I will convert these four attributes to factor.

## Sumamry statistics. 

summary(housedata)

##    year_built      stories       num_bedrooms    full_bathrooms 
##  Min.   :1852   Min.   :0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:1980   1st Qu.:1.000   1st Qu.: 3.000   1st Qu.:1.000  
##  Median :1994   Median :1.000   Median : 3.000   Median :2.000  
##  Mean   :1991   Mean   :1.366   Mean   : 3.209   Mean   :1.924  
##  3rd Qu.:2005   3rd Qu.:2.000   3rd Qu.: 4.000   3rd Qu.:2.000  
##  Max.   :2017   Max.   :4.000   Max.   :31.000   Max.   :8.000  
##                                                                 
##  half_bathrooms    livable_sqft     total_sqft      garage_type   
##  Min.   :0.0000   Min.   :   -3   Min.   :    5   attached:34079  
##  1st Qu.:0.0000   1st Qu.: 1380   1st Qu.: 1466   detached: 2712  
##  Median :1.0000   Median : 1808   Median : 1937   none    : 5912  
##  Mean   :0.5272   Mean   : 1988   Mean   : 2127                   
##  3rd Qu.:1.0000   3rd Qu.: 2486   3rd Qu.: 2640                   
##  Max.   :1.0000   Max.   :12406   Max.   :15449                   
##                                                                   
##   garage_sqft      carport_sqft     has_fireplace    has_pool      
##  Min.   :  -4.0   Min.   :   0.00   Mode :logical   Mode :logical  
##  1st Qu.: 412.0   1st Qu.:   0.00   FALSE:15717     FALSE:35101    
##  Median : 464.0   Median :   0.00   TRUE :26986     TRUE :7602     
##  Mean   : 455.9   Mean   :  41.66                                  
##  3rd Qu.: 606.0   3rd Qu.:   0.00                                  
##  Max.   :8318.0   Max.   :9200.00                                  
##                                                                    
##  has_central_heating has_central_cooling  house_number  
##  Mode :logical       Mode :logical       Min.   :    0  
##  FALSE:2609          FALSE:4141          1st Qu.:  674  
##  TRUE :40094         TRUE :38562         Median : 4530  
##                                          Mean   :18212  
##                                          3rd Qu.:24845  
##                                          Max.   :99971  
##                                                         
##           street_name     unit_number                 city      
##  Matthew Points :  128   Min.   :   3    Chadstad       : 4962  
##  Sanders Inlet  :   98   1st Qu.:1063    Coletown       : 3739  
##  Jessica Highway:   95   Median :2033    Jeffreyhaven   : 2981  
##  Jordan Points  :   94   Mean   :2027    North Erinville: 2868  
##  Andrea Glen    :   88   3rd Qu.:2921    Port Andrealand: 2669  
##  Mckenzie Trace :   88   Max.   :3998    Hallfort       : 2448  
##  (Other)        :42112   NA's   :39615   (Other)        :23036  
##     zip_code       sale_price      
##  Min.   :10004   Min.   :     626  
##  1st Qu.:10537   1st Qu.:  270899  
##  Median :11071   Median :  378001  
##  Mean   :11031   Mean   :  413507  
##  3rd Qu.:11510   3rd Qu.:  497697  
##  Max.   :11989   Max.   :21041998  
##

### Few things about summary statistics. 
### minimum livable_sqft = -3, minimum total_sqft = 5,  gagage_sqft minimum is -4, maximum is 8318, maximum carport_sqft is 9200, sale_price minimum is 626, maximum is more than 21 million. 

### City and street_name labels are numric value. Good for PCA or regression but not good for classifications. I will have to factorize of these two variables. 

### I can get rid of Zip code, unit number and house number as of now as those attributes do not have any direct need in the analysis. If I need to consolidate the address of house, I will use them. For now, I will not use them in regression, classification, clustering.

dontneed = c("unit_number", "house_number", "zip_code")

## Create new data frame with the attributes I need for the analysis. 

names(housedata)

##  [1] "year_built"          "stories"             "num_bedrooms"       
##  [4] "full_bathrooms"      "half_bathrooms"      "livable_sqft"       
##  [7] "total_sqft"          "garage_type"         "garage_sqft"        
## [10] "carport_sqft"        "has_fireplace"       "has_pool"           
## [13] "has_central_heating" "has_central_cooling" "house_number"       
## [16] "street_name"         "unit_number"         "city"               
## [19] "zip_code"            "sale_price"

mydata = housedata[, -c(15,17,19)]

dim(mydata)

## [1] 42703    17

str(mydata)

## 'data.frame':    42703 obs. of  17 variables:
##  $ year_built         : int  1978 1958 2002 2004 2006 2005 1979 1958 1958 1961 ...
##  $ stories            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ num_bedrooms       : int  4 3 3 4 4 3 3 5 5 1 ...
##  $ full_bathrooms     : int  1 1 2 2 2 2 2 2 2 1 ...
##  $ half_bathrooms     : int  1 1 0 0 0 0 1 0 0 0 ...
##  $ livable_sqft       : int  1689 1984 1581 1829 1580 1621 2285 1745 1747 998 ...
##  $ total_sqft         : int  1859 2002 1578 2277 1749 1672 2365 1741 1745 1161 ...
##  $ garage_type        : Factor w/ 3 levels "attached","detached",..: 1 1 3 1 1 1 2 3 3 3 ...
##  $ garage_sqft        : int  508 462 0 479 430 430 532 0 0 0 ...
##  $ carport_sqft       : int  0 0 625 0 0 0 0 0 0 242 ...
##  $ has_fireplace      : logi  TRUE TRUE FALSE TRUE TRUE TRUE ...
##  $ has_pool           : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ has_central_heating: logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ has_central_cooling: logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ street_name        : Factor w/ 11124 levels "Aaron Cliff",..: 6335 3532 3933 7172 4467 7172 3963 4902 4902 5959 ...
##  $ city               : Factor w/ 47 levels "Amystad","Brownport",..: 13 13 20 20 20 20 27 20 20 20 ...
##  $ sale_price         : num  270897 302404 2519996 197193 207897 ...

new_qualitative_var = c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type", "has_fireplace","has_pool", 
          "has_central_heating", "has_central_cooling",  "street_name", "city")

## Convert boolean and categorical attributes into factor. 

mydata[,new_qualitative_var] = lapply(mydata[, new_qualitative_var], factor)

## Missing values. 

as.data.frame(colSums(is.na(mydata)))

##                     colSums(is.na(mydata))
## year_built                               0
## stories                                  0
## num_bedrooms                             0
## full_bathrooms                           0
## half_bathrooms                           0
## livable_sqft                             0
## total_sqft                               0
## garage_type                              0
## garage_sqft                              0
## carport_sqft                             0
## has_fireplace                            0
## has_pool                                 0
## has_central_heating                      0
## has_central_cooling                      0
## street_name                              0
## city                                     0
## sale_price                               0

### There is no missing values in any columns.

## Outliers. 

hist(mydata$sale_price, 
     freq = FALSE,
     breaks = 100,
     col = "grey",
     xlab = "Sale Price",
     main = "Histogram, rug plot, density curve")
rug(jitter(mydata$sale_price))
lines(density(mydata$sale_price), col = "red", lwd = 1)
box()

boxplot(mydata$sale_price, horizontal = F, col = "dark grey", main = "Box Plot of Sales Price", varwidth = T)

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$stories)) +
    geom_boxplot() +
    xlab("Stories") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$num_bedrooms)) +
    geom_boxplot() +
    xlab("Number of Bedrooms") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$full_bathrooms)) +
    geom_boxplot() +
    xlab("Full Bathrooms") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$half_bathrooms)) +
    geom_boxplot() +
    xlab("Half Bathrooms") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$garage_type)) +
    geom_boxplot() +
    xlab("Garage Type") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$has_fireplace)) +
    geom_boxplot() +
    xlab("Has Fire Place") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$has_pool)) +
    geom_boxplot() +
    xlab("Has Pool") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$has_central_heating)) +
    geom_boxplot() +
    xlab("Has Central Heating") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$has_central_cooling)) +
    geom_boxplot() +
    xlab("Has Central Cooling") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$year_built)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    xlab("Built Year") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$livable_sqft)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    xlab("Livable area") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$total_sqft)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    xlab("Total area") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$garage_sqft)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    xlab("Garage area") +
    ylab("Sale Price")

ggplot(mydata, 
    aes_string(y = mydata$sale_price, x = mydata$carport_sqft)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    xlab("Carport area") +
    ylab("Sale Price")

### Distribution of data

### Count of house sales based on house built year.  
countperyr_df = as.data.frame(table(as.factor(mydata$year_built)))

countperyr_ordered = countperyr_df[order(countperyr_df$Freq, decreasing = T),]


### Interested to see is it a very old house sells more of a house built recentyears. First 10 highest selling buil in year are from 1988 to 2016. 

head(countperyr_ordered, 10)

##     Var1 Freq
## 117 2005 2046
## 116 2004 1914
## 118 2006 1720
## 126 2014 1691
## 127 2015 1671
## 115 2003 1589
## 101 1989 1324
## 100 1988 1307
## 114 2002 1233
## 128 2016 1203

### Chck variable one by one and remove outliear. 

### sale_price has many outliers. 

summary(mydata$sale_price)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      626   270899   378001   413507   497697 21041998

### Minimum price 626 and maximum price 21 million 

### highest 30 sale price 

#mydata[order(mydata$sale_price, decreasing = T), ][1:30,c(1,2,3,4,5,6,7,17)]

### 1st one definately an error. livable_sqft 1446 and total_sqft = 1438, price 21million, definately an odd one. 
### I will find out all observations where livable_sqft > total_sqft.

odd_data = subset(mydata, mydata$livable_sqft>mydata$total_sqft)

### Delete such records

mydata = subset(mydata, !(mydata$livable_sqft>mydata$total_sqft))

### Lowest 30 sale price

#mydata[order(mydata$sale_price, decreasing = F), ][1:30,c(1,2,3,4,5,6,7,9, 17)]

#subset(mydata, mydata$sale_price < 100000, c(1,2,3,4,5,6,7,9, 17))

### Looks like there are many odd values in sale_price. For simplicity let's remove the outliers and store them for future reference.


### Number of outliers in sales_price 

outlier_values = boxplot.stats(mydata$sale_price)$out

length(outlier_values)

## [1] 1698

#Outlier range

Q3 = quantile(mydata$sale_price)[4]
Q1 = quantile(mydata$sale_price)[2]

    
# Q3 + IQR * 1.5

maximum = Q3 + IQR(mydata$sale_price) * 1.5

# Q1 - IQR * 1.5
minimum = Q1 - IQR(mydata$sale_price) * 1.5

#Maximum = 834758.8
#Minimum = -47255.25

### Remove ovservations related to outliers of sales_price 

outlier_price = subset(mydata, mydata$sale_price < minimum | mydata$sale_price > maximum)

odd_data = rbind(odd_data, outlier_price)

### Remove observations related to outliers. 

mydata = subset(mydata, !(mydata$sale_price < minimum | mydata$sale_price > maximum))


hist(mydata$sale_price, 
     freq = FALSE,
     breaks = 100,
     col = "grey",
     xlab = "Sale Price",
     main = "Histogram, rug plot, density curve")
rug(jitter(mydata$sale_price))
lines(density(mydata$sale_price), col = "red", lwd = 1)
box()

boxplot(mydata$sale_price, horizontal = F, col = "dark grey", main = "Box Plot of Sales Price")

### Annomali reduction from sale price. 

#Outlier range

Q3 = quantile(mydata$sale_price)[4]
Q1 = quantile(mydata$sale_price)[2]

    
# Q3 + IQR * 1.5

maximum = Q3 + IQR(mydata$sale_price) * 1.5

# Q1 - IQR * 1.5
minimum = Q1 - IQR(mydata$sale_price) * 1.5

#mydata[order(mydata$sale_price, decreasing = F),]

temp_data = subset(mydata, mydata$sale_price < 25000 )

mydata = subset(mydata, mydata$sale_price > 25000 )



odd_data = rbind(odd_data, temp_data)

summary(mydata)

##    year_built   stories    num_bedrooms   full_bathrooms  half_bathrooms
##  Min.   :1852   0:    1   3      :13995   2      :19714   0:16070       
##  1st Qu.:1981   1:22270   4      : 9832   1      :10123   1:19560       
##  Median :1996   2:13016   2      : 7452   3      : 4807                 
##  Mean   :1991   3:  225   5      : 3134   4      :  747                 
##  3rd Qu.:2005   4:  118   1      :  765   0      :  180                 
##  Max.   :2017             6      :  368   5      :   49                 
##                           (Other):   84   (Other):   10                 
##   livable_sqft     total_sqft      garage_type     garage_sqft    
##  Min.   :   -2   Min.   :    6   attached:29466   Min.   :  -4.0  
##  1st Qu.: 1403   1st Qu.: 1515   detached: 2236   1st Qu.: 419.0  
##  Median : 1828   Median : 1969   none    : 3928   Median : 466.0  
##  Mean   : 1962   Mean   : 2104                    Mean   : 462.9  
##  3rd Qu.: 2452   3rd Qu.: 2617                    3rd Qu.: 604.0  
##  Max.   :12406   Max.   :15449                    Max.   :5040.0  
##                                                                   
##   carport_sqft     has_fireplace  has_pool     has_central_heating
##  Min.   :   0.00   FALSE:12621   FALSE:29951   FALSE: 2242        
##  1st Qu.:   0.00   TRUE :23009   TRUE : 5679   TRUE :33388        
##  Median :   0.00                                                  
##  Mean   :  28.43                                                  
##  3rd Qu.:   0.00                                                  
##  Max.   :9200.00                                                  
##                                                                   
##  has_central_cooling                street_name                 city      
##  FALSE: 3327         Sanders Inlet        :   71   Chadstad       : 4248  
##  TRUE :32303         Jeremy Knolls        :   68   Coletown       : 3239  
##                      Richardson Throughway:   62   North Erinville: 2405  
##                      Kenneth Plains       :   60   Jeffreyhaven   : 2316  
##                      Johnson Ville        :   56   Port Andrealand: 2210  
##                      Michelle Streets     :   56   Hallfort       : 2201  
##                      (Other)              :35257   (Other)        :19011  
##    sale_price    
##  Min.   : 25196  
##  1st Qu.:283496  
##  Median :381776  
##  Mean   :387050  
##  3rd Qu.:486992  
##  Max.   :834751  
##

boxplot(mydata$sale_price)

### Check summary of the data

summary(mydata)

##    year_built   stories    num_bedrooms   full_bathrooms  half_bathrooms
##  Min.   :1852   0:    1   3      :13995   2      :19714   0:16070       
##  1st Qu.:1981   1:22270   4      : 9832   1      :10123   1:19560       
##  Median :1996   2:13016   2      : 7452   3      : 4807                 
##  Mean   :1991   3:  225   5      : 3134   4      :  747                 
##  3rd Qu.:2005   4:  118   1      :  765   0      :  180                 
##  Max.   :2017             6      :  368   5      :   49                 
##                           (Other):   84   (Other):   10                 
##   livable_sqft     total_sqft      garage_type     garage_sqft    
##  Min.   :   -2   Min.   :    6   attached:29466   Min.   :  -4.0  
##  1st Qu.: 1403   1st Qu.: 1515   detached: 2236   1st Qu.: 419.0  
##  Median : 1828   Median : 1969   none    : 3928   Median : 466.0  
##  Mean   : 1962   Mean   : 2104                    Mean   : 462.9  
##  3rd Qu.: 2452   3rd Qu.: 2617                    3rd Qu.: 604.0  
##  Max.   :12406   Max.   :15449                    Max.   :5040.0  
##                                                                   
##   carport_sqft     has_fireplace  has_pool     has_central_heating
##  Min.   :   0.00   FALSE:12621   FALSE:29951   FALSE: 2242        
##  1st Qu.:   0.00   TRUE :23009   TRUE : 5679   TRUE :33388        
##  Median :   0.00                                                  
##  Mean   :  28.43                                                  
##  3rd Qu.:   0.00                                                  
##  Max.   :9200.00                                                  
##                                                                   
##  has_central_cooling                street_name                 city      
##  FALSE: 3327         Sanders Inlet        :   71   Chadstad       : 4248  
##  TRUE :32303         Jeremy Knolls        :   68   Coletown       : 3239  
##                      Richardson Throughway:   62   North Erinville: 2405  
##                      Kenneth Plains       :   60   Jeffreyhaven   : 2316  
##                      Johnson Ville        :   56   Port Andrealand: 2210  
##                      Michelle Streets     :   56   Hallfort       : 2201  
##                      (Other)              :35257   (Other)        :19011  
##    sale_price    
##  Min.   : 25196  
##  1st Qu.:283496  
##  Median :381776  
##  Mean   :387050  
##  3rd Qu.:486992  
##  Max.   :834751  
##

### Plot continuous variables 

ggplot(mydata, aes(x = " ", y = mydata$livable_sqft)) +
    geom_boxplot(outlier.size = NA) +
    geom_point() +
    ylab("Livable Area") +
    xlab("Box Plot of Livable Area")

ggplot(mydata, aes(x = " ", y = mydata$total_sqft)) +
    geom_boxplot(outlier.size = NA) +
    geom_point() +
    ylab("Total Area") +
    xlab("Box Plot of Total Area")

ggplot(mydata, aes(x = " ", y = mydata$garage_sqft)) +
    geom_boxplot(outlier.size = NA) +
    geom_point() +
    ylab("Garage Area") +
    xlab("Box Plot of Garage Area")

ggplot(mydata, aes(x = " ", y = mydata$carport_sqft)) +
    geom_boxplot(outlier.size = NA) +
    geom_point() +
    ylab("Carport Area") +
    xlab("Box Plot of Carport Area")

### Remove the observations realted to outliers for each variables and store them into odd-data dataframe. 

#### livable_sqft

summary(mydata$livable_sqft)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      -2    1403    1828    1962    2452   12406

Q1 = quantile(mydata$livable_sqft)[2]

Q3 = quantile(mydata$livable_sqft)[4]

maximum = Q3 + IQR(mydata$livable_sqft) * 1.5

minimum = Q1 - IQR(mydata$livable_sqft) * 1.5

odd_data = rbind(odd_data, subset(mydata, mydata$livable_sqft < minimum | mydata$livable_sqft > maximum))

mydata = subset(mydata, !(mydata$livable_sqft < minimum | mydata$livable_sqft > maximum))

#mydata[order(mydata$livable_sqft, decreasing = F), ]

#### Remove negative values in livable_sqft

odd_data = rbind(odd_data, subset(mydata, (mydata$livable_sqft < 0)))

mydata = subset(mydata, !(mydata$livable_sqft < 0))


#### total_sqft

summary(mydata$total_sqft)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      87    1513    1959    2084    2594    6351

Q1 = quantile(mydata$total_sqft)[2]

Q3 = quantile(mydata$total_sqft)[4]

maximum = Q3 + IQR(mydata$total_sqft) * 1.5

minimum = Q1 - IQR(mydata$total_sqft) * 1.5

odd_data = rbind(odd_data, subset(mydata, mydata$total_sqft < minimum | mydata$total_sqft > maximum))

mydata = subset(mydata, !(mydata$total_sqft < minimum | mydata$total_sqft > maximum))

#mydata[order(mydata$livable_sqft, decreasing = F), ]

#### Remove negative values in livable_sqft

#mydata[order(mydata$total_sqft<100, decreasing = T),]

odd_data = rbind(odd_data, subset(mydata, (mydata$total_sqft <100)))

mydata = subset(mydata, !(mydata$total_sqft < 100))


#### garage_sqft

summary(mydata$garage_sqft)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -4.0   418.0   465.0   460.2   601.0  5040.0

ggplot(mydata, aes(x = mydata$garage_type, y = mydata$garage_sqft))+
    geom_boxplot()

Q1 = quantile(mydata$garage_sqft)[2]

Q3 = quantile(mydata$garage_sqft)[4]

maximum = Q3 + IQR(mydata$garage_sqft) * 1.5

minimum = Q1 - IQR(mydata$garage_sqft) * 1.5

odd_data = rbind(odd_data, subset(mydata, mydata$totgarage_sqft < minimum | mydata$garage_sqft > maximum))

mydata = subset(mydata, !(mydata$garage_sqft < minimum | mydata$garage_sqft > maximum))

#mydata[order(mydata$livable_sqft, decreasing = F), ]

#### Remove negative values in livable_sqft

#mydata[order(mydata$garage_sqft, decreasing = F),]

odd_data = rbind(odd_data, subset(mydata, (mydata$garage_sqft <50)))

mydata = subset(mydata, !(mydata$garage_sqft < 50))


summary(mydata)

##    year_built   stories    num_bedrooms   full_bathrooms  half_bathrooms
##  Min.   :1852   0:    1   3      :12836   2      :17934   0:13160       
##  1st Qu.:1985   1:18361   4      : 9432   1      : 7844   1:17762       
##  Median :1999   2:12263   2      : 5253   3      : 4557                 
##  Mean   :1994   3:  200   5      : 2854   4      :  502                 
##  3rd Qu.:2006   4:   97   6      :  278   0      :   53                 
##  Max.   :2017             1      :  211   5      :   30                 
##                           (Other):   58   (Other):    2                 
##   livable_sqft    total_sqft     garage_type     garage_sqft 
##  Min.   : 124   Min.   : 128   attached:28800   Min.   :144  
##  1st Qu.:1485   1st Qu.:1600   detached: 2122   1st Qu.:432  
##  Median :1905   Median :2046   none    :    0   Median :480  
##  Mean   :2022   Mean   :2156                    Mean   :513  
##  3rd Qu.:2506   3rd Qu.:2656                    3rd Qu.:610  
##  Max.   :4025   Max.   :4215                    Max.   :875  
##                                                              
##   carport_sqft      has_fireplace  has_pool     has_central_heating
##  Min.   :   0.000   FALSE: 9231   FALSE:25681   FALSE: 1325        
##  1st Qu.:   0.000   TRUE :21691   TRUE : 5241   TRUE :29597        
##  Median :   0.000                                                  
##  Mean   :   3.803                                                  
##  3rd Qu.:   0.000                                                  
##  Max.   :1534.000                                                  
##                                                                    
##  has_central_cooling                street_name                 city      
##  FALSE: 2120         Richardson Throughway:   62   Chadstad       : 3683  
##  TRUE :28802         Johnson Ville        :   47   Coletown       : 2959  
##                      Booker Pines         :   45   North Erinville: 2238  
##                      Boyle Brooks         :   44   Hallfort       : 2124  
##                      Avery Islands        :   43   Port Andrealand: 2116  
##                      Olson Fort           :   42   Jeffreyhaven   : 1887  
##                      (Other)              :30639   (Other)        :15915  
##    sale_price    
##  Min.   : 25830  
##  1st Qu.:304290  
##  Median :396899  
##  Mean   :404523  
##  3rd Qu.:493294  
##  Max.   :834751  
##

### Investigation of categorical data

### stories

table(mydata$stories)

## 
##     0     1     2     3     4 
##     1 18361 12263   200    97

mydata[mydata$stories == 0,] ## May be an undergroung building. Odd one.

##       year_built stories num_bedrooms full_bathrooms half_bathrooms
## 35436       1991       0            3              2              0
##       livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 35436         1365       1375    attached         550            0
##       has_fireplace has_pool has_central_heating has_central_cooling
## 35436          TRUE    FALSE                TRUE                TRUE
##       street_name     city sale_price
## 35436 Bobby Views Chadstad     390597

odd_data = rbind(odd_data, mydata[mydata$stories == 0,])

mydata = mydata[!(mydata$stories == 0),]

mydata$stories = factor(mydata$stories)

### num_bedrooms

table(mydata$num_bedrooms)

## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
##    13   211  5253 12835  9432  2854   278    28    10     1     0     0 
##    13    14    21    31 
##     1     2     0     3

mydata$num_bedrooms = factor(mydata$num_bedrooms)

### Check 0 bedrooms houses. 
table(mydata$num_bedrooms)

## 
##     0     1     2     3     4     5     6     7     8     9    13    14 
##    13   211  5253 12835  9432  2854   278    28    10     1     1     2 
##    31 
##     3

odd_data = rbind(odd_data, subset(mydata, mydata$num_bedrooms == 0))

mydata = subset(mydata, !(mydata$num_bedrooms == 0))

mydata$num_bedrooms = factor(mydata$num_bedrooms)

subset(mydata, mydata$num_bedrooms == 31)

##       year_built stories num_bedrooms full_bathrooms half_bathrooms
## 6331        2014       2           31              2              1
## 23846       2015       2           31              2              1
## 33539       2016       1           31              2              0
##       livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 6331          1717       1758    attached         434            0
## 23846         2549       2654    attached         612            0
## 33539         1856       1890    attached         412            0
##       has_fireplace has_pool has_central_heating has_central_cooling
## 6331          FALSE    FALSE                TRUE                TRUE
## 23846         FALSE    FALSE                TRUE                TRUE
## 33539         FALSE    FALSE                TRUE                TRUE
##           street_name      city sale_price
## 6331  Stanley Islands  Coletown     497073
## 23846   Barbara Roads Davidfort     470607
## 33539     Steven Cape Davidfort     406349

odd_data = rbind(odd_data, subset(mydata, mydata$num_bedrooms == 31))

mydata = subset(mydata, !(mydata$num_bedrooms == 31))

mydata$num_bedrooms = factor(mydata$num_bedrooms)

subset(mydata, mydata$num_bedrooms %in% c(8:14))

##       year_built stories num_bedrooms full_bathrooms half_bathrooms
## 2096        2001       2            8              2              1
## 4670        2000       2            8              2              1
## 4673        1999       2            8              3              1
## 4674        1996       2            8              3              1
## 5888        2013       1           14              3              1
## 6577        2011       1           13              3              1
## 10044       1998       2            8              3              1
## 14285       1972       1            9              3              1
## 15466       1998       2            8              3              1
## 16879       2016       2           14              3              0
## 35049       2000       2            8              2              1
## 35052       2003       2            8              2              1
## 36807       2001       2            8              2              1
## 36812       1997       2            8              3              1
##       livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 2096          2904       2954    attached         579            0
## 4670          2678       2724    attached         580            0
## 4673          2700       2745    attached         776            0
## 4674          2709       2751    attached         774            0
## 5888          2761       3303    attached         585            0
## 6577          2397       2888    attached         546            0
## 10044         2709       2748    attached         776            0
## 14285         3392       3484    attached         444            0
## 15466         2711       2748    attached         776            0
## 16879         2917       3131    attached         475            0
## 35049         2665       2707    attached         582            0
## 35052         2668       2715    attached         584            0
## 36807         2684       2724    attached         589            0
## 36812         2711       2752    attached         775            0
##       has_fireplace has_pool has_central_heating has_central_cooling
## 2096           TRUE     TRUE                TRUE                TRUE
## 4670           TRUE    FALSE                TRUE                TRUE
## 4673           TRUE     TRUE                TRUE                TRUE
## 4674           TRUE     TRUE                TRUE                TRUE
## 5888           TRUE    FALSE                TRUE                TRUE
## 6577          FALSE     TRUE                TRUE                TRUE
## 10044          TRUE     TRUE                TRUE                TRUE
## 14285         FALSE    FALSE                TRUE                TRUE
## 15466          TRUE     TRUE                TRUE                TRUE
## 16879          TRUE    FALSE                TRUE                TRUE
## 35049          TRUE    FALSE                TRUE                TRUE
## 35052          TRUE    FALSE                TRUE                TRUE
## 36807          TRUE     TRUE                TRUE                TRUE
## 36812          TRUE     TRUE                TRUE                TRUE
##          street_name            city sale_price
## 2096  Gilbert Valley      East Lucas     409496
## 4670     Obrien Pine North Erinville     453601
## 4673    Kennedy Hill North Erinville     398157
## 4674  Edward Viaduct North Erinville     554397
## 5888    Nancy Bridge   South Anthony     513451
## 6577   Clarke Hollow   South Anthony     793796
## 10044   Mark Circles North Erinville     485097
## 14285   Cooper Point        Chadstad     553138
## 15466    Obrien Pine North Erinville     522904
## 16879   Stephen Cove        Coletown     612363
## 35049   Nelson Haven        Hallfort     403197
## 35052  Joseph Stream        Hallfort     403204
## 36807 Reid Junctions North Erinville     409502
## 36812      Tina View North Erinville     530464

odd_data = rbind(odd_data, subset(mydata, mydata$num_bedrooms %in% c(8:14)))

mydata = subset(mydata, !(mydata$num_bedrooms %in% c(8:14)))

mydata$num_bedrooms = factor(mydata$num_bedrooms)



### full_bathrooms

table(mydata$full_bathrooms)

## 
##     0     1     2     3     4     5     6     7     8 
##    50  7838 17922  4547   502    30     2     0     0

subset(mydata, mydata$full_bathrooms == 0 & mydata$half_bathrooms == 0)

##  [1] year_built          stories             num_bedrooms       
##  [4] full_bathrooms      half_bathrooms      livable_sqft       
##  [7] total_sqft          garage_type         garage_sqft        
## [10] carport_sqft        has_fireplace       has_pool           
## [13] has_central_heating has_central_cooling street_name        
## [16] city                sale_price         
## <0 rows> (or 0-length row.names)

subset(mydata, mydata$full_bathrooms == 0)

##       year_built stories num_bedrooms full_bathrooms half_bathrooms
## 51          1955       1            1              0              1
## 506         2002       1            2              0              1
## 1543        1937       1            1              0              1
## 1902        1947       1            1              0              1
## 2234        1948       1            1              0              1
## 2293        1922       1            1              0              1
## 2800        1928       1            1              0              1
## 3351        1930       1            3              0              1
## 3579        2000       1            2              0              1
## 5430        1954       1            1              0              1
## 7057        1950       1            2              0              1
## 7552        1924       1            1              0              1
## 10163       1902       1            3              0              1
## 10876       1947       1            2              0              1
## 12545       1957       1            1              0              1
## 12920       1932       1            2              0              1
## 12982       1924       1            1              0              1
## 13993       1968       2            1              0              1
## 14117       1921       1            2              0              1
## 14137       1909       1            2              0              1
## 14335       1950       1            1              0              1
## 14627       1977       1            1              0              1
## 15829       1966       1            2              0              1
## 17481       1949       1            2              0              1
## 18092       1962       1            2              0              1
## 18099       1953       1            2              0              1
## 19973       1948       1            1              0              1
## 19988       1941       1            2              0              1
## 21120       1993       1            1              0              1
## 24828       1945       1            3              0              1
## 24829       1946       1            3              0              1
## 26547       1994       1            1              0              1
## 27099       1982       1            1              0              1
## 28794       1938       1            1              0              1
## 30339       1954       1            3              0              1
## 30395       1946       1            1              0              1
## 30443       1953       1            2              0              1
## 30902       1939       2            1              0              1
## 32738       1921       1            2              0              1
## 33879       1948       1            1              0              1
## 34352       1938       1            2              0              1
## 35507       1920       1            3              0              1
## 36935       1996       2            1              0              1
## 38362       1938       1            1              0              1
## 39249       1946       1            1              0              1
## 39468       1938       1            2              0              1
## 39469       1948       1            3              0              1
## 39625       1949       1            2              0              1
## 42369       1994       1            1              0              1
## 42378       1958       1            2              0              1
##       livable_sqft total_sqft garage_type garage_sqft carport_sqft
## 51             813        980    detached         700            0
## 506           1297       1300    attached         416            0
## 1543           596        645    detached         249            0
## 1902           406        588    detached         401            0
## 2234           571        826    detached         218            0
## 2293           693       1022    detached         289            0
## 2800           429        723    detached         422            0
## 3351          1239       1640    detached         442            0
## 3579           954        956    attached         572            0
## 5430           758        817    attached         222            0
## 7057           943       1084    detached         397            0
## 7552           835        929    attached         224            0
## 10163          917       1125    detached         181            0
## 10876          723       1112    detached         361            0
## 12545          608        754    attached         213            0
## 12920          620        775    detached         322            0
## 12982          732        836    detached         397            0
## 13993          796        841    attached         844            0
## 14117          921       1059    detached         194            0
## 14137         1162       1470    detached         358            0
## 14335          602        842    attached         221            0
## 14627          478        482    detached         575            0
## 15829         1061       1411    attached         312            0
## 17481          997       1004    detached         275            0
## 18092          703        808    attached         395            0
## 18099          687        976    detached         283            0
## 19973          646        782    detached         404            0
## 19988         1228       1236    attached         332            0
## 21120         1066       1175    attached         326            0
## 24828          989       1289    detached         315            0
## 24829          989       1290    detached         310            0
## 26547         1047       1170    attached         324            0
## 27099          817        879    attached         424            0
## 28794          556        566    detached         216            0
## 30339          966       1116    detached         764            0
## 30395          594        863    detached         477            0
## 30443          826       1026    attached         292            0
## 30902          707        850    attached         680            0
## 32738          762        981    detached         277            0
## 33879          543        696    detached         244            0
## 34352          892       1033    detached         183            0
## 35507         1534       1760    detached         519            0
## 36935         1445       1446    attached         764            0
## 38362          806       1035    detached         305          307
## 39249          888       1218    detached         440            0
## 39468          962       1186    detached         440            0
## 39469         1011       1146    detached         324          263
## 39625          817        903    detached         172            0
## 42369         1053       1164    attached         329            0
## 42378          772       1184    detached         478          690
##       has_fireplace has_pool has_central_heating has_central_cooling
## 51            FALSE    FALSE               FALSE               FALSE
## 506            TRUE     TRUE                TRUE                TRUE
## 1543          FALSE    FALSE               FALSE               FALSE
## 1902          FALSE    FALSE               FALSE               FALSE
## 2234          FALSE    FALSE               FALSE               FALSE
## 2293          FALSE    FALSE               FALSE               FALSE
## 2800          FALSE    FALSE               FALSE               FALSE
## 3351          FALSE    FALSE               FALSE               FALSE
## 3579          FALSE    FALSE               FALSE               FALSE
## 5430          FALSE     TRUE               FALSE                TRUE
## 7057          FALSE     TRUE               FALSE               FALSE
## 7552          FALSE    FALSE               FALSE               FALSE
## 10163         FALSE    FALSE               FALSE               FALSE
## 10876         FALSE    FALSE               FALSE               FALSE
## 12545          TRUE    FALSE               FALSE               FALSE
## 12920         FALSE    FALSE               FALSE               FALSE
## 12982         FALSE    FALSE               FALSE               FALSE
## 13993         FALSE    FALSE                TRUE                TRUE
## 14117          TRUE    FALSE                TRUE                TRUE
## 14137         FALSE    FALSE               FALSE               FALSE
## 14335         FALSE    FALSE               FALSE               FALSE
## 14627         FALSE    FALSE               FALSE               FALSE
## 15829         FALSE    FALSE               FALSE               FALSE
## 17481         FALSE    FALSE               FALSE               FALSE
## 18092         FALSE    FALSE               FALSE               FALSE
## 18099         FALSE    FALSE               FALSE               FALSE
## 19973         FALSE    FALSE               FALSE               FALSE
## 19988          TRUE    FALSE               FALSE               FALSE
## 21120         FALSE    FALSE                TRUE                TRUE
## 24828         FALSE    FALSE               FALSE               FALSE
## 24829         FALSE    FALSE               FALSE               FALSE
## 26547         FALSE    FALSE                TRUE                TRUE
## 27099         FALSE    FALSE                TRUE                TRUE
## 28794         FALSE    FALSE               FALSE               FALSE
## 30339         FALSE    FALSE               FALSE               FALSE
## 30395         FALSE    FALSE               FALSE               FALSE
## 30443         FALSE    FALSE               FALSE               FALSE
## 30902         FALSE    FALSE               FALSE               FALSE
## 32738         FALSE    FALSE               FALSE               FALSE
## 33879         FALSE    FALSE               FALSE               FALSE
## 34352         FALSE    FALSE               FALSE               FALSE
## 35507         FALSE    FALSE                TRUE               FALSE
## 36935         FALSE    FALSE               FALSE               FALSE
## 38362          TRUE    FALSE               FALSE               FALSE
## 39249         FALSE    FALSE               FALSE               FALSE
## 39468         FALSE    FALSE               FALSE               FALSE
## 39469         FALSE    FALSE               FALSE               FALSE
## 39625         FALSE    FALSE               FALSE               FALSE
## 42369         FALSE    FALSE                TRUE                TRUE
## 42378         FALSE    FALSE               FALSE               FALSE
##             street_name               city sale_price
## 51       William Valley Lake Christinaport     126001
## 506          Darius Row      South Anthony     321302
## 1543         Smith View           Chadstad     157504
## 1902       Adams Stream          Scottberg     283498
## 2234       Roger Cliffs          Davidfort     214199
## 2293  Patterson Centers          Davidfort     233103
## 2800      Fields Groves Lake Dariusborough     207899
## 3351           Ana Glen           Chadstad     409500
## 3579       Fuller Light           Chadstad     466196
## 5430      Angela Harbor Lake Christinaport     283501
## 7057      Cheyenne Park       Jeffreyhaven     103318
## 7552       Lewis Hollow          Davidfort     277203
## 10163      Adams Stream    North Erinville     173879
## 10876       Wanda Crest Lake Christinaport     119704
## 12545    Mitchell Lodge          Scottberg     214200
## 12920    Davies Centers          Davidfort     126001
## 12982     Price Freeway          Davidfort     440996
## 13993 Richardson Shores           Chadstad     810806
## 14117       King Meadow           Chadstad     333901
## 14137   Reginald Circle           Chadstad     197819
## 14335    Hunter Passage   West Gregoryview     114659
## 14627       Tammy Manor           Chadstad     459897
## 15829    Hernandez Fort           Leahview      37800
## 17481  Smith Expressway           Hallfort     207901
## 18092   Robinson Canyon   East Janiceville     220496
## 18099       Alexis Fork         East Lucas     504002
## 19973       Ryan Lights         Justinport     396898
## 19988  Woodard Junction         Justinport     390600
## 21120 Katherine Passage         Lewishaven     233101
## 24828       Billy Brook           Chadstad     313736
## 24829       Billy Brook           Chadstad     200339
## 26547 Katherine Passage         Lewishaven     233098
## 27099        Kevin Wall         Lewishaven     195302
## 28794     Little Bypass         East Lucas     149308
## 30339       Ryan Lights           Chadstad     352796
## 30395    Victor Springs           Chadstad      95757
## 30443       Scott Brook           Chadstad     274676
## 30902    Mitchell Lodge           Coletown     520384
## 32738        Cody Roads      West Terrence     176396
## 33879    Harrell Avenue       Jeffreyhaven      90088
## 34352    Elizabeth Walk          Davidfort     251998
## 35507       Abbott View           Chadstad     472502
## 36935   Gonzalez Hollow    North Erinville     535501
## 38362      Pham Station       Lake Carolyn     239398
## 39249    Stephanie Lake          Scottberg     103948
## 39468        Aaron Mill         East Lucas     113397
## 39469     Jennifer Isle         East Lucas     437216
## 39625   Velasquez Track          Davidfort     214203
## 42369       Perez Mount         Lewishaven     233099
## 42378    Hernandez Fort           Leahview      65517

subset(mydata, mydata$full_bathrooms == 5, select = c(full_bathrooms, half_bathrooms, total_sqft, sale_price))

##       full_bathrooms half_bathrooms total_sqft sale_price
## 1510               5              0       3803     560700
## 3224               5              0       3763     718203
## 4193               5              0       3636     693002
## 4765               5              1       3004     441000
## 6340               5              1       4215     803250
## 6729               5              0       3696     584012
## 6730               5              0       3690     620554
## 6752               5              0       3695     650164
## 8694               5              0       3136     585902
## 12969              5              0       2776     440996
## 13778              5              0       2915     566998
## 17034              5              1       4209     715680
## 17476              5              0       3815     631264
## 17538              5              0       3802     440999
## 22782              5              0       3813     598497
## 22783              5              0       3700     612362
## 22784              5              0       3693     752849
## 27864              5              0       3614     706860
## 28202              5              0       3700     663391
## 33223              5              0       3642     740884
## 33546              5              0       3817     636934
## 33568              5              0       3813     652051
## 36792              5              0       3781     522903
## 38545              5              0       3693     779310
## 38736              5              1       2507     415800
## 38737              5              1       2509     415171
## 38791              5              0       3731     618656
## 38812              5              0       3814     645121
## 38826              5              0       3813     729538
## 39681              5              0       2777     289800

mydata$full_bathrooms = factor(mydata$full_bathrooms)

### half_bathrooms

table(mydata$half_bathrooms)

## 
##     0     1 
## 13149 17742

### garage_type

table(mydata$garage_type)

## 
## attached detached     none 
##    28771     2120        0

mydata$garage_type = factor(mydata$garage_type)

### has_fireplace

table(mydata$has_fireplace)

## 
## FALSE  TRUE 
##  9218 21673

table(mydata$has_pool)

## 
## FALSE  TRUE 
## 25665  5226

table(mydata$has_central_heating)

## 
## FALSE  TRUE 
##  1320 29571

table(mydata$has_central_cooling)

## 
## FALSE  TRUE 
##  2115 28776

### Data clean up is done.

### I will check Multicollinearity  in the data. 

## Corplot/Corrplotmatrix



names(mydata)

##  [1] "year_built"          "stories"             "num_bedrooms"       
##  [4] "full_bathrooms"      "half_bathrooms"      "livable_sqft"       
##  [7] "total_sqft"          "garage_type"         "garage_sqft"        
## [10] "carport_sqft"        "has_fireplace"       "has_pool"           
## [13] "has_central_heating" "has_central_cooling" "street_name"        
## [16] "city"                "sale_price"

m = cor(mydata[quantitative_var])

corrplot(m, method = "number", type = "lower")

corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )

### sale_price has very low correlation with carport_sqft. So, I will remove carport_sqft.
### total_sqft and livable_sqft highly correlated. I will remove livable_sqft. 

names(mydata)

##  [1] "year_built"          "stories"             "num_bedrooms"       
##  [4] "full_bathrooms"      "half_bathrooms"      "livable_sqft"       
##  [7] "total_sqft"          "garage_type"         "garage_sqft"        
## [10] "carport_sqft"        "has_fireplace"       "has_pool"           
## [13] "has_central_heating" "has_central_cooling" "street_name"        
## [16] "city"                "sale_price"

mydata = mydata[,-c(6,10)]

## How sale_price is correlated with categorical variables? 

summary(aov(formula = sale_price ~ stories,data = mydata))

##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## stories         3 5.225e+13 1.742e+13   913.9 <2e-16 ***
## Residuals   30887 5.886e+14 1.906e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ num_bedrooms,data = mydata))

##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## num_bedrooms     6 1.061e+14 1.768e+13    1021 <2e-16 ***
## Residuals    30884 5.348e+14 1.732e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ full_bathrooms,data = mydata))

##                   Df    Sum Sq   Mean Sq F value Pr(>F)    
## full_bathrooms     6 1.329e+14 2.215e+13    1347 <2e-16 ***
## Residuals      30884 5.079e+14 1.645e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ half_bathrooms,data = mydata))

##                   Df    Sum Sq   Mean Sq F value Pr(>F)    
## half_bathrooms     1 1.442e+12 1.442e+12   69.67 <2e-16 ***
## Residuals      30889 6.394e+14 2.070e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ garage_type,data = mydata))

##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## garage_type     1 1.816e+13 1.816e+13   900.6 <2e-16 ***
## Residuals   30889 6.227e+14 2.016e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ has_fireplace,data = mydata))

##                  Df    Sum Sq   Mean Sq F value Pr(>F)    
## has_fireplace     1 2.958e+13 2.958e+13    1495 <2e-16 ***
## Residuals     30889 6.113e+14 1.979e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ has_pool,data = mydata))

##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## has_pool        1 4.100e+13 4.100e+13    2111 <2e-16 ***
## Residuals   30889 5.998e+14 1.942e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ has_central_cooling,data = mydata))

##                        Df    Sum Sq   Mean Sq F value Pr(>F)    
## has_central_cooling     1 1.951e+13 1.951e+13     970 <2e-16 ***
## Residuals           30889 6.213e+14 2.011e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(formula = sale_price ~ has_central_heating,data = mydata))

##                        Df    Sum Sq   Mean Sq F value Pr(>F)    
## has_central_heating     1 2.156e+13 2.156e+13    1075 <2e-16 ***
## Residuals           30889 6.193e+14 2.005e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#summary(aov(formula = sale_price ~ mydata$street_name,data = mydata))
#summary(aov(formula = mydata$sale_price ~ mydata$city,data = mydata))

### Change factor levels to something greater than 0. 

levels(mydata$has_fireplace) <- c(1,2)
levels(mydata$has_pool) <- c(1,2)
levels(mydata$has_central_heating) <- c(1,2)
levels(mydata$has_central_cooling) <- c(1,2)
levels(mydata$garage_type) <- c(1,2)

### Write final data set 

write.csv(mydata, "houseprice_model.csv", row.names = F)

write.csv(odd_data, "houseprice_odd_observations.csv", row.names = F)

# model - 
### Price prediction
### Classfication 1. Type of house, city
### Clustering
### Regression with PCA

## Define file name 

filename = "houseprice_model.csv"


## Load CSV file from local directory

inputdata = read.csv(filename, header = T)

str(inputdata)

## 'data.frame':    30891 obs. of  15 variables:
##  $ year_built         : int  1978 1958 2004 2006 2005 1979 2005 2006 2003 2004 ...
##  $ stories            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ num_bedrooms       : int  4 3 4 4 3 3 4 4 3 4 ...
##  $ full_bathrooms     : int  1 1 2 2 2 2 4 2 1 2 ...
##  $ half_bathrooms     : int  1 1 0 0 0 1 0 1 1 1 ...
##  $ total_sqft         : int  1859 2002 2277 1749 1672 2365 2254 2679 2000 2197 ...
##  $ garage_type        : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ garage_sqft        : int  508 462 479 430 430 532 502 624 428 397 ...
##  $ has_fireplace      : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ has_pool           : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ has_central_heating: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ has_central_cooling: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ street_name        : Factor w/ 9511 levels "Aaron Cliff",..: 5438 3030 6133 3840 6133 3410 2047 6901 1298 9436 ...
##  $ city               : Factor w/ 46 levels "Amystad","Brownport",..: 13 13 20 20 20 27 20 20 20 20 ...
##  $ sale_price         : num  270897 302404 197193 207897 196559 ...

categorical_var = c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type","has_fireplace","has_pool", "has_central_heating", "has_central_cooling")

inputdata[,categorical_var] = lapply(inputdata[, categorical_var], factor)

dim(inputdata)

## [1] 30891    15

names(inputdata)

##  [1] "year_built"          "stories"             "num_bedrooms"       
##  [4] "full_bathrooms"      "half_bathrooms"      "total_sqft"         
##  [7] "garage_type"         "garage_sqft"         "has_fireplace"      
## [10] "has_pool"            "has_central_heating" "has_central_cooling"
## [13] "street_name"         "city"                "sale_price"

## 30915 observations and 15 attributes. 

## Randomly select 20% of the data set for this analysis. 

## I will go for validation set approach. 70% training set and 30% test set.


## Normalize continuous variable using formula (x - min(x))/(max(x) - min(x))

normalize = function(x) {
    return((x - min(x))/(max(x) - min(x)))
}


workdata = inputdata[,-c(1,13,14)]

#workdata <- scale(workdata, center = T, scale = T)

set.seed(125)

index = sample(1:nrow(workdata), 5000)

sampledata = workdata[index,]

sampledata = as.data.frame(sampledata)

## Normalize sale_price, total_sqft, garage_sqft. 

sampledata$sale_price = normalize(sampledata$sale_price)

sampledata$total_sqft = normalize(sampledata$total_sqft)

sampledata$garage_sqft = normalize(sampledata$garage_sqft)

summary(sampledata)

##  stories  num_bedrooms full_bathrooms half_bathrooms   total_sqft    
##  1:2942   1:  27       0:   9         0:2118         Min.   :0.0000  
##  2:2012   2: 867       1:1274         1:2882         1st Qu.:0.3043  
##  3:  29   3:2027       2:2888                        Median :0.4256  
##  4:  17   4:1572       3: 752                        Mean   :0.4531  
##           5: 455       4:  73                        3rd Qu.:0.5875  
##           6:  48       5:   4                        Max.   :1.0000  
##           7:   4       6:   0                                        
##  garage_type  garage_sqft     has_fireplace has_pool has_central_heating
##  1:4657      Min.   :0.0000   1:1446        1:4159   1: 205             
##  2: 343      1st Qu.:0.3940   2:3554        2: 841   2:4795             
##              Median :0.4583                                             
##              Mean   :0.5046                                             
##              3rd Qu.:0.6402                                             
##              Max.   :1.0000                                             
##                                                                         
##  has_central_cooling   sale_price    
##  1: 342              Min.   :0.0000  
##  2:4658              1st Qu.:0.3406  
##                      Median :0.4583  
##                      Mean   :0.4669  
##                      3rd Qu.:0.5802  
##                      Max.   :1.0000  
##

# Creating dummy variables for categorical variables

sampledata = dummy.data.frame(sampledata, c("stories", "num_bedrooms", "full_bathrooms", "half_bathrooms", "garage_type", "has_fireplace", "has_pool", "has_central_cooling", "has_central_heating"))

names(sampledata)

##  [1] "stories1"             "stories2"             "stories3"            
##  [4] "stories4"             "num_bedrooms1"        "num_bedrooms2"       
##  [7] "num_bedrooms3"        "num_bedrooms4"        "num_bedrooms5"       
## [10] "num_bedrooms6"        "num_bedrooms7"        "full_bathrooms0"     
## [13] "full_bathrooms1"      "full_bathrooms2"      "full_bathrooms3"     
## [16] "full_bathrooms4"      "full_bathrooms5"      "half_bathrooms0"     
## [19] "half_bathrooms1"      "total_sqft"           "garage_type1"        
## [22] "garage_type2"         "garage_sqft"          "has_fireplace1"      
## [25] "has_fireplace2"       "has_pool1"            "has_pool2"           
## [28] "has_central_heating1" "has_central_heating2" "has_central_cooling1"
## [31] "has_central_cooling2" "sale_price"

str(sampledata)

## 'data.frame':    5000 obs. of  32 variables:
##  $ stories1            : int  1 0 0 0 1 0 0 1 0 0 ...
##  $ stories2            : int  0 1 1 1 0 1 1 0 1 1 ...
##  $ stories3            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stories4            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_bedrooms1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_bedrooms2       : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ num_bedrooms3       : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ num_bedrooms4       : int  0 0 1 0 0 1 1 0 0 1 ...
##  $ num_bedrooms5       : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ num_bedrooms6       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_bedrooms7       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ full_bathrooms0     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ full_bathrooms1     : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ full_bathrooms2     : int  0 1 1 1 1 1 1 0 0 0 ...
##  $ full_bathrooms3     : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ full_bathrooms4     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ full_bathrooms5     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ half_bathrooms0     : int  0 1 0 0 0 0 0 1 1 1 ...
##  $ half_bathrooms1     : int  1 0 1 1 1 1 1 0 0 0 ...
##  $ total_sqft          : num  0.536 0.615 0.637 0.332 0.573 ...
##  $ garage_type1        : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ garage_type2        : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ garage_sqft         : num  0.317 0.443 0.635 0.436 0.668 ...
##  $ has_fireplace1      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_fireplace2      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ has_pool1           : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ has_pool2           : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ has_central_heating1: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ has_central_heating2: int  1 1 1 1 1 1 1 0 1 1 ...
##  $ has_central_cooling1: int  1 0 0 0 0 0 0 1 0 0 ...
##  $ has_central_cooling2: int  0 1 1 1 1 1 1 0 1 1 ...
##  $ sale_price          : num  0.615 0.647 0.686 0.482 0.666 ...
##  - attr(*, "dummies")=List of 9
##   ..$ stories            : int  1 2 3 4
##   ..$ num_bedrooms       : int  5 6 7 8 9 10 11
##   ..$ full_bathrooms     : int  12 13 14 15 16 17
##   ..$ half_bathrooms     : int  18 19
##   ..$ garage_type        : int  21 22
##   ..$ has_fireplace      : int  24 25
##   ..$ has_pool           : int  26 27
##   ..$ has_central_heating: int  28 29
##   ..$ has_central_cooling: int  30 31

## Now we have 31 variables and 2000 observations.

## Model Development 


## Training and Validation set

set.seed(125)

training_index = sample(1:nrow(sampledata), nrow(sampledata)*.7)

training = sampledata[training_index,]

testing = sampledata[-training_index,]

##Decision tree for classification

#Develop Model on training data
fit_DT = rpart(sale_price ~., data = training, method = "anova")

#Summary of DT model
summary(fit_DT)

## Call:
## rpart(formula = sale_price ~ ., data = training, method = "anova")
##   n= 3500 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.31943386      0 1.0000000 1.0006793 0.02384119
## 2 0.05168657      1 0.6805661 0.6999094 0.01804805
## 3 0.05060668      2 0.6288796 0.6444641 0.01684658
## 4 0.01448357      3 0.5782729 0.5974777 0.01622982
## 5 0.01071923      4 0.5637893 0.5821069 0.01595812
## 6 0.01044152      5 0.5530701 0.5727082 0.01583220
## 7 0.01000000      6 0.5426286 0.5702178 0.01581326
## 
## Variable importance
##      total_sqft     garage_sqft        stories1        stories2 
##              39              16              10              10 
## full_bathrooms1 full_bathrooms3   num_bedrooms2 full_bathrooms2 
##               9               8               1               1 
##   num_bedrooms5       has_pool1       has_pool2   num_bedrooms3 
##               1               1               1               1 
##   num_bedrooms4 
##               1 
## 
## Node number 1: 3500 observations,    complexity param=0.3194339
##   mean=0.4679249, MSE=0.03241327 
##   left son=2 (1839 obs) right son=3 (1661 obs)
##   Primary splits:
##       total_sqft      < 0.4403462 to the left,  improve=0.31943390, (0 missing)
##       garage_sqft     < 0.5314637 to the left,  improve=0.17990530, (0 missing)
##       full_bathrooms1 < 0.5       to the right, improve=0.13682310, (0 missing)
##       num_bedrooms2   < 0.5       to the right, improve=0.09494075, (0 missing)
##       full_bathrooms3 < 0.5       to the left,  improve=0.09296820, (0 missing)
##   Surrogate splits:
##       garage_sqft     < 0.5506156 to the left,  agree=0.758, adj=0.490, (0 split)
##       stories1        < 0.5       to the right, agree=0.691, adj=0.348, (0 split)
##       stories2        < 0.5       to the left,  agree=0.691, adj=0.348, (0 split)
##       full_bathrooms1 < 0.5       to the right, agree=0.669, adj=0.303, (0 split)
##       full_bathrooms3 < 0.5       to the left,  agree=0.653, adj=0.270, (0 split)
## 
## Node number 2: 1839 observations,    complexity param=0.05060668
##   mean=0.3712206, MSE=0.01881842 
##   left son=4 (976 obs) right son=5 (863 obs)
##   Primary splits:
##       total_sqft     < 0.3149134 to the left,  improve=0.16589520, (0 missing)
##       num_bedrooms2  < 0.5       to the right, improve=0.09454217, (0 missing)
##       garage_sqft    < 0.2414501 to the left,  improve=0.06922495, (0 missing)
##       has_fireplace1 < 0.5       to the right, improve=0.06033045, (0 missing)
##       has_fireplace2 < 0.5       to the left,  improve=0.06033045, (0 missing)
##   Surrogate splits:
##       full_bathrooms1 < 0.5       to the right, agree=0.622, adj=0.195, (0 split)
##       num_bedrooms4   < 0.5       to the left,  agree=0.620, adj=0.191, (0 split)
##       garage_sqft     < 0.4562244 to the left,  agree=0.607, adj=0.162, (0 split)
##       full_bathrooms2 < 0.5       to the left,  agree=0.604, adj=0.156, (0 split)
##       stories1        < 0.5       to the right, agree=0.591, adj=0.129, (0 split)
## 
## Node number 3: 1661 observations,    complexity param=0.05168657
##   mean=0.5749925, MSE=0.02564764 
##   left son=6 (977 obs) right son=7 (684 obs)
##   Primary splits:
##       total_sqft      < 0.6315579 to the left,  improve=0.13764220, (0 missing)
##       garage_sqft     < 0.5164159 to the left,  improve=0.04964865, (0 missing)
##       has_pool1       < 0.5       to the right, improve=0.04330838, (0 missing)
##       has_pool2       < 0.5       to the left,  improve=0.04330838, (0 missing)
##       full_bathrooms3 < 0.5       to the left,  improve=0.02000909, (0 missing)
##   Surrogate splits:
##       num_bedrooms5   < 0.5       to the left,  agree=0.673, adj=0.206, (0 split)
##       full_bathrooms3 < 0.5       to the left,  agree=0.655, adj=0.162, (0 split)
##       garage_sqft     < 0.6436389 to the left,  agree=0.644, adj=0.135, (0 split)
##       full_bathrooms2 < 0.5       to the right, agree=0.636, adj=0.117, (0 split)
##       full_bathrooms4 < 0.5       to the left,  agree=0.613, adj=0.060, (0 split)
## 
## Node number 4: 976 observations,    complexity param=0.01448357
##   mean=0.3186807, MSE=0.01487658 
##   left son=8 (361 obs) right son=9 (615 obs)
##   Primary splits:
##       num_bedrooms2  < 0.5       to the right, improve=0.11316540, (0 missing)
##       has_fireplace2 < 0.5       to the left,  improve=0.09729332, (0 missing)
##       has_fireplace1 < 0.5       to the right, improve=0.09729332, (0 missing)
##       garage_sqft    < 0.2414501 to the left,  improve=0.08566344, (0 missing)
##       num_bedrooms3  < 0.5       to the left,  improve=0.07126648, (0 missing)
##   Surrogate splits:
##       num_bedrooms3  < 0.5       to the left,  agree=0.880, adj=0.676, (0 split)
##       total_sqft     < 0.1993342 to the left,  agree=0.707, adj=0.208, (0 split)
##       garage_sqft    < 0.3426813 to the left,  agree=0.694, adj=0.172, (0 split)
##       has_fireplace1 < 0.5       to the right, agree=0.647, adj=0.044, (0 split)
##       has_fireplace2 < 0.5       to the left,  agree=0.647, adj=0.044, (0 split)
## 
## Node number 5: 863 observations
##   mean=0.43064, MSE=0.01662386 
## 
## Node number 6: 977 observations,    complexity param=0.01044152
##   mean=0.5252783, MSE=0.02001787 
##   left son=12 (783 obs) right son=13 (194 obs)
##   Primary splits:
##       has_pool2            < 0.5       to the left,  improve=0.060567860, (0 missing)
##       has_pool1            < 0.5       to the right, improve=0.060567860, (0 missing)
##       total_sqft           < 0.5757656 to the left,  improve=0.044638620, (0 missing)
##       garage_sqft          < 0.5225718 to the left,  improve=0.034419710, (0 missing)
##       has_central_cooling1 < 0.5       to the left,  improve=0.009487644, (0 missing)
##   Surrogate splits:
##       has_pool1   < 0.5       to the right, agree=1.000, adj=1.00, (0 split)
##       garage_sqft < 0.1826265 to the right, agree=0.803, adj=0.01, (0 split)
## 
## Node number 7: 684 observations,    complexity param=0.01071923
##   mean=0.6460024, MSE=0.0251164 
##   left son=14 (463 obs) right son=15 (221 obs)
##   Primary splits:
##       total_sqft < 0.7789614 to the left,  improve=0.07078494, (0 missing)
##       has_pool2  < 0.5       to the left,  improve=0.03501981, (0 missing)
##       has_pool1  < 0.5       to the right, improve=0.03501981, (0 missing)
##       stories1   < 0.5       to the left,  improve=0.02514110, (0 missing)
##       stories2   < 0.5       to the right, improve=0.02211745, (0 missing)
##   Surrogate splits:
##       full_bathrooms4 < 0.5       to the left,  agree=0.709, adj=0.100, (0 split)
##       garage_sqft     < 0.8071135 to the left,  agree=0.694, adj=0.054, (0 split)
##       num_bedrooms6   < 0.5       to the left,  agree=0.687, adj=0.032, (0 split)
##       full_bathrooms5 < 0.5       to the left,  agree=0.681, adj=0.014, (0 split)
## 
## Node number 8: 361 observations
##   mean=0.2651267, MSE=0.01315519 
## 
## Node number 9: 615 observations
##   mean=0.3501165, MSE=0.0132153 
## 
## Node number 12: 783 observations
##   mean=0.5079463, MSE=0.01777941 
## 
## Node number 13: 194 observations
##   mean=0.5952319, MSE=0.02294652 
## 
## Node number 14: 463 observations
##   mean=0.6168714, MSE=0.02321827 
## 
## Node number 15: 221 observations
##   mean=0.7070324, MSE=0.02359049

#Lets predict for training data
pred_DT_train = predict(fit_DT, training[,names(testing) != "sale_price"])


#rpart.plot::rpart.plot(fit_DT)

#Lets predict for training data
pred_DT_test = predict(fit_DT,testing[,names(testing) != "sale_price"])


# For training data 
print(postResample(pred = pred_DT_train, obs = training[,32]))

##      RMSE  Rsquared       MAE 
## 0.1326211 0.4573714 0.1036356

    #   RMSE       Rsquared    MAE 
    #   0.1310250  0.4232061   0.1030601 

# For testing data 
print(postResample(pred = pred_DT_test, obs = testing[,32]))

##      RMSE  Rsquared       MAE 
## 0.1367510 0.4325352 0.1065761

## Linear regression 

#set.seed(125)

#Develop Model on training data
fit_LR = lm(sale_price ~ ., data = training)

#Summary of LR model
summary(fit_LR)

## 
## Call:
## lm(formula = sale_price ~ ., data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63185 -0.08246 -0.00696  0.08371  0.60077 
## 
## Coefficients: (9 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.358477   0.112551   3.185  0.00146 ** 
## stories1             -0.001875   0.035711  -0.053  0.95812    
## stories2             -0.025648   0.035690  -0.719  0.47241    
## stories3             -0.021127   0.046123  -0.458  0.64695    
## stories4                    NA         NA      NA       NA    
## num_bedrooms1         0.115640   0.080971   1.428  0.15333    
## num_bedrooms2         0.075720   0.074790   1.012  0.31140    
## num_bedrooms3         0.092329   0.074551   1.238  0.21563    
## num_bedrooms4         0.080023   0.074451   1.075  0.28252    
## num_bedrooms5         0.062485   0.074571   0.838  0.40213    
## num_bedrooms6         0.027197   0.077420   0.351  0.72539    
## num_bedrooms7               NA         NA      NA       NA    
## full_bathrooms0      -0.147983   0.094601  -1.564  0.11784    
## full_bathrooms1      -0.202890   0.075571  -2.685  0.00729 ** 
## full_bathrooms2      -0.141605   0.074707  -1.895  0.05811 .  
## full_bathrooms3      -0.110856   0.074294  -1.492  0.13576    
## full_bathrooms4      -0.088406   0.076422  -1.157  0.24743    
## full_bathrooms5             NA         NA      NA       NA    
## half_bathrooms0      -0.037954   0.006164  -6.158 8.22e-10 ***
## half_bathrooms1             NA         NA      NA       NA    
## total_sqft            0.531363   0.018570  28.614  < 2e-16 ***
## garage_type1          0.001582   0.009563   0.165  0.86860    
## garage_type2                NA         NA      NA       NA    
## garage_sqft           0.049067   0.016646   2.948  0.00322 ** 
## has_fireplace1       -0.023393   0.005086  -4.599 4.40e-06 ***
## has_fireplace2              NA         NA      NA       NA    
## has_pool1            -0.068618   0.005975 -11.484  < 2e-16 ***
## has_pool2                   NA         NA      NA       NA    
## has_central_heating1 -0.064806   0.017905  -3.619  0.00030 ***
## has_central_heating2        NA         NA      NA       NA    
## has_central_cooling1  0.084015   0.014710   5.711 1.21e-08 ***
## has_central_cooling2        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1277 on 3477 degrees of freedom
## Multiple R-squared:    0.5,  Adjusted R-squared:  0.4968 
## F-statistic:   158 on 22 and 3477 DF,  p-value: < 2.2e-16

#Tune LR model with significant attributes
#fit_LR = lm(sale_price ~ half_bathrooms0 + total_sqft + garage_sqft + has_fireplace1 + has_pool1 + has_central_heating1 + #has_central_cooling1, data = training)

#summary(fit_LR)

#fit_LR = lm(sale_price ~ total_sqft + garage_sqft + has_fireplace1 + has_pool1 + has_central_heating1 + has_central_cooling1, data = #training)

#summary(fit_LR)


#Lets predict for training data
pred_LR_train = predict(fit_LR, training[,names(testing) != "sale_price"])

## Warning in predict.lm(fit_LR, training[, names(testing) != "sale_price"]):
## prediction from a rank-deficient fit may be misleading

#Lets predict for testing data
pred_LR_test = predict(fit_LR,testing[,names(testing) != "sale_price"])

## Warning in predict.lm(fit_LR, testing[, names(testing) != "sale_price"]):
## prediction from a rank-deficient fit may be misleading

# For training data 
print(postResample(pred = pred_LR_train, obs = training[,32]))

##      RMSE  Rsquared       MAE 
## 0.1273099 0.4999640 0.1001539

# For testing data 
print(postResample(pred = pred_LR_test, obs = testing[,32]))

##      RMSE  Rsquared       MAE 
## 0.1296649 0.4897216 0.1006418

## Random Forest

set.seed(125)

#Develop Model on training data
fit_RF = randomForest(sale_price~., data = training)

#Lets predict for training data
pred_RF_train = predict(fit_RF, training[,names(testing) != "sale_price"])

#Lets predict for testing data
pred_RF_test = predict(fit_RF,testing[,names(testing) != "sale_price"])

# For training data 
print(postResample(pred = pred_RF_train, obs = training[,32]))

##       RMSE   Rsquared        MAE 
## 0.07697968 0.83599538 0.06028191

# For testing data 
print(postResample(pred = pred_RF_test, obs = testing[,32]))

##       RMSE   Rsquared        MAE 
## 0.12621684 0.51661004 0.09614311

## XGBoost

set.seed(125)

#Develop Model on training data
fit_XGB = gbm(sale_price~., data = training, n.trees = 500, interaction.depth = 2)

## Distribution not specified, assuming gaussian ...

summary(fit_XGB)

##                                       var     rel.inf
## total_sqft                     total_sqft 72.51642947
## garage_sqft                   garage_sqft 11.67893823
## has_pool2                       has_pool2  2.95749330
## num_bedrooms2               num_bedrooms2  1.51573004
## full_bathrooms1           full_bathrooms1  1.36716574
## stories1                         stories1  1.15302987
## has_pool1                       has_pool1  1.15121070
## has_fireplace2             has_fireplace2  1.03776150
## num_bedrooms6               num_bedrooms6  0.96464375
## has_central_cooling1 has_central_cooling1  0.64381098
## half_bathrooms0           half_bathrooms0  0.49125280
## full_bathrooms2           full_bathrooms2  0.45902342
## num_bedrooms3               num_bedrooms3  0.45736319
## garage_type2                 garage_type2  0.41879147
## full_bathrooms3           full_bathrooms3  0.39931495
## garage_type1                 garage_type1  0.38331197
## stories2                         stories2  0.37498045
## has_fireplace1             has_fireplace1  0.35101053
## num_bedrooms5               num_bedrooms5  0.29200133
## has_central_heating1 has_central_heating1  0.28846560
## full_bathrooms4           full_bathrooms4  0.27843334
## has_central_cooling2 has_central_cooling2  0.24889987
## half_bathrooms1           half_bathrooms1  0.22046680
## has_central_heating2 has_central_heating2  0.19988186
## num_bedrooms1               num_bedrooms1  0.09070008
## num_bedrooms4               num_bedrooms4  0.05988875
## stories3                         stories3  0.00000000
## stories4                         stories4  0.00000000
## num_bedrooms7               num_bedrooms7  0.00000000
## full_bathrooms0           full_bathrooms0  0.00000000
## full_bathrooms5           full_bathrooms5  0.00000000

#Lets predict for training data
pred_XGB_train = predict(fit_XGB, training[,names(testing) != "sale_price"], n.trees = 500)

#Lets predict for testing data
pred_XGB_test = predict(fit_XGB,testing[,names(testing) != "sale_price"], n.trees = 500)

# For training data 
print(postResample(pred = pred_XGB_train, obs = training[,32]))

##       RMSE   Rsquared        MAE 
## 0.11815175 0.57008116 0.09244493

# For testing data 
print(postResample(pred = pred_XGB_test, obs = testing[,32]))

##      RMSE  Rsquared       MAE 
## 0.1303687 0.4848113 0.1012367

## Dimensionality Reduction using PCA


#principal component analysis
prin_comp = prcomp(training)
summary(prin_comp)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     0.8517 0.7386 0.6619 0.6233 0.53045 0.50885 0.43017
## Proportion of Variance 0.2150 0.1617 0.1299 0.1152 0.08341 0.07676 0.05485
## Cumulative Proportion  0.2150 0.3767 0.5066 0.6218 0.70520 0.78196 0.83681
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.41580 0.31951 0.29892 0.24655 0.17730 0.14314
## Proportion of Variance 0.05125 0.03026 0.02649 0.01802 0.00932 0.00607
## Cumulative Proportion  0.88806 0.91832 0.94481 0.96283 0.97215 0.97822
##                           PC14    PC15    PC16    PC17    PC18    PC19
## Standard deviation     0.12827 0.12573 0.09819 0.09596 0.09260 0.07595
## Proportion of Variance 0.00488 0.00469 0.00286 0.00273 0.00254 0.00171
## Cumulative Proportion  0.98310 0.98779 0.99064 0.99337 0.99592 0.99763
##                           PC20    PC21    PC22    PC23      PC24      PC25
## Standard deviation     0.06515 0.04294 0.03134 0.03060 8.692e-15 6.774e-15
## Proportion of Variance 0.00126 0.00055 0.00029 0.00028 0.000e+00 0.000e+00
## Cumulative Proportion  0.99888 0.99943 0.99972 1.00000 1.000e+00 1.000e+00
##                             PC26      PC27      PC28      PC29      PC30
## Standard deviation     5.153e-15 3.407e-15 5.741e-16 3.708e-16 3.637e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                             PC31      PC32
## Standard deviation     2.781e-16 1.513e-16
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00

#compute standard deviation of each principal component
std_dev = prin_comp$sdev

#compute variance
pr_var = std_dev^2

#proportion of variance explained
prop_varex = pr_var/sum(pr_var)

#cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     type = "b")

#add a training set with principal components
train.data = data.frame(sale_price = training$sale_price, prin_comp$x)

# From the above plot selecting 7 components since it explains almost 80+ % data variance
train.data =train.data[,1:7]

#transform test into PCA
test.data = predict(prin_comp, newdata = testing)
test.data = as.data.frame(test.data)

#select the first 7 components
test.data=test.data[,1:7]

## Decision tree for classification

#Develop Model on training data
fit_DT = rpart(sale_price ~., data = train.data, method = "anova")
summary(fit_DT)

## Call:
## rpart(formula = sale_price ~ ., data = train.data, method = "anova")
##   n= 3500 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.19938976      0 1.0000000 1.0006180 0.02384784
## 2 0.04185881      1 0.8006102 0.8029023 0.02040718
## 3 0.03701896      3 0.7168926 0.7526404 0.01967466
## 4 0.02629120      4 0.6798737 0.7230709 0.01895893
## 5 0.01450642      5 0.6535825 0.6787105 0.01843399
## 6 0.01376076      6 0.6390760 0.6725201 0.01845176
## 7 0.01264261      7 0.6253153 0.6568818 0.01816089
## 8 0.01000000      9 0.6000301 0.6311420 0.01771516
## 
## Variable importance
## PC1 PC6 PC2 PC4 PC3 PC5 
##  38  21  17  13   8   3 
## 
## Node number 1: 3500 observations,    complexity param=0.1993898
##   mean=0.4679249, MSE=0.03241327 
##   left son=2 (1379 obs) right son=3 (2121 obs)
##   Primary splits:
##       PC1 < -0.3956497 to the left,  improve=0.19938980, (0 missing)
##       PC2 < -0.7045554 to the left,  improve=0.08676619, (0 missing)
##       PC6 < -0.8567894 to the right, improve=0.06508839, (0 missing)
##       PC4 < 0.3340385  to the left,  improve=0.04248052, (0 missing)
##       PC3 < 0.7199597  to the right, improve=0.03457135, (0 missing)
##   Surrogate splits:
##       PC2 < -0.680802  to the left,  agree=0.724, adj=0.299, (0 split)
##       PC6 < 0.3787574  to the right, agree=0.660, adj=0.138, (0 split)
##       PC3 < -0.5653524 to the left,  agree=0.639, adj=0.083, (0 split)
##       PC4 < -0.9244089 to the left,  agree=0.619, adj=0.032, (0 split)
## 
## Node number 2: 1379 observations,    complexity param=0.03701896
##   mean=0.3682236, MSE=0.02526514 
##   left son=4 (1166 obs) right son=5 (213 obs)
##   Primary splits:
##       PC6 < -0.3736896 to the right, improve=0.12053940, (0 missing)
##       PC1 < -1.070884  to the left,  improve=0.11684440, (0 missing)
##       PC3 < 0.6401534  to the right, improve=0.04935712, (0 missing)
##       PC2 < -0.8271217 to the left,  improve=0.04071553, (0 missing)
##       PC4 < 0.275964   to the left,  improve=0.03082658, (0 missing)
##   Surrogate splits:
##       PC3 < -0.8649294 to the right, agree=0.891, adj=0.291, (0 split)
##       PC4 < 1.114466   to the left,  agree=0.860, adj=0.094, (0 split)
## 
## Node number 3: 2121 observations,    complexity param=0.04185881
##   mean=0.5327473, MSE=0.02639593 
##   left son=6 (458 obs) right son=7 (1663 obs)
##   Primary splits:
##       PC6 < 0.3515094  to the right, improve=0.06893502, (0 missing)
##       PC4 < 0.2324192  to the left,  improve=0.06546717, (0 missing)
##       PC2 < -0.4165453 to the left,  improve=0.03682105, (0 missing)
##       PC5 < 0.5967187  to the left,  improve=0.03615933, (0 missing)
##       PC1 < 0.8507392  to the left,  improve=0.02774672, (0 missing)
##   Surrogate splits:
##       PC2 < -0.7444782 to the left,  agree=0.792, adj=0.037, (0 split)
## 
## Node number 4: 1166 observations,    complexity param=0.0262912
##   mean=0.3446369, MSE=0.02125769 
##   left son=8 (691 obs) right son=9 (475 obs)
##   Primary splits:
##       PC1 < -0.721545  to the left,  improve=0.12033350, (0 missing)
##       PC3 < 0.1877787  to the right, improve=0.04939080, (0 missing)
##       PC2 < 1.075145   to the left,  improve=0.04535920, (0 missing)
##       PC4 < 0.2749883  to the left,  improve=0.04030743, (0 missing)
##       PC5 < 0.4711739  to the left,  improve=0.03565906, (0 missing)
##   Surrogate splits:
##       PC2 < 0.7670407  to the left,  agree=0.854, adj=0.642, (0 split)
##       PC3 < -0.7038985 to the right, agree=0.791, adj=0.486, (0 split)
##       PC4 < -0.3971587 to the right, agree=0.712, adj=0.293, (0 split)
##       PC6 < 0.2197383  to the left,  agree=0.708, adj=0.284, (0 split)
##       PC5 < -0.5441839 to the right, agree=0.644, adj=0.126, (0 split)
## 
## Node number 5: 213 observations
##   mean=0.497341, MSE=0.02748589 
## 
## Node number 6: 458 observations
##   mean=0.4514639, MSE=0.01985569 
## 
## Node number 7: 1663 observations,    complexity param=0.04185881
##   mean=0.5551332, MSE=0.02587641 
##   left son=14 (530 obs) right son=15 (1133 obs)
##   Primary splits:
##       PC4 < -0.5265816 to the left,  improve=0.13101940, (0 missing)
##       PC1 < 0.9421486  to the left,  improve=0.07440345, (0 missing)
##       PC5 < 0.5957716  to the left,  improve=0.04313677, (0 missing)
##       PC6 < 0.2899075  to the left,  improve=0.04265583, (0 missing)
##       PC2 < -0.4145367 to the left,  improve=0.03811003, (0 missing)
##   Surrogate splits:
##       PC2 < -0.5660033 to the left,  agree=0.775, adj=0.294, (0 split)
##       PC5 < 0.4645153  to the right, agree=0.740, adj=0.183, (0 split)
##       PC3 < 0.7496727  to the right, agree=0.707, adj=0.081, (0 split)
##       PC6 < -1.55537   to the left,  agree=0.683, adj=0.006, (0 split)
## 
## Node number 8: 691 observations
##   mean=0.3027036, MSE=0.01879866 
## 
## Node number 9: 475 observations,    complexity param=0.01376076
##   mean=0.4056389, MSE=0.01855567 
##   left son=18 (381 obs) right son=19 (94 obs)
##   Primary splits:
##       PC4 < 0.2820162  to the left,  improve=0.17711820, (0 missing)
##       PC5 < 0.6763835  to the left,  improve=0.10337390, (0 missing)
##       PC6 < 0.2518676  to the right, improve=0.10007340, (0 missing)
##       PC2 < 1.075145   to the left,  improve=0.08818329, (0 missing)
##       PC3 < -0.6859192 to the left,  improve=0.08549251, (0 missing)
##   Surrogate splits:
##       PC5 < 0.473236   to the left,  agree=0.895, adj=0.468, (0 split)
##       PC2 < 0.9995193  to the left,  agree=0.846, adj=0.223, (0 split)
##       PC3 < 1.194617   to the left,  agree=0.823, adj=0.106, (0 split)
##       PC6 < 0.6186065  to the left,  agree=0.808, adj=0.032, (0 split)
## 
## Node number 14: 530 observations,    complexity param=0.01450642
##   mean=0.4700004, MSE=0.01550114 
##   left son=28 (139 obs) right son=29 (391 obs)
##   Primary splits:
##       PC6 < 0.260784   to the right, improve=0.20031420, (0 missing)
##       PC1 < 0.8796377  to the left,  improve=0.19173190, (0 missing)
##       PC4 < -0.6631524 to the left,  improve=0.10831870, (0 missing)
##       PC2 < -0.2848248 to the right, improve=0.04906147, (0 missing)
##       PC5 < -0.2042186 to the right, improve=0.03720161, (0 missing)
##   Surrogate splits:
##       PC3 < -0.5485734 to the left,  agree=0.796, adj=0.223, (0 split)
##       PC5 < 0.5733684  to the right, agree=0.745, adj=0.029, (0 split)
## 
## Node number 15: 1133 observations,    complexity param=0.01264261
##   mean=0.594957, MSE=0.02575356 
##   left son=30 (1082 obs) right son=31 (51 obs)
##   Primary splits:
##       PC6 < -0.987985  to the right, improve=0.04185659, (0 missing)
##       PC1 < -0.1246466 to the left,  improve=0.04132996, (0 missing)
##       PC2 < -0.7385828 to the left,  improve=0.04030826, (0 missing)
##       PC5 < 0.4921896  to the left,  improve=0.02653610, (0 missing)
##       PC4 < 0.443252   to the left,  improve=0.01779070, (0 missing)
##   Surrogate splits:
##       PC2 < 1.082758   to the left,  agree=0.960, adj=0.118, (0 split)
##       PC5 < -1.141208  to the right, agree=0.957, adj=0.039, (0 split)
## 
## Node number 18: 381 observations
##   mean=0.3771634, MSE=0.01314705 
## 
## Node number 19: 94 observations
##   mean=0.5210555, MSE=0.02387027 
## 
## Node number 28: 139 observations
##   mean=0.3765419, MSE=0.006215923 
## 
## Node number 29: 391 observations
##   mean=0.5032248, MSE=0.01459306 
## 
## Node number 30: 1082 observations,    complexity param=0.01264261
##   mean=0.5878289, MSE=0.02477831 
##   left son=60 (185 obs) right son=61 (897 obs)
##   Primary splits:
##       PC1 < -0.1246466 to the left,  improve=0.06143927, (0 missing)
##       PC6 < -0.9380561 to the left,  improve=0.04366694, (0 missing)
##       PC2 < -0.7385828 to the left,  improve=0.04039399, (0 missing)
##       PC3 < -1.04195   to the left,  improve=0.03039344, (0 missing)
##       PC5 < 0.2242114  to the left,  improve=0.02102874, (0 missing)
##   Surrogate splits:
##       PC2 < 0.9707428  to the right, agree=0.920, adj=0.530, (0 split)
##       PC4 < -0.4336536 to the left,  agree=0.861, adj=0.189, (0 split)
##       PC3 < -1.04195   to the left,  agree=0.854, adj=0.146, (0 split)
##       PC6 < -0.9590436 to the left,  agree=0.834, adj=0.027, (0 split)
## 
## Node number 31: 51 observations
##   mean=0.7461839, MSE=0.02249651 
## 
## Node number 60: 185 observations
##   mean=0.5019139, MSE=0.02263828 
## 
## Node number 61: 897 observations
##   mean=0.6055483, MSE=0.02338334

#Lets predict for training data
pred_DT_train = predict(fit_DT, train.data)

#Lets predict for training data
pred_DT_test = predict(fit_DT,test.data)


# For training data 
print(postResample(pred = pred_DT_train, obs = training$sale_price))

##      RMSE  Rsquared       MAE 
## 0.1394594 0.3999699 0.1083676

# For testing data 
print(postResample(pred = pred_DT_test, obs = testing$sale_price))

##      RMSE  Rsquared       MAE 
## 0.1392642 0.4115576 0.1085944

## Linear regression

#Develop Model on training data
fit_LR = lm(sale_price ~ ., data = train.data)
summary(fit_LR)

## 
## Call:
## lm(formula = sale_price ~ ., data = train.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66425 -0.10623 -0.01402  0.09119  0.64428 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.467925   0.002577 181.545  < 2e-16 ***
## PC1          0.096500   0.003027  31.882  < 2e-16 ***
## PC2          0.018497   0.003490   5.300 1.23e-07 ***
## PC3         -0.016207   0.003895  -4.162 3.24e-05 ***
## PC4          0.045123   0.004136  10.910  < 2e-16 ***
## PC5          0.011854   0.004860   2.439   0.0148 *  
## PC6         -0.071513   0.005066 -14.116  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1525 on 3493 degrees of freedom
## Multiple R-squared:  0.2841, Adjusted R-squared:  0.2829 
## F-statistic:   231 on 6 and 3493 DF,  p-value: < 2.2e-16

#Lets predict for training data
pred_LR_train = predict(fit_LR, train.data)

#Lets predict for testing data
pred_LR_test = predict(fit_LR,test.data)

# For training data 
print(postResample(pred = pred_LR_train, obs = training$sale_price))

##      RMSE  Rsquared       MAE 
## 0.1523318 0.2840902 0.1197623

# For testing data 
print(postResample(pred = pred_LR_test, obs =testing$sale_price))

##      RMSE  Rsquared       MAE 
## 0.1524032 0.2947531 0.1207978

## Random forest


#Develop Model on training data
fit_RF = randomForest(sale_price~., data = train.data)



#Lets predict for training data
pred_RF_train = predict(fit_RF, train.data)

#Lets predict for testing data
pred_RF_test = predict(fit_RF,test.data)

# For training data 
print(postResample(pred = pred_RF_train, obs = training$sale_price))

##       RMSE   Rsquared        MAE 
## 0.03800913 0.96136663 0.02514029

# For testing data 
print(postResample(pred = pred_RF_test, obs = testing$sale_price))

##       RMSE   Rsquared        MAE 
## 0.08330477 0.79210076 0.05516627

## XGBoost

#Develop Model on training data
fit_XGB = gbm(sale_price~., data = train.data, n.trees = 500, interaction.depth = 2)

## Distribution not specified, assuming gaussian ...

#Lets predict for training data
pred_XGB_train = predict(fit_XGB, train.data, n.trees = 500)

#Lets predict for testing data
pred_XGB_test = predict(fit_XGB,test.data, n.trees = 500)

# For training data 
print(postResample(pred = pred_XGB_train, obs = training$sale_price))

##       RMSE   Rsquared        MAE 
## 0.10125217 0.69408487 0.07612766

# For testing data 
print(postResample(pred = pred_XGB_test, obs = testing$sale_price))

##       RMSE   Rsquared        MAE 
## 0.11165618 0.62704084 0.08318098

## In this section I will try to predict city from sale_price.

cityhomes = as.data.frame(table(inputdata$city))

citydata = data.frame(sale_price = inputdata$sale_price, total_sqft = inputdata$total_sqft, garage_sqft = inputdata$garage_sqft,                       city = inputdata$city)

str(citydata)

## 'data.frame':    30891 obs. of  4 variables:
##  $ sale_price : num  270897 302404 197193 207897 196559 ...
##  $ total_sqft : int  1859 2002 2277 1749 1672 2365 2254 2679 2000 2197 ...
##  $ garage_sqft: int  508 462 479 430 430 532 502 624 428 397 ...
##  $ city       : Factor w/ 46 levels "Amystad","Brownport",..: 13 13 20 20 20 27 20 20 20 20 ...

citydata = aggregate(citydata[, -4], by = list(citydata$city), FUN = median)

str(citydata)

## 'data.frame':    46 obs. of  4 variables:
##  $ Group.1    : Factor w/ 46 levels "Amystad","Brownport",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ sale_price : num  381148 422101 415802 213253 541800 ...
##  $ total_sqft : num  2196 2012 1778 1583 2242 ...
##  $ garage_sqft: num  579 405 458 456 487 ...

citydata

##                 Group.1 sale_price total_sqft garage_sqft
## 1               Amystad   381148.0     2196.0       579.0
## 2             Brownport   422101.0     2012.0       405.0
## 3              Chadstad   415802.0     1778.0       458.0
## 4             Clarkberg   213252.5     1583.0       456.5
## 5              Coletown   541800.0     2242.0       487.0
## 6             Davidfort   400050.5     2202.0       477.0
## 7             Davidtown   443518.0     2926.0       575.0
## 8       East Amychester   327601.5     1933.5       476.5
## 9      East Janiceville   398793.5     2095.5       462.0
## 10          East Justin   343350.5     1944.5       559.5
## 11           East Lucas   346501.5     2137.0       486.0
## 12           Fosterberg   279716.0     2054.0       486.0
## 13             Hallfort   323823.0     1787.0       484.0
## 14         Jeffreyhaven   244439.0     1738.0       457.0
## 15         Jenniferberg   274683.0     1862.0       456.0
## 16           Joshuafurt   463054.0     2701.0       616.0
## 17            Julieberg    88203.0     1717.0       418.0
## 18           Justinport   566999.0     2291.0       507.0
## 19         Lake Carolyn   251054.5     1507.0       460.5
## 20   Lake Christinaport   196558.0     1730.5       462.0
## 21   Lake Dariusborough   321298.0     2112.0       451.0
## 22            Lake Jack   485101.0     1938.0       438.0
## 23        Lake Jennifer   351539.0     2001.0       574.0
## 24             Leahview   154980.0     1411.0       423.0
## 25           Lewishaven   388082.0     1970.0       483.0
## 26         Martinezfort   189000.0     1289.0       481.0
## 27           Morrisport   497070.0     2295.5       482.0
## 28          New Michele   302402.0     1876.0       418.0
## 29      North Erinville   451076.0     2360.0       518.0
## 30        Port Adamtown   407608.0     2055.0       538.5
## 31      Port Andrealand   485103.5     2234.0       496.5
## 32          Port Daniel   176396.0     1498.0       450.0
## 33 Port Jonathanborough   415796.0     2438.5       551.5
## 34          Richardport   579603.0     2852.0       592.0
## 35            Rickytown   627481.0     2906.0       723.0
## 36            Scottberg   277202.5     2080.0       536.0
## 37        South Anthony   359100.0     2189.0       501.0
## 38     South Stevenfurt   365396.0     2179.0       574.0
## 39            Toddshire   144267.5     1546.0       443.5
## 40            Wendybury   409615.0     2264.5       601.0
## 41             West Ann   409496.0     1976.0       479.0
## 42    West Brittanyview   315000.0     1717.0       579.0
## 43          West Gerald   175141.5     1614.5       448.5
## 44     West Gregoryview   573301.5     2691.0       589.0
## 45           West Lydia   275939.0     1598.0       456.0
## 46        West Terrence   355321.5     1927.5       506.0

names(citydata)[1] <- "city"

summary(citydata[,-1])

##    sale_price       total_sqft    garage_sqft   
##  Min.   : 88203   Min.   :1289   Min.   :405.0  
##  1st Qu.:276255   1st Qu.:1748   1st Qu.:456.6  
##  Median :357211   Median :2006   Median :483.5  
##  Mean   :356843   Mean   :2032   Mean   :502.2  
##  3rd Qu.:420526   3rd Qu.:2226   3rd Qu.:548.2  
##  Max.   :627481   Max.   :2926   Max.   :723.0

citydata$sale_price = normalize(citydata$sale_price)

citydata$total_sqft = normalize(citydata$total_sqft)

citydata$garage_sqft = normalize(citydata$garage_sqft)

summary(citydata)

##         city      sale_price       total_sqft      garage_sqft    
##  Amystad  : 1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  Brownport: 1   1st Qu.:0.3487   1st Qu.:0.2804   1st Qu.:0.1623  
##  Chadstad : 1   Median :0.4988   Median :0.4383   Median :0.2469  
##  Clarkberg: 1   Mean   :0.4981   Mean   :0.4537   Mean   :0.3056  
##  Coletown : 1   3rd Qu.:0.6162   3rd Qu.:0.5724   3rd Qu.:0.4505  
##  Davidfort: 1   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  (Other)  :40

## KNN classification

row.names(citydata) <- citydata$city
          
citydata = citydata[,-1]

citydata

##                      sale_price total_sqft garage_sqft
## Amystad               0.5432170 0.55406231  0.54716981
## Brownport             0.6191575 0.44166158  0.00000000
## Chadstad              0.6074770 0.29871717  0.16666667
## Clarkberg             0.2318832 0.17959682  0.16194969
## Coletown              0.8411191 0.58216249  0.25786164
## Davidfort             0.5782685 0.55772755  0.22641509
## Davidtown             0.6588717 1.00000000  0.53459119
## East Amychester       0.4439241 0.39370800  0.22484277
## East Janiceville      0.5759376 0.49266952  0.17924528
## East Justin           0.4731280 0.40042761  0.48584906
## East Lucas            0.4789710 0.51802077  0.25471698
## Fosterberg            0.3551285 0.46731827  0.25471698
## Hallfort              0.4369175 0.30421503  0.24842767
## Jeffreyhaven          0.2897133 0.27428222  0.16352201
## Jenniferberg          0.3457957 0.35003054  0.16037736
## Joshuafurt            0.6950979 0.86255345  0.66352201
## Julieberg             0.0000000 0.26145388  0.04088050
## Justinport            0.8878463 0.61209530  0.32075472
## Lake Carolyn          0.3019806 0.13317043  0.17452830
## Lake Christinaport    0.2009261 0.26970067  0.17924528
## Lake Dariusborough    0.4322353 0.50274893  0.14465409
## Lake Jack             0.7359803 0.39645693  0.10377358
## Lake Jennifer         0.4883122 0.43494197  0.53144654
## Leahview              0.1238267 0.07452657  0.05660377
## Lewishaven            0.5560750 0.41600489  0.24528302
## Martinezfort          0.1869110 0.00000000  0.23899371
## Morrisport            0.7581748 0.61484423  0.24213836
## New Michele           0.3971959 0.35858277  0.04088050
## North Erinville       0.6728867 0.65424557  0.35534591
## Port Adamtown         0.5922826 0.46792914  0.41981132
## Port Andrealand       0.7359850 0.57727550  0.28773585
## Port Daniel           0.1635390 0.12767257  0.14150943
## Port Jonathanborough  0.6074659 0.70219914  0.46069182
## Richardport           0.9112183 0.95479536  0.58805031
## Rickytown             1.0000000 0.98778253  1.00000000
## Scottberg             0.3504677 0.48320098  0.41194969
## South Anthony         0.5023327 0.54978619  0.30188679
## South Stevenfurt      0.5140076 0.54367746  0.53144654
## Toddshire             0.1039621 0.15699450  0.12106918
## Wendybury             0.5960043 0.59590715  0.61635220
## West Ann              0.5957836 0.41967013  0.23270440
## West Brittanyview     0.4205567 0.26145388  0.54716981
## West Gerald           0.1612128 0.19883934  0.13679245
## West Gregoryview      0.8995333 0.85644472  0.57861635
## West Lydia            0.3481247 0.18875993  0.16037736
## West Terrence         0.4953262 0.39004276  0.31761006

k2 = kmeans(citydata, centers = 2, nstart = 25)

#k2 
#str(k2)


fviz_cluster(k2, data = citydata)

k3 <- kmeans(citydata, centers = 3, nstart = 25)
k4 <- kmeans(citydata, centers = 4, nstart = 25)
k5 <- kmeans(citydata, centers = 5, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = citydata) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = citydata) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = citydata) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = citydata) + ggtitle("k = 5")

grid.arrange(p1, p2, p3, p4, nrow = 2)

set.seed(125)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(citydata, k, nstart = 10 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

set.seed(125)

fviz_nbclust(citydata, kmeans, method = "wss")

# Compute k-means clustering with k = 4
set.seed(125)
final <- kmeans(citydata, 5, nstart = 25)
print(final)

## K-means clustering with 5 clusters of sizes 8, 6, 11, 5, 16
## 
## Cluster means:
##   sale_price total_sqft garage_sqft
## 1  0.4972470  0.4677001   0.5113994
## 2  0.7505796  0.6238037   0.3207547
## 3  0.1920072  0.1695452   0.1432247
## 4  0.8329442  0.9323152   0.6729560
## 5  0.5097817  0.4285851   0.1938876
## 
## Clustering vector:
##              Amystad            Brownport             Chadstad 
##                    1                    5                    5 
##            Clarkberg             Coletown            Davidfort 
##                    3                    2                    5 
##            Davidtown      East Amychester     East Janiceville 
##                    4                    5                    5 
##          East Justin           East Lucas           Fosterberg 
##                    1                    5                    5 
##             Hallfort         Jeffreyhaven         Jenniferberg 
##                    5                    3                    5 
##           Joshuafurt            Julieberg           Justinport 
##                    4                    3                    2 
##         Lake Carolyn   Lake Christinaport   Lake Dariusborough 
##                    3                    3                    5 
##            Lake Jack        Lake Jennifer             Leahview 
##                    5                    1                    3 
##           Lewishaven         Martinezfort           Morrisport 
##                    5                    3                    2 
##          New Michele      North Erinville        Port Adamtown 
##                    5                    2                    1 
##      Port Andrealand          Port Daniel Port Jonathanborough 
##                    2                    3                    2 
##          Richardport            Rickytown            Scottberg 
##                    4                    4                    1 
##        South Anthony     South Stevenfurt            Toddshire 
##                    5                    1                    3 
##            Wendybury             West Ann    West Brittanyview 
##                    1                    5                    1 
##          West Gerald     West Gregoryview           West Lydia 
##                    3                    4                    3 
##        West Terrence 
##                    5 
## 
## Within cluster sum of squares by cluster:
## [1] 0.16061998 0.09702199 0.20153819 0.24885822 0.38679605
##  (between_SS / total_SS =  83.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

fviz_cluster(final, data = citydata)

citydata %>%
  mutate(Cluster = final$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean")

## # A tibble: 5 x 4
##   Cluster sale_price total_sqft garage_sqft
##     <int>      <dbl>      <dbl>       <dbl>
## 1       1      0.497      0.468       0.511
## 2       2      0.751      0.624       0.321
## 3       3      0.192      0.170       0.143
## 4       4      0.833      0.932       0.673
## 5       5      0.510      0.429       0.194

End of Analysis

```

Exercising Data Mining for Real Estate Investment

Himangshu Pal

February 8, 2019

Step of project execution