#Project- House Price of Saratoga County, New York

#1. Descriptive Statistics #Read the Dataset.

HP<-read.csv("C:\\Users\\Sweta\\Documents\\Data\\house price.csv", header=T)
attach(HP)
names(HP)
##  [1] "Price"         "Lot.Size"      "Waterfront"    "Age"          
##  [5] "Land.Value"    "New.Construct" "Central.Air"   "Fuel.Type"    
##  [9] "Heat.Type"     "Sewer.Type"    "Living.Area"   "Pct.College"  
## [13] "Bedrooms"      "Fireplaces"    "Bathrooms"     "Rooms"

#Remove rows which has missing values in the dataframe

HP <- na.omit(HP)

#Check the first few rows of the dataframe

head(HP)
##    Price Lot.Size Waterfront Age Land.Value New.Construct Central.Air Fuel.Type
## 1 132500     0.09          0  42      50000             0           0  Electric
## 2 181115     0.92          0   0      22300             0           0       Gas
## 3 109000     0.19          0 133       7300             0           0       Gas
## 4 155000     0.41          0  13      18700             0           0       Gas
## 5  86060     0.11          0   0      15000             1           1       Gas
## 6 120000     0.68          0  31      14000             0           0       Gas
##   Heat.Type Sewer.Type Living.Area Pct.College Bedrooms Fireplaces Bathrooms
## 1  Electric    Private         906          35        2          1       1.0
## 2 Hot Water    Private        1953          51        3          0       2.5
## 3 Hot Water     Public        1944          51        4          1       1.0
## 4   Hot Air    Private        1944          51        3          1       1.5
## 5   Hot Air     Public         840          51        2          0       1.0
## 6   Hot Air    Private        1152          22        4          1       1.0
##   Rooms
## 1     5
## 2     6
## 3     8
## 4     5
## 5     3
## 6     8

#Check the data types of the dataframe

str(HP)
## 'data.frame':    1731 obs. of  16 variables:
##  $ Price        : int  132500 181115 109000 155000 86060 120000 153000 170000 90000 122900 ...
##  $ Lot.Size     : num  0.09 0.92 0.19 0.41 0.11 0.68 0.4 1.21 0.83 1.94 ...
##  $ Waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age          : int  42 0 133 13 0 31 33 23 36 4 ...
##  $ Land.Value   : int  50000 22300 7300 18700 15000 14000 23300 14600 22200 21200 ...
##  $ New.Construct: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Central.Air  : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Fuel.Type    : chr  "Electric" "Gas" "Gas" "Gas" ...
##  $ Heat.Type    : chr  "Electric" "Hot Water" "Hot Water" "Hot Air" ...
##  $ Sewer.Type   : chr  "Private" "Private" "Public" "Private" ...
##  $ Living.Area  : int  906 1953 1944 1944 840 1152 2752 1662 1632 1416 ...
##  $ Pct.College  : int  35 51 51 51 51 22 51 35 51 44 ...
##  $ Bedrooms     : int  2 3 4 3 2 4 4 4 3 3 ...
##  $ Fireplaces   : int  1 0 1 1 0 1 1 1 0 0 ...
##  $ Bathrooms    : num  1 2.5 1 1.5 1 1 1.5 1.5 1.5 1.5 ...
##  $ Rooms        : int  5 6 8 5 3 8 8 9 8 6 ...

#Change Fireplaces, Bedrooms, Bathrooms and Rooms into factor variables

HP$Fireplaces <- as.factor(HP$Fireplaces)
HP$Bedrooms <- as.factor(HP$Bedrooms)
HP$Bathrooms <- as.factor(HP$Bathrooms)
HP$Rooms <- as.factor(HP$Rooms)

#Change the Waterfront, New.Construct, Central.Air into factor variable and turn the numbers 0 and 1 into “No” and “Yes” respectively.

HP$Waterfront=factor(HP$Waterfront,labels=c("No","Yes"))
HP$Central.Air=factor(HP$Central.Air,labels=c("No","Yes"))
HP$New.Construct=factor(HP$New.Construct,labels=c("No","Yes"))
options(scipen = 999)

#Mean

sapply(Filter(is.numeric, HP), FUN = mean, na.rm = TRUE)
##          Price       Lot.Size            Age     Land.Value    Living.Area 
## 211710.3362218      0.5007972     28.1473137  34549.7515887   1753.6915078 
##    Pct.College 
##     55.5696129

#Median

sapply(Filter(is.numeric, HP), FUN = median, na.rm = TRUE)
##       Price    Lot.Size         Age  Land.Value Living.Area Pct.College 
##   189900.00        0.37       19.00    25000.00     1632.00       57.00

#five number summary

sapply(Filter(is.numeric, HP), FUN = fivenum, na.rm = TRUE)
##       Price Lot.Size Age Land.Value Living.Area Pct.College
## [1,]   5000     0.00   0        200       616.0          20
## [2,] 145000     0.17  13      15100      1300.0          52
## [3,] 189900     0.37  19      25000      1632.0          57
## [4,] 258193     0.54  34      40200      2135.5          64
## [5,] 775000    12.20 225     412600      5228.0          82

#Calculate the range of house price in Saratoga.

range(Price)
## [1]   5000 775000

#Histogram of continuous variables #Histogram for House Price

library(ggplot2)
ggplot(HP, aes(x = Price, fill = ..count..)) + geom_histogram(binwidth = 20000) + ggtitle("Histogram of House Price") + ylab("Frequency") + xlab("Housing Price") + theme(plot.title = element_text(hjust = 0.5))

#Histogram for Lot.Size

ggplot(HP, aes(x = Lot.Size, fill = ..count..)) + geom_histogram(binwidth = 0.5) + ggtitle("Histogram of Lot Size") + ylab("Frequency") + xlab("Lot Size") + theme(plot.title = element_text(hjust = 0.5))

#Histogram for Age

ggplot(HP, aes(x = Age, fill = ..count..)) + geom_histogram(binwidth = 10) + ggtitle("Histogram of Age") + ylab("Frequency") + xlab("Age") + theme(plot.title = element_text(hjust = 0.5))

#Histogram for Land value

ggplot(HP, aes(x = Land.Value, fill = ..count..)) + geom_histogram(binwidth = 20000) + ggtitle("Histogram of Land Value") + ylab("Frequency") + xlab("Land Value") + theme(plot.title = element_text(hjust = 0.5))

#Histogram of percent of neighborhood that graduated college

ggplot(HP, aes(x = Pct.College, fill = ..count..)) + geom_histogram(binwidth = 5) + ggtitle("Histogram of percent of neighborhood that graduated college") + ylab("Frequency") + xlab("Percent of neighborhood that graduated college") + theme(plot.title = element_text(hjust = 0.5))

#Histogram of Living Area

ggplot(HP, aes(x = Living.Area, fill = ..count..)) + geom_histogram(binwidth = 200) + ggtitle("Histogram of Living Area") + ylab("Frequency") + xlab("Living Area") + theme(plot.title = element_text(hjust = 0.5))

#Bar chart of Factor variables

#Distribution of waterfront

ggplot(HP, aes(x = Waterfront, fill = Waterfront )) + 
geom_bar()+ ggtitle("Distribution of Waterfront")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of New Construction

ggplot(HP, aes(x = New.Construct, fill = New.Construct )) + 
geom_bar()+ ggtitle("Distribution of New Construction")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of Central Air

ggplot(HP, aes(x = Central.Air, fill = Central.Air )) + 
geom_bar()+ ggtitle("Distribution of Central Air")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of different Fuel type

ggplot(HP, aes(x = Fuel.Type, fill = Fuel.Type )) + 
geom_bar()+ ggtitle("Distribution of different Fuel type")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of different Heat type

ggplot(HP, aes(x = Heat.Type, fill = Heat.Type )) + 
geom_bar()+ ggtitle("Distribution of different Heat type")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of different Sewer type

ggplot(HP, aes(x = Sewer.Type, fill = Sewer.Type )) + 
geom_bar()+ ggtitle("Distribution of different Sewer type")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of Bedrooms

ggplot(HP, aes(x = Bedrooms, fill = Bedrooms )) + 
geom_bar()+ ggtitle("Distribution of Bedrooms")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of Fireplaces

ggplot(HP, aes(x = Fireplaces, fill = Fireplaces )) + 
geom_bar()+ ggtitle("Distribution of fireplaces")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of Bathrooms

ggplot(HP, aes(x = Bathrooms, fill = Bathrooms )) + 
geom_bar()+ ggtitle("Distribution of Bathrooms")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Distribution of Rooms

ggplot(HP, aes(x = Rooms, fill = Rooms )) + 
geom_bar()+ ggtitle("Distribution of rooms")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)

#Correlation Analysis

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.2
library(car)
## Warning: package 'car' was built under R version 4.0.2
## Loading required package: carData
HPCorr <- data.frame(Price, Lot.Size,Age, Land.Value, Living.Area, Pct.College, Bedrooms, Fireplaces, Bathrooms, Rooms)
corr <- cor(HPCorr)
round(corr, 2)
##             Price Lot.Size   Age Land.Value Living.Area Pct.College Bedrooms
## Price        1.00     0.16 -0.19       0.58        0.71        0.20     0.40
## Lot.Size     0.16     1.00 -0.01       0.06        0.16       -0.03     0.11
## Age         -0.19    -0.01  1.00      -0.02       -0.18       -0.04     0.02
## Land.Value   0.58     0.06 -0.02       1.00        0.42        0.23     0.20
## Living.Area  0.71     0.16 -0.18       0.42        1.00        0.21     0.66
## Pct.College  0.20    -0.03 -0.04       0.23        0.21        1.00     0.16
## Bedrooms     0.40     0.11  0.02       0.20        0.66        0.16     1.00
## Fireplaces   0.38     0.08 -0.18       0.21        0.48        0.25     0.29
## Bathrooms    0.60     0.08 -0.36       0.30        0.72        0.18     0.46
## Rooms        0.53     0.14 -0.09       0.30        0.73        0.16     0.67
##             Fireplaces Bathrooms Rooms
## Price             0.38      0.60  0.53
## Lot.Size          0.08      0.08  0.14
## Age              -0.18     -0.36 -0.09
## Land.Value        0.21      0.30  0.30
## Living.Area       0.48      0.72  0.73
## Pct.College       0.25      0.18  0.16
## Bedrooms          0.29      0.46  0.67
## Fireplaces        1.00      0.44  0.32
## Bathrooms         0.44      1.00  0.52
## Rooms             0.32      0.52  1.00
ggcorrplot(corr,lab=TRUE)

#Few predictors show strong relationship with house price. Living Area (0.71), Land Value(0.58), bathrooms(0.6) and rooms(0.53) are highly correlated with response variable Price. On the other hand, Rooms, Bedrooms, Bathrooms and Living Area are correlated to each other.

#Variance Inflation Factor

model=lm(Price~ Lot.Size + Age +Land.Value + Living.Area + Pct.College + as.numeric(Bedrooms) + as.numeric(Fireplaces) + as.numeric(Bathrooms) + as.numeric(Rooms), data=HP)
car::vif(model)
##               Lot.Size                    Age             Land.Value 
##               1.035819               1.222638               1.276047 
##            Living.Area            Pct.College   as.numeric(Bedrooms) 
##               4.108891               1.114403               2.147644 
## as.numeric(Fireplaces)  as.numeric(Bathrooms)      as.numeric(Rooms) 
##               1.370883               2.402161               2.525071

#Scatter plot of Price vs Lot Size & Analysis of Price ~ Lot.Size

ggplot(HP, aes(x=Lot.Size, y=Price)) + 
  geom_point(color='blue', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'

model1=lm(Price~Lot.Size,data=HP)
summary(model1)
## 
## Call:
## lm(formula = Price ~ Lot.Size, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -313017  -64785  -22936   44801  574357 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   200643       2878  69.706 < 0.0000000000000002 ***
## Lot.Size       22100       3349   6.599       0.000000000055 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97330 on 1729 degrees of freedom
## Multiple R-squared:  0.02456,    Adjusted R-squared:  0.024 
## F-statistic: 43.54 on 1 and 1729 DF,  p-value: 0.000000000055
plot(model1, which=1)

#Scatter plot of Price vs Age & Analysis of Price ~ Age

ggplot(HP, aes(x=Age, y=Price)) + 
  geom_point(color='blue', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'

model2=lm(Price~Age,data=HP)
summary(model2)
## 
## Call:
## lm(formula = Price ~ Age, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -222321  -66295  -22169   43109  565134 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 229906.65    3200.76  71.829 < 0.0000000000000002 ***
## Age           -646.47      78.22  -8.264 0.000000000000000277 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96650 on 1729 degrees of freedom
## Multiple R-squared:  0.038,  Adjusted R-squared:  0.03744 
## F-statistic:  68.3 on 1 and 1729 DF,  p-value: 0.0000000000000002765
plot(model2, which=1)

#Scatter plot of Price vs Land Value & Analysis of Price ~ Land Value

ggplot(HP, aes(x=Land.Value, y=Price)) + 
  geom_point(color='blue', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'

model3=lm(Price~Land.Value,data=HP)
summary(model3)
## 
## Call:
## lm(formula = Price ~ Land.Value, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -267746  -49152  -14017   36236  501092 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept) 155234.41155   2709.83679   57.29 <0.0000000000000002 ***
## Land.Value       1.63463      0.05511   29.66 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80220 on 1729 degrees of freedom
## Multiple R-squared:  0.3373, Adjusted R-squared:  0.3369 
## F-statistic: 879.8 on 1 and 1729 DF,  p-value: < 0.00000000000000022
plot(model3, which=1)

#Scatter plot of Price vs Living Area & Analysis of Price ~ Living Area

ggplot(HP, aes(x=Living.Area, y=Price)) + 
  geom_point(color='brown', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'

model4=lm(Price~Living.Area,data=HP)
summary(model4)
## 
## Call:
## lm(formula = Price ~ Living.Area, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -277098  -39352   -7638   28354  553580 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 13069.90    4984.18   2.622              0.00881 ** 
## Living.Area   113.27       2.68  42.271 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69110 on 1729 degrees of freedom
## Multiple R-squared:  0.5082, Adjusted R-squared:  0.5079 
## F-statistic:  1787 on 1 and 1729 DF,  p-value: < 0.00000000000000022
plot(model4, which=1)

#Scatter plot of Price vs Pct.College & Analysis of Price ~ Pct.College

ggplot(HP, aes(x=Pct.College, y=Price)) + 
  geom_point(color='brown', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'

model5=lm(Price~Pct.College,data=HP)
summary(model5)
## 
## Call:
## lm(formula = Price ~ Pct.College, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -220827  -64665  -22231   42120  560572 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 106139.0    12710.6   8.350 <0.0000000000000002 ***
## Pct.College   1899.8      224.9   8.448 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96570 on 1729 degrees of freedom
## Multiple R-squared:  0.03964,    Adjusted R-squared:  0.03908 
## F-statistic: 71.37 on 1 and 1729 DF,  p-value: < 0.00000000000000022
plot(model5, which=1)

##Box plot of house price by Bedrooms & Analysis of Price ~ Bedrooms

ggplot(HP, aes(x=Bedrooms, y=Price, fill=Bedrooms)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by Bedrooms")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

model6=lm(Price~Bedrooms,data=HP)
summary(model6)
## 
## Call:
## lm(formula = Price ~ Bedrooms, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -200551  -55407  -16007   32697  574593 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)   192771      33789   5.705 0.0000000136 ***
## Bedrooms2     -40468      34125  -1.186       0.2358    
## Bedrooms3       7635      33932   0.225       0.8220    
## Bedrooms4      72779      34031   2.139       0.0326 *  
## Bedrooms5      83806      35951   2.331       0.0199 *  
## Bedrooms6      84557      46267   1.828       0.0678 .  
## Bedrooms7      33895      61689   0.549       0.5828    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 89400 on 1724 degrees of freedom
## Multiple R-squared:  0.1794, Adjusted R-squared:  0.1766 
## F-statistic: 62.83 on 6 and 1724 DF,  p-value: < 0.00000000000000022
plot(model6, which=1)

##Box plot of house price by Fireplaces & Analysis of Price ~ Fireplaces

ggplot(HP, aes(x=Fireplaces, y=Price, fill=Fireplaces)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by Fireplaces")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

model7=lm(Price~Fireplaces,data=HP)
summary(model7)
## 
## Call:
## lm(formula = Price ~ Fireplaces, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -230205  -57705  -18305   42753  585764 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   174236       3328  52.354 < 0.0000000000000002 ***
## Fireplaces1    60969       4453  13.690 < 0.0000000000000002 ***
## Fireplaces2   144586      14397  10.043 < 0.0000000000000002 ***
## Fireplaces3   186264      64275   2.898               0.0038 ** 
## Fireplaces4   525764      64275   8.180 0.000000000000000545 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 90780 on 1726 degrees of freedom
## Multiple R-squared:  0.1529, Adjusted R-squared:  0.1509 
## F-statistic: 77.88 on 4 and 1726 DF,  p-value: < 0.00000000000000022
plot(model7, which=1)

##Box plot of house price by Bathrooms & Analysis of Price ~ Bathrooms

ggplot(HP, aes(x=Bathrooms, y=Price, fill=Bathrooms)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by Bathrooms")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

model8=lm(Price~Bathrooms,data=HP)
summary(model8)
## 
## Call:
## lm(formula = Price ~ Bathrooms, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -263345  -46979   -9345   31655  506655 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    147751       4316  34.231 < 0.0000000000000002 ***
## Bathrooms1.5    20278       5558   3.649             0.000272 ***
## Bathrooms2      52861       6507   8.124 0.000000000000000849 ***
## Bathrooms2.5   120594       5452  22.118 < 0.0000000000000002 ***
## Bathrooms3     173228      11288  15.346 < 0.0000000000000002 ***
## Bathrooms3.5   235362      13706  17.172 < 0.0000000000000002 ***
## Bathrooms4     300699      27931  10.766 < 0.0000000000000002 ***
## Bathrooms4.5   264749      78172   3.387             0.000723 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78050 on 1723 degrees of freedom
## Multiple R-squared:  0.3748, Adjusted R-squared:  0.3723 
## F-statistic: 147.6 on 7 and 1723 DF,  p-value: < 0.00000000000000022
plot(model8, which=1)

#Box plot of house price by Rooms & Analysis of Price ~ Rooms

ggplot(HP, aes(x=Rooms, y=Price, fill=Rooms)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by Rooms")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

model9=lm(Price~Rooms,data=HP)
summary(model9)
## 
## Call:
## lm(formula = Price ~ Rooms, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -294719  -50850  -11450   36525  529721 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|)    
## (Intercept)    94500      57878   1.633   0.102708    
## Rooms3         39656      58597   0.677   0.498651    
## Rooms4         73745      58194   1.267   0.205242    
## Rooms5         71950      58136   1.238   0.216032    
## Rooms6         90814      58112   1.563   0.118300    
## Rooms7         97329      58068   1.676   0.093895 .  
## Rooms8        126097      58102   2.170   0.030124 *  
## Rooms9        150779      58285   2.587   0.009765 ** 
## Rooms10       194067      58287   3.329   0.000888 ***
## Rooms11       211414      58677   3.603   0.000323 ***
## Rooms12       278719      58625   4.754 0.00000216 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81850 on 1720 degrees of freedom
## Multiple R-squared:  0.3137, Adjusted R-squared:  0.3097 
## F-statistic: 78.61 on 10 and 1720 DF,  p-value: < 0.00000000000000022
plot(model9, which=1)

#Boxplot of House Price by Waterfront and t-test for waterfront

ggplot(HP, aes(x=Waterfront, y=Price, fill=Waterfront)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by Waterfront")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

#Hypothesis test #alpha=0.05 #null hypothesis: There is no significant difference between means of homes where waterfront is present and where waterfront is absent. #alternative hypothesis: There is significant difference between means of homes where waterfront is present and where waterfront is absent.

tapply(Price, Waterfront, mean)
##        0        1 
## 210291.8 373991.7
t.test(Price~Waterfront)
## 
##  Welch Two Sample t-test
## 
## data:  Price by Waterfront
## t = -4.0825, df = 14.095, p-value = 0.001105
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -249647.61  -77752.13
## sample estimates:
## mean in group 0 mean in group 1 
##        210291.8        373991.7

#Boxplot of House Price by New Construction and t-test for new.construct

ggplot(HP, aes(x=New.Construct, y=Price, fill=New.Construct)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by New Construction")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

#Hypothesis test #alpha=0.05 #null hypothesis: There is no significant difference between means of new constructed homes and old homes. #alternative hypothesis: There is significant difference between means of new constructed homes and old homes.

tapply(Price, New.Construct, mean)
##        0        1 
## 208244.7 282306.8
t.test(Price~New.Construct)
## 
##  Welch Two Sample t-test
## 
## data:  Price by New.Construct
## t = -7.6912, df = 91.03, p-value = 0.00000000001659
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -93189.84 -54934.41
## sample estimates:
## mean in group 0 mean in group 1 
##        208244.7        282306.8

#Boxplot of House Price by Central Air and t-test for Central.Air

ggplot(HP, aes(x=Central.Air, y=Price, fill=Central.Air)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  ggtitle("Boxplot of House Price by Central Air")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.

#Hypothesis test #alpha=0.05 #null hypothesis: There is no significant difference between means of homes where Central Air present and the homes where central air not present. #alternative hypothesis: There is significant difference between means of homes where Central Air present and the homes where central air not present.

tapply(Price, Central.Air, mean)
##        0        1 
## 186684.9 254903.8
t.test(Price~Central.Air)
## 
##  Welch Two Sample t-test
## 
## data:  Price by Central.Air
## t = -13.387, df = 987.09, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -78219.22 -58218.46
## sample estimates:
## mean in group 0 mean in group 1 
##        186684.9        254903.8

#ANOVA test based on fuel type

tapply(Price, Fuel.Type, mean)
##  Electric       Gas       Oil   Unknown 
## 164937.57 228562.38 188734.40  97006.25
anova<-aov(Price~Fuel.Type) 
modelaov1 <- lm(Price~Fuel.Type) 
summary(anova)
##               Df         Sum Sq      Mean Sq F value              Pr(>F)    
## Fuel.Type      3  1195429652300 398476550767   44.13 <0.0000000000000002 ***
## Residuals   1727 15594921381251   9030064494                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(modelaov1)
##      (Intercept)     Fuel.TypeGas     Fuel.TypeOil Fuel.TypeUnknown 
##        164937.57         63624.81         23796.83        -67931.32

#ANOVA test based on heat type

tapply(Price, Heat.Type, mean)
##  Electric   Hot Air Hot Water   Unknown 
## 161888.63 226382.62 209132.46  97006.25
anova<-aov(Price~Heat.Type) 
modelaov2 <- lm(Price~Heat.Type) 
summary(anova)
##               Df         Sum Sq      Mean Sq F value              Pr(>F)    
## Heat.Type      3  1052815863245 350938621082   38.51 <0.0000000000000002 ***
## Residuals   1727 15737535170305   9112643411                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(modelaov2)
##        (Intercept)   Heat.TypeHot Air Heat.TypeHot Water   Heat.TypeUnknown 
##          161888.63           64493.99           47243.83          -64882.38

#ANOVA test based on Sewer type

tapply(Price, Sewer.Type, mean)
##  Private   Public  Unknown 
## 199597.0 216375.2 250952.3
anova<-aov(Price~Sewer.Type) 
modelaov3 <- lm(Price~Sewer.Type) 
summary(anova)
##               Df         Sum Sq     Mean Sq F value  Pr(>F)   
## Sewer.Type     2   119121637669 59560818834   6.174 0.00213 **
## Residuals   1728 16671229395882  9647702197                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(modelaov3)
##       (Intercept)  Sewer.TypePublic Sewer.TypeUnknown 
##         199597.01          16778.16          51355.33

#Multiple Regression

modelReg1<-lm(Price~ Land.Value + Living.Area + Bathrooms + Rooms, data=HP)
summary(modelReg1)
## 
## Call:
## lm(formula = Price ~ Land.Value + Living.Area + Bathrooms + Rooms, 
##     data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -220736  -35720   -6107   28504  457895 
## 
## Coefficients:
##                 Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  24825.25652 43014.12234   0.577               0.5639    
## Land.Value       0.95881     0.04629  20.712 < 0.0000000000000002 ***
## Living.Area     67.46644     4.68388  14.404 < 0.0000000000000002 ***
## Bathrooms1.5  1677.97579  4491.69855   0.374               0.7088    
## Bathrooms2   26553.62811  5267.60117   5.041      0.0000005123389 ***
## Bathrooms2.5 32673.01750  5553.74474   5.883      0.0000000048343 ***
## Bathrooms3   49127.70068  9916.16941   4.954      0.0000007976127 ***
## Bathrooms3.5 81669.26773 12460.23993   6.554      0.0000000000737 ***
## Bathrooms4   40977.90717 24100.84532   1.700               0.0893 .  
## Bathrooms4.5 15921.28166 61701.91746   0.258               0.7964    
## Rooms3        9721.93717 43478.88320   0.224               0.8231    
## Rooms4       18230.61419 43209.73475   0.422               0.6731    
## Rooms5       11032.54585 43198.55445   0.255               0.7985    
## Rooms6       19219.74637 43184.54016   0.445               0.6563    
## Rooms7       14479.55576 43176.74178   0.335               0.7374    
## Rooms8       20716.82258 43259.57426   0.479               0.6321    
## Rooms9       10091.37351 43507.16571   0.232               0.8166    
## Rooms10      25262.03373 43612.66065   0.579               0.5625    
## Rooms11      24218.91394 44021.46186   0.550               0.5823    
## Rooms12      24528.64878 44314.69881   0.554               0.5800    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60630 on 1711 degrees of freedom
## Multiple R-squared:  0.6254, Adjusted R-squared:  0.6213 
## F-statistic: 150.4 on 19 and 1711 DF,  p-value: < 0.00000000000000022
plot(modelReg1)
## Warning: not plotting observations with leverage one:
##   397

modelReg2<-update(modelReg1, ~. -Rooms)
summary(modelReg2)
## 
## Call:
## lm(formula = Price ~ Land.Value + Living.Area + Bathrooms, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -218676  -34780   -6004   28300  464106 
## 
## Coefficients:
##                 Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  36449.67197  5450.93655   6.687      0.0000000000307 ***
## Land.Value       0.95584     0.04616  20.708 < 0.0000000000000002 ***
## Living.Area     71.00893     3.69160  19.235 < 0.0000000000000002 ***
## Bathrooms1.5   605.56407  4437.33361   0.136               0.8915    
## Bathrooms2   25603.15485  5202.53630   4.921      0.0000009419736 ***
## Bathrooms2.5 32292.06265  5484.66552   5.888      0.0000000046983 ***
## Bathrooms3   48811.90798  9869.02259   4.946      0.0000008315425 ***
## Bathrooms3.5 82548.55485 12226.62423   6.752      0.0000000000199 ***
## Bathrooms4   40523.12361 23641.26148   1.714               0.0867 .  
## Bathrooms4.5 15285.53485 61440.36902   0.249               0.8036    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60620 on 1721 degrees of freedom
## Multiple R-squared:  0.6234, Adjusted R-squared:  0.6214 
## F-statistic: 316.5 on 9 and 1721 DF,  p-value: < 0.00000000000000022
plot(modelReg2)
## Warning: not plotting observations with leverage one:
##   397

modelReg3<-update(modelReg2, ~. -Bathrooms)
summary(modelReg3)
## 
## Call:
## lm(formula = Price ~ Land.Value + Living.Area, data = HP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -241131  -37208   -6267   28046  465813 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept) 20075.72881  4491.81120   4.469           0.00000835 ***
## Land.Value      0.95713     0.04707  20.333 < 0.0000000000000002 ***
## Living.Area    90.41836     2.65719  34.028 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62100 on 1728 degrees of freedom
## Multiple R-squared:  0.6032, Adjusted R-squared:  0.6027 
## F-statistic:  1313 on 2 and 1728 DF,  p-value: < 0.00000000000000022
plot(modelReg3)

anova(modelReg2, modelReg3)
## Analysis of Variance Table
## 
## Model 1: Price ~ Land.Value + Living.Area + Bathrooms
## Model 2: Price ~ Land.Value + Living.Area
##   Res.Df           RSS Df     Sum of Sq      F                Pr(>F)    
## 1   1721 6324003469402                                                  
## 2   1728 6663044967528 -7 -339041498126 13.181 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pred.Price <- predict(modelReg3)
PredHP <- cbind(HP, Pred.Price)
head(PredHP)
##    Price Lot.Size Waterfront Age Land.Value New.Construct Central.Air Fuel.Type
## 1 132500     0.09         No  42      50000            No          No  Electric
## 2 181115     0.92         No   0      22300            No          No       Gas
## 3 109000     0.19         No 133       7300            No          No       Gas
## 4 155000     0.41         No  13      18700            No          No       Gas
## 5  86060     0.11         No   0      15000           Yes         Yes       Gas
## 6 120000     0.68         No  31      14000            No          No       Gas
##   Heat.Type Sewer.Type Living.Area Pct.College Bedrooms Fireplaces Bathrooms
## 1  Electric    Private         906          35        2          1         1
## 2 Hot Water    Private        1953          51        3          0       2.5
## 3 Hot Water     Public        1944          51        4          1         1
## 4   Hot Air    Private        1944          51        3          1       1.5
## 5   Hot Air     Public         840          51        2          0         1
## 6   Hot Air    Private        1152          22        4          1         1
##   Rooms Pred.Price
## 1     5   149851.4
## 2     6   218006.8
## 3     8   202836.1
## 4     5   213747.4
## 5     3   110384.1
## 6     8   137637.5
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.2
modeltree<-rpart(Pred.Price ~., PredHP)
rpart.plot(modeltree, digits=-3)

HP[179, 1:16]
##      Price Lot.Size Waterfront Age Land.Value New.Construct Central.Air
## 179 247000     0.46         No  13      36600            No         Yes
##     Fuel.Type Heat.Type Sewer.Type Living.Area Pct.College Bedrooms Fireplaces
## 179       Gas   Hot Air     Public        2114          64        4          1
##     Bathrooms Rooms
## 179       3.5     8
predict(modelReg3, list(Land.Value=36600, Living.Area=2114))
##        1 
## 246251.2
ggplot(PredHP, aes(x=Living.Area, y=Pred.Price)) + 
  geom_point(color='blue', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'

ggplot(PredHP, aes(x=Land.Value, y=Pred.Price)) + 
  geom_point(color='blue', size = 2) + 
  geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'