#Project- House Price of Saratoga County, New York
#1. Descriptive Statistics #Read the Dataset.
HP<-read.csv("C:\\Users\\Sweta\\Documents\\Data\\house price.csv", header=T)
attach(HP)
names(HP)
## [1] "Price" "Lot.Size" "Waterfront" "Age"
## [5] "Land.Value" "New.Construct" "Central.Air" "Fuel.Type"
## [9] "Heat.Type" "Sewer.Type" "Living.Area" "Pct.College"
## [13] "Bedrooms" "Fireplaces" "Bathrooms" "Rooms"
#Remove rows which has missing values in the dataframe
HP <- na.omit(HP)
#Check the first few rows of the dataframe
head(HP)
## Price Lot.Size Waterfront Age Land.Value New.Construct Central.Air Fuel.Type
## 1 132500 0.09 0 42 50000 0 0 Electric
## 2 181115 0.92 0 0 22300 0 0 Gas
## 3 109000 0.19 0 133 7300 0 0 Gas
## 4 155000 0.41 0 13 18700 0 0 Gas
## 5 86060 0.11 0 0 15000 1 1 Gas
## 6 120000 0.68 0 31 14000 0 0 Gas
## Heat.Type Sewer.Type Living.Area Pct.College Bedrooms Fireplaces Bathrooms
## 1 Electric Private 906 35 2 1 1.0
## 2 Hot Water Private 1953 51 3 0 2.5
## 3 Hot Water Public 1944 51 4 1 1.0
## 4 Hot Air Private 1944 51 3 1 1.5
## 5 Hot Air Public 840 51 2 0 1.0
## 6 Hot Air Private 1152 22 4 1 1.0
## Rooms
## 1 5
## 2 6
## 3 8
## 4 5
## 5 3
## 6 8
#Check the data types of the dataframe
str(HP)
## 'data.frame': 1731 obs. of 16 variables:
## $ Price : int 132500 181115 109000 155000 86060 120000 153000 170000 90000 122900 ...
## $ Lot.Size : num 0.09 0.92 0.19 0.41 0.11 0.68 0.4 1.21 0.83 1.94 ...
## $ Waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : int 42 0 133 13 0 31 33 23 36 4 ...
## $ Land.Value : int 50000 22300 7300 18700 15000 14000 23300 14600 22200 21200 ...
## $ New.Construct: int 0 0 0 0 1 0 0 0 0 0 ...
## $ Central.Air : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Fuel.Type : chr "Electric" "Gas" "Gas" "Gas" ...
## $ Heat.Type : chr "Electric" "Hot Water" "Hot Water" "Hot Air" ...
## $ Sewer.Type : chr "Private" "Private" "Public" "Private" ...
## $ Living.Area : int 906 1953 1944 1944 840 1152 2752 1662 1632 1416 ...
## $ Pct.College : int 35 51 51 51 51 22 51 35 51 44 ...
## $ Bedrooms : int 2 3 4 3 2 4 4 4 3 3 ...
## $ Fireplaces : int 1 0 1 1 0 1 1 1 0 0 ...
## $ Bathrooms : num 1 2.5 1 1.5 1 1 1.5 1.5 1.5 1.5 ...
## $ Rooms : int 5 6 8 5 3 8 8 9 8 6 ...
#Change Fireplaces, Bedrooms, Bathrooms and Rooms into factor variables
HP$Fireplaces <- as.factor(HP$Fireplaces)
HP$Bedrooms <- as.factor(HP$Bedrooms)
HP$Bathrooms <- as.factor(HP$Bathrooms)
HP$Rooms <- as.factor(HP$Rooms)
#Change the Waterfront, New.Construct, Central.Air into factor variable and turn the numbers 0 and 1 into “No” and “Yes” respectively.
HP$Waterfront=factor(HP$Waterfront,labels=c("No","Yes"))
HP$Central.Air=factor(HP$Central.Air,labels=c("No","Yes"))
HP$New.Construct=factor(HP$New.Construct,labels=c("No","Yes"))
options(scipen = 999)
#Mean
sapply(Filter(is.numeric, HP), FUN = mean, na.rm = TRUE)
## Price Lot.Size Age Land.Value Living.Area
## 211710.3362218 0.5007972 28.1473137 34549.7515887 1753.6915078
## Pct.College
## 55.5696129
#Median
sapply(Filter(is.numeric, HP), FUN = median, na.rm = TRUE)
## Price Lot.Size Age Land.Value Living.Area Pct.College
## 189900.00 0.37 19.00 25000.00 1632.00 57.00
#five number summary
sapply(Filter(is.numeric, HP), FUN = fivenum, na.rm = TRUE)
## Price Lot.Size Age Land.Value Living.Area Pct.College
## [1,] 5000 0.00 0 200 616.0 20
## [2,] 145000 0.17 13 15100 1300.0 52
## [3,] 189900 0.37 19 25000 1632.0 57
## [4,] 258193 0.54 34 40200 2135.5 64
## [5,] 775000 12.20 225 412600 5228.0 82
#Calculate the range of house price in Saratoga.
range(Price)
## [1] 5000 775000
#Histogram of continuous variables #Histogram for House Price
library(ggplot2)
ggplot(HP, aes(x = Price, fill = ..count..)) + geom_histogram(binwidth = 20000) + ggtitle("Histogram of House Price") + ylab("Frequency") + xlab("Housing Price") + theme(plot.title = element_text(hjust = 0.5))
#Histogram for Lot.Size
ggplot(HP, aes(x = Lot.Size, fill = ..count..)) + geom_histogram(binwidth = 0.5) + ggtitle("Histogram of Lot Size") + ylab("Frequency") + xlab("Lot Size") + theme(plot.title = element_text(hjust = 0.5))
#Histogram for Age
ggplot(HP, aes(x = Age, fill = ..count..)) + geom_histogram(binwidth = 10) + ggtitle("Histogram of Age") + ylab("Frequency") + xlab("Age") + theme(plot.title = element_text(hjust = 0.5))
#Histogram for Land value
ggplot(HP, aes(x = Land.Value, fill = ..count..)) + geom_histogram(binwidth = 20000) + ggtitle("Histogram of Land Value") + ylab("Frequency") + xlab("Land Value") + theme(plot.title = element_text(hjust = 0.5))
#Histogram of percent of neighborhood that graduated college
ggplot(HP, aes(x = Pct.College, fill = ..count..)) + geom_histogram(binwidth = 5) + ggtitle("Histogram of percent of neighborhood that graduated college") + ylab("Frequency") + xlab("Percent of neighborhood that graduated college") + theme(plot.title = element_text(hjust = 0.5))
#Histogram of Living Area
ggplot(HP, aes(x = Living.Area, fill = ..count..)) + geom_histogram(binwidth = 200) + ggtitle("Histogram of Living Area") + ylab("Frequency") + xlab("Living Area") + theme(plot.title = element_text(hjust = 0.5))
#Bar chart of Factor variables
#Distribution of waterfront
ggplot(HP, aes(x = Waterfront, fill = Waterfront )) +
geom_bar()+ ggtitle("Distribution of Waterfront")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of New Construction
ggplot(HP, aes(x = New.Construct, fill = New.Construct )) +
geom_bar()+ ggtitle("Distribution of New Construction")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of Central Air
ggplot(HP, aes(x = Central.Air, fill = Central.Air )) +
geom_bar()+ ggtitle("Distribution of Central Air")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of different Fuel type
ggplot(HP, aes(x = Fuel.Type, fill = Fuel.Type )) +
geom_bar()+ ggtitle("Distribution of different Fuel type")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of different Heat type
ggplot(HP, aes(x = Heat.Type, fill = Heat.Type )) +
geom_bar()+ ggtitle("Distribution of different Heat type")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of different Sewer type
ggplot(HP, aes(x = Sewer.Type, fill = Sewer.Type )) +
geom_bar()+ ggtitle("Distribution of different Sewer type")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of Bedrooms
ggplot(HP, aes(x = Bedrooms, fill = Bedrooms )) +
geom_bar()+ ggtitle("Distribution of Bedrooms")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of Fireplaces
ggplot(HP, aes(x = Fireplaces, fill = Fireplaces )) +
geom_bar()+ ggtitle("Distribution of fireplaces")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of Bathrooms
ggplot(HP, aes(x = Bathrooms, fill = Bathrooms )) +
geom_bar()+ ggtitle("Distribution of Bathrooms")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Distribution of Rooms
ggplot(HP, aes(x = Rooms, fill = Rooms )) +
geom_bar()+ ggtitle("Distribution of rooms")+
theme(plot.title = element_text(hjust = 0.5))+ geom_text(stat='count',aes(label=..count..),vjust=-0.25)
#Correlation Analysis
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.2
library(car)
## Warning: package 'car' was built under R version 4.0.2
## Loading required package: carData
HPCorr <- data.frame(Price, Lot.Size,Age, Land.Value, Living.Area, Pct.College, Bedrooms, Fireplaces, Bathrooms, Rooms)
corr <- cor(HPCorr)
round(corr, 2)
## Price Lot.Size Age Land.Value Living.Area Pct.College Bedrooms
## Price 1.00 0.16 -0.19 0.58 0.71 0.20 0.40
## Lot.Size 0.16 1.00 -0.01 0.06 0.16 -0.03 0.11
## Age -0.19 -0.01 1.00 -0.02 -0.18 -0.04 0.02
## Land.Value 0.58 0.06 -0.02 1.00 0.42 0.23 0.20
## Living.Area 0.71 0.16 -0.18 0.42 1.00 0.21 0.66
## Pct.College 0.20 -0.03 -0.04 0.23 0.21 1.00 0.16
## Bedrooms 0.40 0.11 0.02 0.20 0.66 0.16 1.00
## Fireplaces 0.38 0.08 -0.18 0.21 0.48 0.25 0.29
## Bathrooms 0.60 0.08 -0.36 0.30 0.72 0.18 0.46
## Rooms 0.53 0.14 -0.09 0.30 0.73 0.16 0.67
## Fireplaces Bathrooms Rooms
## Price 0.38 0.60 0.53
## Lot.Size 0.08 0.08 0.14
## Age -0.18 -0.36 -0.09
## Land.Value 0.21 0.30 0.30
## Living.Area 0.48 0.72 0.73
## Pct.College 0.25 0.18 0.16
## Bedrooms 0.29 0.46 0.67
## Fireplaces 1.00 0.44 0.32
## Bathrooms 0.44 1.00 0.52
## Rooms 0.32 0.52 1.00
ggcorrplot(corr,lab=TRUE)
#Few predictors show strong relationship with house price. Living Area (0.71), Land Value(0.58), bathrooms(0.6) and rooms(0.53) are highly correlated with response variable Price. On the other hand, Rooms, Bedrooms, Bathrooms and Living Area are correlated to each other.
#Variance Inflation Factor
model=lm(Price~ Lot.Size + Age +Land.Value + Living.Area + Pct.College + as.numeric(Bedrooms) + as.numeric(Fireplaces) + as.numeric(Bathrooms) + as.numeric(Rooms), data=HP)
car::vif(model)
## Lot.Size Age Land.Value
## 1.035819 1.222638 1.276047
## Living.Area Pct.College as.numeric(Bedrooms)
## 4.108891 1.114403 2.147644
## as.numeric(Fireplaces) as.numeric(Bathrooms) as.numeric(Rooms)
## 1.370883 2.402161 2.525071
#Scatter plot of Price vs Lot Size & Analysis of Price ~ Lot.Size
ggplot(HP, aes(x=Lot.Size, y=Price)) +
geom_point(color='blue', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'
model1=lm(Price~Lot.Size,data=HP)
summary(model1)
##
## Call:
## lm(formula = Price ~ Lot.Size, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -313017 -64785 -22936 44801 574357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 200643 2878 69.706 < 0.0000000000000002 ***
## Lot.Size 22100 3349 6.599 0.000000000055 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97330 on 1729 degrees of freedom
## Multiple R-squared: 0.02456, Adjusted R-squared: 0.024
## F-statistic: 43.54 on 1 and 1729 DF, p-value: 0.000000000055
plot(model1, which=1)
#Scatter plot of Price vs Age & Analysis of Price ~ Age
ggplot(HP, aes(x=Age, y=Price)) +
geom_point(color='blue', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'
model2=lm(Price~Age,data=HP)
summary(model2)
##
## Call:
## lm(formula = Price ~ Age, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -222321 -66295 -22169 43109 565134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 229906.65 3200.76 71.829 < 0.0000000000000002 ***
## Age -646.47 78.22 -8.264 0.000000000000000277 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96650 on 1729 degrees of freedom
## Multiple R-squared: 0.038, Adjusted R-squared: 0.03744
## F-statistic: 68.3 on 1 and 1729 DF, p-value: 0.0000000000000002765
plot(model2, which=1)
#Scatter plot of Price vs Land Value & Analysis of Price ~ Land Value
ggplot(HP, aes(x=Land.Value, y=Price)) +
geom_point(color='blue', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'
model3=lm(Price~Land.Value,data=HP)
summary(model3)
##
## Call:
## lm(formula = Price ~ Land.Value, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -267746 -49152 -14017 36236 501092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 155234.41155 2709.83679 57.29 <0.0000000000000002 ***
## Land.Value 1.63463 0.05511 29.66 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80220 on 1729 degrees of freedom
## Multiple R-squared: 0.3373, Adjusted R-squared: 0.3369
## F-statistic: 879.8 on 1 and 1729 DF, p-value: < 0.00000000000000022
plot(model3, which=1)
#Scatter plot of Price vs Living Area & Analysis of Price ~ Living Area
ggplot(HP, aes(x=Living.Area, y=Price)) +
geom_point(color='brown', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'
model4=lm(Price~Living.Area,data=HP)
summary(model4)
##
## Call:
## lm(formula = Price ~ Living.Area, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -277098 -39352 -7638 28354 553580
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13069.90 4984.18 2.622 0.00881 **
## Living.Area 113.27 2.68 42.271 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69110 on 1729 degrees of freedom
## Multiple R-squared: 0.5082, Adjusted R-squared: 0.5079
## F-statistic: 1787 on 1 and 1729 DF, p-value: < 0.00000000000000022
plot(model4, which=1)
#Scatter plot of Price vs Pct.College & Analysis of Price ~ Pct.College
ggplot(HP, aes(x=Pct.College, y=Price)) +
geom_point(color='brown', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'
model5=lm(Price~Pct.College,data=HP)
summary(model5)
##
## Call:
## lm(formula = Price ~ Pct.College, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220827 -64665 -22231 42120 560572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106139.0 12710.6 8.350 <0.0000000000000002 ***
## Pct.College 1899.8 224.9 8.448 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96570 on 1729 degrees of freedom
## Multiple R-squared: 0.03964, Adjusted R-squared: 0.03908
## F-statistic: 71.37 on 1 and 1729 DF, p-value: < 0.00000000000000022
plot(model5, which=1)
##Box plot of house price by Bedrooms & Analysis of Price ~ Bedrooms
ggplot(HP, aes(x=Bedrooms, y=Price, fill=Bedrooms)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by Bedrooms")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
model6=lm(Price~Bedrooms,data=HP)
summary(model6)
##
## Call:
## lm(formula = Price ~ Bedrooms, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -200551 -55407 -16007 32697 574593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 192771 33789 5.705 0.0000000136 ***
## Bedrooms2 -40468 34125 -1.186 0.2358
## Bedrooms3 7635 33932 0.225 0.8220
## Bedrooms4 72779 34031 2.139 0.0326 *
## Bedrooms5 83806 35951 2.331 0.0199 *
## Bedrooms6 84557 46267 1.828 0.0678 .
## Bedrooms7 33895 61689 0.549 0.5828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 89400 on 1724 degrees of freedom
## Multiple R-squared: 0.1794, Adjusted R-squared: 0.1766
## F-statistic: 62.83 on 6 and 1724 DF, p-value: < 0.00000000000000022
plot(model6, which=1)
##Box plot of house price by Fireplaces & Analysis of Price ~ Fireplaces
ggplot(HP, aes(x=Fireplaces, y=Price, fill=Fireplaces)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by Fireplaces")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
model7=lm(Price~Fireplaces,data=HP)
summary(model7)
##
## Call:
## lm(formula = Price ~ Fireplaces, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -230205 -57705 -18305 42753 585764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 174236 3328 52.354 < 0.0000000000000002 ***
## Fireplaces1 60969 4453 13.690 < 0.0000000000000002 ***
## Fireplaces2 144586 14397 10.043 < 0.0000000000000002 ***
## Fireplaces3 186264 64275 2.898 0.0038 **
## Fireplaces4 525764 64275 8.180 0.000000000000000545 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90780 on 1726 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1509
## F-statistic: 77.88 on 4 and 1726 DF, p-value: < 0.00000000000000022
plot(model7, which=1)
##Box plot of house price by Bathrooms & Analysis of Price ~ Bathrooms
ggplot(HP, aes(x=Bathrooms, y=Price, fill=Bathrooms)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by Bathrooms")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
model8=lm(Price~Bathrooms,data=HP)
summary(model8)
##
## Call:
## lm(formula = Price ~ Bathrooms, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -263345 -46979 -9345 31655 506655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147751 4316 34.231 < 0.0000000000000002 ***
## Bathrooms1.5 20278 5558 3.649 0.000272 ***
## Bathrooms2 52861 6507 8.124 0.000000000000000849 ***
## Bathrooms2.5 120594 5452 22.118 < 0.0000000000000002 ***
## Bathrooms3 173228 11288 15.346 < 0.0000000000000002 ***
## Bathrooms3.5 235362 13706 17.172 < 0.0000000000000002 ***
## Bathrooms4 300699 27931 10.766 < 0.0000000000000002 ***
## Bathrooms4.5 264749 78172 3.387 0.000723 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78050 on 1723 degrees of freedom
## Multiple R-squared: 0.3748, Adjusted R-squared: 0.3723
## F-statistic: 147.6 on 7 and 1723 DF, p-value: < 0.00000000000000022
plot(model8, which=1)
#Box plot of house price by Rooms & Analysis of Price ~ Rooms
ggplot(HP, aes(x=Rooms, y=Price, fill=Rooms)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by Rooms")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
model9=lm(Price~Rooms,data=HP)
summary(model9)
##
## Call:
## lm(formula = Price ~ Rooms, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -294719 -50850 -11450 36525 529721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94500 57878 1.633 0.102708
## Rooms3 39656 58597 0.677 0.498651
## Rooms4 73745 58194 1.267 0.205242
## Rooms5 71950 58136 1.238 0.216032
## Rooms6 90814 58112 1.563 0.118300
## Rooms7 97329 58068 1.676 0.093895 .
## Rooms8 126097 58102 2.170 0.030124 *
## Rooms9 150779 58285 2.587 0.009765 **
## Rooms10 194067 58287 3.329 0.000888 ***
## Rooms11 211414 58677 3.603 0.000323 ***
## Rooms12 278719 58625 4.754 0.00000216 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81850 on 1720 degrees of freedom
## Multiple R-squared: 0.3137, Adjusted R-squared: 0.3097
## F-statistic: 78.61 on 10 and 1720 DF, p-value: < 0.00000000000000022
plot(model9, which=1)
#Boxplot of House Price by Waterfront and t-test for waterfront
ggplot(HP, aes(x=Waterfront, y=Price, fill=Waterfront)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by Waterfront")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
#Hypothesis test #alpha=0.05 #null hypothesis: There is no significant difference between means of homes where waterfront is present and where waterfront is absent. #alternative hypothesis: There is significant difference between means of homes where waterfront is present and where waterfront is absent.
tapply(Price, Waterfront, mean)
## 0 1
## 210291.8 373991.7
t.test(Price~Waterfront)
##
## Welch Two Sample t-test
##
## data: Price by Waterfront
## t = -4.0825, df = 14.095, p-value = 0.001105
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -249647.61 -77752.13
## sample estimates:
## mean in group 0 mean in group 1
## 210291.8 373991.7
#Boxplot of House Price by New Construction and t-test for new.construct
ggplot(HP, aes(x=New.Construct, y=Price, fill=New.Construct)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by New Construction")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
#Hypothesis test #alpha=0.05 #null hypothesis: There is no significant difference between means of new constructed homes and old homes. #alternative hypothesis: There is significant difference between means of new constructed homes and old homes.
tapply(Price, New.Construct, mean)
## 0 1
## 208244.7 282306.8
t.test(Price~New.Construct)
##
## Welch Two Sample t-test
##
## data: Price by New.Construct
## t = -7.6912, df = 91.03, p-value = 0.00000000001659
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -93189.84 -54934.41
## sample estimates:
## mean in group 0 mean in group 1
## 208244.7 282306.8
#Boxplot of House Price by Central Air and t-test for Central.Air
ggplot(HP, aes(x=Central.Air, y=Price, fill=Central.Air)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
ggtitle("Boxplot of House Price by Central Air")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: `fun.y` is deprecated. Use `fun` instead.
#Hypothesis test #alpha=0.05 #null hypothesis: There is no significant difference between means of homes where Central Air present and the homes where central air not present. #alternative hypothesis: There is significant difference between means of homes where Central Air present and the homes where central air not present.
tapply(Price, Central.Air, mean)
## 0 1
## 186684.9 254903.8
t.test(Price~Central.Air)
##
## Welch Two Sample t-test
##
## data: Price by Central.Air
## t = -13.387, df = 987.09, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -78219.22 -58218.46
## sample estimates:
## mean in group 0 mean in group 1
## 186684.9 254903.8
#ANOVA test based on fuel type
tapply(Price, Fuel.Type, mean)
## Electric Gas Oil Unknown
## 164937.57 228562.38 188734.40 97006.25
anova<-aov(Price~Fuel.Type)
modelaov1 <- lm(Price~Fuel.Type)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## Fuel.Type 3 1195429652300 398476550767 44.13 <0.0000000000000002 ***
## Residuals 1727 15594921381251 9030064494
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(modelaov1)
## (Intercept) Fuel.TypeGas Fuel.TypeOil Fuel.TypeUnknown
## 164937.57 63624.81 23796.83 -67931.32
#ANOVA test based on heat type
tapply(Price, Heat.Type, mean)
## Electric Hot Air Hot Water Unknown
## 161888.63 226382.62 209132.46 97006.25
anova<-aov(Price~Heat.Type)
modelaov2 <- lm(Price~Heat.Type)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## Heat.Type 3 1052815863245 350938621082 38.51 <0.0000000000000002 ***
## Residuals 1727 15737535170305 9112643411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(modelaov2)
## (Intercept) Heat.TypeHot Air Heat.TypeHot Water Heat.TypeUnknown
## 161888.63 64493.99 47243.83 -64882.38
#ANOVA test based on Sewer type
tapply(Price, Sewer.Type, mean)
## Private Public Unknown
## 199597.0 216375.2 250952.3
anova<-aov(Price~Sewer.Type)
modelaov3 <- lm(Price~Sewer.Type)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## Sewer.Type 2 119121637669 59560818834 6.174 0.00213 **
## Residuals 1728 16671229395882 9647702197
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(modelaov3)
## (Intercept) Sewer.TypePublic Sewer.TypeUnknown
## 199597.01 16778.16 51355.33
#Multiple Regression
modelReg1<-lm(Price~ Land.Value + Living.Area + Bathrooms + Rooms, data=HP)
summary(modelReg1)
##
## Call:
## lm(formula = Price ~ Land.Value + Living.Area + Bathrooms + Rooms,
## data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220736 -35720 -6107 28504 457895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24825.25652 43014.12234 0.577 0.5639
## Land.Value 0.95881 0.04629 20.712 < 0.0000000000000002 ***
## Living.Area 67.46644 4.68388 14.404 < 0.0000000000000002 ***
## Bathrooms1.5 1677.97579 4491.69855 0.374 0.7088
## Bathrooms2 26553.62811 5267.60117 5.041 0.0000005123389 ***
## Bathrooms2.5 32673.01750 5553.74474 5.883 0.0000000048343 ***
## Bathrooms3 49127.70068 9916.16941 4.954 0.0000007976127 ***
## Bathrooms3.5 81669.26773 12460.23993 6.554 0.0000000000737 ***
## Bathrooms4 40977.90717 24100.84532 1.700 0.0893 .
## Bathrooms4.5 15921.28166 61701.91746 0.258 0.7964
## Rooms3 9721.93717 43478.88320 0.224 0.8231
## Rooms4 18230.61419 43209.73475 0.422 0.6731
## Rooms5 11032.54585 43198.55445 0.255 0.7985
## Rooms6 19219.74637 43184.54016 0.445 0.6563
## Rooms7 14479.55576 43176.74178 0.335 0.7374
## Rooms8 20716.82258 43259.57426 0.479 0.6321
## Rooms9 10091.37351 43507.16571 0.232 0.8166
## Rooms10 25262.03373 43612.66065 0.579 0.5625
## Rooms11 24218.91394 44021.46186 0.550 0.5823
## Rooms12 24528.64878 44314.69881 0.554 0.5800
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60630 on 1711 degrees of freedom
## Multiple R-squared: 0.6254, Adjusted R-squared: 0.6213
## F-statistic: 150.4 on 19 and 1711 DF, p-value: < 0.00000000000000022
plot(modelReg1)
## Warning: not plotting observations with leverage one:
## 397
modelReg2<-update(modelReg1, ~. -Rooms)
summary(modelReg2)
##
## Call:
## lm(formula = Price ~ Land.Value + Living.Area + Bathrooms, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -218676 -34780 -6004 28300 464106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36449.67197 5450.93655 6.687 0.0000000000307 ***
## Land.Value 0.95584 0.04616 20.708 < 0.0000000000000002 ***
## Living.Area 71.00893 3.69160 19.235 < 0.0000000000000002 ***
## Bathrooms1.5 605.56407 4437.33361 0.136 0.8915
## Bathrooms2 25603.15485 5202.53630 4.921 0.0000009419736 ***
## Bathrooms2.5 32292.06265 5484.66552 5.888 0.0000000046983 ***
## Bathrooms3 48811.90798 9869.02259 4.946 0.0000008315425 ***
## Bathrooms3.5 82548.55485 12226.62423 6.752 0.0000000000199 ***
## Bathrooms4 40523.12361 23641.26148 1.714 0.0867 .
## Bathrooms4.5 15285.53485 61440.36902 0.249 0.8036
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60620 on 1721 degrees of freedom
## Multiple R-squared: 0.6234, Adjusted R-squared: 0.6214
## F-statistic: 316.5 on 9 and 1721 DF, p-value: < 0.00000000000000022
plot(modelReg2)
## Warning: not plotting observations with leverage one:
## 397
modelReg3<-update(modelReg2, ~. -Bathrooms)
summary(modelReg3)
##
## Call:
## lm(formula = Price ~ Land.Value + Living.Area, data = HP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -241131 -37208 -6267 28046 465813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20075.72881 4491.81120 4.469 0.00000835 ***
## Land.Value 0.95713 0.04707 20.333 < 0.0000000000000002 ***
## Living.Area 90.41836 2.65719 34.028 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62100 on 1728 degrees of freedom
## Multiple R-squared: 0.6032, Adjusted R-squared: 0.6027
## F-statistic: 1313 on 2 and 1728 DF, p-value: < 0.00000000000000022
plot(modelReg3)
anova(modelReg2, modelReg3)
## Analysis of Variance Table
##
## Model 1: Price ~ Land.Value + Living.Area + Bathrooms
## Model 2: Price ~ Land.Value + Living.Area
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1721 6324003469402
## 2 1728 6663044967528 -7 -339041498126 13.181 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pred.Price <- predict(modelReg3)
PredHP <- cbind(HP, Pred.Price)
head(PredHP)
## Price Lot.Size Waterfront Age Land.Value New.Construct Central.Air Fuel.Type
## 1 132500 0.09 No 42 50000 No No Electric
## 2 181115 0.92 No 0 22300 No No Gas
## 3 109000 0.19 No 133 7300 No No Gas
## 4 155000 0.41 No 13 18700 No No Gas
## 5 86060 0.11 No 0 15000 Yes Yes Gas
## 6 120000 0.68 No 31 14000 No No Gas
## Heat.Type Sewer.Type Living.Area Pct.College Bedrooms Fireplaces Bathrooms
## 1 Electric Private 906 35 2 1 1
## 2 Hot Water Private 1953 51 3 0 2.5
## 3 Hot Water Public 1944 51 4 1 1
## 4 Hot Air Private 1944 51 3 1 1.5
## 5 Hot Air Public 840 51 2 0 1
## 6 Hot Air Private 1152 22 4 1 1
## Rooms Pred.Price
## 1 5 149851.4
## 2 6 218006.8
## 3 8 202836.1
## 4 5 213747.4
## 5 3 110384.1
## 6 8 137637.5
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.2
modeltree<-rpart(Pred.Price ~., PredHP)
rpart.plot(modeltree, digits=-3)
HP[179, 1:16]
## Price Lot.Size Waterfront Age Land.Value New.Construct Central.Air
## 179 247000 0.46 No 13 36600 No Yes
## Fuel.Type Heat.Type Sewer.Type Living.Area Pct.College Bedrooms Fireplaces
## 179 Gas Hot Air Public 2114 64 4 1
## Bathrooms Rooms
## 179 3.5 8
predict(modelReg3, list(Land.Value=36600, Living.Area=2114))
## 1
## 246251.2
ggplot(PredHP, aes(x=Living.Area, y=Pred.Price)) +
geom_point(color='blue', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'
ggplot(PredHP, aes(x=Land.Value, y=Pred.Price)) +
geom_point(color='blue', size = 2) +
geom_smooth(method=lm, color='#2C3E50')
## `geom_smooth()` using formula 'y ~ x'