First I load in the packages and the data
library(tidyverse)
## ── Attaching packages ─────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(leaps)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(readr)
AmesHousing<-read.csv("/Users/timogunsalus/Downloads/AmesHousing.csv", header = TRUE)
Select only the useful varable that dont give errors with my methods
AH<-subset(AmesHousing,select = c(SalePrice,MS.Zoning,Lot.Area,Neighborhood,Gr.Liv.Area, Overall.Cond,Overall.Qual,Land.Slope,Lot.Config,Condition.1,Bldg.Type, Year.Built,Year.Remod.Add,Garage.Cars,TotRms.AbvGrd))
# I only want residential buildings
AH1 <- filter(AH,(MS.Zoning == "RH")| (MS.Zoning =="RM")|(MS.Zoning =="RL"))
Make the data usable
# Change charicters into factors
AH2<-AH1
AH2$MS.Zoning <- as.factor(AH2$MS.Zoning)
AH2$Neighborhood <- as.factor(AH2$Neighborhood)
AH2$Land.Slope <- as.factor(AH2$Land.Slope)
AH2$Lot.Config <- as.factor(AH2$Lot.Config)
AH2$Condition.1 <- as.factor(AH2$Condition.1)
AH2$Bldg.Type <- as.factor(AH2$Bldg.Type)
# Factor control
AH2$Condition.1 <- factor(AH2$Condition.1, ordered = FALSE)
AH2$Condition.1 <- relevel(AH2$Condition.1, "Norm")
AH2$MS.Zoning <- factor(AH2$MS.Zoning, ordered = FALSE)
AH2$MS.Zoning <- relevel(AH2$MS.Zoning, "RL")
AH2$Land.Slope <- factor(AH2$Land.Slope, ordered = FALSE)
AH2$Land.Slope <- relevel(AH2$Land.Slope, "Gtl")
AH2$Lot.Config <- factor(AH2$Lot.Config, ordered = FALSE)
AH2$Lot.Config <- relevel(AH2$Lot.Config, "Inside")
AH2$Bldg.Type <- factor(AH2$Bldg.Type, ordered = FALSE)
AH2$Bldg.Type <- relevel(AH2$Bldg.Type, "1Fam")
The structure of my data set:
## 'data.frame': 2762 obs. of 15 variables:
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
## $ MS.Zoning : Factor w/ 3 levels "RL","RH","RM": 1 2 1 1 1 1 1 1 1 1 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Neighborhood : Factor w/ 28 levels "Blmngtn","Blueste",..: 16 16 16 16 9 9 25 25 25 9 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Land.Slope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Lot.Config : Factor w/ 5 levels "Inside","Corner",..: 2 1 2 2 1 1 1 1 1 1 ...
## $ Condition.1 : Factor w/ 9 levels "Norm","Artery",..: 1 3 1 1 1 1 1 1 1 1 ...
## $ Bldg.Type : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 5 5 5 1 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add: int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
Next I created a training and testing dataset.
#create partitioning varable
AH2$part<-rep(0,2762)
set.seed(1)
test<-sample(1:2762,1381, replace = FALSE)
AH2$part[test]<-1
#training df
AHTrain<-AH2%>%
filter(part ==0)
#testing df
AHTest<-AH2%>%
filter(part ==1)
First I looked at all the varables to view significance. And then looked for the best linar model to fit to the data.
lmAll<-lm(SalePrice~.-Neighborhood,AHTest)
Summary of all the varables
##
## Call:
## lm(formula = SalePrice ~ . - Neighborhood, data = AHTest)
##
## Residuals:
## Min 1Q Median 3Q Max
## -355643 -21060 -3010 15632 290625
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.164e+06 1.357e+05 -8.578 < 2e-16 ***
## MS.ZoningRH 2.803e+03 1.161e+04 0.241 0.809343
## MS.ZoningRM -5.705e+03 3.430e+03 -1.664 0.096443 .
## Lot.Area 1.044e+00 1.519e-01 6.875 9.46e-12 ***
## Gr.Liv.Area 7.121e+01 4.090e+00 17.412 < 2e-16 ***
## Overall.Cond 3.294e+03 1.113e+03 2.958 0.003148 **
## Overall.Qual 2.218e+04 1.219e+03 18.196 < 2e-16 ***
## Land.SlopeMod 2.275e+04 4.930e+03 4.614 4.33e-06 ***
## Land.SlopeSev -2.340e+04 1.433e+04 -1.633 0.102751
## Lot.ConfigCorner -2.266e+02 2.698e+03 -0.084 0.933060
## Lot.ConfigCulDSac 9.007e+03 4.262e+03 2.113 0.034772 *
## Lot.ConfigFR2 -5.783e+03 5.758e+03 -1.004 0.315371
## Lot.ConfigFR3 1.309e+03 1.888e+04 0.069 0.944751
## Condition.1Artery -1.444e+04 6.125e+03 -2.358 0.018515 *
## Condition.1Feedr -7.027e+03 4.413e+03 -1.592 0.111524
## Condition.1PosA -8.842e+03 1.204e+04 -0.734 0.462788
## Condition.1PosN 5.267e+03 7.821e+03 0.673 0.500767
## Condition.1RRAe -2.555e+04 1.206e+04 -2.119 0.034244 *
## Condition.1RRAn -1.328e+04 8.325e+03 -1.595 0.110907
## Condition.1RRNe -1.562e+04 1.695e+04 -0.921 0.356964
## Condition.1RRNn -4.522e+03 1.553e+04 -0.291 0.770946
## Bldg.Type2fmCon 2.140e+03 7.293e+03 0.293 0.769281
## Bldg.TypeDuplex -1.742e+04 5.751e+03 -3.029 0.002498 **
## Bldg.TypeTwnhs -2.556e+04 6.938e+03 -3.684 0.000238 ***
## Bldg.TypeTwnhsE -5.850e+03 4.647e+03 -1.259 0.208286
## Year.Built 4.043e+02 6.637e+01 6.092 1.45e-09 ***
## Year.Remod.Add 1.479e+02 7.157e+01 2.067 0.038933 *
## Garage.Cars 9.716e+03 1.887e+03 5.150 2.99e-07 ***
## TotRms.AbvGrd -4.712e+03 1.198e+03 -3.933 8.81e-05 ***
## part NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37530 on 1352 degrees of freedom
## Multiple R-squared: 0.7963, Adjusted R-squared: 0.7921
## F-statistic: 188.7 on 28 and 1352 DF, p-value: < 2.2e-16
Neighborhood has no easily discernible baseline and should not be used in the LM
lmz<-lm(SalePrice~.-Neighborhood,AHTrain)
Summary of the lm with all varables exept the neighborhoods varable.
##
## Call:
## lm(formula = SalePrice ~ . - Neighborhood, data = AHTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -350476 -19741 -2899 15339 234734
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.196e+06 1.243e+05 -9.621 < 2e-16 ***
## MS.ZoningRH -8.168e+03 9.217e+03 -0.886 0.37563
## MS.ZoningRM -8.987e+03 3.147e+03 -2.856 0.00436 **
## Lot.Area 4.348e-01 1.680e-01 2.588 0.00976 **
## Gr.Liv.Area 5.275e+01 3.911e+00 13.488 < 2e-16 ***
## Overall.Cond 4.274e+03 1.022e+03 4.181 3.09e-05 ***
## Overall.Qual 2.346e+04 1.122e+03 20.916 < 2e-16 ***
## Land.SlopeMod 1.893e+04 4.880e+03 3.879 0.00011 ***
## Land.SlopeSev 1.603e+04 1.621e+04 0.989 0.32287
## Lot.ConfigCorner -5.355e+03 2.688e+03 -1.992 0.04655 *
## Lot.ConfigCulDSac 3.215e+03 4.167e+03 0.772 0.44047
## Lot.ConfigFR2 -1.142e+04 6.598e+03 -1.731 0.08370 .
## Lot.ConfigFR3 8.599e+02 1.244e+04 0.069 0.94491
## Condition.1Artery -1.247e+04 5.408e+03 -2.305 0.02133 *
## Condition.1Feedr -1.097e+04 4.449e+03 -2.466 0.01378 *
## Condition.1PosA 2.811e+04 1.142e+04 2.461 0.01399 *
## Condition.1PosN 4.547e+03 9.356e+03 0.486 0.62705
## Condition.1RRAe -1.319e+04 8.572e+03 -1.539 0.12411
## Condition.1RRAn -2.263e+04 7.000e+03 -3.233 0.00125 **
## Condition.1RRNe -4.902e+04 3.592e+04 -1.365 0.17261
## Condition.1RRNn -6.172e+04 2.602e+04 -2.372 0.01783 *
## Bldg.Type2fmCon -3.154e+03 6.966e+03 -0.453 0.65077
## Bldg.TypeDuplex -1.499e+04 5.236e+03 -2.862 0.00427 **
## Bldg.TypeTwnhs -2.344e+04 5.895e+03 -3.977 7.36e-05 ***
## Bldg.TypeTwnhsE -1.329e+04 4.317e+03 -3.078 0.00212 **
## Year.Built 3.960e+02 5.746e+01 6.892 8.39e-12 ***
## Year.Remod.Add 1.707e+02 6.476e+01 2.637 0.00847 **
## Garage.Cars 1.553e+04 1.672e+03 9.290 < 2e-16 ***
## TotRms.AbvGrd -2.283e+03 1.147e+03 -1.991 0.04668 *
## part NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35650 on 1351 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7958, Adjusted R-squared: 0.7916
## F-statistic: 188 on 28 and 1351 DF, p-value: < 2.2e-16
First, I explored the condition 1 varable.
LMD<-lm(SalePrice~Condition.1, AHTrain)
Summary of the lm for sale price and condition 1 and a graph.
##
## Call:
## lm(formula = SalePrice ~ Condition.1, data = AHTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -169304 -49593 -16110 27907 442907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 182093 2220 82.012 < 2e-16 ***
## Condition.1Artery -49419 11138 -4.437 9.85e-06 ***
## Condition.1Feedr -42983 9097 -4.725 2.54e-06 ***
## Condition.1PosA 91102 24262 3.755 0.000181 ***
## Condition.1PosN 54136 19851 2.727 0.006470 **
## Condition.1RRAe -41126 18144 -2.267 0.023566 *
## Condition.1RRAn -9006 14870 -0.606 0.544827
## Condition.1RRNe 12407 76432 0.162 0.871069
## Condition.1RRNn -41093 54068 -0.760 0.447376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76400 on 1372 degrees of freedom
## Multiple R-squared: 0.04796, Adjusted R-squared: 0.04241
## F-statistic: 8.64 on 8 and 1372 DF, p-value: 1.488e-11
Only the proximity to clsoe roads, proximity to positive off site fetrues and adjacency to NS railways are statistically relevant. The average cost of a home with no conditions, the baseline which other variables are measured against, is 182,093 dollars (p-value < 2e-16). A home on an arterial street is 49,419 dollars less than the baseline (p-value = )9.85e-06. A home on a feeder street is on average 42,983 dollars less than the baseline (p-value = 2.54e-06). A home adjacent to a positive off site feature is on average 91,102 dollars more expensive than the baseline (p-value = 0.000181). A home that is near a positive off site feature is on average 54,136 dollars more than the baseline (p-value = 0.006470). A home that is adjacent to a north-south railroad is 41,126 less than the baseline (p-value = 0.023566).
Next I looked at the effect that the size of the lot had on the price.
lmArea<- lm(SalePrice~Lot.Area, AHTrain)
The summary of the lm and the graph
##
## Call:
## lm(formula = SalePrice ~ Lot.Area, data = AHTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -355034 -46776 -20439 28069 430030
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.516e+05 3.382e+03 44.816 <2e-16 ***
## Lot.Area 2.626e+00 2.638e-01 9.954 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75440 on 1379 degrees of freedom
## Multiple R-squared: 0.06703, Adjusted R-squared: 0.06636
## F-statistic: 99.08 on 1 and 1379 DF, p-value: < 2.2e-16
Each additional square foot adds 2.626 dollars but I suspect that this is scewed The Adjusted r squared is very low even if the p value is also low (p-value <2e-16).
Next I looked at the impact that the number of rooms that a home has on sale price.
lmTotRoom<-lm(SalePrice~TotRms.AbvGrd, AHTrain)
The summary of the lm and the graph
##
## Call:
## lm(formula = SalePrice ~ TotRms.AbvGrd, data = AHTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -228354 -38713 -12766 25793 424264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30972 7756 3.993 6.87e-05 ***
## TotRms.AbvGrd 22824 1165 19.590 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69080 on 1379 degrees of freedom
## Multiple R-squared: 0.2177, Adjusted R-squared: 0.2171
## F-statistic: 383.7 on 1 and 1379 DF, p-value: < 2.2e-16
Each aditional room adds 22,824 dollars to home value with an extremly signifigant p value (p value < 2e-16)
Then I looked at the lot configuration and the impact it has on sale price.
lmLotConfig<-lm(SalePrice~Lot.Config, AHTrain)
The summary of the lm and the graph
##
## Call:
## lm(formula = SalePrice ~ Lot.Config, data = AHTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163341 -49130 -20664 27870 438870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176129.8 2425.3 72.621 < 2e-16 ***
## Lot.ConfigCorner 1534.6 5665.4 0.271 0.787
## Lot.ConfigCulDSac 37262.0 8717.3 4.274 2.05e-05 ***
## Lot.ConfigFR2 -9879.3 14155.5 -0.698 0.485
## Lot.ConfigFR3 914.7 25996.4 0.035 0.972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77650 on 1376 degrees of freedom
## Multiple R-squared: 0.01371, Adjusted R-squared: 0.01084
## F-statistic: 4.78 on 4 and 1376 DF, p-value: 0.0007847
The baseline taht the other variables are compared against is the most commin type, inside which there is a home on ether side of the home. The only relevant difference is homes in cul-de-sac wich homes were on average 37262.0 dollars more expensive (p-value = 2.05e-05)
Finally. I looked at the impact that the number of cars a garage can hold and its impact on sale price.
lmCar<-lm(SalePrice~Garage.Cars, AHTrain)
The summary of the lm and the graph
##
## Call:
## lm(formula = SalePrice ~ Garage.Cars, data = AHTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -261647 -37247 -5600 25753 366120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64980 4074 15.95 <2e-16 ***
## Garage.Cars 64633 2126 30.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60450 on 1378 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4014, Adjusted R-squared: 0.401
## F-statistic: 924.1 on 1 and 1378 DF, p-value: < 2.2e-16
## Warning: Ignoring unknown parameters: intersept
## Warning: Removed 1 rows containing missing values (geom_point).
The avrage price of a home that ether did not have a garage or one that could not fit any cars is 64,980 (p-value <2e-16). Every additional car the garge could fit added on avrage 64,633 dollars (p-value <2e-16)