Setup

First I load in the packages and the data

library(tidyverse)
## ── Attaching packages ─────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(leaps)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(readr)
AmesHousing<-read.csv("/Users/timogunsalus/Downloads/AmesHousing.csv", header = TRUE)

Data wrangleing

Select only the useful varable that dont give errors with my methods

AH<-subset(AmesHousing,select = c(SalePrice,MS.Zoning,Lot.Area,Neighborhood,Gr.Liv.Area, Overall.Cond,Overall.Qual,Land.Slope,Lot.Config,Condition.1,Bldg.Type, Year.Built,Year.Remod.Add,Garage.Cars,TotRms.AbvGrd))

# I only want residential buildings
AH1 <- filter(AH,(MS.Zoning == "RH")| (MS.Zoning =="RM")|(MS.Zoning =="RL"))

Make the data usable

# Change charicters into factors
AH2<-AH1
AH2$MS.Zoning <- as.factor(AH2$MS.Zoning)
AH2$Neighborhood <- as.factor(AH2$Neighborhood)
AH2$Land.Slope <- as.factor(AH2$Land.Slope)
AH2$Lot.Config <- as.factor(AH2$Lot.Config)
AH2$Condition.1 <- as.factor(AH2$Condition.1)
AH2$Bldg.Type <- as.factor(AH2$Bldg.Type)

# Factor control 
AH2$Condition.1 <- factor(AH2$Condition.1, ordered = FALSE)
AH2$Condition.1 <- relevel(AH2$Condition.1, "Norm")
AH2$MS.Zoning <- factor(AH2$MS.Zoning, ordered = FALSE)
AH2$MS.Zoning <- relevel(AH2$MS.Zoning, "RL")
AH2$Land.Slope <- factor(AH2$Land.Slope, ordered = FALSE)
AH2$Land.Slope <- relevel(AH2$Land.Slope, "Gtl")
AH2$Lot.Config <- factor(AH2$Lot.Config, ordered = FALSE)
AH2$Lot.Config <- relevel(AH2$Lot.Config, "Inside")
AH2$Bldg.Type <- factor(AH2$Bldg.Type, ordered = FALSE)
AH2$Bldg.Type <- relevel(AH2$Bldg.Type, "1Fam")

The structure of my data set:

## 'data.frame':    2762 obs. of  15 variables:
##  $ SalePrice     : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
##  $ MS.Zoning     : Factor w/ 3 levels "RL","RH","RM": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Lot.Area      : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Neighborhood  : Factor w/ 28 levels "Blmngtn","Blueste",..: 16 16 16 16 9 9 25 25 25 9 ...
##  $ Gr.Liv.Area   : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Overall.Cond  : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Overall.Qual  : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Land.Slope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Lot.Config    : Factor w/ 5 levels "Inside","Corner",..: 2 1 2 2 1 1 1 1 1 1 ...
##  $ Condition.1   : Factor w/ 9 levels "Norm","Artery",..: 1 3 1 1 1 1 1 1 1 1 ...
##  $ Bldg.Type     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 5 5 5 1 ...
##  $ Year.Built    : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add: int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Garage.Cars   : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ TotRms.AbvGrd : int  7 5 6 8 6 7 6 5 5 7 ...

Next I created a training and testing dataset.

#create partitioning varable
AH2$part<-rep(0,2762)
set.seed(1)
test<-sample(1:2762,1381, replace = FALSE)
AH2$part[test]<-1

#training df
AHTrain<-AH2%>%
  filter(part ==0)

#testing df
AHTest<-AH2%>%
  filter(part ==1)

Exploritory analysis

First I looked at all the varables to view significance. And then looked for the best linar model to fit to the data.

lmAll<-lm(SalePrice~.-Neighborhood,AHTest)

Summary of all the varables

## 
## Call:
## lm(formula = SalePrice ~ . - Neighborhood, data = AHTest)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -355643  -21060   -3010   15632  290625 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.164e+06  1.357e+05  -8.578  < 2e-16 ***
## MS.ZoningRH        2.803e+03  1.161e+04   0.241 0.809343    
## MS.ZoningRM       -5.705e+03  3.430e+03  -1.664 0.096443 .  
## Lot.Area           1.044e+00  1.519e-01   6.875 9.46e-12 ***
## Gr.Liv.Area        7.121e+01  4.090e+00  17.412  < 2e-16 ***
## Overall.Cond       3.294e+03  1.113e+03   2.958 0.003148 ** 
## Overall.Qual       2.218e+04  1.219e+03  18.196  < 2e-16 ***
## Land.SlopeMod      2.275e+04  4.930e+03   4.614 4.33e-06 ***
## Land.SlopeSev     -2.340e+04  1.433e+04  -1.633 0.102751    
## Lot.ConfigCorner  -2.266e+02  2.698e+03  -0.084 0.933060    
## Lot.ConfigCulDSac  9.007e+03  4.262e+03   2.113 0.034772 *  
## Lot.ConfigFR2     -5.783e+03  5.758e+03  -1.004 0.315371    
## Lot.ConfigFR3      1.309e+03  1.888e+04   0.069 0.944751    
## Condition.1Artery -1.444e+04  6.125e+03  -2.358 0.018515 *  
## Condition.1Feedr  -7.027e+03  4.413e+03  -1.592 0.111524    
## Condition.1PosA   -8.842e+03  1.204e+04  -0.734 0.462788    
## Condition.1PosN    5.267e+03  7.821e+03   0.673 0.500767    
## Condition.1RRAe   -2.555e+04  1.206e+04  -2.119 0.034244 *  
## Condition.1RRAn   -1.328e+04  8.325e+03  -1.595 0.110907    
## Condition.1RRNe   -1.562e+04  1.695e+04  -0.921 0.356964    
## Condition.1RRNn   -4.522e+03  1.553e+04  -0.291 0.770946    
## Bldg.Type2fmCon    2.140e+03  7.293e+03   0.293 0.769281    
## Bldg.TypeDuplex   -1.742e+04  5.751e+03  -3.029 0.002498 ** 
## Bldg.TypeTwnhs    -2.556e+04  6.938e+03  -3.684 0.000238 ***
## Bldg.TypeTwnhsE   -5.850e+03  4.647e+03  -1.259 0.208286    
## Year.Built         4.043e+02  6.637e+01   6.092 1.45e-09 ***
## Year.Remod.Add     1.479e+02  7.157e+01   2.067 0.038933 *  
## Garage.Cars        9.716e+03  1.887e+03   5.150 2.99e-07 ***
## TotRms.AbvGrd     -4.712e+03  1.198e+03  -3.933 8.81e-05 ***
## part                      NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37530 on 1352 degrees of freedom
## Multiple R-squared:  0.7963, Adjusted R-squared:  0.7921 
## F-statistic: 188.7 on 28 and 1352 DF,  p-value: < 2.2e-16

Neighborhood has no easily discernible baseline and should not be used in the LM

lmz<-lm(SalePrice~.-Neighborhood,AHTrain)

Summary of the lm with all varables exept the neighborhoods varable.

## 
## Call:
## lm(formula = SalePrice ~ . - Neighborhood, data = AHTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -350476  -19741   -2899   15339  234734 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.196e+06  1.243e+05  -9.621  < 2e-16 ***
## MS.ZoningRH       -8.168e+03  9.217e+03  -0.886  0.37563    
## MS.ZoningRM       -8.987e+03  3.147e+03  -2.856  0.00436 ** 
## Lot.Area           4.348e-01  1.680e-01   2.588  0.00976 ** 
## Gr.Liv.Area        5.275e+01  3.911e+00  13.488  < 2e-16 ***
## Overall.Cond       4.274e+03  1.022e+03   4.181 3.09e-05 ***
## Overall.Qual       2.346e+04  1.122e+03  20.916  < 2e-16 ***
## Land.SlopeMod      1.893e+04  4.880e+03   3.879  0.00011 ***
## Land.SlopeSev      1.603e+04  1.621e+04   0.989  0.32287    
## Lot.ConfigCorner  -5.355e+03  2.688e+03  -1.992  0.04655 *  
## Lot.ConfigCulDSac  3.215e+03  4.167e+03   0.772  0.44047    
## Lot.ConfigFR2     -1.142e+04  6.598e+03  -1.731  0.08370 .  
## Lot.ConfigFR3      8.599e+02  1.244e+04   0.069  0.94491    
## Condition.1Artery -1.247e+04  5.408e+03  -2.305  0.02133 *  
## Condition.1Feedr  -1.097e+04  4.449e+03  -2.466  0.01378 *  
## Condition.1PosA    2.811e+04  1.142e+04   2.461  0.01399 *  
## Condition.1PosN    4.547e+03  9.356e+03   0.486  0.62705    
## Condition.1RRAe   -1.319e+04  8.572e+03  -1.539  0.12411    
## Condition.1RRAn   -2.263e+04  7.000e+03  -3.233  0.00125 ** 
## Condition.1RRNe   -4.902e+04  3.592e+04  -1.365  0.17261    
## Condition.1RRNn   -6.172e+04  2.602e+04  -2.372  0.01783 *  
## Bldg.Type2fmCon   -3.154e+03  6.966e+03  -0.453  0.65077    
## Bldg.TypeDuplex   -1.499e+04  5.236e+03  -2.862  0.00427 ** 
## Bldg.TypeTwnhs    -2.344e+04  5.895e+03  -3.977 7.36e-05 ***
## Bldg.TypeTwnhsE   -1.329e+04  4.317e+03  -3.078  0.00212 ** 
## Year.Built         3.960e+02  5.746e+01   6.892 8.39e-12 ***
## Year.Remod.Add     1.707e+02  6.476e+01   2.637  0.00847 ** 
## Garage.Cars        1.553e+04  1.672e+03   9.290  < 2e-16 ***
## TotRms.AbvGrd     -2.283e+03  1.147e+03  -1.991  0.04668 *  
## part                      NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35650 on 1351 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7958, Adjusted R-squared:  0.7916 
## F-statistic:   188 on 28 and 1351 DF,  p-value: < 2.2e-16

Data analysis

Condition 1

First, I explored the condition 1 varable.

LMD<-lm(SalePrice~Condition.1, AHTrain)

Summary of the lm for sale price and condition 1 and a graph.

## 
## Call:
## lm(formula = SalePrice ~ Condition.1, data = AHTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -169304  -49593  -16110   27907  442907 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         182093       2220  82.012  < 2e-16 ***
## Condition.1Artery   -49419      11138  -4.437 9.85e-06 ***
## Condition.1Feedr    -42983       9097  -4.725 2.54e-06 ***
## Condition.1PosA      91102      24262   3.755 0.000181 ***
## Condition.1PosN      54136      19851   2.727 0.006470 ** 
## Condition.1RRAe     -41126      18144  -2.267 0.023566 *  
## Condition.1RRAn      -9006      14870  -0.606 0.544827    
## Condition.1RRNe      12407      76432   0.162 0.871069    
## Condition.1RRNn     -41093      54068  -0.760 0.447376    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76400 on 1372 degrees of freedom
## Multiple R-squared:  0.04796,    Adjusted R-squared:  0.04241 
## F-statistic:  8.64 on 8 and 1372 DF,  p-value: 1.488e-11

Only the proximity to clsoe roads, proximity to positive off site fetrues and adjacency to NS railways are statistically relevant. The average cost of a home with no conditions, the baseline which other variables are measured against, is 182,093 dollars (p-value < 2e-16). A home on an arterial street is 49,419 dollars less than the baseline (p-value = )9.85e-06. A home on a feeder street is on average 42,983 dollars less than the baseline (p-value = 2.54e-06). A home adjacent to a positive off site feature is on average 91,102 dollars more expensive than the baseline (p-value = 0.000181). A home that is near a positive off site feature is on average 54,136 dollars more than the baseline (p-value = 0.006470). A home that is adjacent to a north-south railroad is 41,126 less than the baseline (p-value = 0.023566).

Lot Area

Next I looked at the effect that the size of the lot had on the price.

lmArea<- lm(SalePrice~Lot.Area, AHTrain)

The summary of the lm and the graph

## 
## Call:
## lm(formula = SalePrice ~ Lot.Area, data = AHTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -355034  -46776  -20439   28069  430030 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.516e+05  3.382e+03  44.816   <2e-16 ***
## Lot.Area    2.626e+00  2.638e-01   9.954   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75440 on 1379 degrees of freedom
## Multiple R-squared:  0.06703,    Adjusted R-squared:  0.06636 
## F-statistic: 99.08 on 1 and 1379 DF,  p-value: < 2.2e-16

Each additional square foot adds 2.626 dollars but I suspect that this is scewed The Adjusted r squared is very low even if the p value is also low (p-value <2e-16).

Total Rooms Above Ground

Next I looked at the impact that the number of rooms that a home has on sale price.

lmTotRoom<-lm(SalePrice~TotRms.AbvGrd, AHTrain)

The summary of the lm and the graph

## 
## Call:
## lm(formula = SalePrice ~ TotRms.AbvGrd, data = AHTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -228354  -38713  -12766   25793  424264 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      30972       7756   3.993 6.87e-05 ***
## TotRms.AbvGrd    22824       1165  19.590  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69080 on 1379 degrees of freedom
## Multiple R-squared:  0.2177, Adjusted R-squared:  0.2171 
## F-statistic: 383.7 on 1 and 1379 DF,  p-value: < 2.2e-16

Each aditional room adds 22,824 dollars to home value with an extremly signifigant p value (p value < 2e-16)

Lot Config

Then I looked at the lot configuration and the impact it has on sale price.

lmLotConfig<-lm(SalePrice~Lot.Config, AHTrain)

The summary of the lm and the graph

## 
## Call:
## lm(formula = SalePrice ~ Lot.Config, data = AHTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -163341  -49130  -20664   27870  438870 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       176129.8     2425.3  72.621  < 2e-16 ***
## Lot.ConfigCorner    1534.6     5665.4   0.271    0.787    
## Lot.ConfigCulDSac  37262.0     8717.3   4.274 2.05e-05 ***
## Lot.ConfigFR2      -9879.3    14155.5  -0.698    0.485    
## Lot.ConfigFR3        914.7    25996.4   0.035    0.972    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77650 on 1376 degrees of freedom
## Multiple R-squared:  0.01371,    Adjusted R-squared:  0.01084 
## F-statistic:  4.78 on 4 and 1376 DF,  p-value: 0.0007847

The baseline taht the other variables are compared against is the most commin type, inside which there is a home on ether side of the home. The only relevant difference is homes in cul-de-sac wich homes were on average 37262.0 dollars more expensive (p-value = 2.05e-05)

Number of cars in garage

Finally. I looked at the impact that the number of cars a garage can hold and its impact on sale price.

lmCar<-lm(SalePrice~Garage.Cars, AHTrain)

The summary of the lm and the graph

## 
## Call:
## lm(formula = SalePrice ~ Garage.Cars, data = AHTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -261647  -37247   -5600   25753  366120 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    64980       4074   15.95   <2e-16 ***
## Garage.Cars    64633       2126   30.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60450 on 1378 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4014, Adjusted R-squared:  0.401 
## F-statistic: 924.1 on 1 and 1378 DF,  p-value: < 2.2e-16
## Warning: Ignoring unknown parameters: intersept
## Warning: Removed 1 rows containing missing values (geom_point).

The avrage price of a home that ether did not have a garage or one that could not fit any cars is 64,980 (p-value <2e-16). Every additional car the garge could fit added on avrage 64,633 dollars (p-value <2e-16)