library( ggplot2 )
url <- "http://latul.be/mbaa_531/data/nanaimo.csv"
nanaimo <- read.csv(url, header = TRUE)
index <- nanaimo$area > 0
nanaimo <- nanaimo[ index, ]
Dummy Variable = areaperbed = Area per bedroom
nanaimo$areaPerBed=(nanaimo$area/nanaimo$bed)
we can see the houses data with larger area seem to generally have a higher price.
base <- ggplot( data = nanaimo, aes( x = area, y = price ) )
base + geom_point()
mhousing <- lm( formula = price ~ area, data = nanaimo )
summary(mhousing)
##
## Call:
## lm(formula = price ~ area, data = nanaimo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -359736 -108143 -23071 78713 578644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106124.05 19109.34 5.554 5.35e-08 ***
## area 138.95 9.17 15.153 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 165300 on 371 degrees of freedom
## (104 observations deleted due to missingness)
## Multiple R-squared: 0.3823, Adjusted R-squared: 0.3806
## F-statistic: 229.6 on 1 and 371 DF, p-value: < 2.2e-16
The prdiction formula for area and price correlation is: yi=106,124.1+138.9489∗xi+ei
base +
# We draw the regression line
geom_smooth( method = 'lm', # LM for Linear Model
formula = y ~ x, # Price (y) explained by area (x)
se = FALSE # No standard error band
) +
# We add the actual observations
geom_point()
we can see the houses data with larger taxes seem to generally have a higher price.
base <- ggplot( data = nanaimo, aes( x = taxes, y = price ) )
base + geom_point()
mhousing <- lm( formula = price ~ taxes, data = nanaimo )
summary(mhousing)
##
## Call:
## lm(formula = price ~ taxes, data = nanaimo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -408214 -112316 -28914 67032 709869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95838.413 20404.245 4.697 3.83e-06 ***
## taxes 103.164 7.018 14.701 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 165400 on 343 degrees of freedom
## (132 observations deleted due to missingness)
## Multiple R-squared: 0.3865, Adjusted R-squared: 0.3847
## F-statistic: 216.1 on 1 and 343 DF, p-value: < 2.2e-16
The prdiction formula for taxes and price correlation is (yi=95838.413+103.164∗xi+ei)
base +
# We draw the regression line
geom_smooth( method = 'lm', # LM for Linear Model
formula = y ~ x, # Price (y) explained by area (x)
se = FALSE # No standard error band
) +
# We add the actual observations
geom_point()
we can see the houses data with larger Area per bedroom seem to generally have a higher price.
base <- ggplot( data = nanaimo, aes( x = areaPerBed, y = price ) )
base + geom_point()
nanaimo$areaPerBed[which(is.nan(nanaimo$areaPerBed))] = NA
nanaimo$areaPerBed[which(nanaimo$areaPerBed==Inf)] = NA
mhousing <- lm( formula = areaPerBed ~ area, data = nanaimo , na.action = na.exclude )
summary(mhousing)
##
## Call:
## lm(formula = areaPerBed ~ area, data = nanaimo, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -352.46 -137.49 -28.42 67.81 1357.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 328.22459 22.27859 14.73 <2e-16 ***
## area 0.17215 0.01012 17.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 201.3 on 383 degrees of freedom
## (92 observations deleted due to missingness)
## Multiple R-squared: 0.4304, Adjusted R-squared: 0.4289
## F-statistic: 289.4 on 1 and 383 DF, p-value: < 2.2e-16
The prdiction formula for area per bed and price correlation is (yi=328.22459+0.17215∗xi+ei)
base +
# We draw the regression line
geom_smooth( method = 'lm', # LM for Linear Model
formula = y ~ x, # Price (y) explained by area (x)
se = FALSE # No standard error band
) +
# We add the actual observations
geom_point()