library( ggplot2 )

url <- "http://latul.be/mbaa_531/data/nanaimo.csv"

nanaimo <- read.csv(url, header = TRUE)

index <- nanaimo$area > 0

nanaimo <- nanaimo[ index, ]

Dummy Variable = areaperbed = Area per bedroom

nanaimo$areaPerBed=(nanaimo$area/nanaimo$bed)

AREA

we can see the houses data with larger area seem to generally have a higher price.

base <- ggplot( data = nanaimo, aes( x = area, y = price ) )

base + geom_point()

mhousing <- lm( formula = price ~ area, data = nanaimo )
summary(mhousing)
## 
## Call:
## lm(formula = price ~ area, data = nanaimo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -359736 -108143  -23071   78713  578644 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 106124.05   19109.34   5.554 5.35e-08 ***
## area           138.95       9.17  15.153  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 165300 on 371 degrees of freedom
##   (104 observations deleted due to missingness)
## Multiple R-squared:  0.3823, Adjusted R-squared:  0.3806 
## F-statistic: 229.6 on 1 and 371 DF,  p-value: < 2.2e-16

The prdiction formula for area and price correlation is: yi=106,124.1+138.9489∗xi+ei

base +
    # We draw the regression line
    geom_smooth( method = 'lm', # LM for Linear Model
        formula = y ~ x,  # Price (y) explained by area (x)
        se = FALSE # No standard error band
        ) +
    # We add the actual observations
    geom_point()

TAXES

we can see the houses data with larger taxes seem to generally have a higher price.

base <- ggplot( data = nanaimo, aes( x = taxes, y = price ) )

base + geom_point()

mhousing <- lm( formula = price ~ taxes, data = nanaimo )
summary(mhousing)
## 
## Call:
## lm(formula = price ~ taxes, data = nanaimo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -408214 -112316  -28914   67032  709869 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 95838.413  20404.245   4.697 3.83e-06 ***
## taxes         103.164      7.018  14.701  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 165400 on 343 degrees of freedom
##   (132 observations deleted due to missingness)
## Multiple R-squared:  0.3865, Adjusted R-squared:  0.3847 
## F-statistic: 216.1 on 1 and 343 DF,  p-value: < 2.2e-16

The prdiction formula for taxes and price correlation is (yi=95838.413+103.164∗xi+ei)

base +
    # We draw the regression line
    geom_smooth( method = 'lm', # LM for Linear Model
        formula = y ~ x,  # Price (y) explained by area (x)
        se = FALSE # No standard error band
        ) +
    # We add the actual observations
    geom_point()

AREA PER BED

we can see the houses data with larger Area per bedroom seem to generally have a higher price.

base <- ggplot( data = nanaimo, aes( x = areaPerBed, y = price ) )

base + geom_point()

nanaimo$areaPerBed[which(is.nan(nanaimo$areaPerBed))] = NA
nanaimo$areaPerBed[which(nanaimo$areaPerBed==Inf)] = NA
mhousing <- lm( formula = areaPerBed ~ area, data = nanaimo , na.action = na.exclude )
summary(mhousing)
## 
## Call:
## lm(formula = areaPerBed ~ area, data = nanaimo, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -352.46 -137.49  -28.42   67.81 1357.28 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 328.22459   22.27859   14.73   <2e-16 ***
## area          0.17215    0.01012   17.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 201.3 on 383 degrees of freedom
##   (92 observations deleted due to missingness)
## Multiple R-squared:  0.4304, Adjusted R-squared:  0.4289 
## F-statistic: 289.4 on 1 and 383 DF,  p-value: < 2.2e-16

The prdiction formula for area per bed and price correlation is (yi=328.22459+0.17215∗xi+ei)

base +
    # We draw the regression line
    geom_smooth( method = 'lm', # LM for Linear Model
        formula = y ~ x,  # Price (y) explained by area (x)
        se = FALSE # No standard error band
        ) +
    # We add the actual observations
    geom_point()