Week 4 Project of the course Statistics with R Capstone under the course track Statistics with R

Submitted by Olusola Afuwape

January 2nd, 2020

Load the data and necessary packages:

# Load the data and required packages

load("ames_train.Rdata")

library(BAS)
library(dplyr)
library(ggplot2)
library(statsr)
library(ggridges)
library(gridExtra)
library(grid)

0.1 Exploratory data analysis

# Check the data dimension, description and variables

dim(ames_train)
[1] 1000   81
names(ames_train)
 [1] "PID"             "area"            "price"          
 [4] "MS.SubClass"     "MS.Zoning"       "Lot.Frontage"   
 [7] "Lot.Area"        "Street"          "Alley"          
[10] "Lot.Shape"       "Land.Contour"    "Utilities"      
[13] "Lot.Config"      "Land.Slope"      "Neighborhood"   
[16] "Condition.1"     "Condition.2"     "Bldg.Type"      
[19] "House.Style"     "Overall.Qual"    "Overall.Cond"   
[22] "Year.Built"      "Year.Remod.Add"  "Roof.Style"     
[25] "Roof.Matl"       "Exterior.1st"    "Exterior.2nd"   
[28] "Mas.Vnr.Type"    "Mas.Vnr.Area"    "Exter.Qual"     
[31] "Exter.Cond"      "Foundation"      "Bsmt.Qual"      
[34] "Bsmt.Cond"       "Bsmt.Exposure"   "BsmtFin.Type.1" 
[37] "BsmtFin.SF.1"    "BsmtFin.Type.2"  "BsmtFin.SF.2"   
[40] "Bsmt.Unf.SF"     "Total.Bsmt.SF"   "Heating"        
[43] "Heating.QC"      "Central.Air"     "Electrical"     
[46] "X1st.Flr.SF"     "X2nd.Flr.SF"     "Low.Qual.Fin.SF"
[49] "Bsmt.Full.Bath"  "Bsmt.Half.Bath"  "Full.Bath"      
[52] "Half.Bath"       "Bedroom.AbvGr"   "Kitchen.AbvGr"  
[55] "Kitchen.Qual"    "TotRms.AbvGrd"   "Functional"     
[58] "Fireplaces"      "Fireplace.Qu"    "Garage.Type"    
[61] "Garage.Yr.Blt"   "Garage.Finish"   "Garage.Cars"    
[64] "Garage.Area"     "Garage.Qual"     "Garage.Cond"    
[67] "Paved.Drive"     "Wood.Deck.SF"    "Open.Porch.SF"  
[70] "Enclosed.Porch"  "X3Ssn.Porch"     "Screen.Porch"   
[73] "Pool.Area"       "Pool.QC"         "Fence"          
[76] "Misc.Feature"    "Misc.Val"        "Mo.Sold"        
[79] "Yr.Sold"         "Sale.Type"       "Sale.Condition" 
#str(ames_train)
#summary(ames_train)

1

1.1 Question 1

Make a labeled histogram (with 30 bins) of the ages of the houses in the data set, and describe the distribution.

# Create a new column age_built for the age of buildings

ames_train <- ames_train %>% mutate(age_built = sapply(ames_train$Year.Built, function(x) 2020 - x))

# Extract the columns for year of buildings and age of buildings

data_1 <- ames_train %>% select(Year.Built, age_built)

str(data_1)
Classes 'tbl_df', 'tbl' and 'data.frame':   1000 obs. of  2 variables:
 $ Year.Built: int  1939 1984 1930 1900 2001 2003 1953 2007 1984 2005 ...
 $ age_built : num  81 36 90 120 19 17 67 13 36 15 ...
age_mean <- mean(data_1$age_built)
age_sd <- sd(data_1$age_built)
age_median <- median(data_1$age_built)
age_minimum <- min(data_1$age_built)
age_maximum <- max(data_1$age_built)

ggplot(data_1) + geom_histogram(aes(x = age_built, y = ..density..), bins = 30, fill = "linen", color = "black") + geom_vline(mapping = aes(xintercept = age_mean, color = "purple")) + geom_vline(mapping = aes(xintercept = age_sd, color = "blue")) + geom_vline(mapping = aes(xintercept = age_median, color = "green"))

age_mean
[1] 47.797
age_median
[1] 45
age_sd
[1] 29.63741
age_minimum
[1] 10
age_maximum
[1] 148

1.1.1 Discussion

The histogram of the age distribution is right skewed displaying the mean, median and standard deviation. Since the distribution is right skewed the mean(47.797 years) is slightly larger than the median(45 years). The standard deviation is 29.63741 while the minimum and the oldest building is 148 years and the newest was built 10 years ago respectively.


2

2.1 Question 2

The mantra in real estate is “Location, Location, Location!” Make a graphical display that relates a home price to its neighborhood in Ames, Iowa. Which summary statistics are most appropriate to use for determining the most expensive, least expensive, and most heterogeneous (having the most variation in housing price) neighborhoods? Report which neighborhoods these are based on the summary statistics of your choice. Report the value of your chosen summary statistics for these neighborhoods.

# Extract price and Neighborhood columns from the data

mantra <- ames_train %>%  select(Neighborhood, price) %>% group_by(Neighborhood) %>% summarise(p_mean = mean(price, na.rm = TRUE), p_sd = sd(price, na.rm = TRUE), most_exp = max(price, na.rm = TRUE), least_exp = min(price, na.rm = TRUE)) %>% arrange(desc(p_sd))

# View the data summary and output

summary(mantra)
  Neighborhood     p_mean            p_sd           most_exp     
 Blmngtn: 1    Min.   : 92947   Min.   : 10381   Min.   :125500  
 Blueste: 1    1st Qu.:133471   1st Qu.: 27322   1st Qu.:210500  
 BrDale : 1    Median :193154   Median : 39683   Median :278000  
 BrkSide: 1    Mean   :187191   Mean   : 46138   Mean   :308588  
 ClearCr: 1    3rd Qu.:218923   3rd Qu.: 60026   3rd Qu.:398750  
 CollgCr: 1    Max.   :339316   Max.   :123459   Max.   :615000  
 (Other):21                                                      
   least_exp     
 Min.   : 12789  
 1st Qu.: 74750  
 Median :107500  
 Mean   :113496  
 3rd Qu.:152500  
 Max.   :235000  
                 
mantra
# A tibble: 27 x 5
   Neighborhood  p_mean    p_sd most_exp least_exp
   <fct>          <dbl>   <dbl>    <int>     <int>
 1 StoneBr      339316. 123459.   591587    180000
 2 NridgHt      333647. 105089.   615000    173000
 3 Timber       265192.  84030.   425000    175000
 4 Veenker      233650   72545.   385000    150000
 5 Crawfor      204197.  71268.   392500     96500
 6 GrnHill      280000   70711.   330000    230000
 7 Somerst      234596.  65199.   468000    139000
 8 Edwards      136322.  54852.   415000     61500
 9 CollgCr      196951.  52786.   475000    110000
10 SawyerW      183101   48354.   320000     67500
# ... with 17 more rows
# Observe the minimum and maximium mean

min(mantra$p_mean)
[1] 92946.88
max(mantra$p_mean)
[1] 339316
#Observe the first six and last six outputs of the data

head(mantra)
# A tibble: 6 x 5
  Neighborhood  p_mean    p_sd most_exp least_exp
  <fct>          <dbl>   <dbl>    <int>     <int>
1 StoneBr      339316. 123459.   591587    180000
2 NridgHt      333647. 105089.   615000    173000
3 Timber       265192.  84030.   425000    175000
4 Veenker      233650   72545.   385000    150000
5 Crawfor      204197.  71268.   392500     96500
6 GrnHill      280000   70711.   330000    230000
tail(mantra)
# A tibble: 6 x 5
  Neighborhood  p_mean   p_sd most_exp least_exp
  <fct>          <dbl>  <dbl>    <int>     <int>
1 Blmngtn      198635. 26455.   246990    172500
2 Sawyer       139313. 21216.   219000     63900
3 MeadowV       92947. 18940.   129500     73000
4 BrDale        98930  13338.   125500     83000
5 NPkVill      139275  11958.   149900    123000
6 Blueste      125800  10381.   137000    116500
# Filter for the most expensive, least expensive, and most heterogeneous neighborhoods

ames_train %>%  select(Neighborhood, price) %>% group_by(Neighborhood)
# A tibble: 1,000 x 2
# Groups:   Neighborhood [27]
   Neighborhood  price
   <fct>         <int>
 1 SWISU        126000
 2 Edwards      139500
 3 IDOTRR       124900
 4 OldTown      114000
 5 NWAmes       227000
 6 Edwards      198500
 7 OldTown       93000
 8 Blmngtn      187687
 9 Mitchel      137500
10 Edwards      140000
# ... with 990 more rows

2.1.1 Graphical presentation

#Compare colnames(mantra) and colnames(ames_train)
# Boxplot of price against neighborhood

ggplot(ames_train, aes(x = Neighborhood, y = price, fill = Neighborhood)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1))


2.1.2 Discussion

  1. The most expensive neighborhood is StoneBr with a mean price of 339316
  2. The least expensive neighborhood is MeadowV with a mean price of 92947
  3. The most heterogeneous neighborhood is StoneBr with a standard deviation of 123459

3

3.1 Question 3

Which variable has the largest number of missing values? Explain why it makes sense that there are so many missing values for this variable.

# Reload data

load("ames_train.Rdata")

#data_na <- sapply(ames_train, function(x) sum(is.na(x)))
#class(data_na)
#data_na <- data.frame(data_na)
#class(data_na)
#sort(head(data_na[1,], decreasing = TRUE))

data_na <- ames_train %>% summarise_all(list(~ sum(is.na(.))))
head(sort(unlist(data_na[1,]), decreasing = TRUE), 10)
      Pool.QC  Misc.Feature         Alley         Fence  Fireplace.Qu 
          997           971           933           798           491 
 Lot.Frontage Garage.Yr.Blt   Garage.Qual   Garage.Cond   Garage.Type 
          167            48            47            47            46 

3.1.1 Discussion

The variable Pool quality (Pool. QC) has the highest number of NAs. The high number of NAs in the variable Pool. QC may be due to cost and socioeconomic status. A building with a swimming pool will mostly cost more than a similar building without a pool.


4

4.1 Question 4

We want to predict the natural log of the home prices. Candidate explanatory variables are lot size in square feet (Lot.Area), slope of property (Land.Slope), original construction date (Year.Built), remodel date (Year.Remod.Add), and the number of bedrooms above grade (Bedroom.AbvGr). Pick a model selection or model averaging method covered in the Specialization, and describe how this method works. Then, use this method to find the best multiple regression model for predicting the natural log of the home prices.

4.1.1 Discussion

Linear models can be used for prediction or to evaluate whether there is a linear relationship between two numerical variables or between a variable and several others. The relationship between the explanatory and the response variables should be linear. Scatterplot or residuals plot can be used to validate linearity.

Adjusted R2 describes the strength of a model fit, and it is a useful tool for evaluating which predictors are adding value to the model.

Two common strategies for adding or removing variables in a multiple regression model are called backward elimination and forward selection. These techniques are often referred to as stepwise model selection strategies, because they add or delete one variable at a time as they “step” through the candidate predictors.

Backward elimination starts with the model that includes all potential predictor variables. Variables are eliminated one-at-a-time from the model until we cannot improve the adjusted R2. The strategy within each elimination step is to eliminate the variable that leads to the largest improvement in adjusted R2.

This model prediction will use backwards elimination method to get at the model that gives the best and more parsimonious model with the highest adjusted R2 value.

# Get the required variable from the dataset

model_data <- ames_train %>% filter(!is.na(price), !is.na(Lot.Area), !is.na(Land.Slope), !is.na(Year.Built), !is.na(Year.Remod.Add), !is.na(Bedroom.AbvGr)) %>% select(price, Lot.Area, Land.Slope, Year.Built, Year.Remod.Add, Bedroom.AbvGr)


# Model with all selected data variables

full_model <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(full_model)

Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built + 
    Year.Remod.Add + Bedroom.AbvGr, data = model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.0878 -0.1651 -0.0211  0.1657  0.9945 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.371e+01  8.574e-01 -15.996  < 2e-16 ***
Lot.Area        1.028e-05  1.106e-06   9.296  < 2e-16 ***
Land.SlopeMod   1.384e-01  4.991e-02   2.773  0.00565 ** 
Land.SlopeSev  -4.567e-01  1.514e-01  -3.016  0.00263 ** 
Year.Built      6.049e-03  3.788e-04  15.968  < 2e-16 ***
Year.Remod.Add  6.778e-03  5.468e-04  12.395  < 2e-16 ***
Bedroom.AbvGr   8.686e-02  1.077e-02   8.063 2.12e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.279 on 993 degrees of freedom
Multiple R-squared:  0.5625,    Adjusted R-squared:  0.5598 
F-statistic: 212.8 on 6 and 993 DF,  p-value: < 2.2e-16
# Stepwise removal of explanatory variables

model_1 <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Built + Year.Remod.Add, data = model_data)
summary(model_1)

Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built + 
    Year.Remod.Add, data = model_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.16270 -0.16779 -0.02396  0.17883  0.98794 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.352e+01  8.842e-01 -15.291  < 2e-16 ***
Lot.Area        1.177e-05  1.125e-06  10.472  < 2e-16 ***
Land.SlopeMod   1.117e-01  5.138e-02   2.174 0.029962 *  
Land.SlopeSev  -5.383e-01  1.559e-01  -3.454 0.000577 ***
Year.Built      5.911e-03  3.904e-04  15.138  < 2e-16 ***
Year.Remod.Add  6.934e-03  5.638e-04  12.298  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2879 on 994 degrees of freedom
Multiple R-squared:  0.5338,    Adjusted R-squared:  0.5315 
F-statistic: 227.7 on 5 and 994 DF,  p-value: < 2.2e-16
model_2 <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Built + Bedroom.AbvGr, data = model_data)
summary(model_2)

Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built + 
    Bedroom.AbvGr, data = model_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.03845 -0.17426 -0.02837  0.15621  1.17565 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -5.997e+00  6.331e-01  -9.473  < 2e-16 ***
Lot.Area       1.090e-05  1.186e-06   9.188  < 2e-16 ***
Land.SlopeMod  1.466e-01  5.361e-02   2.735 0.006342 ** 
Land.SlopeSev -5.494e-01  1.624e-01  -3.382 0.000748 ***
Year.Built     8.946e-03  3.202e-04  27.939  < 2e-16 ***
Bedroom.AbvGr  9.157e-02  1.156e-02   7.920 6.35e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2997 on 994 degrees of freedom
Multiple R-squared:  0.4948,    Adjusted R-squared:  0.4922 
F-statistic: 194.7 on 5 and 994 DF,  p-value: < 2.2e-16
model_3 <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(model_3)

Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Remod.Add + 
    Bedroom.AbvGr, data = model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.3150 -0.1680  0.0036  0.1779  1.0793 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.245e+01  9.566e-01 -13.016  < 2e-16 ***
Lot.Area        1.014e-05  1.239e-06   8.187 8.17e-16 ***
Land.SlopeMod   1.253e-01  5.592e-02   2.240   0.0253 *  
Land.SlopeSev  -4.342e-01  1.697e-01  -2.559   0.0106 *  
Year.Remod.Add  1.217e-02  4.822e-04  25.229  < 2e-16 ***
Bedroom.AbvGr   7.905e-02  1.206e-02   6.556 8.86e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3127 on 994 degrees of freedom
Multiple R-squared:  0.4501,    Adjusted R-squared:  0.4474 
F-statistic: 162.7 on 5 and 994 DF,  p-value: < 2.2e-16
model_4 <- lm(log(price) ~ Lot.Area + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(model_4)

Call:
lm(formula = log(price) ~ Lot.Area + Year.Built + Year.Remod.Add + 
    Bedroom.AbvGr, data = model_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.09102 -0.16488 -0.02135  0.16605  1.13359 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.385e+01  8.633e-01 -16.049  < 2e-16 ***
Lot.Area        8.697e-06  9.168e-07   9.486  < 2e-16 ***
Year.Built      6.019e-03  3.819e-04  15.761  < 2e-16 ***
Year.Remod.Add  6.889e-03  5.505e-04  12.512  < 2e-16 ***
Bedroom.AbvGr   8.704e-02  1.082e-02   8.047  2.4e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2813 on 995 degrees of freedom
Multiple R-squared:  0.5544,    Adjusted R-squared:  0.5526 
F-statistic: 309.5 on 4 and 995 DF,  p-value: < 2.2e-16
model_5 <- lm(log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(model_5)

Call:
lm(formula = log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + 
    Bedroom.AbvGr, data = model_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.07347 -0.16922 -0.02455  0.16585  1.07012 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.406e+01  8.926e-01 -15.758  < 2e-16 ***
Land.SlopeMod   2.009e-01  5.154e-02   3.898 0.000104 ***
Land.SlopeSev   3.345e-01  1.305e-01   2.562 0.010542 *  
Year.Built      6.022e-03  3.948e-04  15.255  < 2e-16 ***
Year.Remod.Add  7.009e-03  5.693e-04  12.312  < 2e-16 ***
Bedroom.AbvGr   1.037e-01  1.107e-02   9.369  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2908 on 994 degrees of freedom
Multiple R-squared:  0.5244,    Adjusted R-squared:  0.522 
F-statistic: 219.2 on 5 and 994 DF,  p-value: < 2.2e-16
# Tabulate adjusted R squared 

s_full <- summary(full_model)$adj.r.squared
s_1 <- summary(model_1)$adj.r.squared
s_2 <- summary(model_2)$adj.r.squared
s_3 <- summary(model_3)$adj.r.squared
s_4 <- summary(model_4)$adj.r.squared
s_5 <- summary(model_5)$adj.r.squared

R_squared <- rbind(s_full, s_1, s_2, s_3, s_4, s_5)
model <- c("full_model", "model_1", "model_2", "model_3", "model_4", "model_5")
colu <- cbind(model, R_squared)
df <- data.frame(colu)
names(df) <- c("Models", "R_squared")
head(df)
           Models         R_squared
s_full full_model 0.559834474177341
s_1       model_1 0.531485946233459
s_2       model_2 0.492238381396363
s_3       model_3 0.447366335111107
s_4       model_4 0.552606180049213
s_5       model_5 0.522009031037836
# Pair plot to observe the relationship of response and explantory variables

panel.cor <- function(x, y, ...)
{
par(usr = c(0, 2, 0, 2))
txt <- as.character(format(cor(x, y), digits = 2))
text(0.7, 0.7, txt, cex = 2*abs(cor(x, y)))
}
pairs(model_data[1:6], upper.panel = panel.cor)

# Test for multicollinearity

#YB_model <- lm(Year.Built ~ Lot.Area + Land.Slope + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
#summary(YB_model)

#YR_model <- lm(Year.Remod.Add ~ Lot.Area + Land.Slope + Year.Built + Bedroom.AbvGr, data = model_data)
#summary(YR_model)

#b_model1 <- lm(log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
#summary(best_model1)

#b_model2 <- lm(log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + + Bedroom.AbvGr, data = model_data)
#summary(best_model2)

4.1.2 Discussion

After checking for collinearity and regressing the explanatory variables as the rsponse variables, the full model still gives the highest value of adjusted R2.

Conditions for linear regression include:

  1. Linearity - relationship between the explanatory and the response variable should be linear
  2. Residuals normality - nearly normal residuals
  3. Constant variability - variability of points around the least squares line should be roughly constant also called homoscedasticity
  4. Residuals independence

The model adequately met all the conditions for linear regression.

# Linear relationship

plot(full_model$residuals ~ model_data$Year.Built)

plot(full_model$residuals ~ model_data$Year.Remod.Add)

# Residuals normality

hist(full_model$residuals)

qqnorm(full_model$residuals)
qqline(full_model$residuals)

# Constant variability of residuals

plot(full_model$residuals ~ full_model$fitted)

plot(abs(full_model$residuals) ~ full_model$fitted)

# Independence of residuals

plot(full_model$residuals)


The full model with the high adjusted R2 value model adequately met all the conditions for linear regression listed above.


5

5.1 Question 5

Which home has the largest squared residual in the previous analysis (Question 4)? Looking at all the variables in the data set, can you explain why this home stands out from the rest (what factors contribute to the high squared residual and why are those factors relevant)?

# Summary of the full model and the maximum residuals

summary(full_model)

Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built + 
    Year.Remod.Add + Bedroom.AbvGr, data = model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.0878 -0.1651 -0.0211  0.1657  0.9945 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.371e+01  8.574e-01 -15.996  < 2e-16 ***
Lot.Area        1.028e-05  1.106e-06   9.296  < 2e-16 ***
Land.SlopeMod   1.384e-01  4.991e-02   2.773  0.00565 ** 
Land.SlopeSev  -4.567e-01  1.514e-01  -3.016  0.00263 ** 
Year.Built      6.049e-03  3.788e-04  15.968  < 2e-16 ***
Year.Remod.Add  6.778e-03  5.468e-04  12.395  < 2e-16 ***
Bedroom.AbvGr   8.686e-02  1.077e-02   8.063 2.12e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.279 on 993 degrees of freedom
Multiple R-squared:  0.5625,    Adjusted R-squared:  0.5598 
F-statistic: 212.8 on 6 and 993 DF,  p-value: < 2.2e-16
which(abs(resid(full_model)) == max(abs(resid(full_model))))
428 
428 
# Display the specific row. The which() stated 428 as the highest residuals

as.data.frame(model_data[428, ])
  price Lot.Area Land.Slope Year.Built Year.Remod.Add Bedroom.AbvGr
1 12789     9656        Gtl       1923           1970             2
# Prediction

predict_model <- model_data %>% as.data.frame(model_data)
#class(predict_model)

predict_model$prediction <- exp(predict(full_model))
predict_model$residuals <- residuals(full_model)
predict_model[428, ]
    price Lot.Area Land.Slope Year.Built Year.Remod.Add Bedroom.AbvGr
428 12789     9656        Gtl       1923           1970             2
    prediction residuals
428   103176.2 -2.087853

5.1.1 Discussion

House 428 has the largest residuals with a actual price of $12,789 and predicted price $103176.2 which is much higher. A number of factors may be responsible for this high price difference which include Year.Built(1923) and Year.Remod.Add(1970). The building was built 93 years ago and remodeled 50 years ago. The overall quality and condition of the building might be low.


6

6.1 Question 6

Use the same model selection method you chose in Question 4 to again find the best multiple regression model to predict the natural log of home prices, but this time replacing Lot.Area with log(Lot.Area). Do you arrive at a model including the same set of predictors?

# Model with log of Lot.Area

full_model_log <- lm(log(price) ~ log(Lot.Area) + Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(full_model_log)

Call:
lm(formula = log(price) ~ log(Lot.Area) + Land.Slope + Year.Built + 
    Year.Remod.Add + Bedroom.AbvGr, data = model_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.14050 -0.15650 -0.01561  0.15350  0.90854 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.553e+01  8.197e-01 -18.947  < 2e-16 ***
log(Lot.Area)   2.442e-01  1.708e-02  14.297  < 2e-16 ***
Land.SlopeMod   1.151e-01  4.734e-02   2.431   0.0152 *  
Land.SlopeSev  -6.554e-02  1.222e-01  -0.536   0.5917    
Year.Built      5.981e-03  3.597e-04  16.628  < 2e-16 ***
Year.Remod.Add  6.734e-03  5.190e-04  12.975  < 2e-16 ***
Bedroom.AbvGr   5.909e-02  1.055e-02   5.599  2.8e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2649 on 993 degrees of freedom
Multiple R-squared:  0.6056,    Adjusted R-squared:  0.6032 
F-statistic: 254.1 on 6 and 993 DF,  p-value: < 2.2e-16

6.1.1 Discussion

The adjusted R2 is 0.6032. The highest adjusted R2(0.6032) is arrived at only when all the predictors were included.


7

7.1 Question 7

Do you think it is better to log transform Lot.Area, in terms of assumptions for linear regression? Make graphs of the predicted values of log home price versus the true values of log home price for the regression models selected for Lot.Area and log(Lot.Area). Referencing these two plots, provide a written support that includes a quantitative justification for your answer in the first part of question 7.

# Model without log transformation
model_nolog <- full_model

# Model with log transformation
model_log <- full_model_log

# Display adjusted R squared values


m_log <- summary(model_log)$adj.r.squared
m_nolog <- summary(model_nolog)$adj.r.squared

a.R_squared <- rbind(m_log, m_nolog)
modell <- c("m_log", "m_nolog")
coln <- cbind(modell, a.R_squared)
dfl <- data.frame(coln)
names(dfl) <- c("Model", "R_squared")
head(dfl)
          Model         R_squared
m_log     m_log 0.603201839590637
m_nolog m_nolog 0.559834474177341
# Select entities and graphics

pri_data <- model_data[, "price"]
pri_t <- subset(model_data, select = -c(price))
pri_log <- predict(full_model_log, pri_t, interval = "prediction", level = 0.95)
pri_log <- data.frame(pri_log)
pd_log <- sapply(pri_log[1], function(x) exp(x))

pri_nolog <- predict(full_model, pri_t, interval = "prediction", level = 0.95)
pri_nolog <- data.frame(pri_nolog)
pd_nolog <- sapply(pri_nolog[1], function(x) exp(x))

pd_log <- cbind(pd_log, pri_data)
names(pd_log) <- c("pd", "value")

pd_nolog <- cbind(pd_nolog, pri_data)
names(pd_nolog) <- c("pd", "value")

ggplot(pd_log, aes(x = pd, y = value)) + geom_point() + stat_smooth(method=lm, level=0.95) + geom_abline(intercept = 0, slope = 1)

ggplot(pd_nolog, aes(x = pd, y = value)) + geom_point() + stat_smooth(method=lm, level=0.95) + geom_abline(intercept = 0, slope = 1)


7.1.1 Discussion

The scatterplot of the log model shows more evenly distributed points than the model without log transformation. The log model exhibited homoscedasticity. Thus the log model will give a better prediction.