Week 4 Project of the course Statistics with R Capstone under the course track Statistics with R
Submitted by Olusola Afuwape
January 2nd, 2020
Load the data and necessary packages:
# Load the data and required packages
load("ames_train.Rdata")
library(BAS)
library(dplyr)
library(ggplot2)
library(statsr)
library(ggridges)
library(gridExtra)
library(grid)
# Check the data dimension, description and variables
dim(ames_train)
[1] 1000 81
names(ames_train)
[1] "PID" "area" "price"
[4] "MS.SubClass" "MS.Zoning" "Lot.Frontage"
[7] "Lot.Area" "Street" "Alley"
[10] "Lot.Shape" "Land.Contour" "Utilities"
[13] "Lot.Config" "Land.Slope" "Neighborhood"
[16] "Condition.1" "Condition.2" "Bldg.Type"
[19] "House.Style" "Overall.Qual" "Overall.Cond"
[22] "Year.Built" "Year.Remod.Add" "Roof.Style"
[25] "Roof.Matl" "Exterior.1st" "Exterior.2nd"
[28] "Mas.Vnr.Type" "Mas.Vnr.Area" "Exter.Qual"
[31] "Exter.Cond" "Foundation" "Bsmt.Qual"
[34] "Bsmt.Cond" "Bsmt.Exposure" "BsmtFin.Type.1"
[37] "BsmtFin.SF.1" "BsmtFin.Type.2" "BsmtFin.SF.2"
[40] "Bsmt.Unf.SF" "Total.Bsmt.SF" "Heating"
[43] "Heating.QC" "Central.Air" "Electrical"
[46] "X1st.Flr.SF" "X2nd.Flr.SF" "Low.Qual.Fin.SF"
[49] "Bsmt.Full.Bath" "Bsmt.Half.Bath" "Full.Bath"
[52] "Half.Bath" "Bedroom.AbvGr" "Kitchen.AbvGr"
[55] "Kitchen.Qual" "TotRms.AbvGrd" "Functional"
[58] "Fireplaces" "Fireplace.Qu" "Garage.Type"
[61] "Garage.Yr.Blt" "Garage.Finish" "Garage.Cars"
[64] "Garage.Area" "Garage.Qual" "Garage.Cond"
[67] "Paved.Drive" "Wood.Deck.SF" "Open.Porch.SF"
[70] "Enclosed.Porch" "X3Ssn.Porch" "Screen.Porch"
[73] "Pool.Area" "Pool.QC" "Fence"
[76] "Misc.Feature" "Misc.Val" "Mo.Sold"
[79] "Yr.Sold" "Sale.Type" "Sale.Condition"
#str(ames_train)
#summary(ames_train)
Make a labeled histogram (with 30 bins) of the ages of the houses in the data set, and describe the distribution.
# Create a new column age_built for the age of buildings
ames_train <- ames_train %>% mutate(age_built = sapply(ames_train$Year.Built, function(x) 2020 - x))
# Extract the columns for year of buildings and age of buildings
data_1 <- ames_train %>% select(Year.Built, age_built)
str(data_1)
Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
$ Year.Built: int 1939 1984 1930 1900 2001 2003 1953 2007 1984 2005 ...
$ age_built : num 81 36 90 120 19 17 67 13 36 15 ...
age_mean <- mean(data_1$age_built)
age_sd <- sd(data_1$age_built)
age_median <- median(data_1$age_built)
age_minimum <- min(data_1$age_built)
age_maximum <- max(data_1$age_built)
ggplot(data_1) + geom_histogram(aes(x = age_built, y = ..density..), bins = 30, fill = "linen", color = "black") + geom_vline(mapping = aes(xintercept = age_mean, color = "purple")) + geom_vline(mapping = aes(xintercept = age_sd, color = "blue")) + geom_vline(mapping = aes(xintercept = age_median, color = "green"))
age_mean
[1] 47.797
age_median
[1] 45
age_sd
[1] 29.63741
age_minimum
[1] 10
age_maximum
[1] 148
The histogram of the age distribution is right skewed displaying the mean, median and standard deviation. Since the distribution is right skewed the mean(47.797 years) is slightly larger than the median(45 years). The standard deviation is 29.63741 while the minimum and the oldest building is 148 years and the newest was built 10 years ago respectively.
The mantra in real estate is “Location, Location, Location!” Make a graphical display that relates a home price to its neighborhood in Ames, Iowa. Which summary statistics are most appropriate to use for determining the most expensive, least expensive, and most heterogeneous (having the most variation in housing price) neighborhoods? Report which neighborhoods these are based on the summary statistics of your choice. Report the value of your chosen summary statistics for these neighborhoods.
# Extract price and Neighborhood columns from the data
mantra <- ames_train %>% select(Neighborhood, price) %>% group_by(Neighborhood) %>% summarise(p_mean = mean(price, na.rm = TRUE), p_sd = sd(price, na.rm = TRUE), most_exp = max(price, na.rm = TRUE), least_exp = min(price, na.rm = TRUE)) %>% arrange(desc(p_sd))
# View the data summary and output
summary(mantra)
Neighborhood p_mean p_sd most_exp
Blmngtn: 1 Min. : 92947 Min. : 10381 Min. :125500
Blueste: 1 1st Qu.:133471 1st Qu.: 27322 1st Qu.:210500
BrDale : 1 Median :193154 Median : 39683 Median :278000
BrkSide: 1 Mean :187191 Mean : 46138 Mean :308588
ClearCr: 1 3rd Qu.:218923 3rd Qu.: 60026 3rd Qu.:398750
CollgCr: 1 Max. :339316 Max. :123459 Max. :615000
(Other):21
least_exp
Min. : 12789
1st Qu.: 74750
Median :107500
Mean :113496
3rd Qu.:152500
Max. :235000
mantra
# A tibble: 27 x 5
Neighborhood p_mean p_sd most_exp least_exp
<fct> <dbl> <dbl> <int> <int>
1 StoneBr 339316. 123459. 591587 180000
2 NridgHt 333647. 105089. 615000 173000
3 Timber 265192. 84030. 425000 175000
4 Veenker 233650 72545. 385000 150000
5 Crawfor 204197. 71268. 392500 96500
6 GrnHill 280000 70711. 330000 230000
7 Somerst 234596. 65199. 468000 139000
8 Edwards 136322. 54852. 415000 61500
9 CollgCr 196951. 52786. 475000 110000
10 SawyerW 183101 48354. 320000 67500
# ... with 17 more rows
# Observe the minimum and maximium mean
min(mantra$p_mean)
[1] 92946.88
max(mantra$p_mean)
[1] 339316
#Observe the first six and last six outputs of the data
head(mantra)
# A tibble: 6 x 5
Neighborhood p_mean p_sd most_exp least_exp
<fct> <dbl> <dbl> <int> <int>
1 StoneBr 339316. 123459. 591587 180000
2 NridgHt 333647. 105089. 615000 173000
3 Timber 265192. 84030. 425000 175000
4 Veenker 233650 72545. 385000 150000
5 Crawfor 204197. 71268. 392500 96500
6 GrnHill 280000 70711. 330000 230000
tail(mantra)
# A tibble: 6 x 5
Neighborhood p_mean p_sd most_exp least_exp
<fct> <dbl> <dbl> <int> <int>
1 Blmngtn 198635. 26455. 246990 172500
2 Sawyer 139313. 21216. 219000 63900
3 MeadowV 92947. 18940. 129500 73000
4 BrDale 98930 13338. 125500 83000
5 NPkVill 139275 11958. 149900 123000
6 Blueste 125800 10381. 137000 116500
# Filter for the most expensive, least expensive, and most heterogeneous neighborhoods
ames_train %>% select(Neighborhood, price) %>% group_by(Neighborhood)
# A tibble: 1,000 x 2
# Groups: Neighborhood [27]
Neighborhood price
<fct> <int>
1 SWISU 126000
2 Edwards 139500
3 IDOTRR 124900
4 OldTown 114000
5 NWAmes 227000
6 Edwards 198500
7 OldTown 93000
8 Blmngtn 187687
9 Mitchel 137500
10 Edwards 140000
# ... with 990 more rows
#Compare colnames(mantra) and colnames(ames_train)
# Boxplot of price against neighborhood
ggplot(ames_train, aes(x = Neighborhood, y = price, fill = Neighborhood)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Which variable has the largest number of missing values? Explain why it makes sense that there are so many missing values for this variable.
# Reload data
load("ames_train.Rdata")
#data_na <- sapply(ames_train, function(x) sum(is.na(x)))
#class(data_na)
#data_na <- data.frame(data_na)
#class(data_na)
#sort(head(data_na[1,], decreasing = TRUE))
data_na <- ames_train %>% summarise_all(list(~ sum(is.na(.))))
head(sort(unlist(data_na[1,]), decreasing = TRUE), 10)
Pool.QC Misc.Feature Alley Fence Fireplace.Qu
997 971 933 798 491
Lot.Frontage Garage.Yr.Blt Garage.Qual Garage.Cond Garage.Type
167 48 47 47 46
The variable Pool quality (Pool. QC) has the highest number of NAs. The high number of NAs in the variable Pool. QC may be due to cost and socioeconomic status. A building with a swimming pool will mostly cost more than a similar building without a pool.
We want to predict the natural log of the home prices. Candidate explanatory variables are lot size in square feet (Lot.Area), slope of property (Land.Slope), original construction date (Year.Built), remodel date (Year.Remod.Add), and the number of bedrooms above grade (Bedroom.AbvGr). Pick a model selection or model averaging method covered in the Specialization, and describe how this method works. Then, use this method to find the best multiple regression model for predicting the natural log of the home prices.
Linear models can be used for prediction or to evaluate whether there is a linear relationship between two numerical variables or between a variable and several others. The relationship between the explanatory and the response variables should be linear. Scatterplot or residuals plot can be used to validate linearity.
Adjusted R2 describes the strength of a model fit, and it is a useful tool for evaluating which predictors are adding value to the model.
Two common strategies for adding or removing variables in a multiple regression model are called backward elimination and forward selection. These techniques are often referred to as stepwise model selection strategies, because they add or delete one variable at a time as they “step” through the candidate predictors.
Backward elimination starts with the model that includes all potential predictor variables. Variables are eliminated one-at-a-time from the model until we cannot improve the adjusted R2. The strategy within each elimination step is to eliminate the variable that leads to the largest improvement in adjusted R2.
This model prediction will use backwards elimination method to get at the model that gives the best and more parsimonious model with the highest adjusted R2 value.
# Get the required variable from the dataset
model_data <- ames_train %>% filter(!is.na(price), !is.na(Lot.Area), !is.na(Land.Slope), !is.na(Year.Built), !is.na(Year.Remod.Add), !is.na(Bedroom.AbvGr)) %>% select(price, Lot.Area, Land.Slope, Year.Built, Year.Remod.Add, Bedroom.AbvGr)
# Model with all selected data variables
full_model <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(full_model)
Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built +
Year.Remod.Add + Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.0878 -0.1651 -0.0211 0.1657 0.9945
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.371e+01 8.574e-01 -15.996 < 2e-16 ***
Lot.Area 1.028e-05 1.106e-06 9.296 < 2e-16 ***
Land.SlopeMod 1.384e-01 4.991e-02 2.773 0.00565 **
Land.SlopeSev -4.567e-01 1.514e-01 -3.016 0.00263 **
Year.Built 6.049e-03 3.788e-04 15.968 < 2e-16 ***
Year.Remod.Add 6.778e-03 5.468e-04 12.395 < 2e-16 ***
Bedroom.AbvGr 8.686e-02 1.077e-02 8.063 2.12e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.279 on 993 degrees of freedom
Multiple R-squared: 0.5625, Adjusted R-squared: 0.5598
F-statistic: 212.8 on 6 and 993 DF, p-value: < 2.2e-16
# Stepwise removal of explanatory variables
model_1 <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Built + Year.Remod.Add, data = model_data)
summary(model_1)
Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built +
Year.Remod.Add, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.16270 -0.16779 -0.02396 0.17883 0.98794
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.352e+01 8.842e-01 -15.291 < 2e-16 ***
Lot.Area 1.177e-05 1.125e-06 10.472 < 2e-16 ***
Land.SlopeMod 1.117e-01 5.138e-02 2.174 0.029962 *
Land.SlopeSev -5.383e-01 1.559e-01 -3.454 0.000577 ***
Year.Built 5.911e-03 3.904e-04 15.138 < 2e-16 ***
Year.Remod.Add 6.934e-03 5.638e-04 12.298 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2879 on 994 degrees of freedom
Multiple R-squared: 0.5338, Adjusted R-squared: 0.5315
F-statistic: 227.7 on 5 and 994 DF, p-value: < 2.2e-16
model_2 <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Built + Bedroom.AbvGr, data = model_data)
summary(model_2)
Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built +
Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.03845 -0.17426 -0.02837 0.15621 1.17565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.997e+00 6.331e-01 -9.473 < 2e-16 ***
Lot.Area 1.090e-05 1.186e-06 9.188 < 2e-16 ***
Land.SlopeMod 1.466e-01 5.361e-02 2.735 0.006342 **
Land.SlopeSev -5.494e-01 1.624e-01 -3.382 0.000748 ***
Year.Built 8.946e-03 3.202e-04 27.939 < 2e-16 ***
Bedroom.AbvGr 9.157e-02 1.156e-02 7.920 6.35e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2997 on 994 degrees of freedom
Multiple R-squared: 0.4948, Adjusted R-squared: 0.4922
F-statistic: 194.7 on 5 and 994 DF, p-value: < 2.2e-16
model_3 <- lm(log(price) ~ Lot.Area + Land.Slope + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(model_3)
Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Remod.Add +
Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.3150 -0.1680 0.0036 0.1779 1.0793
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.245e+01 9.566e-01 -13.016 < 2e-16 ***
Lot.Area 1.014e-05 1.239e-06 8.187 8.17e-16 ***
Land.SlopeMod 1.253e-01 5.592e-02 2.240 0.0253 *
Land.SlopeSev -4.342e-01 1.697e-01 -2.559 0.0106 *
Year.Remod.Add 1.217e-02 4.822e-04 25.229 < 2e-16 ***
Bedroom.AbvGr 7.905e-02 1.206e-02 6.556 8.86e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3127 on 994 degrees of freedom
Multiple R-squared: 0.4501, Adjusted R-squared: 0.4474
F-statistic: 162.7 on 5 and 994 DF, p-value: < 2.2e-16
model_4 <- lm(log(price) ~ Lot.Area + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(model_4)
Call:
lm(formula = log(price) ~ Lot.Area + Year.Built + Year.Remod.Add +
Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.09102 -0.16488 -0.02135 0.16605 1.13359
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.385e+01 8.633e-01 -16.049 < 2e-16 ***
Lot.Area 8.697e-06 9.168e-07 9.486 < 2e-16 ***
Year.Built 6.019e-03 3.819e-04 15.761 < 2e-16 ***
Year.Remod.Add 6.889e-03 5.505e-04 12.512 < 2e-16 ***
Bedroom.AbvGr 8.704e-02 1.082e-02 8.047 2.4e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2813 on 995 degrees of freedom
Multiple R-squared: 0.5544, Adjusted R-squared: 0.5526
F-statistic: 309.5 on 4 and 995 DF, p-value: < 2.2e-16
model_5 <- lm(log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(model_5)
Call:
lm(formula = log(price) ~ Land.Slope + Year.Built + Year.Remod.Add +
Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.07347 -0.16922 -0.02455 0.16585 1.07012
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.406e+01 8.926e-01 -15.758 < 2e-16 ***
Land.SlopeMod 2.009e-01 5.154e-02 3.898 0.000104 ***
Land.SlopeSev 3.345e-01 1.305e-01 2.562 0.010542 *
Year.Built 6.022e-03 3.948e-04 15.255 < 2e-16 ***
Year.Remod.Add 7.009e-03 5.693e-04 12.312 < 2e-16 ***
Bedroom.AbvGr 1.037e-01 1.107e-02 9.369 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2908 on 994 degrees of freedom
Multiple R-squared: 0.5244, Adjusted R-squared: 0.522
F-statistic: 219.2 on 5 and 994 DF, p-value: < 2.2e-16
# Tabulate adjusted R squared
s_full <- summary(full_model)$adj.r.squared
s_1 <- summary(model_1)$adj.r.squared
s_2 <- summary(model_2)$adj.r.squared
s_3 <- summary(model_3)$adj.r.squared
s_4 <- summary(model_4)$adj.r.squared
s_5 <- summary(model_5)$adj.r.squared
R_squared <- rbind(s_full, s_1, s_2, s_3, s_4, s_5)
model <- c("full_model", "model_1", "model_2", "model_3", "model_4", "model_5")
colu <- cbind(model, R_squared)
df <- data.frame(colu)
names(df) <- c("Models", "R_squared")
head(df)
Models R_squared
s_full full_model 0.559834474177341
s_1 model_1 0.531485946233459
s_2 model_2 0.492238381396363
s_3 model_3 0.447366335111107
s_4 model_4 0.552606180049213
s_5 model_5 0.522009031037836
# Pair plot to observe the relationship of response and explantory variables
panel.cor <- function(x, y, ...)
{
par(usr = c(0, 2, 0, 2))
txt <- as.character(format(cor(x, y), digits = 2))
text(0.7, 0.7, txt, cex = 2*abs(cor(x, y)))
}
pairs(model_data[1:6], upper.panel = panel.cor)
# Test for multicollinearity
#YB_model <- lm(Year.Built ~ Lot.Area + Land.Slope + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
#summary(YB_model)
#YR_model <- lm(Year.Remod.Add ~ Lot.Area + Land.Slope + Year.Built + Bedroom.AbvGr, data = model_data)
#summary(YR_model)
#b_model1 <- lm(log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
#summary(best_model1)
#b_model2 <- lm(log(price) ~ Land.Slope + Year.Built + Year.Remod.Add + + Bedroom.AbvGr, data = model_data)
#summary(best_model2)
After checking for collinearity and regressing the explanatory variables as the rsponse variables, the full model still gives the highest value of adjusted R2.
Conditions for linear regression include:
The model adequately met all the conditions for linear regression.
# Linear relationship
plot(full_model$residuals ~ model_data$Year.Built)
plot(full_model$residuals ~ model_data$Year.Remod.Add)
# Residuals normality
hist(full_model$residuals)
qqnorm(full_model$residuals)
qqline(full_model$residuals)
# Constant variability of residuals
plot(full_model$residuals ~ full_model$fitted)
plot(abs(full_model$residuals) ~ full_model$fitted)
# Independence of residuals
plot(full_model$residuals)
The full model with the high adjusted R2 value model adequately met all the conditions for linear regression listed above.
Which home has the largest squared residual in the previous analysis (Question 4)? Looking at all the variables in the data set, can you explain why this home stands out from the rest (what factors contribute to the high squared residual and why are those factors relevant)?
# Summary of the full model and the maximum residuals
summary(full_model)
Call:
lm(formula = log(price) ~ Lot.Area + Land.Slope + Year.Built +
Year.Remod.Add + Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.0878 -0.1651 -0.0211 0.1657 0.9945
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.371e+01 8.574e-01 -15.996 < 2e-16 ***
Lot.Area 1.028e-05 1.106e-06 9.296 < 2e-16 ***
Land.SlopeMod 1.384e-01 4.991e-02 2.773 0.00565 **
Land.SlopeSev -4.567e-01 1.514e-01 -3.016 0.00263 **
Year.Built 6.049e-03 3.788e-04 15.968 < 2e-16 ***
Year.Remod.Add 6.778e-03 5.468e-04 12.395 < 2e-16 ***
Bedroom.AbvGr 8.686e-02 1.077e-02 8.063 2.12e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.279 on 993 degrees of freedom
Multiple R-squared: 0.5625, Adjusted R-squared: 0.5598
F-statistic: 212.8 on 6 and 993 DF, p-value: < 2.2e-16
which(abs(resid(full_model)) == max(abs(resid(full_model))))
428
428
# Display the specific row. The which() stated 428 as the highest residuals
as.data.frame(model_data[428, ])
price Lot.Area Land.Slope Year.Built Year.Remod.Add Bedroom.AbvGr
1 12789 9656 Gtl 1923 1970 2
# Prediction
predict_model <- model_data %>% as.data.frame(model_data)
#class(predict_model)
predict_model$prediction <- exp(predict(full_model))
predict_model$residuals <- residuals(full_model)
predict_model[428, ]
price Lot.Area Land.Slope Year.Built Year.Remod.Add Bedroom.AbvGr
428 12789 9656 Gtl 1923 1970 2
prediction residuals
428 103176.2 -2.087853
House 428 has the largest residuals with a actual price of $12,789 and predicted price $103176.2 which is much higher. A number of factors may be responsible for this high price difference which include Year.Built(1923) and Year.Remod.Add(1970). The building was built 93 years ago and remodeled 50 years ago. The overall quality and condition of the building might be low.
Use the same model selection method you chose in Question 4 to again find the best multiple regression model to predict the natural log of home prices, but this time replacing Lot.Area with log(Lot.Area). Do you arrive at a model including the same set of predictors?
# Model with log of Lot.Area
full_model_log <- lm(log(price) ~ log(Lot.Area) + Land.Slope + Year.Built + Year.Remod.Add + Bedroom.AbvGr, data = model_data)
summary(full_model_log)
Call:
lm(formula = log(price) ~ log(Lot.Area) + Land.Slope + Year.Built +
Year.Remod.Add + Bedroom.AbvGr, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-2.14050 -0.15650 -0.01561 0.15350 0.90854
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.553e+01 8.197e-01 -18.947 < 2e-16 ***
log(Lot.Area) 2.442e-01 1.708e-02 14.297 < 2e-16 ***
Land.SlopeMod 1.151e-01 4.734e-02 2.431 0.0152 *
Land.SlopeSev -6.554e-02 1.222e-01 -0.536 0.5917
Year.Built 5.981e-03 3.597e-04 16.628 < 2e-16 ***
Year.Remod.Add 6.734e-03 5.190e-04 12.975 < 2e-16 ***
Bedroom.AbvGr 5.909e-02 1.055e-02 5.599 2.8e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2649 on 993 degrees of freedom
Multiple R-squared: 0.6056, Adjusted R-squared: 0.6032
F-statistic: 254.1 on 6 and 993 DF, p-value: < 2.2e-16
The adjusted R2 is 0.6032. The highest adjusted R2(0.6032) is arrived at only when all the predictors were included.
Do you think it is better to log transform Lot.Area, in terms of assumptions for linear regression? Make graphs of the predicted values of log home price versus the true values of log home price for the regression models selected for Lot.Area and log(Lot.Area). Referencing these two plots, provide a written support that includes a quantitative justification for your answer in the first part of question 7.
# Model without log transformation
model_nolog <- full_model
# Model with log transformation
model_log <- full_model_log
# Display adjusted R squared values
m_log <- summary(model_log)$adj.r.squared
m_nolog <- summary(model_nolog)$adj.r.squared
a.R_squared <- rbind(m_log, m_nolog)
modell <- c("m_log", "m_nolog")
coln <- cbind(modell, a.R_squared)
dfl <- data.frame(coln)
names(dfl) <- c("Model", "R_squared")
head(dfl)
Model R_squared
m_log m_log 0.603201839590637
m_nolog m_nolog 0.559834474177341
# Select entities and graphics
pri_data <- model_data[, "price"]
pri_t <- subset(model_data, select = -c(price))
pri_log <- predict(full_model_log, pri_t, interval = "prediction", level = 0.95)
pri_log <- data.frame(pri_log)
pd_log <- sapply(pri_log[1], function(x) exp(x))
pri_nolog <- predict(full_model, pri_t, interval = "prediction", level = 0.95)
pri_nolog <- data.frame(pri_nolog)
pd_nolog <- sapply(pri_nolog[1], function(x) exp(x))
pd_log <- cbind(pd_log, pri_data)
names(pd_log) <- c("pd", "value")
pd_nolog <- cbind(pd_nolog, pri_data)
names(pd_nolog) <- c("pd", "value")
ggplot(pd_log, aes(x = pd, y = value)) + geom_point() + stat_smooth(method=lm, level=0.95) + geom_abline(intercept = 0, slope = 1)
ggplot(pd_nolog, aes(x = pd, y = value)) + geom_point() + stat_smooth(method=lm, level=0.95) + geom_abline(intercept = 0, slope = 1)
The scatterplot of the log model shows more evenly distributed points than the model without log transformation. The log model exhibited homoscedasticity. Thus the log model will give a better prediction.