```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(ggplot2)
library(car) # For diagnostic tests like VIF
## Loading required package: carData
library(MASS) # For Cook's Distance
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read the dataset
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
str(ames)
## 'data.frame': 2930 obs. of 82 variables:
## $ Order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ PID : int 526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
## $ MS.SubClass : int 20 20 20 20 60 60 120 120 120 60 ...
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "NAmes" "NAmes" "NAmes" "NAmes" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" NA NA "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pool.QC : chr NA NA NA NA ...
## $ Fence : chr NA "MnPrv" NA NA ...
## $ Misc.Feature : chr NA NA "Gar2" NA ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Mo.Sold : int 5 6 6 4 3 6 4 1 3 6 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
# Build a simple linear regression model: SalePrice ~ Gr.Liv.Area
linear_model <- lm(SalePrice ~ Gr.Liv.Area, data = ames)
# Display the model summary
summary(linear_model)
##
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area, data = ames)
##
## Residuals:
## Min 1Q Median 3Q Max
## -483467 -30219 -1966 22728 334323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13289.634 3269.703 4.064 4.94e-05 ***
## Gr.Liv.Area 111.694 2.066 54.061 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56520 on 2928 degrees of freedom
## Multiple R-squared: 0.4995, Adjusted R-squared: 0.4994
## F-statistic: 2923 on 1 and 2928 DF, p-value: < 2.2e-16
expanded_model <- lm(SalePrice ~ Gr.Liv.Area + Lot.Area + Total.Bsmt.SF, data = ames)
summary(expanded_model)
##
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area + Lot.Area + Total.Bsmt.SF,
## data = ames)
##
## Residuals:
## Min 1Q Median 3Q Max
## -735699 -21485 180 21084 267138
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.080e+04 3.039e+03 -6.846 9.19e-12 ***
## Gr.Liv.Area 8.326e+01 2.037e+00 40.877 < 2e-16 ***
## Lot.Area 1.633e-01 1.210e-01 1.350 0.177
## Total.Bsmt.SF 7.141e+01 2.315e+00 30.846 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48900 on 2925 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6257, Adjusted R-squared: 0.6253
## F-statistic: 1630 on 3 and 2925 DF, p-value: < 2.2e-16
# We hypothesize that Gr.Liv.Area and Total.Bsmt.SF together may have an interactive effect on SalePrice
expanded_model_interaction <- lm(SalePrice ~ Gr.Liv.Area * Total.Bsmt.SF + Lot.Area, data = ames)
summary(expanded_model_interaction)
##
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area * Total.Bsmt.SF + Lot.Area,
## data = ames)
##
## Residuals:
## Min 1Q Median 3Q Max
## -324139 -22966 63 21888 303102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.994e+04 4.799e+03 -14.575 <2e-16 ***
## Gr.Liv.Area 1.103e+02 2.874e+00 38.387 <2e-16 ***
## Total.Bsmt.SF 1.151e+02 4.045e+00 28.451 <2e-16 ***
## Lot.Area 2.550e-01 1.178e-01 2.164 0.0306 *
## Gr.Liv.Area:Total.Bsmt.SF -2.285e-02 1.758e-03 -12.996 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47550 on 2924 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6461, Adjusted R-squared: 0.6456
## F-statistic: 1335 on 4 and 2924 DF, p-value: < 2.2e-16
1. `Lot.Area`: Included because the land size can directly influence property value.
2. `Total.Bsmt.SF`: Represents the total basement area; it may add value to the home.
3. Interaction term `Gr.Liv.Area:Total.Bsmt.SF`: Investigates if combined living area and basement area impact SalePrice.
# Using VIF to check for multicollinearity in the expanded model
vif_values <- vif(expanded_model_interaction)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(vif_values)
## Gr.Liv.Area Total.Bsmt.SF Lot.Area
## 2.732521 4.113217 1.116951
## Gr.Liv.Area:Total.Bsmt.SF
## 7.169596
# If any VIF > 5-10, it suggests potential multicollinearity, indicating variables may need adjustment.
Before creating the diagnostic plots, filter the dataset to include only complete cases:
# Filter the dataset for complete cases
ames_complete <- ames[, c("SalePrice", "Gr.Liv.Area", "Lot.Area", "Total.Bsmt.SF")]
ames_complete <- na.omit(ames_complete) # Remove rows with missing values
# Fit the expanded linear regression model
expanded_model <- lm(SalePrice ~ Gr.Liv.Area + Lot.Area + Total.Bsmt.SF, data = ames_complete)
library(lindia)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Create Residuals vs. Fitted plot
residuals_vs_fitted <- ggplot(data = data.frame(fitted = fitted(expanded_model), residuals = residuals(expanded_model)),
aes(x = fitted, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Fitted", x = "Fitted Values", y = "Residuals") +
theme_minimal()
print(residuals_vs_fitted)
Ideal Outcome: Random scatter around the horizontal line indicates that assumptions of linearity and constant variance are likely met.
Issues Indicated: Patterns or “funnel shapes” suggest heteroscedasticity or model misspecification.
# Create Residuals vs. Gr.Liv.Area plot
residuals_vs_grlivarea <- ggplot(data = data.frame(Gr.Liv.Area = ames_complete$Gr.Liv.Area,
residuals = residuals(expanded_model)),
aes(x = Gr.Liv.Area, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Gr.Liv.Area", x = "Gr.Liv.Area", y = "Residuals") +
theme_minimal()
print(residuals_vs_grlivarea)
Explanation:
Here, we check if residuals correlate with Gr.Liv.Area. A random scatter
implies that the model adequately represents the influence of living
area on sale price. However, patterns suggest that our model might not
fully capture this relationship, indicating that we may need to explore
interaction terms or transformations.
# Create Residuals vs. LotArea plot
residuals_vs_lotarea <- ggplot(data = data.frame(Lot.Area = ames_complete$Lot.Area,
residuals = residuals(expanded_model)),
aes(x = Lot.Area, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. LotArea", x = "LotArea", y = "Residuals") +
theme_minimal()
print(residuals_vs_lotarea)
Explanation:
This plot evaluates the influence of LotArea on residuals. If the
residuals show random behavior, our model likely captures the impact of
lot size well. Patterns could indicate a more complex relationship,
warranting further exploration of interactions or transformations
involving LotArea.
# Create Residuals vs. TotalBsmtSF plot
residuals_vs_totalbsmtsf <- ggplot(data = data.frame(Total.Bsmt.SF = ames_complete$Total.Bsmt.SF,
residuals = residuals(expanded_model)),
aes(x = Total.Bsmt.SF, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. TotalBsmtSF", x = "TotalBsmtSF", y = "Residuals") +
theme_minimal()
print(residuals_vs_totalbsmtsf)
Explanation:
In this plot, we assess how well the model accounts for TotalBsmtSF.
Randomly distributed residuals indicate proper modeling. However, trends
(like increasing residuals with larger basement sizes) may point to
missed interactions, suggesting a need to refine the model.
# Create Residual Histogram
residual_histogram <- ggplot(data = data.frame(residuals = residuals(expanded_model)),
aes(x = residuals)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
theme_minimal()
print(residual_histogram)
Explanation:
The histogram visualizes the distribution of residuals. A normal
distribution (bell curve) indicates that the model fits well. Skewness
or heavy tails signal potential outliers or missed influential factors,
requiring further investigation into the model structure.
# Create Q-Q Plot
qq_plot <- ggplot(data = data.frame(residuals = residuals(expanded_model)),
aes(sample = residuals)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
print(qq_plot)
Explanation:
The Q-Q plot assesses the normality of residuals. Points following the
diagonal line suggest normally distributed residuals, which is an
assumption of linear regression. Significant deviations may indicate
issues with model fit, prompting further investigation into
transformations.
# Calculate Cook's Distance
cooksd <- cooks.distance(expanded_model)
# Create Cook's Distance plot
cooksd_plot <- ggplot(data = data.frame(index = seq_along(cooksd), cooksd = cooksd),
aes(x = index, y = cooksd)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Cook's Distance", x = "Observation Index", y = "Cook's Distance") +
theme_minimal()
print(cooksd_plot)
Explanation:
Cook’s Distance identifies influential observations that can
disproportionately affect model results. High values (greater than 1)
suggest these observations should be examined further, as they may skew
coefficients and overall model performance.
Conclusion:
In this data dive, we expanded our regression model to include additional predictors and an interaction term. Diagnostic plots indicate whether our model meets the assumptions of linear regression. Addressing any violations found in these assumptions is crucial for improving model performance and validity.