```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Load necessary libraries

library(ggplot2) 
library(car) # For diagnostic tests like VIF 
## Loading required package: carData
library(MASS) # For Cook's Distance 
library(dplyr) # For data manipulation
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Step 1: Load the data

# Read the dataset
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
str(ames)
## 'data.frame':    2930 obs. of  82 variables:
##  $ Order          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ PID            : int  526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
##  $ MS.SubClass    : int  20 20 20 20 60 60 120 120 120 60 ...
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  NA NA NA NA ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities      : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood   : chr  "NAmes" "NAmes" "NAmes" "NAmes" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" NA NA "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pool.QC        : chr  NA NA NA NA ...
##  $ Fence          : chr  NA "MnPrv" NA NA ...
##  $ Misc.Feature   : chr  NA NA "Gar2" NA ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Mo.Sold        : int  5 6 6 4 3 6 4 1 3 6 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...

Original Model

# Build a simple linear regression model: SalePrice ~ Gr.Liv.Area
linear_model <- lm(SalePrice ~ Gr.Liv.Area, data = ames)

# Display the model summary
summary(linear_model)
## 
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area, data = ames)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -483467  -30219   -1966   22728  334323 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13289.634   3269.703   4.064 4.94e-05 ***
## Gr.Liv.Area   111.694      2.066  54.061  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56520 on 2928 degrees of freedom
## Multiple R-squared:  0.4995, Adjusted R-squared:  0.4994 
## F-statistic:  2923 on 1 and 2928 DF,  p-value: < 2.2e-16

Expanded Model: Including Lot.Area and Total.Bsmt.SF

expanded_model <- lm(SalePrice ~ Gr.Liv.Area + Lot.Area + Total.Bsmt.SF, data = ames)
summary(expanded_model)
## 
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area + Lot.Area + Total.Bsmt.SF, 
##     data = ames)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -735699  -21485     180   21084  267138 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.080e+04  3.039e+03  -6.846 9.19e-12 ***
## Gr.Liv.Area    8.326e+01  2.037e+00  40.877  < 2e-16 ***
## Lot.Area       1.633e-01  1.210e-01   1.350    0.177    
## Total.Bsmt.SF  7.141e+01  2.315e+00  30.846  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48900 on 2925 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6257, Adjusted R-squared:  0.6253 
## F-statistic:  1630 on 3 and 2925 DF,  p-value: < 2.2e-16

Step 2: Add Interaction Term

# We hypothesize that Gr.Liv.Area and Total.Bsmt.SF together may have an interactive effect on SalePrice

expanded_model_interaction <- lm(SalePrice ~ Gr.Liv.Area * Total.Bsmt.SF + Lot.Area, data = ames)
summary(expanded_model_interaction)
## 
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area * Total.Bsmt.SF + Lot.Area, 
##     data = ames)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -324139  -22966      63   21888  303102 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -6.994e+04  4.799e+03 -14.575   <2e-16 ***
## Gr.Liv.Area                1.103e+02  2.874e+00  38.387   <2e-16 ***
## Total.Bsmt.SF              1.151e+02  4.045e+00  28.451   <2e-16 ***
## Lot.Area                   2.550e-01  1.178e-01   2.164   0.0306 *  
## Gr.Liv.Area:Total.Bsmt.SF -2.285e-02  1.758e-03 -12.996   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47550 on 2924 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6461, Adjusted R-squared:  0.6456 
## F-statistic:  1335 on 4 and 2924 DF,  p-value: < 2.2e-16

# Explanation of Variables:

1. `Lot.Area`: Included because the land size can directly influence property value.

2. `Total.Bsmt.SF`: Represents the total basement area; it may add value to the home.

3. Interaction term `Gr.Liv.Area:Total.Bsmt.SF`: Investigates if combined living area and basement area impact SalePrice.

Step 3: Multicollinearity Check

# Using VIF to check for multicollinearity in the expanded model

vif_values <- vif(expanded_model_interaction)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(vif_values)
##               Gr.Liv.Area             Total.Bsmt.SF                  Lot.Area 
##                  2.732521                  4.113217                  1.116951 
## Gr.Liv.Area:Total.Bsmt.SF 
##                  7.169596

# Interpretation:

# If any VIF > 5-10, it suggests potential multicollinearity, indicating variables may need adjustment.

Create a Data Frame with Complete Cases

Before creating the diagnostic plots, filter the dataset to include only complete cases:

# Filter the dataset for complete cases
ames_complete <- ames[, c("SalePrice", "Gr.Liv.Area", "Lot.Area", "Total.Bsmt.SF")]
ames_complete <- na.omit(ames_complete)  # Remove rows with missing values

# Fit the expanded linear regression model
expanded_model <- lm(SalePrice ~ Gr.Liv.Area + Lot.Area + Total.Bsmt.SF, data = ames_complete)

Step 4: Diagnostic Plots to Evaluate Model Assumptions

library(lindia)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Diagnostic Plot 1: Residuals vs. Fitted Values

# Create Residuals vs. Fitted plot
residuals_vs_fitted <- ggplot(data = data.frame(fitted = fitted(expanded_model), residuals = residuals(expanded_model)), 
                               aes(x = fitted, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. Fitted", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

print(residuals_vs_fitted)

Interpretation:

  • Ideal Outcome: Random scatter around the horizontal line indicates that assumptions of linearity and constant variance are likely met.

  • Issues Indicated: Patterns or “funnel shapes” suggest heteroscedasticity or model misspecification.

Diagnostic Plot 2: Residuals vs. Predictor Values

# Create Residuals vs. Gr.Liv.Area plot
residuals_vs_grlivarea <- ggplot(data = data.frame(Gr.Liv.Area = ames_complete$Gr.Liv.Area, 
                                                     residuals = residuals(expanded_model)), 
                                  aes(x = Gr.Liv.Area, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. Gr.Liv.Area", x = "Gr.Liv.Area", y = "Residuals") +
  theme_minimal()

print(residuals_vs_grlivarea)

Explanation:
Here, we check if residuals correlate with Gr.Liv.Area. A random scatter implies that the model adequately represents the influence of living area on sale price. However, patterns suggest that our model might not fully capture this relationship, indicating that we may need to explore interaction terms or transformations.

# Create Residuals vs. LotArea plot
residuals_vs_lotarea <- ggplot(data = data.frame(Lot.Area = ames_complete$Lot.Area, 
                                                  residuals = residuals(expanded_model)), 
                               aes(x = Lot.Area, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. LotArea", x = "LotArea", y = "Residuals") +
  theme_minimal()

print(residuals_vs_lotarea)

Explanation:
This plot evaluates the influence of LotArea on residuals. If the residuals show random behavior, our model likely captures the impact of lot size well. Patterns could indicate a more complex relationship, warranting further exploration of interactions or transformations involving LotArea.

# Create Residuals vs. TotalBsmtSF plot
residuals_vs_totalbsmtsf <- ggplot(data = data.frame(Total.Bsmt.SF = ames_complete$Total.Bsmt.SF, 
                                                      residuals = residuals(expanded_model)), 
                                   aes(x = Total.Bsmt.SF, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. TotalBsmtSF", x = "TotalBsmtSF", y = "Residuals") +
  theme_minimal()

print(residuals_vs_totalbsmtsf)

Explanation:
In this plot, we assess how well the model accounts for TotalBsmtSF. Randomly distributed residuals indicate proper modeling. However, trends (like increasing residuals with larger basement sizes) may point to missed interactions, suggesting a need to refine the model.

Diagnostic Plot 3: Residual Histogram

# Create Residual Histogram
residual_histogram <- ggplot(data = data.frame(residuals = residuals(expanded_model)), 
                              aes(x = residuals)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
  theme_minimal()

print(residual_histogram)

Explanation:
The histogram visualizes the distribution of residuals. A normal distribution (bell curve) indicates that the model fits well. Skewness or heavy tails signal potential outliers or missed influential factors, requiring further investigation into the model structure.

Diagnostic Plot 4: Q-Q Plot

# Create Q-Q Plot
qq_plot <- ggplot(data = data.frame(residuals = residuals(expanded_model)), 
                   aes(sample = residuals)) +
  stat_qq() +
  stat_qq_line(color = "red") +
  labs(title = "Q-Q Plot of Residuals") +
  theme_minimal()

print(qq_plot)

Explanation:
The Q-Q plot assesses the normality of residuals. Points following the diagonal line suggest normally distributed residuals, which is an assumption of linear regression. Significant deviations may indicate issues with model fit, prompting further investigation into transformations.

Diagnostic Plot 5: Cook’s Distance by Observation

# Calculate Cook's Distance
cooksd <- cooks.distance(expanded_model)

# Create Cook's Distance plot
cooksd_plot <- ggplot(data = data.frame(index = seq_along(cooksd), cooksd = cooksd), 
                      aes(x = index, y = cooksd)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Cook's Distance", x = "Observation Index", y = "Cook's Distance") +
  theme_minimal()

print(cooksd_plot)

Explanation:
Cook’s Distance identifies influential observations that can disproportionately affect model results. High values (greater than 1) suggest these observations should be examined further, as they may skew coefficients and overall model performance.

Conclusion:

In this data dive, we expanded our regression model to include additional predictors and an interaction term. Diagnostic plots indicate whether our model meets the assumptions of linear regression. Addressing any violations found in these assumptions is crucial for improving model performance and validity.