set.seed(123)

n <- 200  

sqft <- round(rnorm(n, mean = 2000, sd = 500))         # House size (sqft)
bedrooms <- sample(2:6, n, replace = TRUE)             # Number of bedrooms
bathrooms <- sample(1:4, n, replace = TRUE)            # Number of bathrooms
age <- round(runif(n, 0, 50))                          # Age of house (years)
garage <- sample(0:1, n, replace = TRUE, prob = c(0.3, 0.7)) # Garage (0=no, 1=yes)
distance_city <- round(runif(n, 1, 30), 1)             # Distance to city center (km)

price <- 50000 + 
  150 * sqft + 
  10000 * bedrooms + 
  12000 * bathrooms - 
  800 * age + 
  15000 * garage - 
  2000 * distance_city +
  rnorm(n, mean = 0, sd = 20000)  # random noise


house_data <- data.frame(
  price,
  sqft,
  bedrooms,
  bathrooms,
  age,
  garage,
  distance_city
)


## Set up
summary(house_data)
##      price             sqft         bedrooms      bathrooms         age       
##  Min.   :164674   Min.   : 845   Min.   :2.00   Min.   :1.00   Min.   : 0.00  
##  1st Qu.:324351   1st Qu.:1687   1st Qu.:3.00   1st Qu.:1.00   1st Qu.:13.75  
##  Median :379585   Median :1970   Median :4.00   Median :3.00   Median :24.00  
##  Mean   :380861   Mean   :1996   Mean   :3.97   Mean   :2.57   Mean   :24.32  
##  3rd Qu.:438076   3rd Qu.:2284   3rd Qu.:5.00   3rd Qu.:4.00   3rd Qu.:36.00  
##  Max.   :644525   Max.   :3621   Max.   :6.00   Max.   :4.00   Max.   :50.00  
##      garage     distance_city   
##  Min.   :0.00   Min.   : 1.100  
##  1st Qu.:0.00   1st Qu.: 7.875  
##  Median :1.00   Median :14.850  
##  Mean   :0.71   Mean   :15.658  
##  3rd Qu.:1.00   3rd Qu.:22.700  
##  Max.   :1.00   Max.   :30.000
head(house_data)
##      price sqft bedrooms bathrooms age garage distance_city
## 1 313700.8 1720        4         3  48      0          13.7
## 2 352112.1 1885        2         2   8      0           5.9
## 3 555406.8 2779        6         4  26      1          21.7
## 4 288291.4 2035        2         1  44      1          22.7
## 5 378613.4 2065        5         4  43      1          25.5
## 6 535049.2 2858        3         3   1      1          16.5
names(house_data)
## [1] "price"         "sqft"          "bedrooms"      "bathrooms"    
## [5] "age"           "garage"        "distance_city"

Let’s review: Simple Linear Regression

price ~ sqft

m_simple <- lm(price ~ sqft, data = house_data)
summary(m_simple)
## 
## Call:
## lm(formula = price ~ sqft, data = house_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98447 -23562  -1310  23668  92697 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 82325.85   11010.32   7.477 2.41e-12 ***
## sqft          149.59       5.37  27.857  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35720 on 198 degrees of freedom
## Multiple R-squared:  0.7967, Adjusted R-squared:  0.7957 
## F-statistic:   776 on 1 and 198 DF,  p-value: < 2.2e-16
coef(m_simple)
## (Intercept)        sqft 
##  82325.8539    149.5885

Visualization

library(ggplot2)

# Fit the model
m_simple <- lm(price ~ sqft, data = house_data)

# Scatterplot with regression line
ggplot(house_data, aes(x = sqft, y = price)) +
  geom_point(color = "blue", size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred") +
  labs(
    title = "Simple Linear Regression: price ~ sqft",
    x = "Displacement (cu.in.)",
    y = "Miles per Gallon (mpg)"
  ) +
  theme_light()

1) Correlation Check

We use correlation checks to avoid multicollinearity in a regression model because high correlation between predictor variables can lead to unstable coefficient estimates, inflated standard errors, and difficulty in interpreting the individual significance of each predictor.

vars <- house_data[, c("price", "sqft", "distance_city", "age")]
round(cor(vars), 4)
##                 price    sqft distance_city     age
## price          1.0000  0.8926       -0.2153 -0.1878
## sqft           0.8926  1.0000       -0.0172 -0.0350
## distance_city -0.2153 -0.0172        1.0000  0.0137
## age           -0.1878 -0.0350        0.0137  1.0000
  • Note: size (sqft) is the dominant driver of price, while distance and age have only weak negative associations, and the predictors themselves are nearly uncorrelated—good news for regression, since it reduces multicollinearity concerns.

  • Note: Predictors are generally independent –> no serious multicollinearity

2) Multiple Linear Regression (all predictors)

m_multiple <- lm(price ~ sqft + age + distance_city,data = house_data)

summary(m_multiple)
## 
## Call:
## lm(formula = price ~ sqft + age + distance_city, data = house_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -73218 -18590   1242  20194  71269 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   136131.726  10813.431  12.589  < 2e-16 ***
## sqft             148.113      4.476  33.094  < 2e-16 ***
## age             -893.906    154.828  -5.774 3.00e-08 ***
## distance_city  -1859.788    250.796  -7.416 3.57e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29750 on 196 degrees of freedom
## Multiple R-squared:  0.8604, Adjusted R-squared:  0.8583 
## F-statistic: 402.8 on 3 and 196 DF,  p-value: < 2.2e-16
coef(m_multiple)
##   (Intercept)          sqft           age distance_city 
##   136131.7255      148.1126     -893.9064    -1859.7877
  • Interpret β_j: Average effect on price of a one-unit increase in X_j, holding the other predictors fixed.
  • Compare coefficients/p-values here vs. simple regressions above.

3) Visual Relationship

To begin with,

ggplot(house_data, aes(x = sqft, y = price,
                       color = distance_city, size = age)) +
  geom_point(alpha = 0.8) +
  scale_size_continuous(range = c(2, 6)) +
  scale_color_viridis_c(option = "plasma", end = 0.9) +  # nicer color scale
  labs(
    title = "House Price vs. Square Footage",
    x = "Square Footage (sqft)",
    y = "House Price ($)",
    color = "Distance from the city (km)",
    size  = "Age (years)"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom"
  )

Or in another way,

house_data$price_fitted <- fitted(m_multiple)

ggplot(house_data, aes(x = price_fitted, y = price)) +
  geom_point(color = "steelblue", size = 2.5, alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Fitted vs. Actual House Prices",
    subtitle = "Red dashed line = perfect fit",
    x = "Fitted Price (from model)",
    y = "Observed Price"
  ) +
  theme_minimal(base_size = 10) +
  theme(plot.title = element_text(face = "bold"))

- Note: When you fit a regression model in R with lm(), the function fitted() extracts the predicted values of the response variable from that model.

-Note: Each blue point represents a house: its x-coordinate is the fitted (predicted) price from your regression model, and its y-coordinate is the observed (actual) price in the dataset.

The red dashed 45° line represents perfect prediction—if the model predicted exactly correctly, all points would fall on this line.

Or in another another way,

#install.packages("scatterplot3d")
library(scatterplot3d)
s3d <- scatterplot3d(
  x = house_data$sqft,
  y = house_data$price,
  z = house_data$distance_city,
  pch = 16,
  color = "steelblue",
  cex.symbols = scales::rescale(house_data$age, to = c(0.5, 1.5)),      
  angle = 40,               
  xlab = "Square Footage (sqft)",
  ylab = "House Price ($)",
  zlab = "Distance from City (km)",
  main = "3D Plot: Price vs. Sqft vs. Distance",
  grid = TRUE,                
  box = TRUE
)

-Note: If house price only depended on sqft, the points would form a rising band straight across.

If it only depended on distance, the points would form a declining band.

Because both work at the same time (sqft ↑ increases price, distance ↑ decreases price), the points spread across a sloping surface in 3D space — like laying a tilted sheet of paper inside the box. Plus, age ↑ decreases price.

4) Relationship Test (Overall F-test)

Question: Is at least one predictor useful?
Hypotheses:
- H0: β_hp = β_wt = β_disp = 0
- Ha: at least one β_j ≠ 0

The overall F-statistic and p-value appear in summary(m_multiple)

summary(m_multiple)$fstatistic
##  value  numdf  dendf 
## 402.76   3.00 196.00
summary(m_multiple)$coefficients 
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)   136131.7255 10813.431487 12.589133 5.102024e-27
## sqft             148.1126     4.475547 33.093743 3.584959e-82
## age             -893.9064   154.827503 -5.773563 3.000304e-08
## distance_city  -1859.7877   250.796315 -7.415530 3.566130e-12
pf(summary(m_multiple)$fstatistic[1],
   summary(m_multiple)$fstatistic[2],
   summary(m_multiple)$fstatistic[3],
   lower.tail = FALSE)
##        value 
## 1.614601e-83

Interpretation: Very small p-value → reject H0 → at least one predictor relates to price.

5) t-tests in multiple regression

summary(m_multiple)$coefficients 
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)   136131.7255 10813.431487 12.589133 5.102024e-27
## sqft             148.1126     4.475547 33.093743 3.584959e-82
## age             -893.9064   154.827503 -5.773563 3.000304e-08
## distance_city  -1859.7877   250.796315 -7.415530 3.566130e-12
summary(lm(price ~ sqft, data = house_data))
## 
## Call:
## lm(formula = price ~ sqft, data = house_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98447 -23562  -1310  23668  92697 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 82325.85   11010.32   7.477 2.41e-12 ***
## sqft          149.59       5.37  27.857  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35720 on 198 degrees of freedom
## Multiple R-squared:  0.7967, Adjusted R-squared:  0.7957 
## F-statistic:   776 on 1 and 198 DF,  p-value: < 2.2e-16
summary(lm(price ~ age, data = house_data))
## 
## Call:
## lm(formula = price ~ age, data = house_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -216535  -56269    -819   51260  270939 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 407346.2    11276.3  36.124  < 2e-16 ***
## age          -1089.0      404.7  -2.691  0.00774 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77820 on 198 degrees of freedom
## Multiple R-squared:  0.03528,    Adjusted R-squared:  0.03041 
## F-statistic: 7.241 on 1 and 198 DF,  p-value: 0.007735
summary(lm(price ~ distance_city, data = house_data))
## 
## Call:
## lm(formula = price ~ distance_city, data = house_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -201336  -51998   -6613   50466  245747 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   412533.0    11583.9  35.613   <2e-16 ***
## distance_city  -2022.7      652.1  -3.102   0.0022 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77370 on 198 degrees of freedom
## Multiple R-squared:  0.04634,    Adjusted R-squared:  0.04153 
## F-statistic: 9.622 on 1 and 198 DF,  p-value: 0.002204
  • Individual t-tests in summary(m_multiple) evaluate a single predictor given others.
m_price_sqft <- lm(price ~ sqft + age, data = house_data)   # drop distance_city
anova(m_price_sqft, m_multiple)  
## Analysis of Variance Table
## 
## Model 1: price ~ sqft + age
## Model 2: price ~ sqft + age + distance_city
##   Res.Df        RSS Df  Sum of Sq     F    Pr(>F)    
## 1    197 2.2216e+11                                  
## 2    196 1.7348e+11  1 4.8673e+10 54.99 3.566e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note: ANOVA asks: Does adding the extra predictor(s) significantly improve the model fit?

Interpretation: Adding distance_city reduces the residual error substantially.

6) Model Fit: RSE and R²

# Full model
summary(m_multiple)$r.squared        # R²
## [1] 0.8604267
summary(m_multiple)$adj.r.squared    # Adjusted R²
## [1] 0.8582904
# Let's play with that
m_sqft <- lm(price ~ sqft, data = house_data)
m_age  <- lm(price ~ age,  data = house_data)
r2_results <- data.frame(
  Model = c("sqft only", "age only", "multiple"),
  R2    = c(summary(m_sqft)$r.squared,
            summary(m_age)$r.squared,
            summary(m_multiple)$r.squared))

r2_results
##       Model         R2
## 1 sqft only 0.79671877
## 2  age only 0.03528022
## 3  multiple 0.86042670

7) Predictions: Point, Confidence Interval, Prediction Interval

Okay, let’s put it into a fictional scenario. There are two houses:

House 1: 1600 sqft, 10 years old, 8 km from city. House 2: 2400 sqft, 5 years old, 18 km from city.

# data
your_new_home <- data.frame(
  sqft          = c(1600, 2400),
  age           = c(10, 5),
  distance_city = c(8, 18)
)

# Point predictions
pred_point <- predict(m_multiple, newdata = your_new_home)

# 95% Confidence Interval --> mean price at those features
pred_ci <- predict(m_multiple, newdata = your_new_home,
                   interval = "confidence", level = 0.95)

# 95% Prediction Interval --> individual house price
pred_pi <- predict(m_multiple, newdata = your_new_home,
                   interval = "prediction", level = 0.95)

pred_point
##        1        2 
## 349294.5 453656.3
pred_ci
##        fit      lwr      upr
## 1 349294.5 341297.8 357291.3
## 2 453656.3 445597.6 461715.0
pred_pi
##        fit      lwr      upr
## 1 349294.5 290078.9 408510.1
## 2 453656.3 394432.2 512880.3

8) Visual Diagnostics

par(mfrow = c(2, 2))
plot(m_multiple)  # residuals vs fitted, QQ-plot, scale-location, Cook's distance/leverage

par(mfrow = c(1, 1))

1. Residuals vs Fitted

  • Purpose: Checks linearity and constant variance.
  • Ideal: Points scattered randomly around 0 (no curve, no funnel).
  • Interpretation: The plot looks fairly flat, suggesting linearity holds. Some spread is visible but no strong heteroskedasticity problem.

2. Normal Q-Q

  • Purpose: Checks whether residuals are normally distributed.
  • Ideal: Points lie on the 45° line.
  • Interpretation: Most points follow the line, with only slight deviation at the tails → normality assumption is reasonable.

3. Scale-Location (Spread-Location)

  • Purpose: Checks homoscedasticity (equal variance across fitted values).
  • Ideal: Horizontal line with random scatter.
  • Interpretation: The spread looks fairly even, no strong pattern → variance appears stable.

4. Residuals vs Leverage

  • Purpose: Checks for influential points (cases that strongly affect the model).
  • Ideal: Most points in the middle, no extreme outliers.
  • Interpretation: A few cases (e.g., #144, #44) may have higher leverage → worth checking, but not severe.