set.seed(123)
n <- 200
sqft <- round(rnorm(n, mean = 2000, sd = 500)) # House size (sqft)
bedrooms <- sample(2:6, n, replace = TRUE) # Number of bedrooms
bathrooms <- sample(1:4, n, replace = TRUE) # Number of bathrooms
age <- round(runif(n, 0, 50)) # Age of house (years)
garage <- sample(0:1, n, replace = TRUE, prob = c(0.3, 0.7)) # Garage (0=no, 1=yes)
distance_city <- round(runif(n, 1, 30), 1) # Distance to city center (km)
price <- 50000 +
150 * sqft +
10000 * bedrooms +
12000 * bathrooms -
800 * age +
15000 * garage -
2000 * distance_city +
rnorm(n, mean = 0, sd = 20000) # random noise
house_data <- data.frame(
price,
sqft,
bedrooms,
bathrooms,
age,
garage,
distance_city
)
## Set up
summary(house_data)
## price sqft bedrooms bathrooms age
## Min. :164674 Min. : 845 Min. :2.00 Min. :1.00 Min. : 0.00
## 1st Qu.:324351 1st Qu.:1687 1st Qu.:3.00 1st Qu.:1.00 1st Qu.:13.75
## Median :379585 Median :1970 Median :4.00 Median :3.00 Median :24.00
## Mean :380861 Mean :1996 Mean :3.97 Mean :2.57 Mean :24.32
## 3rd Qu.:438076 3rd Qu.:2284 3rd Qu.:5.00 3rd Qu.:4.00 3rd Qu.:36.00
## Max. :644525 Max. :3621 Max. :6.00 Max. :4.00 Max. :50.00
## garage distance_city
## Min. :0.00 Min. : 1.100
## 1st Qu.:0.00 1st Qu.: 7.875
## Median :1.00 Median :14.850
## Mean :0.71 Mean :15.658
## 3rd Qu.:1.00 3rd Qu.:22.700
## Max. :1.00 Max. :30.000
head(house_data)
## price sqft bedrooms bathrooms age garage distance_city
## 1 313700.8 1720 4 3 48 0 13.7
## 2 352112.1 1885 2 2 8 0 5.9
## 3 555406.8 2779 6 4 26 1 21.7
## 4 288291.4 2035 2 1 44 1 22.7
## 5 378613.4 2065 5 4 43 1 25.5
## 6 535049.2 2858 3 3 1 1 16.5
names(house_data)
## [1] "price" "sqft" "bedrooms" "bathrooms"
## [5] "age" "garage" "distance_city"
price ~ sqft
m_simple <- lm(price ~ sqft, data = house_data)
summary(m_simple)
##
## Call:
## lm(formula = price ~ sqft, data = house_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98447 -23562 -1310 23668 92697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82325.85 11010.32 7.477 2.41e-12 ***
## sqft 149.59 5.37 27.857 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35720 on 198 degrees of freedom
## Multiple R-squared: 0.7967, Adjusted R-squared: 0.7957
## F-statistic: 776 on 1 and 198 DF, p-value: < 2.2e-16
coef(m_simple)
## (Intercept) sqft
## 82325.8539 149.5885
Visualization
library(ggplot2)
# Fit the model
m_simple <- lm(price ~ sqft, data = house_data)
# Scatterplot with regression line
ggplot(house_data, aes(x = sqft, y = price)) +
geom_point(color = "blue", size = 3) +
geom_smooth(method = "lm", se = TRUE, color = "darkred") +
labs(
title = "Simple Linear Regression: price ~ sqft",
x = "Displacement (cu.in.)",
y = "Miles per Gallon (mpg)"
) +
theme_light()
We use correlation checks to avoid multicollinearity in a regression model because high correlation between predictor variables can lead to unstable coefficient estimates, inflated standard errors, and difficulty in interpreting the individual significance of each predictor.
vars <- house_data[, c("price", "sqft", "distance_city", "age")]
round(cor(vars), 4)
## price sqft distance_city age
## price 1.0000 0.8926 -0.2153 -0.1878
## sqft 0.8926 1.0000 -0.0172 -0.0350
## distance_city -0.2153 -0.0172 1.0000 0.0137
## age -0.1878 -0.0350 0.0137 1.0000
Note: size (sqft) is the dominant driver of price, while distance and age have only weak negative associations, and the predictors themselves are nearly uncorrelated—good news for regression, since it reduces multicollinearity concerns.
Note: Predictors are generally independent –> no serious multicollinearity
m_multiple <- lm(price ~ sqft + age + distance_city,data = house_data)
summary(m_multiple)
##
## Call:
## lm(formula = price ~ sqft + age + distance_city, data = house_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73218 -18590 1242 20194 71269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 136131.726 10813.431 12.589 < 2e-16 ***
## sqft 148.113 4.476 33.094 < 2e-16 ***
## age -893.906 154.828 -5.774 3.00e-08 ***
## distance_city -1859.788 250.796 -7.416 3.57e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29750 on 196 degrees of freedom
## Multiple R-squared: 0.8604, Adjusted R-squared: 0.8583
## F-statistic: 402.8 on 3 and 196 DF, p-value: < 2.2e-16
coef(m_multiple)
## (Intercept) sqft age distance_city
## 136131.7255 148.1126 -893.9064 -1859.7877
price
of a one-unit increase in X_j, holding the other
predictors fixed.To begin with,
ggplot(house_data, aes(x = sqft, y = price,
color = distance_city, size = age)) +
geom_point(alpha = 0.8) +
scale_size_continuous(range = c(2, 6)) +
scale_color_viridis_c(option = "plasma", end = 0.9) + # nicer color scale
labs(
title = "House Price vs. Square Footage",
x = "Square Footage (sqft)",
y = "House Price ($)",
color = "Distance from the city (km)",
size = "Age (years)"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "bottom"
)
Or in another way,
house_data$price_fitted <- fitted(m_multiple)
ggplot(house_data, aes(x = price_fitted, y = price)) +
geom_point(color = "steelblue", size = 2.5, alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Fitted vs. Actual House Prices",
subtitle = "Red dashed line = perfect fit",
x = "Fitted Price (from model)",
y = "Observed Price"
) +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold"))
- Note: When you fit a regression model in R with lm(),
the function fitted() extracts the predicted values of the response
variable from that model.
-Note: Each blue point represents a house: its x-coordinate is the fitted (predicted) price from your regression model, and its y-coordinate is the observed (actual) price in the dataset.
The red dashed 45° line represents perfect prediction—if the model predicted exactly correctly, all points would fall on this line.
Or in another another way,
#install.packages("scatterplot3d")
library(scatterplot3d)
s3d <- scatterplot3d(
x = house_data$sqft,
y = house_data$price,
z = house_data$distance_city,
pch = 16,
color = "steelblue",
cex.symbols = scales::rescale(house_data$age, to = c(0.5, 1.5)),
angle = 40,
xlab = "Square Footage (sqft)",
ylab = "House Price ($)",
zlab = "Distance from City (km)",
main = "3D Plot: Price vs. Sqft vs. Distance",
grid = TRUE,
box = TRUE
)
-Note: If house price only depended on sqft, the points
would form a rising band straight across.
If it only depended on distance, the points would form a declining band.
Because both work at the same time (sqft ↑ increases price, distance ↑ decreases price), the points spread across a sloping surface in 3D space — like laying a tilted sheet of paper inside the box. Plus, age ↑ decreases price.
Question: Is at least one predictor
useful?
Hypotheses:
- H0: β_hp = β_wt = β_disp = 0
- Ha: at least one β_j ≠ 0
The overall F-statistic and p-value appear in summary(m_multiple)
summary(m_multiple)$fstatistic
## value numdf dendf
## 402.76 3.00 196.00
summary(m_multiple)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 136131.7255 10813.431487 12.589133 5.102024e-27
## sqft 148.1126 4.475547 33.093743 3.584959e-82
## age -893.9064 154.827503 -5.773563 3.000304e-08
## distance_city -1859.7877 250.796315 -7.415530 3.566130e-12
pf(summary(m_multiple)$fstatistic[1],
summary(m_multiple)$fstatistic[2],
summary(m_multiple)$fstatistic[3],
lower.tail = FALSE)
## value
## 1.614601e-83
Interpretation: Very small p-value → reject H0 → at least one predictor relates to
price.
summary(m_multiple)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 136131.7255 10813.431487 12.589133 5.102024e-27
## sqft 148.1126 4.475547 33.093743 3.584959e-82
## age -893.9064 154.827503 -5.773563 3.000304e-08
## distance_city -1859.7877 250.796315 -7.415530 3.566130e-12
summary(lm(price ~ sqft, data = house_data))
##
## Call:
## lm(formula = price ~ sqft, data = house_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98447 -23562 -1310 23668 92697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82325.85 11010.32 7.477 2.41e-12 ***
## sqft 149.59 5.37 27.857 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35720 on 198 degrees of freedom
## Multiple R-squared: 0.7967, Adjusted R-squared: 0.7957
## F-statistic: 776 on 1 and 198 DF, p-value: < 2.2e-16
summary(lm(price ~ age, data = house_data))
##
## Call:
## lm(formula = price ~ age, data = house_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -216535 -56269 -819 51260 270939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 407346.2 11276.3 36.124 < 2e-16 ***
## age -1089.0 404.7 -2.691 0.00774 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77820 on 198 degrees of freedom
## Multiple R-squared: 0.03528, Adjusted R-squared: 0.03041
## F-statistic: 7.241 on 1 and 198 DF, p-value: 0.007735
summary(lm(price ~ distance_city, data = house_data))
##
## Call:
## lm(formula = price ~ distance_city, data = house_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -201336 -51998 -6613 50466 245747
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 412533.0 11583.9 35.613 <2e-16 ***
## distance_city -2022.7 652.1 -3.102 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77370 on 198 degrees of freedom
## Multiple R-squared: 0.04634, Adjusted R-squared: 0.04153
## F-statistic: 9.622 on 1 and 198 DF, p-value: 0.002204
summary(m_multiple) evaluate a single predictor
given others.m_price_sqft <- lm(price ~ sqft + age, data = house_data) # drop distance_city
anova(m_price_sqft, m_multiple)
## Analysis of Variance Table
##
## Model 1: price ~ sqft + age
## Model 2: price ~ sqft + age + distance_city
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 197 2.2216e+11
## 2 196 1.7348e+11 1 4.8673e+10 54.99 3.566e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Note: ANOVA asks: Does adding the extra predictor(s) significantly improve the model fit?
Interpretation: Adding distance_city reduces the residual error substantially.
# Full model
summary(m_multiple)$r.squared # R²
## [1] 0.8604267
summary(m_multiple)$adj.r.squared # Adjusted R²
## [1] 0.8582904
# Let's play with that
m_sqft <- lm(price ~ sqft, data = house_data)
m_age <- lm(price ~ age, data = house_data)
r2_results <- data.frame(
Model = c("sqft only", "age only", "multiple"),
R2 = c(summary(m_sqft)$r.squared,
summary(m_age)$r.squared,
summary(m_multiple)$r.squared))
r2_results
## Model R2
## 1 sqft only 0.79671877
## 2 age only 0.03528022
## 3 multiple 0.86042670
Okay, let’s put it into a fictional scenario. There are two houses:
House 1: 1600 sqft, 10 years old, 8 km from city. House 2: 2400 sqft, 5 years old, 18 km from city.
# data
your_new_home <- data.frame(
sqft = c(1600, 2400),
age = c(10, 5),
distance_city = c(8, 18)
)
# Point predictions
pred_point <- predict(m_multiple, newdata = your_new_home)
# 95% Confidence Interval --> mean price at those features
pred_ci <- predict(m_multiple, newdata = your_new_home,
interval = "confidence", level = 0.95)
# 95% Prediction Interval --> individual house price
pred_pi <- predict(m_multiple, newdata = your_new_home,
interval = "prediction", level = 0.95)
pred_point
## 1 2
## 349294.5 453656.3
pred_ci
## fit lwr upr
## 1 349294.5 341297.8 357291.3
## 2 453656.3 445597.6 461715.0
pred_pi
## fit lwr upr
## 1 349294.5 290078.9 408510.1
## 2 453656.3 394432.2 512880.3
par(mfrow = c(2, 2))
plot(m_multiple) # residuals vs fitted, QQ-plot, scale-location, Cook's distance/leverage
par(mfrow = c(1, 1))