housing_prices = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Int\\Housing_prices_data.csv")
Study Design: A cross-sectional investigation of house conditions in 506 suburbs in the Boston area conducted to develop a model for predicting housing prices.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(1234)
index = createDataPartition(housing_prices$MEDV, p = 0.75, list = FALSE)
training = housing_prices[index, ]
dim(training)
## [1] 381 14
testing = housing_prices[-index, ]
dim(testing)
## [1] 125 14
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT + MEDV, data = training)
| Overall (N=381) |
|
|---|---|
| CRIM | |
| Mean (SD) | 3.85 (9.36) |
| Median [Min, Max] | 0.250 [0.00906, 89.0] |
| ZN | |
| Mean (SD) | 12.6 (24.5) |
| Median [Min, Max] | 0 [0, 100] |
| INDUS | |
| Mean (SD) | 10.9 (6.88) |
| Median [Min, Max] | 8.56 [0.460, 27.7] |
| CHAS | |
| Mean (SD) | 0.0604 (0.238) |
| Median [Min, Max] | 0 [0, 1.00] |
| NOX | |
| Mean (SD) | 0.552 (0.117) |
| Median [Min, Max] | 0.538 [0.385, 0.871] |
| RM | |
| Mean (SD) | 6.29 (0.722) |
| Median [Min, Max] | 6.22 [3.56, 8.73] |
| AGE | |
| Mean (SD) | 68.1 (27.7) |
| Median [Min, Max] | 76.5 [6.00, 100] |
| DIS | |
| Mean (SD) | 3.86 (2.16) |
| Median [Min, Max] | 3.36 [1.13, 10.7] |
| RAD | |
| Mean (SD) | 9.41 (8.66) |
| Median [Min, Max] | 5.00 [1.00, 24.0] |
| TAX | |
| Mean (SD) | 405 (168) |
| Median [Min, Max] | 330 [188, 711] |
| PTRATIO | |
| Mean (SD) | 18.5 (2.15) |
| Median [Min, Max] | 19.0 [12.6, 22.0] |
| B | |
| Mean (SD) | 359 (87.0) |
| Median [Min, Max] | 392 [0.320, 397] |
| LSTAT | |
| Mean (SD) | 12.6 (7.20) |
| Median [Min, Max] | 10.9 [1.73, 38.0] |
| MEDV | |
| Mean (SD) | 22.6 (9.43) |
| Median [Min, Max] | 21.2 [5.00, 50.0] |
table1(~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT + MEDV, data = testing)
| Overall (N=125) |
|
|---|---|
| CRIM | |
| Mean (SD) | 2.88 (5.69) |
| Median [Min, Max] | 0.318 [0.00632, 45.7] |
| ZN | |
| Mean (SD) | 7.71 (18.8) |
| Median [Min, Max] | 0 [0, 90.0] |
| INDUS | |
| Mean (SD) | 11.8 (6.80) |
| Median [Min, Max] | 10.0 [1.38, 27.7] |
| CHAS | |
| Mean (SD) | 0.0960 (0.296) |
| Median [Min, Max] | 0 [0, 1.00] |
| NOX | |
| Mean (SD) | 0.562 (0.114) |
| Median [Min, Max] | 0.538 [0.398, 0.871] |
| RM | |
| Mean (SD) | 6.28 (0.642) |
| Median [Min, Max] | 6.19 [4.52, 8.78] |
| AGE | |
| Mean (SD) | 70.0 (29.5) |
| Median [Min, Max] | 82.6 [2.90, 100] |
| DIS | |
| Mean (SD) | 3.59 (1.93) |
| Median [Min, Max] | 2.89 [1.33, 12.1] |
| RAD | |
| Mean (SD) | 9.96 (8.87) |
| Median [Min, Max] | 5.00 [1.00, 24.0] |
| TAX | |
| Mean (SD) | 417 (171) |
| Median [Min, Max] | 384 [187, 711] |
| PTRATIO | |
| Mean (SD) | 18.4 (2.23) |
| Median [Min, Max] | 19.1 [13.0, 21.2] |
| B | |
| Mean (SD) | 351 (104) |
| Median [Min, Max] | 391 [2.52, 397] |
| LSTAT | |
| Mean (SD) | 12.8 (6.98) |
| Median [Min, Max] | 12.3 [1.92, 37.0] |
| MEDV | |
| Mean (SD) | 22.4 (8.49) |
| Median [Min, Max] | 21.2 [7.00, 50.0] |
# Distribution plot for housing price
ggplot(data = housing_prices, aes(x = MEDV)) +
geom_histogram(fill = "skyblue", color = "black", bins = 30) +
labs(title = "Distribution of Housing Prices",
x = "Median Housing Price ($1000s)",
y = "Frequency")
# Distribution plots for candidate predictor variables + Housing prices
for (variable in c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT")) {
p <- ggplot(data = housing_prices, aes_string(x = variable)) +
geom_histogram(fill = "skyblue", color = "black", bins = 30) +
labs(title = paste("Distribution of", variable),
x = variable,
y = "Frequency")
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(BMA)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-5)
xvars = training[, c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT")]
yvar = training[,c("MEDV")]
bma = bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 11 models were selected
## Best 5 models (cumulative posterior probability = 0.8562 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 2.974e+01 6.119946 2.931e+01 3.077e+01 3.068e+01
## CRIM 96.7 -1.185e-01 0.041066 -1.252e-01 -1.269e-01 -1.174e-01
## ZN 84.2 3.771e-02 0.021626 4.096e-02 4.901e-02 .
## INDUS 3.0 -1.947e-03 0.015780 . . .
## CHAS 97.6 3.445e+00 1.153681 3.601e+00 3.337e+00 3.591e+00
## NOX 100.0 -1.614e+01 4.296008 -1.727e+01 -1.511e+01 -1.820e+01
## RM 100.0 4.378e+00 0.463747 4.394e+00 4.249e+00 4.553e+00
## AGE 4.7 -5.942e-04 0.004280 . . .
## DIS 100.0 -1.378e+00 0.238078 -1.403e+00 -1.450e+00 -1.110e+00
## RAD 91.4 1.834e-01 0.098191 1.445e-01 2.719e-01 1.654e-01
## TAX 38.1 -3.349e-03 0.004889 . -8.862e-03 .
## PTRATIO 100.0 -9.713e-01 0.165651 -9.791e-01 -9.347e-01 -1.127e+00
## B 98.3 1.058e-02 0.003435 1.099e-02 1.056e-02 1.101e-02
## LSTAT 100.0 -5.393e-01 0.053736 -5.423e-01 -5.351e-01 -5.390e-01
##
## nVar 10 11 9
## r2 0.763 0.766 0.758
## BIC -4.884e+02 -4.880e+02 -4.864e+02
## post prob 0.345 0.294 0.132
## model 4 model 5
## Intercept 2.363e+01 2.491e+01
## CRIM -8.073e-02 .
## ZN 4.828e-02 4.183e-02
## INDUS . .
## CHAS 3.773e+00 3.887e+00
## NOX -1.250e+01 -1.337e+01
## RM 4.578e+00 4.529e+00
## AGE . .
## DIS -1.428e+00 -1.347e+00
## RAD . .
## TAX . .
## PTRATIO -7.849e-01 -8.534e-01
## B 9.398e-03 1.085e-02
## LSTAT -5.378e-01 -5.612e-01
##
## nVar 9 8
## r2 0.756 0.752
## BIC -4.846e+02 -4.836e+02
## post prob 0.053 0.033
imageplot.bma(bma)
m1 = lm(MEDV ~ ZN + CHAS + RM + RAD + B, data = training)
summary(m1)
##
## Call:
## lm(formula = MEDV ~ ZN + CHAS + RM + RAD + B, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6379 -3.2515 -0.5942 2.3860 30.7314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.268016 3.235095 -10.902 < 2e-16 ***
## ZN 0.040059 0.013567 2.953 0.003350 **
## CHAS 5.066499 1.271591 3.984 8.13e-05 ***
## RM 8.279230 0.446047 18.561 < 2e-16 ***
## RAD -0.141558 0.040528 -3.493 0.000535 ***
## B 0.017671 0.003867 4.570 6.63e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.873 on 375 degrees of freedom
## Multiple R-squared: 0.6171, Adjusted R-squared: 0.612
## F-statistic: 120.9 on 5 and 375 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m1)
fit <- lm(MEDV ~ ZN + CHAS + RM + RAD + B, data = training)
coefficients <- coef(fit)
intercept <- coefficients[1]
predictors <- names(coefficients)[-1]
coefs <- coefficients[-1]
equation <- paste("MEDV =", round(intercept, 2), "+", paste(round(coefs, 2), predictors, collapse = " + "))
print(equation)
## [1] "MEDV = -35.27 + 0.04 ZN + 5.07 CHAS + 8.28 RM + -0.14 RAD + 0.02 B"
fit.m1 = train(MEDV ~ ZN + CHAS + RM + RAD + B, data = training, method = "lm", metric = "Rsquared")
summary(fit.m1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6379 -3.2515 -0.5942 2.3860 30.7314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.268016 3.235095 -10.902 < 2e-16 ***
## ZN 0.040059 0.013567 2.953 0.003350 **
## CHAS 5.066499 1.271591 3.984 8.13e-05 ***
## RM 8.279230 0.446047 18.561 < 2e-16 ***
## RAD -0.141558 0.040528 -3.493 0.000535 ***
## B 0.017671 0.003867 4.570 6.63e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.873 on 375 degrees of freedom
## Multiple R-squared: 0.6171, Adjusted R-squared: 0.612
## F-statistic: 120.9 on 5 and 375 DF, p-value: < 2.2e-16
pred.m1 = predict(fit.m1, testing)
testing.data = data.frame(obs = testing$MEDV, pred = pred.m1)
defaultSummary(testing.data)
## RMSE Rsquared MAE
## 6.1874989 0.4849193 3.8340495
library(BMA)
xvars = housing_prices[, c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT")]
yvar = housing_prices[,c("MEDV")]
bma.2 = bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
summary(bma.2)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 5 models were selected
## Best 5 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 36.520642 5.174370 3.634e+01 3.662e+01 3.523e+01
## CRIM 93.4 -0.101775 0.041894 -1.084e-01 -1.141e-01 .
## ZN 95.0 0.043273 0.016564 4.584e-02 4.574e-02 4.173e-02
## INDUS 0.0 0.000000 0.000000 . . .
## CHAS 90.1 2.465386 1.152744 2.719e+00 . 2.872e+00
## NOX 100.0 -17.325300 3.574303 -1.738e+01 -1.647e+01 -1.651e+01
## RM 100.0 3.813179 0.410411 3.802e+00 3.845e+00 3.832e+00
## AGE 0.0 0.000000 0.000000 . . .
## DIS 100.0 -1.476279 0.198410 -1.493e+00 -1.526e+00 -1.420e+00
## RAD 100.0 0.295811 0.065488 2.996e-01 3.155e-01 2.389e-01
## TAX 100.0 -0.011751 0.003427 -1.178e-02 -1.267e-02 -1.143e-02
## PTRATIO 100.0 -0.955902 0.133379 -9.465e-01 -9.784e-01 -9.355e-01
## B 96.3 0.009058 0.003188 9.291e-03 9.730e-03 1.032e-02
## LSTAT 100.0 -0.525639 0.048080 -5.226e-01 -5.281e-01 -5.479e-01
##
## nVar 11 10 10
## r2 0.741 0.735 0.735
## BIC -6.143e+02 -6.102e+02 -6.094e+02
## post prob 0.747 0.099 0.066
## model 4 model 5
## Intercept 3.703e+01 4.145e+01
## CRIM -9.819e-02 -1.217e-01
## ZN . 4.619e-02
## INDUS . .
## CHAS 2.712e+00 2.872e+00
## NOX -1.863e+01 -1.826e+01
## RM 4.003e+00 3.673e+00
## AGE . .
## DIS -1.178e+00 -1.516e+00
## RAD 2.844e-01 2.839e-01
## TAX -9.548e-03 -1.229e-02
## PTRATIO -1.097e+00 -9.310e-01
## B 9.358e-03 .
## LSTAT -5.219e-01 -5.465e-01
##
## nVar 10 10
## r2 0.735 0.734
## BIC -6.089e+02 -6.083e+02
## post prob 0.050 0.037
imageplot.bma(bma.2)
### 1.4.2.2 Present the model
m2 = lm(MEDV ~ ZN + CHAS + RM + RAD + B, data = housing_prices)
summary(m2)
##
## Call:
## lm(formula = MEDV ~ ZN + CHAS + RM + RAD + B, data = housing_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.373 -3.239 -0.676 2.197 40.498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.094065 2.835567 -11.318 < 2e-16 ***
## ZN 0.040391 0.012416 3.253 0.00122 **
## CHAS 4.186574 1.049381 3.990 7.61e-05 ***
## RM 7.857518 0.401563 19.567 < 2e-16 ***
## RAD -0.157055 0.035333 -4.445 1.08e-05 ***
## B 0.016812 0.003241 5.188 3.09e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.94 on 500 degrees of freedom
## Multiple R-squared: 0.587, Adjusted R-squared: 0.5829
## F-statistic: 142.1 on 5 and 500 DF, p-value: < 2.2e-16
### 1.4.3.3 Check the model’s assumptions
par(mfrow = c(2,2))
plot(m2)
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
fit.m2 = ols(MEDV ~ ZN + CHAS + RM + RAD + B, data = housing_prices, x = TRUE, y = TRUE)
fit.m2
## Linear Regression Model
##
## ols(formula = MEDV ~ ZN + CHAS + RM + RAD + B, data = housing_prices,
## x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 506 LR chi2 447.44 R2 0.587
## sigma5.9401 d.f. 5 R2 adj 0.583
## d.f. 500 Pr(> chi2) 0.0000 g 7.676
##
## Residuals
##
## Min 1Q Median 3Q Max
## -21.3729 -3.2386 -0.6765 2.1973 40.4982
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -32.0941 2.8356 -11.32 <0.0001
## ZN 0.0404 0.0124 3.25 0.0012
## CHAS 4.1866 1.0494 3.99 <0.0001
## RM 7.8575 0.4016 19.57 <0.0001
## RAD -0.1571 0.0353 -4.45 <0.0001
## B 0.0168 0.0032 5.19 <0.0001
### 1.4.3.2 Validate the model
set.seed(1234)
v.m2 = validate(fit.m2, B = 500)
v.m2
## index.orig training test optimism index.corrected n
## R-square 0.5870 0.5926 0.5788 0.0138 0.5732 500
## MSE 34.8665 34.4890 35.5556 -1.0666 35.9331 500
## g 7.6757 7.7088 7.6409 0.0678 7.6079 500
## Intercept 0.0000 0.0000 0.1737 -0.1737 0.1737 500
## Slope 1.0000 1.0000 0.9913 0.0087 0.9913 500