Import údajov
url <- "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
boston <- read.csv(url)
colnames(boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "b" "lstat" "medv"
head(boston)
## crim zn indus chas nox rm age dis rad tax ptratio b lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
Grafy
Scatterplot: rm vs medv
library(ggplot2)
ggplot(boston, aes(x = rm, y = medv)) +
geom_point(color = "darkblue") +
theme_minimal() +
labs(
title = "Relationship between Average Rooms (rm) and Median Value (medv)",
x = "Average Number of Rooms (rm)",
y = "Median Value (medv, $1000s)"
)

Boxplot: medv by chas
ggplot(boston, aes(x = factor(chas), y = medv)) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
theme_minimal() +
labs(
title = "House Prices (medv) by Charles River Proximity (chas)",
x = "chas (1 = bounds river, 0 = otherwise)",
y = "Median Value (medv, $1000s)"
)

Základné štatistiky
library(dplyr)
library(knitr)
library(kableExtra)
stats <- boston %>%
summarise(
n = n(),
crim_mean = mean(crim),
crim_sd = sd(crim),
rm_mean = mean(rm),
rm_sd = sd(rm),
medv_mean = mean(medv),
medv_sd = sd(medv)
)
kable(stats, digits = 2, caption = "Basic statistics for selected Boston Housing variables") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Basic statistics for selected Boston Housing variables
|
n
|
crim_mean
|
crim_sd
|
rm_mean
|
rm_sd
|
medv_mean
|
medv_sd
|
|
506
|
3.61
|
8.6
|
6.28
|
0.7
|
22.53
|
9.2
|
Korelačná matica a Heatmap
library(corrplot)
num_vars <- boston[, sapply(boston, is.numeric)]
cor_mat <- cor(num_vars)
corrplot(cor_mat, method = "color", type = "upper", tl.col = "black", addCoef.col = "black", number.cex = 0.7)

Testovanie hypotéz
t-test: medv medzi chas = 0 a chas = 1
t.test(medv ~ chas, data = boston)
##
## Welch Two Sample t-test
##
## data: medv by chas
## t = -3.1133, df = 36.876, p-value = 0.003567
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -10.476831 -2.215483
## sample estimates:
## mean in group 0 mean in group 1
## 22.09384 28.44000
ANOVA: medv medzi rad skupinami
anova_result <- aov(medv ~ factor(rad), data = boston)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(rad) 8 9767 1220.9 18.42 <2e-16 ***
## Residuals 497 32949 66.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Linear Regression: medv ~ rm + lstat + ptratio
model <- lm(medv ~ rm + lstat + ptratio, data = boston)
summary(model)
##
## Call:
## lm(formula = medv ~ rm + lstat + ptratio, data = boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.4871 -3.1047 -0.7976 1.8129 29.6559
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.56711 3.91320 4.745 2.73e-06 ***
## rm 4.51542 0.42587 10.603 < 2e-16 ***
## lstat -0.57181 0.04223 -13.540 < 2e-16 ***
## ptratio -0.93072 0.11765 -7.911 1.64e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.229 on 502 degrees of freedom
## Multiple R-squared: 0.6786, Adjusted R-squared: 0.6767
## F-statistic: 353.3 on 3 and 502 DF, p-value: < 2.2e-16
library(broom)
coef_tbl <- tidy(model, conf.int = TRUE) %>%
mutate(
stars = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ "·",
TRUE ~ ""
)
) %>%
transmute(
Term = term,
Estimate = estimate,
`Std. Error` = std.error,
`t value` = statistic,
`p value` = p.value,
`95% CI` = paste0("[", round(conf.low, 2), ", ", round(conf.high, 2), "]"),
Sig = stars
)
coef_tbl %>%
kable(digits = 3, caption = "OLS Regression Coefficients") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, bold = TRUE) %>%
row_spec(0, bold = TRUE, background = "#f2f2f2")
OLS Regression Coefficients
|
Term
|
Estimate
|
Std. Error
|
t value
|
p value
|
95% CI
|
Sig
|
|
(Intercept)
|
18.567
|
3.913
|
4.745
|
0
|
[10.88, 26.26]
|
***
|
|
rm
|
4.515
|
0.426
|
10.603
|
0
|
[3.68, 5.35]
|
***
|
|
lstat
|
-0.572
|
0.042
|
-13.540
|
0
|
[-0.65, -0.49]
|
***
|
|
ptratio
|
-0.931
|
0.118
|
-7.911
|
0
|
[-1.16, -0.7]
|
***
|
fit_tbl <- glance(model) %>%
transmute(
`R-squared` = r.squared,
`Adj. R-squared` = adj.r.squared,
`F-statistic` = statistic,
`F p-value` = p.value,
`AIC` = AIC,
`BIC` = BIC,
`Num. obs.` = nobs
)
fit_tbl %>%
kable(digits = 3, caption = "Model Fit Statistics") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("condensed"))
Model Fit Statistics
|
R-squared
|
Adj. R-squared
|
F-statistic
|
F p-value
|
AIC
|
BIC
|
Num. obs.
|
|
0.679
|
0.677
|
353.345
|
0
|
3116.097
|
3137.23
|
506
|