Import údajov

url <- "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
boston <- read.csv(url)

colnames(boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "b"       "lstat"   "medv"
head(boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

Grafy

Scatterplot: rm vs medv

library(ggplot2)

ggplot(boston, aes(x = rm, y = medv)) +
  geom_point(color = "darkblue") +
  theme_minimal() +
  labs(
    title = "Relationship between Average Rooms (rm) and Median Value (medv)",
    x = "Average Number of Rooms (rm)",
    y = "Median Value (medv, $1000s)"
  )

Boxplot: medv by chas

ggplot(boston, aes(x = factor(chas), y = medv)) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  theme_minimal() +
  labs(
    title = "House Prices (medv) by Charles River Proximity (chas)",
    x = "chas (1 = bounds river, 0 = otherwise)",
    y = "Median Value (medv, $1000s)"
  )

Základné štatistiky

library(dplyr)
library(knitr)
library(kableExtra)

stats <- boston %>%
  summarise(
    n = n(),
    crim_mean = mean(crim),
    crim_sd = sd(crim),
    rm_mean = mean(rm),
    rm_sd = sd(rm),
    medv_mean = mean(medv),
    medv_sd = sd(medv)
  )

kable(stats, digits = 2, caption = "Basic statistics for selected Boston Housing variables") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Basic statistics for selected Boston Housing variables
n crim_mean crim_sd rm_mean rm_sd medv_mean medv_sd
506 3.61 8.6 6.28 0.7 22.53 9.2

Korelačná matica a Heatmap

library(corrplot)

num_vars <- boston[, sapply(boston, is.numeric)]
cor_mat <- cor(num_vars)

corrplot(cor_mat, method = "color", type = "upper", tl.col = "black", addCoef.col = "black", number.cex = 0.7)

Testovanie hypotéz

t-test: medv medzi chas = 0 a chas = 1

t.test(medv ~ chas, data = boston)
## 
##  Welch Two Sample t-test
## 
## data:  medv by chas
## t = -3.1133, df = 36.876, p-value = 0.003567
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -10.476831  -2.215483
## sample estimates:
## mean in group 0 mean in group 1 
##        22.09384        28.44000

ANOVA: medv medzi rad skupinami

anova_result <- aov(medv ~ factor(rad), data = boston)
summary(anova_result)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## factor(rad)   8   9767  1220.9   18.42 <2e-16 ***
## Residuals   497  32949    66.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear Regression: medv ~ rm + lstat + ptratio

model <- lm(medv ~ rm + lstat + ptratio, data = boston)
summary(model)
## 
## Call:
## lm(formula = medv ~ rm + lstat + ptratio, data = boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.4871  -3.1047  -0.7976   1.8129  29.6559 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.56711    3.91320   4.745 2.73e-06 ***
## rm           4.51542    0.42587  10.603  < 2e-16 ***
## lstat       -0.57181    0.04223 -13.540  < 2e-16 ***
## ptratio     -0.93072    0.11765  -7.911 1.64e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.229 on 502 degrees of freedom
## Multiple R-squared:  0.6786, Adjusted R-squared:  0.6767 
## F-statistic: 353.3 on 3 and 502 DF,  p-value: < 2.2e-16
library(broom)

coef_tbl <- tidy(model, conf.int = TRUE) %>%
  mutate(
    stars = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ "·",
      TRUE            ~ ""
    )
  ) %>%
  transmute(
    Term = term,
    Estimate = estimate,
    `Std. Error` = std.error,
    `t value` = statistic,
    `p value` = p.value,
    `95% CI` = paste0("[", round(conf.low, 2), ", ", round(conf.high, 2), "]"),
    Sig = stars
  )

coef_tbl %>%
  kable(digits = 3, caption = "OLS Regression Coefficients") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#f2f2f2")
OLS Regression Coefficients
Term Estimate Std. Error t value p value 95% CI Sig
(Intercept) 18.567 3.913 4.745 0 [10.88, 26.26] ***
rm 4.515 0.426 10.603 0 [3.68, 5.35] ***
lstat -0.572 0.042 -13.540 0 [-0.65, -0.49] ***
ptratio -0.931 0.118 -7.911 0 [-1.16, -0.7] ***
fit_tbl <- glance(model) %>%
  transmute(
    `R-squared` = r.squared,
    `Adj. R-squared` = adj.r.squared,
    `F-statistic` = statistic,
    `F p-value` = p.value,
    `AIC` = AIC,
    `BIC` = BIC,
    `Num. obs.` = nobs
  )

fit_tbl %>%
  kable(digits = 3, caption = "Model Fit Statistics") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("condensed"))
Model Fit Statistics
R-squared Adj. R-squared F-statistic F p-value AIC BIC Num. obs.
0.679 0.677 353.345 0 3116.097 3137.23 506