Úvod

V tomto cvičení pracujeme s pripravenými dátami z predchádzajúcej úlohy. Cieľom je vytvoriť základné grafy, tabuľky opisných štatistík, vykonať jednoduché testovanie hypotéz a odhadnúť lineárny regresný model.

Import údajov

udaje1 <- read.csv2(
  "cviko4/udaje/ChybnaDatabaza.csv",
  header = TRUE,
  sep = ";",
  dec = ".",
  na.strings = c("", "NA"),
  stringsAsFactors = FALSE
)

head(udaje1)

Úprava názvov premenných

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
old_names <- names(udaje1)

udaje1 <- udaje1 %>%
  rename_with(~ abbreviate(.x, strict = FALSE))

names(udaje1) <- make.unique(names(udaje1))

comparison <- data.frame(
  Original_Name = old_names,
  Shortened_Name = names(udaje1)
)

print(comparison)
##                     Original_Name Shortened_Name
## 1                           YEARS           YEAR
## 2                       COMPANIES           COMP
## 3                 EXCHANGE.SECTOR           EXCH
## 4                PRIMARY.BUSINESS           PRIM
## 5                         TOBIN.Q           TOBI
## 6           MARKET.CAPITALIZATION           MARK
## 7                RETURN.ON.ASSETS           RETU
## 8                   DEBT.TO.ASSET           DEBT
## 9                       FIRM.SIZE           FIRM
## 10        SOCIAL.DISCLOSURE.INDEX           SOCI
## 11 ENVIRONMENTAL.DISCLOSURE.INDEX           ENVI
## 12    GOVERNANCE.DISCLOSURE.INDEX           GOVE
## 13                      ESG.INDEX           ESG.

Príprava dát

udaje.2013 <- udaje1 %>%
  filter(YEAR == 2013) %>%
  select(RETU, ESG., DEBT, FIRM)

head(udaje.2013)

Grafy

Scatter plot

library(ggplot2)

ggplot(udaje.2013, aes(x = FIRM, y = ESG.)) +
  geom_point() +
  theme_minimal() +
  labs(
    title = "ESG index v závislosti od veľkosti firmy (2013)",
    x = "Veľkosť firmy",
    y = "ESG index"
  )

Boxplot

ggplot(
  udaje1 %>% filter(!is.na(YEAR), !is.na(ESG.)),
  aes(x = factor(YEAR), y = ESG.)
) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  theme_minimal() +
  labs(
    title = "ESG index podľa rokov",
    x = "Rok",
    y = "ESG index"
  )

Základné štatistiky

library(knitr)

esg.stats <- udaje1 %>%
  filter(YEAR %in% 2013:2016) %>%
  group_by(YEAR) %>%
  summarise(
    n      = n(),
    mean   = mean(ESG., na.rm = TRUE),
    sd     = sd(ESG., na.rm = TRUE),
    min    = min(ESG., na.rm = TRUE),
    q25    = quantile(ESG., 0.25, na.rm = TRUE),
    median = median(ESG., na.rm = TRUE),
    q75    = quantile(ESG., 0.75, na.rm = TRUE),
    max    = max(ESG., na.rm = TRUE),
    .groups = "drop"
  )

kable(esg.stats, digits = 2, caption = "Základné štatistiky ESG indexu (2013–2016)")
Základné štatistiky ESG indexu (2013–2016)
YEAR n mean sd min q25 median q75 max
2013 76 0.23 0.10 0 0.15 0.22 0.29 0.58
2014 76 0.24 0.11 0 0.16 0.24 0.31 0.57
2015 76 0.25 0.12 0 0.17 0.25 0.31 0.65
2016 75 0.26 0.12 0 0.17 0.27 0.32 0.64

Testovanie hypotéz

t-test

t.test.result <- t.test(
  udaje1$ESG.[udaje1$YEAR == 2013],
  udaje1$ESG.[udaje1$YEAR == 2015]
)

t.test.result
## 
##  Welch Two Sample t-test
## 
## data:  udaje1$ESG.[udaje1$YEAR == 2013] and udaje1$ESG.[udaje1$YEAR == 2015]
## t = -1.3118, df = 146.15, p-value = 0.1916
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05974418  0.01207401
## sample estimates:
## mean of x mean of y 
## 0.2276316 0.2514667

ANOVA

anova.result <- aov(ESG. ~ factor(YEAR), data = udaje1)
summary(anova.result)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## factor(YEAR)   9  0.636 0.07064   3.381 0.000444 ***
## Residuals    748 15.626 0.02089                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness

Lineárna regresia

model <- lm(ESG. ~ RETU + FIRM + DEBT, data = udaje.2013)
summary(model)
## 
## Call:
## lm(formula = ESG. ~ RETU + FIRM + DEBT, data = udaje.2013)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.182294 -0.060003  0.001775  0.051708  0.246346 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.000e-01  7.981e-02  -3.759 0.000347 ***
## RETU         1.661e-04  9.086e-05   1.828 0.071807 .  
## FIRM         7.832e-02  1.083e-02   7.234 4.37e-10 ***
## DEBT        -5.500e-04  4.211e-04  -1.306 0.195724    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0807 on 71 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4328, Adjusted R-squared:  0.4088 
## F-statistic: 18.06 on 3 and 71 DF,  p-value: 8.234e-09