Úvod

V tomto cvičení pracujeme s pripravenými dátami z predchádzajúcej úlohy. Cieľom je vytvoriť základné grafy, tabuľky opisných štatistík, vykonať jednoduché testovanie hypotéz a odhadnúť lineárny regresný model.

Import údajov

udaje1 <- read.csv2(
  "cviko4/udaje/ChybnaDatabaza.csv",
  header = TRUE,
  sep = ";",
  dec = ".",
  na.strings = c("", "NA"),
  stringsAsFactors = FALSE
)

head(udaje1)

Úprava názvov premenných

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

old_names <- names(udaje1)

udaje1 <- udaje1 %>%
  rename_with(~ abbreviate(.x, strict = FALSE))

names(udaje1) <- make.unique(names(udaje1))

comparison <- data.frame(
  Original_Name = old_names,
  Shortened_Name = names(udaje1)
)

print(comparison)

##                     Original_Name Shortened_Name
## 1                           YEARS           YEAR
## 2                       COMPANIES           COMP
## 3                 EXCHANGE.SECTOR           EXCH
## 4                PRIMARY.BUSINESS           PRIM
## 5                         TOBIN.Q           TOBI
## 6           MARKET.CAPITALIZATION           MARK
## 7                RETURN.ON.ASSETS           RETU
## 8                   DEBT.TO.ASSET           DEBT
## 9                       FIRM.SIZE           FIRM
## 10        SOCIAL.DISCLOSURE.INDEX           SOCI
## 11 ENVIRONMENTAL.DISCLOSURE.INDEX           ENVI
## 12    GOVERNANCE.DISCLOSURE.INDEX           GOVE
## 13                      ESG.INDEX           ESG.

Príprava dát

udaje.2013 <- udaje1 %>%
  filter(YEAR == 2013) %>%
  select(RETU, ESG., DEBT, FIRM)

head(udaje.2013)

Grafy

Scatter plot

library(ggplot2)

ggplot(udaje.2013, aes(x = FIRM, y = ESG.)) +
  geom_point() +
  theme_minimal() +
  labs(
    title = "ESG index v závislosti od veľkosti firmy (2013)",
    x = "Veľkosť firmy",
    y = "ESG index"
  )

Boxplot

ggplot(
  udaje1 %>% filter(!is.na(YEAR), !is.na(ESG.)),
  aes(x = factor(YEAR), y = ESG.)
) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  theme_minimal() +
  labs(
    title = "ESG index podľa rokov",
    x = "Rok",
    y = "ESG index"
  )

Základné štatistiky

library(knitr)

esg.stats <- udaje1 %>%
  filter(YEAR %in% 2013:2016) %>%
  group_by(YEAR) %>%
  summarise(
    n      = n(),
    mean   = mean(ESG., na.rm = TRUE),
    sd     = sd(ESG., na.rm = TRUE),
    min    = min(ESG., na.rm = TRUE),
    q25    = quantile(ESG., 0.25, na.rm = TRUE),
    median = median(ESG., na.rm = TRUE),
    q75    = quantile(ESG., 0.75, na.rm = TRUE),
    max    = max(ESG., na.rm = TRUE),
    .groups = "drop"
  )

kable(esg.stats, digits = 2, caption = "Základné štatistiky ESG indexu (2013–2016)")

Základné štatistiky ESG indexu (2013–2016)
YEAR	n	mean	sd	q25	median	q75	max
2013	76	0.23	0.10	0.15	0.22	0.29	0.58
2014	76	0.24	0.11	0.16	0.24	0.31	0.57
2015	76	0.25	0.12	0.17	0.25	0.31	0.65
2016	75	0.26	0.12	0.17	0.27	0.32	0.64

Testovanie hypotéz

t-test

t.test.result <- t.test(
  udaje1$ESG.[udaje1$YEAR == 2013],
  udaje1$ESG.[udaje1$YEAR == 2015]
)

t.test.result

## 
##  Welch Two Sample t-test
## 
## data:  udaje1$ESG.[udaje1$YEAR == 2013] and udaje1$ESG.[udaje1$YEAR == 2015]
## t = -1.3118, df = 146.15, p-value = 0.1916
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05974418  0.01207401
## sample estimates:
## mean of x mean of y 
## 0.2276316 0.2514667

ANOVA

anova.result <- aov(ESG. ~ factor(YEAR), data = udaje1)
summary(anova.result)

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## factor(YEAR)   9  0.636 0.07064   3.381 0.000444 ***
## Residuals    748 15.626 0.02089                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness

Lineárna regresia

model <- lm(ESG. ~ RETU + FIRM + DEBT, data = udaje.2013)
summary(model)

## 
## Call:
## lm(formula = ESG. ~ RETU + FIRM + DEBT, data = udaje.2013)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.182294 -0.060003  0.001775  0.051708  0.246346 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.000e-01  7.981e-02  -3.759 0.000347 ***
## RETU         1.661e-04  9.086e-05   1.828 0.071807 .  
## FIRM         7.832e-02  1.083e-02   7.234 4.37e-10 ***
## DEBT        -5.500e-04  4.211e-04  -1.306 0.195724    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0807 on 71 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4328, Adjusted R-squared:  0.4088 
## F-statistic: 18.06 on 3 and 71 DF,  p-value: 8.234e-09

Moderné regresné metódy – grafy, tabuľky a jednoduché štatistiky