knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)

Import údajov z .csv alebo .xls

Pre účely tejto úlohy som si vybrala databázu movies.csv, ktorá zobrazuje údaje o filmoch vydaných v rokoch 1986-2016, hlavné obsadenie, režisérov, hodnotenie, trvanie, budget,… Dáta som stiahla z https://www.kaggle.com/datasets/danielgrijalvas/movies?select=movies.csv

Importujeme si údaje do .data.frame., a to nasledovne

udaje <- read.csv2("Ekonometria/movies.csv",header=TRUE,sep=",",dec=".")
head(udaje)                                             
colnames(udaje)
##  [1] "name"     "rating"   "genre"    "year"     "released" "score"   
##  [7] "votes"    "director" "writer"   "star"     "country"  "budget"  
## [13] "gross"    "company"  "runtime"

Grafy

ggplot2 - knižnica pre grafy

Výber a následné triedenie

library(dplyr)

udaje.spielberg <- udaje %>%
  filter(director == 'Steven Spielberg') %>%
  select(name, genre, year, score, budget, gross)
head(udaje.spielberg)

Z dát sme vyselektovali údaje týkajúce sa filmov Stevena Spielberga. Zamerali sme sa na názov, žáner, rok vydania, hodnotenie, budget a zárobok filmu.

Scatter plot

# Basic scatter plot
library(ggplot2)
ggplot(udaje.spielberg, aes(x = budget, y = gross)) +            # specifikacia osi
  geom_point() +                                                   # typ grafu - scatterplot
  geom_text(aes(label = name), vjust = -0.8, size = 2) + #oznacenie filmu
  theme_minimal() +
  labs(title = 'pomer zárobku k budgetu', x = "Budget", y = "Zárobok")      # oznacenie osi

Na grafe sme znázornili zárobok filmu v závislosti od budgetu pre daný film Spielberga. Vidíme, že E.T. bol Spielbergov najúspešnejší film v tejto kategórii, najvyšší zárobok pri najnižšom budgete. Z grafu sa nám ťažko identifikuje, či je medzi budgetom a zárobkom nejaká závislosť. Vieme si preto overiť koreláciu medzi týmito dvoma údajmi.

cor(udaje.spielberg$budget, udaje.spielberg$gross, method = "pearson")
## [1] 0.3335166

Výsledok 0.3335166 nám hovorí, že je mierna pozitívna korelácia medzi budgetom a zárobkom.

Boxplot

Tento typ grafu využijeme na zobrazenie budgetov pre filmy rôznych žánrov.

# Bar plot with grouping
library(ggplot2)

table(udaje$genre) #vyberiem si najcastejsie zanre
## 
##    Action Adventure Animation Biography    Comedy     Crime     Drama    Family 
##      1705       427       338       443      2245       551      1518        11 
##   Fantasy   History    Horror     Music   Musical   Mystery   Romance    Sci-Fi 
##        44         1       322         1         2        20        10        10 
##     Sport  Thriller   Western 
##         1        16         3
len <- c('Action','Adventure','Animation','Biography','Comedy','Crime','Drama','Fantasy','Horror')
ggplot(subset(udaje, genre %in% len), aes(x = genre, y = budget)) +        # specifikacia osi
  geom_boxplot(fill = "pink", color = "brown") +      # typ grafu - boxplot
  labs(                                                       # oznacenie osi, nazov grafu
    title = "budget pre jednotlivé žánre",
    x = "žáner",
    y = "budget"
  ) +
  theme_minimal()

Z boxplotov pre jednotlivé žánre vidíme, že akčné filmy majú najväčšie horné odchýlky zo všetkých žánrov. V priemere najnákladovejšie sú animované filmy, a potom akčné filmy. Toto bude vyplývať z nákladov na digitálnu prácu, animácie a CGI efekty.

Základné štatistiky.

knitr - tabuľka

Zo zvedavosti si zostavíme tabuľku so štatistikou o dĺžke filmu v jednotlivých žánroch - priemerné hodnoty, odchýlky. Takáto tabuľka nám môže pomôcť zamyslieť sa nad tým, ako tvorba a marketing takýchto filmov fungujú.

library(dplyr)
library(knitr)

# Summarise basic statistics
runtime.stats <- udaje %>%
  filter(genre %in% len) %>%
  group_by(genre) %>%
  summarise(
    n     = n(),
    mean  = mean(runtime, na.rm = TRUE),
    sd    = sd(runtime, na.rm = TRUE),
    min   = min(runtime, na.rm = TRUE),
    q25   = quantile(runtime, 0.25, na.rm = TRUE),
    median= median(runtime, na.rm = TRUE),
    q75   = quantile(runtime, 0.75, na.rm = TRUE),
    max   = max(runtime, na.rm = TRUE),
    .groups = "drop"
  )

# Create knitr table
kable(runtime.stats, digits = 2, caption = "štatistika dĺžky filmu v jednotlivých źánroch")
štatistika dĺžky filmu v jednotlivých źánroch
genre n mean sd min q25 median q75 max
Action 1705 110.21 17.96 75 97.0 107.0 120.00 201
Adventure 427 107.98 19.68 73 95.0 103.0 116.50 219
Animation 338 92.20 11.79 63 84.0 91.0 99.00 137
Biography 443 119.88 20.52 79 106.5 117.0 129.00 219
Comedy 2245 101.49 12.99 77 93.0 100.0 108.00 188
Crime 551 111.75 18.19 69 99.0 108.0 120.00 229
Drama 1518 112.97 22.04 71 99.0 109.0 122.00 366
Fantasy 44 99.36 13.37 85 91.0 95.5 104.25 152
Horror 322 96.30 10.36 72 89.0 94.5 102.00 135

alebo krajšie tabuľky s pomocou .kableExtra.:

library(dplyr)
library(knitr)
library(kableExtra)

# Create styled kableExtra table
runtime.stats %>%
  kable(digits = 2, caption = "Štatistika dĺžky filmov v jednotlivých žánroch") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%          # make years bold
  row_spec(0, bold = TRUE, background = "#f2f2f2") %>%  # style header row
  add_header_above(c(" " = 2, "Runtime Statistics" = 7))
Štatistika dĺžky filmov v jednotlivých žánroch
Runtime Statistics
genre n mean sd min q25 median q75 max
Action 1705 110.21 17.96 75 97.0 107.0 120.00 201
Adventure 427 107.98 19.68 73 95.0 103.0 116.50 219
Animation 338 92.20 11.79 63 84.0 91.0 99.00 137
Biography 443 119.88 20.52 79 106.5 117.0 129.00 219
Comedy 2245 101.49 12.99 77 93.0 100.0 108.00 188
Crime 551 111.75 18.19 69 99.0 108.0 120.00 229
Drama 1518 112.97 22.04 71 99.0 109.0 122.00 366
Fantasy 44 99.36 13.37 85 91.0 95.5 104.25 152
Horror 322 96.30 10.36 72 89.0 94.5 102.00 135

Vidíme, že všetky žánre sa v priemere držia pod 2 hodiny. Animované filmy a horory sú v priemere najkratšie. Biografické filmy majú vyššiu hodnotu q75, a teda majú pomerovo viac dlhších filmov.

Testovanie hypotéz

t-test: Porovnanie priemeru zárobku v rokoch 1990 a 2010

t.test.result <- t.test(
  udaje$gross[udaje$year == 1990],
  udaje$gross[udaje$year == 2010]
)

print(t.test.result)
## 
##  Welch Two Sample t-test
## 
## data:  udaje$gross[udaje$year == 1990] and udaje$gross[udaje$year == 2010]
## t = -5.6329, df = 269.16, p-value = 4.457e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -109413216  -52738196
## sample estimates:
## mean of x mean of y 
##  35624272 116699977

Výsledné hodnoty nám hovoria, že hypotéza, že hodnoty zárobkov sú cca rovnaké, je zamietaná, pretože nám vo výsledku vyšla štatisticky významná odchýlka. Filmy z roku 1990 majú štatisticky významne nižší zárobok, ako filmy z 2010.

ANOVA: Porovnávame hodnotenie filmov podľa žánrov

udaje_clean <- udaje %>%
  mutate(
    # ak je score text s čiarkami:
    score = as.numeric(gsub(",", ".", score)),
    genre = as.factor(genre)
  ) %>%
  filter(is.finite(score), !is.na(genre))
#tu sme sa poistili, že všetky hodnoty sú použiteľné

fit <- aov(score ~ genre, data = udaje_clean)
summary(fit)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## genre         18    719   39.96    47.2 <2e-16 ***
## Residuals   7646   6474    0.85                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Výsledok ANOVA testu nám ukazuje, že medzi priemernými hodnoteniami filmov v jednotlivých žánroch je štatisticky významná odchýlka. Vieme využiť nasledujúci kód, aby sme sa pozreli, medzi ktorými žánrami dochádza k takýmto odchýlkam.

TukeyHSD(fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = score ~ genre, data = udaje_clean)
## 
## $genre
##                             diff          lwr         upr     p adj
## Adventure-Action     0.088752185 -0.086459313  0.26396368 0.9585520
## Animation-Action     0.566413868  0.373637259  0.75919048 0.0000000
## Biography-Action     0.828108606  0.655445728  1.00077148 0.0000000
## Comedy-Action       -0.008830264 -0.112851070  0.09519054 1.0000000
## Crime-Action         0.468689451  0.310024029  0.62735487 0.0000000
## Drama-Action         0.490850645  0.376546191  0.60515510 0.0000000
## Family-Action        0.160819462 -0.818491320  1.14013024 1.0000000
## Fantasy-Action      -0.195998720 -0.690342634  0.29834520 0.9971668
## History-Action       2.097183099 -1.141340069  5.33570627 0.7255634
## Horror-Action       -0.452195783 -0.648928506 -0.25546306 0.0000000
## Music-Action         0.997183099 -2.241340069  4.23570627 0.9999047
## Musical-Action       1.847183099 -0.443470045  4.13783624 0.3100964
## Mystery-Action       0.462183099 -0.265996403  1.19036260 0.7567299
## Romance-Action       0.207183099 -0.819627223  1.23399342 0.9999999
## Sci-Fi-Action        0.007183099 -1.019627223  1.03399342 1.0000000
## Sport-Action        -0.302816901 -3.541340069  2.93570627 1.0000000
## Thriller-Action     -0.290316901 -1.103501320  0.52286752 0.9992599
## Western-Action      -0.536150235 -2.407008771  1.33470830 0.9999666
## Animation-Adventure  0.477661683  0.241951664  0.71337170 0.0000000
## Biography-Adventure  0.739356421  0.519791257  0.95892159 0.0000000
## Comedy-Adventure    -0.097582450 -0.268511441  0.07334654 0.8837618
## Crime-Adventure      0.379937265  0.171200330  0.58867420 0.0000000
## Drama-Adventure      0.402098460  0.224723433  0.57947349 0.0000000
## Family-Adventure     0.072067277 -0.916591411  1.06072596 1.0000000
## Fantasy-Adventure   -0.284750905 -0.797364110  0.22786230 0.9065221
## History-Adventure    2.008430913 -1.232931254  5.24979308 0.7911749
## Horror-Adventure    -0.540947969 -0.779904358 -0.30199158 0.0000000
## Music-Adventure      0.908430913 -2.332931254  4.14979308 0.9999761
## Musical-Adventure    1.758430913 -0.536234251  4.05309608 0.4073767
## Mystery-Adventure    0.373430913 -0.367272657  1.11413448 0.9603671
## Romance-Adventure    0.118430913 -0.917298698  1.15416052 1.0000000
## Sci-Fi-Adventure    -0.081569087 -1.117298698  0.95416052 1.0000000
## Sport-Adventure     -0.391569087 -3.632931254  2.84979308 1.0000000
## Thriller-Adventure  -0.379069087 -1.203487240  0.44534907 0.9843782
## Western-Adventure   -0.624902420 -2.500671077  1.25086624 0.9997103
## Biography-Animation  0.261694739  0.027872984  0.49551649 0.0112969
## Comedy-Animation    -0.575244132 -0.764136883 -0.38635138 0.0000000
## Crime-Animation     -0.097724417 -0.321409150  0.12596032 0.9910913
## Drama-Animation     -0.075563223 -0.270308317  0.11918187 0.9978331
## Family-Animation    -0.405594406 -1.397516657  0.58632785 0.9958788
## Fantasy-Animation   -0.762412587 -1.281292197 -0.24353298 0.0000398
## History-Animation    1.530769231 -1.711589857  4.77312832 0.9793694
## Horror-Animation    -1.018609651 -1.270728487 -0.76649082 0.0000000
## Music-Animation      0.430769231 -2.811589857  3.67312832 1.0000000
## Musical-Animation    1.280769231 -1.015303932  3.57684239 0.9032987
## Mystery-Animation   -0.104230769 -0.849284815  0.64082328 1.0000000
## Romance-Animation   -0.359230769 -1.398076080  0.67961454 0.9995175
## Sci-Fi-Animation    -0.559230769 -1.598076080  0.47961454 0.9281710
## Sport-Animation     -0.869230769 -4.111589857  2.37312832 0.9999877
## Thriller-Animation  -0.856730769 -1.685059836 -0.02840170 0.0334493
## Western-Animation   -1.102564103 -2.980054930  0.77492672 0.8561005
## Comedy-Biography    -0.836938871 -1.005254408 -0.66862333 0.0000000
## Crime-Biography     -0.359419156 -0.566021451 -0.15281686 0.0000002
## Drama-Biography     -0.337257962 -0.512115905 -0.16240002 0.0000000
## Family-Biography    -0.667289144 -1.655499345  0.32092106 0.6540825
## Fantasy-Biography   -1.024107326 -1.535855016 -0.51235964 0.0000000
## History-Biography    1.269074492 -1.972150909  4.51029989 0.9975739
## Horror-Biography    -1.280304390 -1.517398372 -1.04321041 0.0000000
## Music-Biography      0.169074492 -3.072150909  3.41029989 1.0000000
## Musical-Biography    1.019074492 -1.275397477  3.31354646 0.9892880
## Mystery-Biography   -0.365925508 -1.106030351  0.37417934 0.9671600
## Romance-Biography   -0.620925508 -1.656227023  0.41437601 0.8325847
## Sci-Fi-Biography    -0.820925508 -1.856227023  0.21437601 0.3413311
## Sport-Biography     -1.130925508 -4.372150909  2.11029989 0.9994553
## Thriller-Biography  -1.118425508 -1.942305773 -0.29454524 0.0002920
## Western-Biography   -1.364258841 -3.239791154  0.51127347 0.5113875
## Crime-Comedy         0.477519715  0.323596466  0.63144296 0.0000000
## Drama-Comedy         0.499680909  0.392055825  0.60730599 0.0000000
## Family-Comedy        0.169649727 -0.808903930  1.14820338 1.0000000
## Fantasy-Comedy      -0.187168455 -0.680010780  0.30567387 0.9983472
## History-Comedy       2.106013363 -1.132280935  5.34430766 0.7186532
## Horror-Comedy       -0.443365519 -0.636294041 -0.25043700 0.0000000
## Music-Comedy         1.006013363 -2.232280935  4.24430766 0.9998917
## Musical-Comedy       1.856013363 -0.434316193  4.14634292 0.3011995
## Mystery-Comedy       0.471013363 -0.256147581  1.19817431 0.7251440
## Romance-Comedy       0.216013363 -0.810074882  1.24210161 0.9999997
## Sci-Fi-Comedy        0.016013363 -1.010074882  1.04210161 1.0000000
## Sport-Comedy        -0.293986637 -3.532280935  2.94430766 1.0000000
## Thriller-Comedy     -0.281486637 -1.093759097  0.53078582 0.9995033
## Western-Comedy      -0.527319970 -2.397782296  1.34314236 0.9999738
## Drama-Crime          0.022161194 -0.138890186  0.18321257 1.0000000
## Family-Crime        -0.307869988 -1.293730855  0.67799088 0.9998833
## Fantasy-Crime       -0.664688170 -1.171884331 -0.15749201 0.0006413
## History-Crime        1.628493648 -1.612016242  4.86900354 0.9615612
## Horror-Crime        -0.920885234 -1.147988301 -0.69378217 0.0000000
## Music-Crime          0.528493648 -2.712016242  3.76900354 1.0000000
## Musical-Crime        1.378493648 -0.914967462  3.67195476 0.8300226
## Mystery-Crime       -0.006506352 -0.743471362  0.73045866 1.0000000
## Romance-Crime       -0.261506352 -1.294565631  0.77155293 0.9999949
## Sci-Fi-Crime        -0.461506352 -1.494565631  0.57155293 0.9885754
## Sport-Crime         -0.771506352 -4.012016242  2.46900354 0.9999981
## Thriller-Crime      -0.759006352 -1.580067214  0.06205451 0.1119965
## Western-Crime       -1.004839685 -2.879135207  0.86945584 0.9306731
## Family-Drama        -0.330031183 -1.309731361  0.64966900 0.9996623
## Fantasy-Drama       -0.686849364 -1.181964237 -0.19173449 0.0001747
## History-Drama        1.606332454 -1.632308487  4.84497339 0.9661316
## Horror-Drama        -0.943046428 -1.141708440 -0.74438442 0.0000000
## Music-Drama          0.506332454 -2.732308487  3.74497339 1.0000000
## Musical-Drama        1.356332454 -0.934487193  3.64715210 0.8472745
## Mystery-Drama       -0.028667546 -0.757370653  0.70003556 1.0000000
## Romance-Drama       -0.283667546 -1.310849257  0.74351416 0.9999808
## Sci-Fi-Drama        -0.483667546 -1.510849257  0.54351416 0.9799281
## Sport-Drama         -0.793667546 -4.032308487  2.44497339 0.9999969
## Thriller-Drama      -0.781167546 -1.594820869  0.03248578 0.0776425
## Western-Drama       -1.027000880 -2.898063276  0.84406152 0.9153044
## Fantasy-Family      -0.356818182 -1.448203924  0.73456756 0.9997769
## History-Family       1.936363636 -1.445171406  5.31789868 0.8810001
## Horror-Family       -0.613015246 -1.605713939  0.37968345 0.7957760
## Music-Family         0.836363636 -2.545171406  4.21789868 0.9999965
## Musical-Family       1.686363636 -0.802378768  4.17510604 0.6478075
## Mystery-Family       0.301363636 -0.913952092  1.51667936 0.9999963
## Romance-Family       0.046363636 -1.368233963  1.46096124 1.0000000
## Sci-Fi-Family       -0.153636364 -1.568233963  1.26096124 1.0000000
## Sport-Family        -0.463636364 -3.845171406  2.91789868 1.0000000
## Thriller-Family     -0.451136364 -1.719212004  0.81693928 0.9992936
## Western-Family      -0.696969697 -2.805727292  1.41178790 0.9997404
## History-Fantasy      2.293181818 -0.980975408  5.56733904 0.5857892
## Horror-Fantasy      -0.256197064 -0.776559431  0.26416530 0.9684893
## Music-Fantasy        1.193181818 -2.080975408  4.46733904 0.9990297
## Musical-Fantasy      2.043181818 -0.297579821  4.38394346 0.1815119
## Mystery-Fantasy      0.658181818 -0.214926775  1.53129041 0.4399647
## Romance-Fantasy      0.403181818 -0.731019515  1.53738315 0.9993012
## Sci-Fi-Fantasy       0.203181818 -0.931019515  1.33738315 1.0000000
## Sport-Fantasy       -0.106818182 -3.380975408  3.16733904 1.0000000
## Thriller-Fantasy    -0.094318182 -1.039485960  0.85084960 1.0000000
## Western-Fantasy     -0.340151515 -2.272037871  1.59173484 1.0000000
## Horror-History      -2.549378882 -5.791975588  0.69321782 0.3574912
## Music-History       -1.100000000 -5.678620092  3.47862009 0.9999978
## Musical-History     -0.250000000 -4.215201314  3.71520131 1.0000000
## Mystery-History     -1.635000000 -4.952525436  1.68252544 0.9681776
## Romance-History     -1.890000000 -5.285595540  1.50559554 0.9049823
## Sci-Fi-History      -2.090000000 -5.485595540  1.30559554 0.8001523
## Sport-History       -2.400000000 -6.978620092  2.17862009 0.9431768
## Thriller-History    -2.387500000 -5.724714187  0.94971419 0.5443583
## Western-History     -2.633333333 -6.371760983  1.10509432 0.5747559
## Music-Horror         1.449378882 -1.793217824  4.69197559 0.9885066
## Musical-Horror       2.299378882  0.002970183  4.59578758 0.0492671
## Mystery-Horror       0.914378882  0.168291438  1.66046633 0.0024544
## Romance-Horror       0.659378882 -0.380207826  1.69896559 0.7577686
## Sci-Fi-Horror        0.459378882 -0.580207826  1.49896559 0.9898782
## Sport-Horror         0.149378882 -3.093217824  3.39197559 1.0000000
## Thriller-Horror      0.161878882 -0.667379815  0.99113758 0.9999999
## Western-Horror      -0.083954451 -1.961855607  1.79394670 1.0000000
## Musical-Music        0.850000000 -3.115201314  4.81520131 0.9999996
## Mystery-Music       -0.535000000 -3.852525436  2.78252544 1.0000000
## Romance-Music       -0.790000000 -4.185595540  2.60559554 0.9999986
## Sci-Fi-Music        -0.990000000 -4.385595540  2.40559554 0.9999569
## Sport-Music         -1.300000000 -5.878620092  3.27862009 0.9999709
## Thriller-Music      -1.287500000 -4.624714187  2.04971419 0.9979830
## Western-Music       -1.533333333 -5.271760983  2.20509432 0.9957244
## Mystery-Musical     -1.385000000 -3.786048632  1.01604863 0.8741383
## Romance-Musical     -1.640000000 -4.147813506  0.86781351 0.7095630
## Sci-Fi-Musical      -1.840000000 -4.347813506  0.66781351 0.4941312
## Sport-Musical       -2.150000000 -6.115201314  1.81520131 0.9235936
## Thriller-Musical    -2.137500000 -4.565679986  0.29067999 0.1699261
## Western-Musical     -2.383333333 -5.338819894  0.57215323 0.3100747
## Romance-Mystery     -0.255000000 -1.508906753  0.99890675 0.9999998
## Sci-Fi-Mystery      -0.455000000 -1.708906753  0.79890675 0.9990827
## Sport-Mystery       -0.765000000 -4.082525436  2.55252544 0.9999988
## Thriller-Mystery    -0.752500000 -1.838415102  0.33341510 0.6061582
## Western-Mystery     -0.998333333 -3.002841845  1.00617518 0.9647456
## Sci-Fi-Romance      -0.200000000 -1.647886803  1.24788680 1.0000000
## Sport-Romance       -0.510000000 -3.905595540  2.88559554 1.0000000
## Thriller-Romance    -0.497500000 -1.802607527  0.80760753 0.9982669
## Western-Romance     -0.743333333 -2.874565001  1.38789833 0.9994582
## Sport-Sci-Fi        -0.310000000 -3.705595540  3.08559554 1.0000000
## Thriller-Sci-Fi     -0.297500000 -1.602607527  1.00760753 0.9999990
## Western-Sci-Fi      -0.543333333 -2.674565001  1.58789833 0.9999943
## Thriller-Sport       0.012500000 -3.324714187  3.34971419 1.0000000
## Western-Sport       -0.233333333 -3.971760983  3.50509432 1.0000000
## Western-Thriller    -0.245833333 -2.282761875  1.79109521 1.0000000

Linear Regression

model <- lm(score ~ budget + gross + runtime, data = udaje)
summary(model)
## 
## Call:
## lm(formula = score ~ budget + gross + runtime, data = udaje)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1632 -0.4673  0.0678  0.5550  2.5502 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.053e+00  7.091e-02   57.16   <2e-16 ***
## budget      -7.227e-09  4.181e-10  -17.29   <2e-16 ***
## gross        1.734e-09  9.154e-11   18.94   <2e-16 ***
## runtime      2.238e-02  6.703e-04   33.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8471 on 5431 degrees of freedom
##   (2233 observations deleted due to missingness)
## Multiple R-squared:  0.227,  Adjusted R-squared:  0.2266 
## F-statistic: 531.6 on 3 and 5431 DF,  p-value: < 2.2e-16

V modeli lineárnej regresie sledujeme, či sú budget, box office zárobok a dĺžka filmu štatisticky významné pre hodnotenie filmu. Z výsledku modelu vidíme, že všetky prvky sú pre hodnotenie štatisticky významné. Zárobok a dĺžka filmu ovplyvňujú hodnotenie kladne, ale, prekvapivo, budget filmu zvykne “viesť” skôr k nižšiemu finálnemu hodnoteniu. Toto by mohlo byť dôkazom, že na natočenie dobrého filmu netreba tak veľa peňazí, ako skôr nápad a talent. Výsledky lineárnej regresie si môžeme vypísať aj do tabuľky:

library(broom)
library(dplyr)
library(kableExtra)
library(stringr)

coef.tbl <- tidy(model, conf.int = TRUE) %>%
  mutate(
    term = recode(term,
      "(Intercept)" = "Intercept",
      "budget" = "budget",
      "gross" = "gross",
      "runtime" = "runtime"
    ),
    stars = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ "·",
      TRUE            ~ ""
    )
  ) %>%
  transmute(
    Term = term,
    Estimate = estimate,
    `Std. Error` = std.error,
    `t value` = statistic,
    `p value` = p.value,
    `95% CI` = str_c("[", round(conf.low, 3), ", ", round(conf.high, 3), "]"),
    Sig = stars
  )

coef.tbl %>%
  kable(
    digits = 3,
    caption = "Výsledok lineárnej regresie"
  ) %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#f2f2f2") %>%
  footnote(
    general = "Signif. codes: *** p<0.001, ** p<0.01, * p<0.05, · p<0.1.",
    threeparttable = TRUE
  )
Výsledok lineárnej regresie
Term Estimate Std. Error t value p value 95% CI Sig
Intercept 4.053 0.071 57.165 0 [3.914, 4.192] ***
budget 0.000 0.000 -17.287 0 [0, 0] ***
gross 0.000 0.000 18.944 0 [0, 0] ***
runtime 0.022 0.001 33.389 0 [0.021, 0.024] ***
Note:
Signif. codes: *** p<0.001, ** p<0.01, * p<0.05, · p<0.1.
library(corrplot)

num_cols <- udaje[, c("score", "budget", "gross", "runtime")]
corr <- cor(num_cols, use = "complete.obs", method = "pearson")

corrplot(corr, method = "color", type = "upper",
         addCoef.col = "black", tl.col = "black", tl.srt = 45,
         col = colorRampPalette(c("blue", "white", "red"))(200),
         title = "Korelačná matica numerických premenných",
         mar = c(0,0,2,0))

Na heatmape vidíme vizuálne zobrazené závislosti medzi jednotlivými numerickými premennými. Kde je políčko červenšie, tam je kladná závislosť silnejšia. Kedže nemáme žiadnu zápornú koreláciu, nemáme žiadne modré políčka.