PONAVLJANJE: STATISTIČKI TESTOVI

Hrvatski studiji

dr.sc. Luka Šikić

05 ožujak, 2020

CILJEVI PREDAVANJA

PRETPOSTAVKE STATISTIČKIH TESTOVA

  1. Da li su dvije ili više varijabli međusobno korelirane?
  2. Da se dvije ili više grupa međusobno razlikuju?
  3. Da li postoje razlike u varijabilnosti između dva uzorka?
  4. Da li postoje razlike između proporcija?
  1. Korelacijski test/korelacijska matrica.
  2. Studentov t-test;Wilcoxon test/ANOVA;Kruskal-Wallis test.
  3. F-test;Fligner-Kileen test.
  4. Chi-sq test.
  1. Normalnost distribucije.
  2. Homogenost varijance.

NORMALNOST DISTRIBUCIJE

dta <- ToothGrowth # Učitaj podatke
dplyr::sample_n(dta, 10) # Pregled podataka
##     len supp dose
## 1  26.4   OJ  1.0
## 2  24.5   OJ  2.0
## 3  11.5   VC  0.5
## 4  25.5   OJ  2.0
## 5  17.3   VC  1.0
## 6  14.5   OJ  0.5
## 7  23.6   OJ  1.0
## 8   8.2   OJ  0.5
## 9  14.5   VC  1.0
## 10 18.8   VC  1.0
summary(dta) # Pregled podataka
##       len        supp         dose      
##  Min.   : 4.20   OJ:30   Min.   :0.500  
##  1st Qu.:13.07   VC:30   1st Qu.:0.500  
##  Median :19.25           Median :1.000  
##  Mean   :18.81           Mean   :1.167  
##  3rd Qu.:25.27           3rd Qu.:2.000  
##  Max.   :33.90           Max.   :2.000
# Pogledaj distribuciju
ggdensity(dta$len,
          main = "Distribucija varijable duljina zuba (len)",
          xlab = "Duljina zuba") 

# Pogledaj QQ plot
ggqqplot(dta$len)

# Provedi Shapiro-Wilk tets
shapiro.test(dta$len)
## 
##  Shapiro-Wilk normality test
## 
## data:  dta$len
## W = 0.96743, p-value = 0.1091

TESTOVI KORELACIJE

  1. Pearson
  2. Spearman
# Učitaj podatke
cor_dta <- mtcars
head(cor_dta, 10) # Pregled podataka
##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
summary(cor_dta)  # Pregled podataka
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
str(cor_dta)      # Pregled podataka
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
  1. Linearnost odnosa.
# Dijagram raspršivanja
ggscatter(cor_dta, x = "mpg", y = "wt",
          add = "reg.line", conf.int = T,
          cor.coef = T, cor.method = "pearson",
          xlab = "Milje/galon", ylab = "Težina(1k lbs)")

  1. Normalnost distribucije(QQ plot;Shapiro-Wilk).
# QQ plot za "mpg"
ggqqplot(cor_dta$mpg, ylab = "MPG")

# QQ plot za "wg
ggqqplot(cor_dta$wt, ylab = "WT")

# Test za "mpg"
shapiro.test(cor_dta$mpg)
## 
##  Shapiro-Wilk normality test
## 
## data:  cor_dta$mpg
## W = 0.94756, p-value = 0.1229
# Test za "wt"
shapiro.test(cor_dta$wt)
## 
##  Shapiro-Wilk normality test
## 
## data:  cor_dta$wt
## W = 0.94326, p-value = 0.09265
# Provedi Pearsonov korelacijski test

pearson <- cor.test(cor_dta$mpg, cor_dta$wt,
                    method = "pearson") 

print(pearson) # Pregled rezultata testa
## 
##  Pearson's product-moment correlation
## 
## data:  cor_dta$mpg and cor_dta$wt
## t = -9.559, df = 30, p-value = 1.294e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9338264 -0.7440872
## sample estimates:
##        cor 
## -0.8676594
print(pearson$p.value)  # Pregled p-vrijednosti
## [1] 1.293959e-10
print(pearson$estimate) # Pregled korelacijskog koeficijenta
##        cor 
## -0.8676594
# Provedi Spearmanov korelacijski test

spearman <- cor.test(cor_dta$mpg, cor_dta$wt,
                    method = "spearman")
print(spearman) # Pregled rezultata testa
## 
##  Spearman's rank correlation rho
## 
## data:  cor_dta$mpg and cor_dta$wt
## S = 10292, p-value = 1.488e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.886422
print(spearman$p.value)  # Pregled p-vrijednosti
## [1] 1.487595e-11
print(spearman$estimate) # Pregled korelacijskog koeficijenta
##       rho 
## -0.886422
# Učitaj podatke
m_cor_dta <- mtcars[, c(1,3,4,5,6,7)]
head(m_cor_dta,10) #Pregled podataka
##                    mpg  disp  hp drat    wt  qsec
## Mazda RX4         21.0 160.0 110 3.90 2.620 16.46
## Mazda RX4 Wag     21.0 160.0 110 3.90 2.875 17.02
## Datsun 710        22.8 108.0  93 3.85 2.320 18.61
## Hornet 4 Drive    21.4 258.0 110 3.08 3.215 19.44
## Hornet Sportabout 18.7 360.0 175 3.15 3.440 17.02
## Valiant           18.1 225.0 105 2.76 3.460 20.22
## Duster 360        14.3 360.0 245 3.21 3.570 15.84
## Merc 240D         24.4 146.7  62 3.69 3.190 20.00
## Merc 230          22.8 140.8  95 3.92 3.150 22.90
## Merc 280          19.2 167.6 123 3.92 3.440 18.30
cor_mtx <- cor(m_cor_dta) # Izračunaj korelacijsku matricu
round(cor_mtx,2) # Pregled korelacijskih koeficijenata
##        mpg  disp    hp  drat    wt  qsec
## mpg   1.00 -0.85 -0.78  0.68 -0.87  0.42
## disp -0.85  1.00  0.79 -0.71  0.89 -0.43
## hp   -0.78  0.79  1.00 -0.45  0.66 -0.71
## drat  0.68 -0.71 -0.45  1.00 -0.71  0.09
## wt   -0.87  0.89  0.66 -0.71  1.00 -0.17
## qsec  0.42 -0.43 -0.71  0.09 -0.17  1.00
cor_mtx_pv <- Hmisc::rcorr(as.matrix(m_cor_dta)) # Pripadajuće p-vrijednosti
print(cor_mtx_pv) # Pregled rezultata
##        mpg  disp    hp  drat    wt  qsec
## mpg   1.00 -0.85 -0.78  0.68 -0.87  0.42
## disp -0.85  1.00  0.79 -0.71  0.89 -0.43
## hp   -0.78  0.79  1.00 -0.45  0.66 -0.71
## drat  0.68 -0.71 -0.45  1.00 -0.71  0.09
## wt   -0.87  0.89  0.66 -0.71  1.00 -0.17
## qsec  0.42 -0.43 -0.71  0.09 -0.17  1.00
## 
## n= 32 
## 
## 
## P
##      mpg    disp   hp     drat   wt     qsec  
## mpg         0.0000 0.0000 0.0000 0.0000 0.0171
## disp 0.0000        0.0000 0.0000 0.0000 0.0131
## hp   0.0000 0.0000        0.0100 0.0000 0.0000
## drat 0.0000 0.0000 0.0100        0.0000 0.6196
## wt   0.0000 0.0000 0.0000 0.0000        0.3389
## qsec 0.0171 0.0131 0.0000 0.6196 0.3389
# Vizualizacija 1

library(corrplot) # Učitaj paket
corrplot(cor_mtx, type = "upper", order = "hclust", # Korelacijska matrica je input
         tl.col = "black", tl.srt = 45)

# Vizualizacija 2

library(PerformanceAnalytics)
chart.Correlation(m_cor_dta, histogram = T, pch = 19)

# Vizualizacija 3

col <- colorRampPalette(c("blue", "green", "red"))(20) # Definiraj boje
heatmap(cor_mtx, col = col, symm = T) # Prikaži heatmap/ input je korelacijska matrica

USPOREDBA PROSJEKA

  1. Jedan uzorak: t-test(parametarski), Wilcox(neparametarski)
  2. Dva nezavisna uzorka: t-test(parametarski), Wilcox(neparametarski)
  3. Dva zavisna uzorka: t-test(parametarski), Wilcox(neparametarski)
  4. Više od dva uzorka: Jednostrana i dvostrana ANOVA(param), Kruskal-Wallis(nparam)
## t-test ##

set.seed(1234)
# Stvori podatke
t1_dta <- data.frame(
  naziv = paste0(rep("M_", 10),1:10),
  mjera = round(rnorm(10,20,2),1)
)

# Pogledaj podatke
head(t1_dta, 10)
##    naziv mjera
## 1    M_1  17.6
## 2    M_2  20.6
## 3    M_3  22.2
## 4    M_4  15.3
## 5    M_5  20.9
## 6    M_6  21.0
## 7    M_7  18.9
## 8    M_8  18.9
## 9    M_9  18.9
## 10  M_10  18.2
# Deskriptivna statistika
summary(t1_dta$mjera)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.30   18.38   18.90   19.25   20.82   22.20
# Pregled podataka
ggboxplot(t1_dta$mjera,
          ylab = "Mjera",
          xlab = F,
          ggtheme = theme_minimal())

# Test normalnosti
shapiro.test(t1_dta$mjera)
## 
##  Shapiro-Wilk normality test
## 
## data:  t1_dta$mjera
## W = 0.9526, p-value = 0.6993
# QQ plot
ggqqplot(t1_dta$mjera, ylab = "Mjera",
         ggtheme = theme_minimal())

# Provedi t-test (prosjek za usporedbu je 25)
t1_test <- t.test(t1_dta$mjera, mu = 25)
# Prikaži rezultate
print(t1_test)
## 
##  One Sample t-test
## 
## data:  t1_dta$mjera
## t = -9.0783, df = 9, p-value = 7.953e-06
## alternative hypothesis: true mean is not equal to 25
## 95 percent confidence interval:
##  17.8172 20.6828
## sample estimates:
## mean of x 
##     19.25
# Pristupi elementima rezultata
print(t1_test$p.value)
## [1] 7.953383e-06
print(t1_test$conf.int)
## [1] 17.8172 20.6828
## attr(,"conf.level")
## [1] 0.95
## Willcoxon test ##

# Provedi test
w1_test <- wilcox.test(t1_dta$mjera, mu = 25)
# Pogledaj rezultate
print(w1_test)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  t1_dta$mjera
## V = 0, p-value = 0.005793
## alternative hypothesis: true location is not equal to 25
## t-test ##

# Stvori podatke
visina_zene <- c(38.9, 61.2, 73.3, 21.8, 63.4, 64.6, 48.4, 48.8, 48.5)
visina_muskarci <- c(67.8, 60, 63.4, 76, 89.4, 73.3, 67.3, 61.3, 62.4) 
# Poveži u DF
nu2 <- data.frame(
  grupa = rep(c("Muskarci", "Zene"), each = 9),
  tezina = c(visina_zene, visina_muskarci)
)
# Pogledaj podatke
print(nu2)
##       grupa tezina
## 1  Muskarci   38.9
## 2  Muskarci   61.2
## 3  Muskarci   73.3
## 4  Muskarci   21.8
## 5  Muskarci   63.4
## 6  Muskarci   64.6
## 7  Muskarci   48.4
## 8  Muskarci   48.8
## 9  Muskarci   48.5
## 10     Zene   67.8
## 11     Zene   60.0
## 12     Zene   63.4
## 13     Zene   76.0
## 14     Zene   89.4
## 15     Zene   73.3
## 16     Zene   67.3
## 17     Zene   61.3
## 18     Zene   62.4
# Deskriptivna statistika
nu2 %>%
  group_by(grupa) %>%
  summarise(
    broj = n(),
    mean = mean(tezina, na.rm = T),
    sd = sd(tezina, na.rm = T)
          )
## # A tibble: 2 x 4
##   grupa     broj  mean    sd
##   <fct>    <int> <dbl> <dbl>
## 1 Muskarci     9  52.1 15.6 
## 2 Zene         9  69.0  9.38
# Vizualiziraj podatke
ggboxplot(nu2, x = "grupa", y = "tezina",
          color = "grupa",
          ylab = "tezina", xlab = "grupe")

# Testiraj normalnost distribucije
with(nu2, shapiro.test(tezina[grupa == "Muskarci"]))
## 
##  Shapiro-Wilk normality test
## 
## data:  tezina[grupa == "Muskarci"]
## W = 0.94266, p-value = 0.6101
with(nu2,shapiro.test(tezina[grupa == "Zene"]))
## 
##  Shapiro-Wilk normality test
## 
## data:  tezina[grupa == "Zene"]
## W = 0.86425, p-value = 0.1066
# Testiraj jednakost varijanci
eq_var_nu2 <- var.test(tezina ~ grupa, data = nu2)
print(eq_var_nu2)
## 
##  F test to compare two variances
## 
## data:  tezina by grupa
## F = 2.7675, num df = 8, denom df = 8, p-value = 0.1714
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##   0.6242536 12.2689506
## sample estimates:
## ratio of variances 
##           2.767478
# Provedi test
t_nu2 <- t.test(visina_muskarci, visina_zene, var.equal = T)
print(t_nu2)
## 
##  Two Sample t-test
## 
## data:  visina_muskarci and visina_zene
## t = 2.7842, df = 16, p-value = 0.01327
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   4.029759 29.748019
## sample estimates:
## mean of x mean of y 
##  68.98889  52.10000
# Alternativni nacin
a_t_nu2 <- t.test(tezina ~ grupa, data = nu2, var.equal = T)
print(a_t_nu2)
## 
##  Two Sample t-test
## 
## data:  tezina by grupa
## t = -2.7842, df = 16, p-value = 0.01327
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -29.748019  -4.029759
## sample estimates:
## mean in group Muskarci     mean in group Zene 
##               52.10000               68.98889
## Wilcoxon test ##

# Provedi test
w_nu2 <- wilcox.test(visina_muskarci, visina_zene)
print(w_nu2, 10)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  visina_muskarci and visina_zene
## W = 66, p-value = 0.02711657
## alternative hypothesis: true location shift is not equal to 0
## t-test ##

# Napravi podatke
prije <- c(200.1, 190.9, 192.7, 213, 241.4, 196.9, 172.2, 185.5, 205.2, 193.7)
nakon <- c(392.9, 393.2, 345.1, 393, 434, 427.9, 422, 383.9, 392.3, 352.2)
# Poveži u DF
zu2 <- data.frame(
  grupa = rep(c("prije", "nakon"), each = 10),
  tezina = c(prije, nakon)
)
# Pregled podataka
head(zu2,10)
##    grupa tezina
## 1  prije  200.1
## 2  prije  190.9
## 3  prije  192.7
## 4  prije  213.0
## 5  prije  241.4
## 6  prije  196.9
## 7  prije  172.2
## 8  prije  185.5
## 9  prije  205.2
## 10 prije  193.7
# Deskriptivna statistika
zu2 %>%
  group_by(grupa) %>%
  summarise(
    broj = n(),
    mean = mean(tezina, na.rm = T),
    sd = sd(tezina, na.rm = T)
          )
## # A tibble: 2 x 4
##   grupa  broj  mean    sd
##   <fct> <int> <dbl> <dbl>
## 1 nakon    10  394.  29.4
## 2 prije    10  199.  18.5
# Vizualizacija 1

ggboxplot(zu2, x = "grupa", y = "tezina", 
          color = "grupa", palette = c("#00AFBB", "#E7B800"),
          order = c("prije", "nakon"),
          ylab = "Tezina", xlab = "Grupe")

# Vizualizacija 2

prije <- subset(zu2, grupa == "prije", tezina, drop =  T)
nakon <- subset(zu2, grupa == "nakon", tezina, drop = T)

pd <- paired(prije, nakon)
plot(pd, type = "profile") + theme_bw()

# Provjeri normalnost

razlika <- with(zu2, tezina[grupa == "prije"] - tezina[grupa == "nakon"])
shapiro.test(razlika)
## 
##  Shapiro-Wilk normality test
## 
## data:  razlika
## W = 0.94536, p-value = 0.6141
# Provedi test
t_zu2 <- t.test(prije, nakon, paired = T)
t_zu2
## 
##  Paired t-test
## 
## data:  prije and nakon
## t = -20.883, df = 9, p-value = 6.2e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -215.5581 -173.4219
## sample estimates:
## mean of the differences 
##                 -194.49
## Willcox ##
w_zu2 <- wilcox.test(prije, nakon, paired = T)
w_zu2
## 
##  Wilcoxon signed rank test
## 
## data:  prije and nakon
## V = 0, p-value = 0.001953
## alternative hypothesis: true location shift is not equal to 0
## JEDNOSTRANA ANOVA ##

# Podatci
anova_dta <- PlantGrowth
# Pregled podataka
set.seed(1234)
dplyr::sample_n(anova_dta,10)
##    weight group
## 1    6.15  trt2
## 2    3.83  trt1
## 3    5.29  trt2
## 4    5.12  trt2
## 5    4.50  ctrl
## 6    4.17  trt1
## 7    5.87  trt1
## 8    5.33  ctrl
## 9    5.26  trt2
## 10   4.61  ctrl
glimpse(anova_dta)
## Observations: 30
## Variables: 2
## $ weight <dbl> 4.17, 5.58, 5.18, 6.11, 4.50, 4.61, 5.17, 4.53, 5.33, 5.14, ...
## $ group  <fct> ctrl, ctrl, ctrl, ctrl, ctrl, ctrl, ctrl, ctrl, ctrl, ctrl, ...
levels(anova_dta$group)
## [1] "ctrl" "trt1" "trt2"
# Uredi redosljed faktora
anova_dta$group <- ordered(anova_dta$group,
                            levels = c("ctrl", "trt1", "trt2"))
# Deskriptivna statistika
group_by(anova_dta, group) %>%
  summarise(
    count = n(),
    mean = mean(weight, na.rm = TRUE),
    sd = sd(weight, na.rm = TRUE)
  )
## # A tibble: 3 x 4
##   group count  mean    sd
##   <ord> <int> <dbl> <dbl>
## 1 ctrl     10  5.03 0.583
## 2 trt1     10  4.66 0.794
## 3 trt2     10  5.53 0.443
# Vizualizacija 1

ggboxplot(anova_dta, x = "group", y = "weight", 
          color = "group", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("ctrl", "trt1", "trt2"),
          ylab = "Težina", xlab = "Tretman")

# Vizualizacija 2

ggline(anova_dta, x = "group", y = "weight", 
       add = c("mean_se", "jitter"), 
       order = c("ctrl", "trt1", "trt2"),
       ylab = "Weight", xlab = "Treatment")

# Provedi ANOVA test
procjena_anova_dta <- aov(weight ~ group, data = anova_dta)
summary(procjena_anova_dta)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## group        2  3.766  1.8832   4.846 0.0159 *
## Residuals   27 10.492  0.3886                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Međusobna usporedba prosjeka (varijabli)
TukeyHSD((procjena_anova_dta)) # 1. način
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ group, data = anova_dta)
## 
## $group
##             diff        lwr       upr     p adj
## trt1-ctrl -0.371 -1.0622161 0.3202161 0.3908711
## trt2-ctrl  0.494 -0.1972161 1.1852161 0.1979960
## trt2-trt1  0.865  0.1737839 1.5562161 0.0120064
summary(glht(procjena_anova_dta, linfct = mcp(group = "Tukey"))) # 2.način
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = weight ~ group, data = anova_dta)
## 
## Linear Hypotheses:
##                  Estimate Std. Error t value Pr(>|t|)  
## trt1 - ctrl == 0  -0.3710     0.2788  -1.331   0.3908  
## trt2 - ctrl == 0   0.4940     0.2788   1.772   0.1979  
## trt2 - trt1 == 0   0.8650     0.2788   3.103   0.0121 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
pairwise.t.test(anova_dta$weight, anova_dta$group, # 3.način
               p.adjust.method = "BH")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  anova_dta$weight and anova_dta$group 
## 
##      ctrl  trt1 
## trt1 0.194 -    
## trt2 0.132 0.013
## 
## P value adjustment method: BH
# Provjera pretpostavki

# Homogenost varijance
plot(procjena_anova_dta,1) # Vizualna inspekcija

leveneTest(weight ~ group, data = anova_dta) # Formalni test
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.1192 0.3412
##       27
# Normalnost distribucije
plot(procjena_anova_dta,2) # Vizualna inspekcija

reziduali <- residuals(object = procjena_anova_dta) # Uzmi rezidualne vrijednosti
shapiro.test(x = reziduali) # Provedi S-H test
## 
##  Shapiro-Wilk normality test
## 
## data:  reziduali
## W = 0.96607, p-value = 0.4379
# Provedi neparametarski test(Kruskal-Wallis)
kruskal.test(weight ~ group, data = anova_dta)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  weight by group
## Kruskal-Wallis chi-squared = 7.9882, df = 2, p-value = 0.01842
## VIŠESTRUKA ANOVA ##

# Učitaj podatke
anova2_dta <- ToothGrowth
# Pregled podataka
dplyr::sample_n(anova2_dta, 10)
##     len supp dose
## 1  25.8   OJ  1.0
## 2  32.5   VC  2.0
## 3  15.2   OJ  0.5
## 4  17.3   VC  1.0
## 5  26.4   OJ  2.0
## 6  29.5   VC  2.0
## 7  10.0   VC  0.5
## 8  23.6   OJ  1.0
## 9  11.2   VC  0.5
## 10 18.5   VC  2.0
str(anova2_dta)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
# Uredi podatke za analizu
anova2_dta$dose <- factor(anova2_dta$dose,
                          levels = c(0.5, 1, 2),
                          labels = c("D_0.5", "D_1", "D_2"))
dplyr::sample_n(anova2_dta, 10) # Pregledaj
##     len supp  dose
## 1  18.5   VC   D_2
## 2  16.5   OJ D_0.5
## 3  15.2   OJ D_0.5
## 4  21.2   OJ   D_1
## 5  13.6   VC   D_1
## 6  24.5   OJ   D_2
## 7  27.3   OJ   D_1
## 8  14.5   OJ   D_1
## 9   9.7   OJ D_0.5
## 10  5.8   VC D_0.5
# Deskriptivna statistika
group_by(anova2_dta, supp, dose) %>%
  summarise(
    count = n(),
    mean = mean(len, na.rm = TRUE),
    sd = sd(len, na.rm = TRUE)
  )
## # A tibble: 6 x 5
## # Groups:   supp [2]
##   supp  dose  count  mean    sd
##   <fct> <fct> <int> <dbl> <dbl>
## 1 OJ    D_0.5    10 13.2   4.46
## 2 OJ    D_1      10 22.7   3.91
## 3 OJ    D_2      10 26.1   2.66
## 4 VC    D_0.5    10  7.98  2.75
## 5 VC    D_1      10 16.8   2.52
## 6 VC    D_2      10 26.1   4.80
# Tabuliraj podatke
table(anova2_dta$supp, anova2_dta$dose)
##     
##      D_0.5 D_1 D_2
##   OJ    10  10  10
##   VC    10  10  10
# Vizualizacija 1
ggboxplot(anova2_dta, x = "dose", y = "len", color = "supp",
          palette = c("#00AFBB", "#E7B800"))

# Vizualizacija 2 
ggline(anova2_dta, x = "dose", y = "len", color = "supp",
       add = c("mean_se", "dotplot"),
       palette = c("#00AFBB", "#E7B800"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

# Provedi test
procjena_anova2_dta <- aov(len ~ supp + dose, data = anova2_dta)
summary(procjena_anova2_dta)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## supp         1  205.4   205.4   14.02 0.000429 ***
## dose         2 2426.4  1213.2   82.81  < 2e-16 ***
## Residuals   56  820.4    14.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test sa interakcijskim regresorom
procjena_anova3_dta <- aov(len ~ supp + dose + supp:dose, data = anova2_dta)
summary(procjena_anova3_dta)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## supp         1  205.4   205.4  15.572 0.000231 ***
## dose         2 2426.4  1213.2  92.000  < 2e-16 ***
## supp:dose    2  108.3    54.2   4.107 0.021860 *  
## Residuals   54  712.1    13.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Međusobna usporedba prosjeka (varijabli)
TukeyHSD(procjena_anova3_dta, which = "dose") # 1. način
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = len ~ supp + dose + supp:dose, data = anova2_dta)
## 
## $dose
##             diff       lwr       upr   p adj
## D_1-D_0.5  9.130  6.362488 11.897512 0.0e+00
## D_2-D_0.5 15.495 12.727488 18.262512 0.0e+00
## D_2-D_1    6.365  3.597488  9.132512 2.7e-06
summary(glht(procjena_anova3_dta, lincft = mcp(dose = "Tukey"))) # 2. način
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: aov(formula = len ~ supp + dose + supp:dose, data = anova2_dta)
## 
## Linear Hypotheses:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept) == 0      13.230      1.148  11.521  < 0.001 ***
## suppVC == 0           -5.250      1.624  -3.233  0.00958 ** 
## doseD_1 == 0           9.470      1.624   5.831  < 0.001 ***
## doseD_2 == 0          12.830      1.624   7.900  < 0.001 ***
## suppVC:doseD_1 == 0   -0.680      2.297  -0.296  0.99854    
## suppVC:doseD_2 == 0    5.330      2.297   2.321  0.09388 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
# Provjera pretpostavki
plot(procjena_anova3_dta,1) # Homogenost varijance

leveneTest(len ~ supp*dose, data = anova2_dta) # Formalni tetst
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  5  1.7086 0.1484
##       54
plot(procjena_anova3_dta,2) # Normalnost distribucije

reziduali_2 <- residuals(procjena_anova3_dta) # Reziduali za SH test
shapiro.test(reziduali_2)
## 
##  Shapiro-Wilk normality test
## 
## data:  reziduali_2
## W = 0.98499, p-value = 0.6694

USPOREDBA VARIJANCI

  1. F-test
  2. Balett test;Levene test
## DVIJE VARIJANCE ##

# Učitaj podatke
F_dta <- ToothGrowth
# Pregled podataka
dplyr::sample_n(F_dta,10)
##     len supp dose
## 1  26.4   VC  2.0
## 2  23.6   VC  2.0
## 3  20.0   OJ  1.0
## 4  14.5   VC  1.0
## 5  23.3   OJ  1.0
## 6  15.2   OJ  0.5
## 7  10.0   VC  0.5
## 8  11.2   VC  0.5
## 9  19.7   OJ  1.0
## 10 21.2   OJ  1.0
# Provedi F-test
procjena_F_dta <- var.test(len ~ supp, data = F_dta)
print(procjena_F_dta)
## 
##  F test to compare two variances
## 
## data:  len by supp
## F = 0.6386, num df = 29, denom df = 29, p-value = 0.2331
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3039488 1.3416857
## sample estimates:
## ratio of variances 
##          0.6385951
# Pogledaj omjer varijanci
procjena_F_dta$estimate
## ratio of variances 
##          0.6385951
# p-vrijednost
procjena_F_dta$p.value
## [1] 0.2331433
## VIŠE OD DVIJE VARIJANCE ##

# Učitaj podatke
mv_dta <- PlantGrowth
# Pregledaj podatke
str(mv_dta)
## 'data.frame':    30 obs. of  2 variables:
##  $ weight: num  4.17 5.58 5.18 6.11 4.5 4.61 5.17 4.53 5.33 5.14 ...
##  $ group : Factor w/ 3 levels "ctrl","trt1",..: 1 1 1 1 1 1 1 1 1 1 ...
# Provedi Barlett test
b_mv_dta <- bartlett.test(weight ~ group, data = mv_dta)
b_mv_dta
## 
##  Bartlett test of homogeneity of variances
## 
## data:  weight by group
## Bartlett's K-squared = 2.8786, df = 2, p-value = 0.2371
# Provedi Barlett test za više nezavisnih varijabli; interaction() to collapse
bartlett.test(len ~ interaction(supp,dose), data = ToothGrowth)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  len by interaction(supp, dose)
## Bartlett's K-squared = 6.9273, df = 5, p-value = 0.2261
# Provedi Levene test za jednu nezavisnu varijablu
leveneTest(weight ~ group, data = mv_dta)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.1192 0.3412
##       27
# Provedi Levene test za više nezavisnih varijabli
leveneTest(len ~ interaction(supp,dose), data = ToothGrowth)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  5  1.7086 0.1484
##       54
# Provedi Fligner-Killen test
fligner.test(weight ~ group, data = mv_dta)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  weight by group
## Fligner-Killeen:med chi-squared = 2.3499, df = 2, p-value = 0.3088

USPOREDBA PROPORCIJA

  1. Chi-sq goodnes of fit
  2. Chi-sq test nezavisnosti
## Chi-sq GOF ##

# Stvori podatke
sparoge <- c(81,50,27)
# Provedi test
procjena_sparoge <- chisq.test(sparoge, p = c(1/3, 1/3, 1/3))
# Pogledaj rezulatate
procjena_sparoge
## 
##  Chi-squared test for given probabilities
## 
## data:  sparoge
## X-squared = 27.886, df = 2, p-value = 8.803e-07
# Pogledaj očekivane vrijednosti
procjena_sparoge$expected
## [1] 52.66667 52.66667 52.66667
# Odredi duge vjerojatnosti
procjena_sparoge2 <- chisq.test(sparoge, p = c(1/2, 1/3, 1/6))
procjena_sparoge2
## 
##  Chi-squared test for given probabilities
## 
## data:  sparoge
## X-squared = 0.20253, df = 2, p-value = 0.9037
# Pogledaj p.vrijednosti
procjena_sparoge2$p.value
## [1] 0.9036928
## Chi-sq test nezavisnosti ##


# Ucitaj podatke
file_path <- "http://www.sthda.com/sthda/RDoc/data/housetasks.txt"
kucniPoslovi <- read.delim(file_path, row.names = 1)
kucniPoslovi # Pogledaj podatke
##            Wife Alternating Husband Jointly
## Laundry     156          14       2       4
## Main_meal   124          20       5       4
## Dinner       77          11       7      13
## Breakfeast   82          36      15       7
## Tidying      53          11       1      57
## Dishes       32          24       4      53
## Shopping     33          23       9      55
## Official     12          46      23      15
## Driving      10          51      75       3
## Finances     13          13      21      66
## Insurance     8           1      53      77
## Repairs       0           3     160       2
## Holidays      0           1       6     153
# Pogledaj kontigencijsku tablicu
kt <- as.table(as.matrix(kucniPoslovi))

gplots::balloonplot(t(kt), main = "kucniPoslovi", # 1. način
           xlab = "", ylab = "",
           label = F, show.margins = F)

graphics::mosaicplot(kt,shade = T, las = 2,       # 2. način
                     main = "kucniPoslovi")

# Provedi Chi-sq
procjena_kucniPoslovi <- chisq.test(kucniPoslovi)
procjena_kucniPoslovi
## 
##  Pearson's Chi-squared test
## 
## data:  kucniPoslovi
## X-squared = 1944.5, df = 36, p-value < 2.2e-16
# Opažene vrijednosti
procjena_kucniPoslovi$observed
##            Wife Alternating Husband Jointly
## Laundry     156          14       2       4
## Main_meal   124          20       5       4
## Dinner       77          11       7      13
## Breakfeast   82          36      15       7
## Tidying      53          11       1      57
## Dishes       32          24       4      53
## Shopping     33          23       9      55
## Official     12          46      23      15
## Driving      10          51      75       3
## Finances     13          13      21      66
## Insurance     8           1      53      77
## Repairs       0           3     160       2
## Holidays      0           1       6     153
# Očekivane vrijednosti
round(procjena_kucniPoslovi$expected,2)
##             Wife Alternating Husband Jointly
## Laundry    60.55       25.63   38.45   51.37
## Main_meal  52.64       22.28   33.42   44.65
## Dinner     37.16       15.73   23.59   31.52
## Breakfeast 48.17       20.39   30.58   40.86
## Tidying    41.97       17.77   26.65   35.61
## Dishes     38.88       16.46   24.69   32.98
## Shopping   41.28       17.48   26.22   35.02
## Official   33.03       13.98   20.97   28.02
## Driving    47.82       20.24   30.37   40.57
## Finances   38.88       16.46   24.69   32.98
## Insurance  47.82       20.24   30.37   40.57
## Repairs    56.77       24.03   36.05   48.16
## Holidays   55.05       23.30   34.95   46.70
# Prikaži reziduale
round(procjena_kucniPoslovi$residuals,2)
##             Wife Alternating Husband Jointly
## Laundry    12.27       -2.30   -5.88   -6.61
## Main_meal   9.84       -0.48   -4.92   -6.08
## Dinner      6.54       -1.19   -3.42   -3.30
## Breakfeast  4.88        3.46   -2.82   -5.30
## Tidying     1.70       -1.61   -4.97    3.59
## Dishes     -1.10        1.86   -4.16    3.49
## Shopping   -1.29        1.32   -3.36    3.38
## Official   -3.66        8.56    0.44   -2.46
## Driving    -5.47        6.84    8.10   -5.90
## Finances   -4.15       -0.85   -0.74    5.75
## Insurance  -5.76       -4.28    4.11    5.72
## Repairs    -7.53       -4.29   20.65   -6.65
## Holidays   -7.42       -4.62   -4.90   15.56
# Grafički prikaz
corrplot(procjena_kucniPoslovi$residuals, is.cor = F)

# Izracunaj doprinos Chi_sq statistici
doprinos <- 100*procjena_kucniPoslovi$residuals^2/procjena_kucniPoslovi$statistic
round(doprinos,2)
##            Wife Alternating Husband Jointly
## Laundry    7.74        0.27    1.78    2.25
## Main_meal  4.98        0.01    1.24    1.90
## Dinner     2.20        0.07    0.60    0.56
## Breakfeast 1.22        0.61    0.41    1.44
## Tidying    0.15        0.13    1.27    0.66
## Dishes     0.06        0.18    0.89    0.63
## Shopping   0.09        0.09    0.58    0.59
## Official   0.69        3.77    0.01    0.31
## Driving    1.54        2.40    3.37    1.79
## Finances   0.89        0.04    0.03    1.70
## Insurance  1.71        0.94    0.87    1.68
## Repairs    2.92        0.95   21.92    2.28
## Holidays   2.83        1.10    1.23   12.45
corrplot(doprinos, is.cor = F)

LINEARNA REGRESIJA

Dijagram rasipanja koji pokazuje "raspolozenje" kao funkciju "sati spavanja".

Dijagram rasipanja koji pokazuje “raspolozenje” kao funkciju “sati spavanja”.

  # Najbolji regresijski pravac
    drawBasicScatterplot( emphGrey, "Najbolji regresijski pravac" )
    good.coef <- lm( dan.grump ~ dan.sleep, parenthood)$coef
    abline( good.coef, col=ifelse(colour,emphCol,"black"), lwd=3 )
Regresijski pravac koji prikazuje odnos "raspolozenja" i "sati spavanja".

Regresijski pravac koji prikazuje odnos “raspolozenja” i “sati spavanja”.

Regresijski pravac koji loše prikazuje odnos "raspolozenja" i "sati spavanja".

Regresijski pravac koji loše prikazuje odnos “raspolozenja” i “sati spavanja”.

\[ \hat{Y_i} = b_1 X_i + b_0 \]

\[ \epsilon_i = Y_i - \hat{Y}_i \]

\[ Y_i = b_1 X_i + b_0 + \epsilon_i \] - OLS model

\[ \sum_i (Y_i - \hat{Y}_i)^2 \] - optimizacija modela uz uvjet(ograničenje)

\[ \sum_i {\epsilon_i}^2 \]

Prikaz reziduala vezanih uz regresijski pravac.

Prikaz reziduala vezanih uz regresijski pravac.

Prikaz reziduala vezanih uz "loš" regresijski pravac.

Prikaz reziduala vezanih uz “loš” regresijski pravac.

# Procjeni regresijski model i spremi rezultate u objekt
regression.1 <- lm( formula = dan.grump ~ dan.sleep,  
                    data = parenthood ) 
# Pogledaj rezultate
print( regression.1 )
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937

\[ \hat{Y}_i = -8.94 \ X_i + 125.96 \]

VIŠESTRUKA LINEARNA REGRESIJA

\[ Y_i = b_2 X_{i2} + b_1 X_{i1} + b_0 + \epsilon_i \]

      dan.grump ~ dan.sleep + baby.sleep
knitr::include_graphics(file.path("../GRAFIKE/scatter3d.png"))

# Provedi višestruku regresiju u R
regression.2 <- lm( formula = dan.grump ~ dan.sleep + baby.sleep,  
                     data = parenthood )
# Pregledaj rezultate
print(regression.2)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##   125.96557     -8.95025      0.01052

\[ Y_i = \left( \sum_{k=1}^K b_{k} X_{ik} \right) + b_0 + \epsilon_i \]

  1. Rezidualna odstupanja

\[ \mbox{SS}_{res} = \sum_i (Y_i - \hat{Y}_i)^2 \]

  1. Ukupna odstupanja

\[ \mbox{SS}_{tot} = \sum_i (Y_i - \bar{Y})^2 \]

  1. Izračunaj u programu
X <- parenthood$dan.sleep  # Nezavisna varijabla
Y <- parenthood$dan.grump  # Zavisna varijabla
# Procijenjene vrijednosti
Y.pred <- -8.94 * X  +  125.97
# Izračunaj sumu rezidualnih odstupanja
SS.resid <- sum((Y - Y.pred)^2)
print(SS.resid) # Prikaži
## [1] 1838.722
# Izračunaj sumu ukupnih odstupanja
SS.tot <- sum((Y - mean(Y)^2))
print(SS.tot) # Prikaži
## [1] -399525.4
  1. Formula

\[ R^2 = 1 - \frac{\mbox{SS}_{res}}{\mbox{SS}_{tot}} \] 5. Izračunaj vrijednost

# Unesi vrijednsoti
R.squared <- 1 - (SS.resid / SS.tot)
print(R.squared) # Prikaži 
## [1] 1.004602
  1. Usporedi sa korelacijom
r <- cor(X, Y)  # Izračunaj korelaciju
print( r^2 )    # Prikaži
## [1] 0.8161027

\[ \mbox{adj. } R^2 = 1 - \left(\frac{\mbox{SS}_{res}}{\mbox{SS}_{tot}} \times \frac{N-1}{N-K-1} \right) \]

  1. Za cijeli model

\[ H_0: Y_i = b_0 + \epsilon_i \]

\[ H_1: Y_i = \left( \sum_{k=1}^K b_{k} X_{ik} \right) + b_0 + \epsilon_i \]

\[ \mbox{SS}_{mod} = \mbox{SS}_{tot} - \mbox{SS}_{res} \]

\[ \begin{array}{rcl} \mbox{MS}_{mod} &=& \displaystyle\frac{\mbox{SS}_{mod} }{df_{mod}} \\ \\ \mbox{MS}_{res} &=& \displaystyle\frac{\mbox{SS}_{res} }{df_{res} } \end{array} \]

\[ F = \frac{\mbox{MS}_{mod}}{\mbox{MS}_{res}} \]

  1. Za pojedinačne koeficijente

\[ \begin{array}{rl} H_0: & b = 0 \\ H_1: & b \neq 0 \end{array} \]

\[ t = \frac{\hat{b}}{\mbox{SE}({\hat{b})}} \]

# Pogledaj rezultate modela
print( regression.2 ) 
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##   125.96557     -8.95025      0.01052
# Pogledaj rezultate
summary(regression.2)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0345  -2.2198  -0.4016   2.6775  11.7496 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 125.96557    3.04095  41.423   <2e-16 ***
## dan.sleep    -8.95025    0.55346 -16.172   <2e-16 ***
## baby.sleep    0.01052    0.27106   0.039    0.969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.354 on 97 degrees of freedom
## Multiple R-squared:  0.8161, Adjusted R-squared:  0.8123 
## F-statistic: 215.2 on 2 and 97 DF,  p-value: < 2.2e-16
  1. Outlieri

  2. Leverage

  3. Utjecaj opservacije

\[ D_i = \frac{{\epsilon_i^*}^2 }{K+1} \times \frac{h_i}{1-h_i} \]

# Izračunaj mjeru Cook-ove udaljenosti
cooks.distance( model = regression.2 )
# Prikaži Cook-ovu mjeru grafički
plot(x = regression.2, which = 4)

# Provedi procjenu bez ekstremnih opservacija

lm(formula = dan.grump ~ dan.sleep + baby.sleep,
   data = parenthood,
   subset = -64)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood, 
##     subset = -64)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##    126.3553      -8.8283      -0.1319
# Prikaži grafički
plot(x = regression.2, which = 1 ) # Resid vs. fitted

plot(x = regression.2, which = 2 ) # QQ-plot

# Prikaži reziduale na histogramu
hist( x = residuals(regression.2),
      xlab = "Vrijednost reziduala",
      main = "",
      breaks = 20)

# Spremi fit vrijednosti u objekt
yhat.2 <- fitted.values(object = regression.2)
# Prikaži grafički
plot(x = yhat.2,
     y = parenthood$dan.grump,
     xlab = "Fit",
     ylab = "Observed")

# Prikaži reyidualne vs. fitted vrijednosti
plot(x = regression.2, which = 1)

# Prikaži rezidualne vs fitted vrijednosti
car::residualPlots(model = regression.2)

##            Test stat Pr(>|Test stat|)  
## dan.sleep     2.1604          0.03323 *
## baby.sleep   -0.5445          0.58733  
## Tukey test    2.1615          0.03066 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(x = regression.2, which = 3)

# Provedi test homogenosti varijance
car::ncvTest(regression.2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.09317511, Df = 1, p = 0.76018
# Provedi drugi test varijance
library(car)
lmtest::coeftest( regression.2, vcov= hccm )
## 
## t test of coefficients:
## 
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 125.965566   3.247285  38.7910   <2e-16 ***
## dan.sleep    -8.950250   0.615820 -14.5339   <2e-16 ***
## baby.sleep    0.010524   0.291565   0.0361   0.9713    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

\[ \mbox{VIF}_k = \frac{1}{1-{R^2_{(-k)}}} \]

# Provedi test
vif( mod = regression.2 )
##  dan.sleep baby.sleep 
##   1.651038   1.651038
# Pogledaj korelaciju
cor( parenthood )
##              dan.sleep  baby.sleep   dan.grump         day
## dan.sleep   1.00000000  0.62794934 -0.90338404 -0.09840768
## baby.sleep  0.62794934  1.00000000 -0.56596373 -0.01043394
## dan.grump  -0.90338404 -0.56596373  1.00000000  0.07647926
## day        -0.09840768 -0.01043394  0.07647926  1.00000000
  1. Informacijski kriterij (AIC)

\[ \mbox{AIC} = \displaystyle\frac{\mbox{SS}_{res}}{\hat{\sigma}}^2+ 2K \] 2. Selekcija unatrag

# Specificiraj puni model
full.model <- lm(formula = dan.grump ~ dan.sleep + baby.sleep + day,
                 data = parenthood)

# Selekcija unatrag
step(object = full.model,
     direction = "backward")
## Start:  AIC=299.08
## dan.grump ~ dan.sleep + baby.sleep + day
## 
##              Df Sum of Sq    RSS    AIC
## - baby.sleep  1       0.1 1837.2 297.08
## - day         1       1.6 1838.7 297.16
## <none>                    1837.1 299.08
## - dan.sleep   1    4909.0 6746.1 427.15
## 
## Step:  AIC=297.08
## dan.grump ~ dan.sleep + day
## 
##             Df Sum of Sq    RSS    AIC
## - day        1       1.6 1838.7 295.17
## <none>                   1837.2 297.08
## - dan.sleep  1    8103.0 9940.1 463.92
## 
## Step:  AIC=295.17
## dan.grump ~ dan.sleep
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1838.7 295.17
## - dan.sleep  1    8159.9 9998.6 462.50
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
  1. Selekcija unaprijed
# Procijeni osnovni model
nul.model <- lm(dan.grump ~ 1, parenthood)
# Definiraj selekcijsku funkciju (unaprijed)
step(object = nul.model,
     direction = "forward",
     scope = dan.grump ~ dan.sleep + baby.sleep + day)
## Start:  AIC=462.5
## dan.grump ~ 1
## 
##              Df Sum of Sq    RSS    AIC
## + dan.sleep   1    8159.9 1838.7 295.17
## + baby.sleep  1    3202.7 6795.9 425.89
## <none>                    9998.6 462.50
## + day         1      58.5 9940.1 463.92
## 
## Step:  AIC=295.17
## dan.grump ~ dan.sleep
## 
##              Df Sum of Sq    RSS    AIC
## <none>                    1838.7 295.17
## + day         1   1.55760 1837.2 297.08
## + baby.sleep  1   0.02858 1838.7 297.16
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# Procjeni dva ugnježdena modela
M0 <- lm( dan.grump ~ dan.sleep + day, parenthood )
M1 <- lm( dan.grump ~ dan.sleep + day + baby.sleep, parenthood )
# Usporedi modele
AIC( M0, M1 )
##    df      AIC
## M0  4 582.8681
## M1  5 584.8646

\[ F = \frac{(\mbox{SS}_{res}^{(0)} - \mbox{SS}_{res}^{(1)})/k}{(\mbox{SS}_{res}^{(1)})/(N-p-1)} \]

\[ \mbox{SS}_\Delta = \mbox{SS}_{res}^{(0)} - \mbox{SS}_{res}^{(1)} \]

\[ \mbox{SS}_\Delta = \sum_{i} \left( \hat{y}_i^{(1)} - \hat{y}_i^{(0)} \right)^2 \]

# Provedi hijerarhijsku regresiju
anova(M0, M1)
## Analysis of Variance Table
## 
## Model 1: dan.grump ~ dan.sleep + day
## Model 2: dan.grump ~ dan.sleep + day + baby.sleep
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     97 1837.2                           
## 2     96 1837.1  1  0.063688 0.0033 0.9541