STATISTIČKI ALATI: KATEGORIJALNA ANALIZA

Hrvatski studiji

dr.sc. Luka Šikić

18 studeni, 2019

CILJEVI PREDAVANJA

GOODNES-OF-FIT TEST

library( lsr ) # Učitaj paket
load( file.path("randomness.Rdata" )) # Uvezi podatke
str(cards) # Pregledaj podatke
## 'data.frame':    200 obs. of  3 variables:
##  $ id      : Factor w/ 200 levels "subj1","subj10",..: 1 112 124 135 146 157 168 179 190 2 ...
##  $ choice_1: Factor w/ 4 levels "clubs","diamonds",..: 4 2 3 4 3 1 3 2 4 2 ...
##  $ choice_2: Factor w/ 4 levels "clubs","diamonds",..: 1 1 1 1 4 3 2 1 1 4 ...
head(cards) # Pregledaj podatke
##      id choice_1 choice_2
## 1 subj1   spades    clubs
## 2 subj2 diamonds    clubs
## 3 subj3   hearts    clubs
## 4 subj4   spades    clubs
## 5 subj5   hearts   spades
## 6 subj6    clubs   hearts
podatci <- table(cards$choice_1) # Tabuliraj prvu varijablu
podatci # Pregledaj tabuliranu varijablu
## 
##    clubs diamonds   hearts   spades 
##       35       51       64       50
naziv indeks \(i\) simbol R naredba vrijednost
clubs \(\clubsuit\) 1 \(O_1\) observed[1] 35
diamonds \(\diamondsuit\) 2 \(O_2\) observed[2] 51
hearts \(\heartsuit\) 3 \(O_3\) observed[3] 64
spades \(\spadesuit\) 4 \(O_4\) observed[4] 50

\[ O = (O_1, O_2, O_3, O_4) \]

\[ O = (35, 51, 64, 50) \]

Hipoteza Mat.Hipoteza
\(H_0\) : Sve četiri karte su izabrane sa jednakom vjerojatnošću \(H_0: {P} = (.25, .25, .25, .25)\)
Nulta.Hipoteza Alternativna.Hipoteza
\(H_0\) : Sve četiri karte su izabrane sa jednakom vjerojatnošću \(H_1\) : Barem jedan izbor nije sa vjerojatnošću 0.25
\(H_0\) \(H_1\)
\(P = (.25, .25, .25, .25)\) \(P \neq (.25,.25,.25,.25)\)
# Pripiši vjerojatnosti varijabli
probabilities <- c(clubs = .25, diamonds = .25, hearts = .25, spades = .25) 
probabilities # Pogledaj varijablu
##    clubs diamonds   hearts   spades 
##     0.25     0.25     0.25     0.25

\[ E_i = N \times P_i \]

N <- 200  # Veličina uzorka
expected <- N * probabilities # Očekivane frekvencije
expected # Pogledaj podatke
##    clubs diamonds   hearts   spades 
##       50       50       50       50
# Izračunaj razliku izmedju podataka i očekivanih frekvencija
podatci - expected 
## 
##    clubs diamonds   hearts   spades 
##      -15        1       14        0
# Kvadriraj razlike kako bi se dobili veći brojevi
(podatci - expected)^2
## 
##    clubs diamonds   hearts   spades 
##      225        1      196        0
# Podijeli kvadrirane razlike 
(podatci - expected)^2 / expected
## 
##    clubs diamonds   hearts   spades 
##     4.50     0.02     3.92     0.00
# Zbroji podatke za sve karte
sum( (podatci - expected)^2 / expected )
## [1] 8.44

\[ X^2 = \sum_{i=1}^k \frac{(O_i - E_i)^2}{E_i} \]

\[ O_i \sim \mbox{Binomial}(P_i, N) \] - Stupnjevi slobode

Chi-square distribucija za različite vrijednosti stupnjeva slobode.

Chi-square distribucija za različite vrijednosti stupnjeva slobode.

Illustration of how the hypothesis testing works for the chi-square goodness of fit test.

Illustration of how the hypothesis testing works for the chi-square goodness of fit test.

# Izračunaj 95i centil chi-sq distribucije
qchisq( p = .95, df = 3 )
## [1] 7.814728
# Izračunaj p-vrijednost za podatke iz radnog primjera
pchisq( q = 8.44, df = 3, lower.tail = FALSE )
## [1] 0.03774185
# Alternativna varijanta izračuna
1-pchisq( q = 8.44, df = 3 )
## [1] 0.03774185
# Izvedi test u lsr paketu
goodnessOfFitTest( cards$choice_1 )
## 
##      Chi-square test against specified probabilities
## 
## Data variable:   cards$choice_1 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##          observed freq. expected freq. specified prob.
## clubs                35             50            0.25
## diamonds             51             50            0.25
## hearts               64             50            0.25
## spades               50             50            0.25
## 
## Test results: 
##    X-squared statistic:  8.44 
##    degrees of freedom:  3 
##    p-value:  0.038
# Formiraj druge vjerojatnosti
nullProbs <- c(clubs = .2, diamonds = .3, hearts = .3, spades = .2)
nullProbs # Pregledaj varijablu
##    clubs diamonds   hearts   spades 
##      0.2      0.3      0.3      0.2
# Izvedi test sa drugačije specificiranim vjerojatnostima
goodnessOfFitTest( x = cards$choice_1, p = nullProbs )
## 
##      Chi-square test against specified probabilities
## 
## Data variable:   cards$choice_1 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##          observed freq. expected freq. specified prob.
## clubs                35             40             0.2
## diamonds             51             60             0.3
## hearts               64             60             0.3
## spades               50             40             0.2
## 
## Test results: 
##    X-squared statistic:  4.742 
##    degrees of freedom:  3 
##    p-value:  0.192
  1. Opis podataka i nulte hipoteze.
  2. Statistički blok.
  3. Interpretacija rezultata.

TEST NEZAVISNOSTIT

# Uvezi podatke
load( file.path("./chapek9.Rdata" ))
str(chapek9) # Pregledaj strukturu podataka
## 'data.frame':    180 obs. of  2 variables:
##  $ species: Factor w/ 2 levels "robot","human": 1 2 2 2 1 2 2 1 2 1 ...
##  $ choice : Factor w/ 3 levels "puppy","flower",..: 2 3 3 3 3 2 3 3 1 2 ...
# Pregledaj podatke
head(chapek9)
##   species choice
## 1   robot flower
## 2   human   data
## 3   human   data
## 4   human   data
## 5   robot   data
## 6   human flower
# Pregledaj podatke
summary(chapek9)
##   species      choice   
##  robot:87   puppy : 28  
##  human:93   flower: 43  
##             data  :109
# Tabuliraj podatke
chapekFrequencies <- xtabs( ~ choice + species, data = chapek9)
chapekFrequencies # Pregledaj tabulirane podatke 
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
\(H_0\): Sve od sljedećeg je točno:
\(P_{11} = P_{12}\) (jednaka vjerojatnost izbora puppy)
\(P_{21} = P_{22}\) (jednaka vjerojatnost izbora flower) i
\(P_{31} = P_{32}\) (jednaka vjerojatnost izbora data).

\[ E_{ij} = C_j \times P_i \]

\[ \hat{P}_i = \frac{R_i}{N} \]

\[ E_{ij} = \frac{R_i \times C_j}{N} \]

\[ X^2 = \sum_{i=1}^r \sum_{j=1}^c \frac{({E}_{ij} - O_{ij})^2}{{E}_{ij}} \]

\[ df = (r-1)(c-1) \]

\[ \begin{array}{rcl} df &=& \mbox{(broj opservacija)} - \mbox{(broj ograničenja)} \\ &=& (rc) - (c + (r-1)) \\ &=& rc - c - r + 1 \\ &=& (r - 1)(c - 1) \end{array} \]

\[ \begin{array}{rcl} df &=& \mbox{(broj opservacija)} - \mbox{(broj ograničenja)} \\ &=& rc - ( (c-1) + (r-1) + 1) \\ &=& rc - c - r + 1 \\ &=& (r - 1)(c - 1) \end{array} \]

# Tabuliraj podatke
xtabs( formula = ~choice+species, data = chapek9 )
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
# Izvedi test nezavisnosti pomoću funkcije iz paketa lsr
associationTest( formula = ~choice+species, data = chapek9 )
## 
##      Chi-square test of categorical association
## 
## Variables:   choice, species 
## 
## Hypotheses: 
##    null:        variables are independent of one another
##    alternative: some contingency exists between variables
## 
## Observed contingency table:
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
## 
## Expected contingency table under the null hypothesis:
##         species
## choice   robot human
##   puppy   13.5  14.5
##   flower  20.8  22.2
##   data    52.7  56.3
## 
## Test results: 
##    X-squared statistic:  10.722 
##    degrees of freedom:  2 
##    p-value:  0.005 
## 
## Other information: 
##    estimated effect size (Cramer's v):  0.244

KOREKCIJA KONTINUIRANOSTI

\[ X^2 = \sum_{i} \frac{(|E_i - O_i| - 0.5)^2}{E_i} \]

EFEKT VELIČINE

\[ \phi = \sqrt{\frac{X^2}{N}} \]

\[ V = \sqrt{\frac{X^2}{N(k-1)}} \]

cramersV( chapekFrequencies )
## [1] 0.244058

PRETPOSTAVKE TESTA

IZVOĐENJE CHI-SQ TESTOVA U R

# Pregledaj podatke
podatci
## 
##    clubs diamonds   hearts   spades 
##       35       51       64       50
# Izvedi chi-sq test na standardni način
chisq.test(x = podatci)
## 
##  Chi-squared test for given probabilities
## 
## data:  podatci
## X-squared = 8.44, df = 3, p-value = 0.03774
# Izvedi chi-sq test na standardni način; drugačije definirane vjerojatnosti
chisq.test(x = podatci, p = c(.2,.3,.3,.2))
## 
##  Chi-squared test for given probabilities
## 
## data:  podatci
## X-squared = 4.7417, df = 3, p-value = 0.1917
# Test sa drugim podatcima
# Pregledaj podatke
chapekFrequencies
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
# Izvedi chi-sq test

chisq.test(chapekFrequencies)
## 
##  Pearson's Chi-squared test
## 
## data:  chapekFrequencies
## X-squared = 10.722, df = 2, p-value = 0.004697

FISHEROV TEST

# Učitaj nove podatke
load(file.path("./salem.Rdata"))
# Tabuliraj podatke
salem.tabs <- table( trial )
# Prikaži podatke
print( salem.tabs )
##        on.fire
## happy   FALSE TRUE
##   FALSE     3    3
##   TRUE     10    0
# Izvedi chi-sq test na podatcima
chisq.test( salem.tabs )
## Warning in chisq.test(salem.tabs): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  salem.tabs
## X-squared = 3.3094, df = 1, p-value = 0.06888

\[ P(O_{11}, O_{12}, O_{21}, O_{22} \ | \ R_1, R_2, C_1, C_2) \]

# Izvedi Fisherov test u R
fisher.test( salem.tabs )
## 
##  Fisher's Exact Test for Count Data
## 
## data:  salem.tabs
## p-value = 0.03571
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.000000 1.202913
## sample estimates:
## odds ratio 
##          0

McNEMAROV TEST

\[ X^2 = \frac{(|b-c| - 0.5)^2}{b+c} \]

\[ X^2 = \frac{(|O_{12}-O_{21}| - 0.5)^2}{O_{12} + O_{21}} \]

# Učitaj podatke
load(file.path("./agpp.Rdata"))
str(agpp)      # Pregled podataka
## 'data.frame':    100 obs. of  3 variables:
##  $ id             : Factor w/ 100 levels "subj.1","subj.10",..: 1 13 24 35 46 57 68 79 90 2 ...
##  $ response_before: Factor w/ 2 levels "no","yes": 1 2 2 2 1 1 1 1 1 1 ...
##  $ response_after : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 2 1 1 ...
head(agpp)     # Pregled podataka
##       id response_before response_after
## 1 subj.1              no            yes
## 2 subj.2             yes             no
## 3 subj.3             yes             no
## 4 subj.4             yes             no
## 5 subj.5              no             no
## 6 subj.6              no             no
summary(agpp ) # Pregled podataka
##         id     response_before response_after
##  subj.1  : 1   no :70          no :90        
##  subj.10 : 1   yes:30          yes:10        
##  subj.100: 1                                 
##  subj.11 : 1                                 
##  subj.12 : 1                                 
##  subj.13 : 1                                 
##  (Other) :94
# Tabuliraj podatke
right.table <- xtabs( ~ response_before + response_after, data = agpp)
print( right.table ) # Pregled podataka
##                response_after
## response_before no yes
##             no  65   5
##             yes 25   5
# Provedi test u R
mcnemar.test( right.table )
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  right.table
## McNemar's chi-squared = 12.033, df = 1, p-value = 0.0005226
# Tabuliraj podatke
cardChoices <- xtabs( ~ choice_1 + choice_2, data = cards )
cardChoices # Pregled podataka
##           choice_2
## choice_1   clubs diamonds hearts spades
##   clubs       10        9     10      6
##   diamonds    20        4     13     14
##   hearts      20       18      3     23
##   spades      18       13     15      4
# Provedi standardni chi-sq test
chisq.test(cardChoices)
## 
##  Pearson's Chi-squared test
## 
## data:  cardChoices
## X-squared = 29.237, df = 9, p-value = 0.0005909
# Provedi McNemarov tets
mcnemar.test(cardChoices)
## 
##  McNemar's Chi-squared test
## 
## data:  cardChoices
## McNemar's chi-squared = 16.033, df = 6, p-value = 0.01358