Chapter 6 Comparing Groups: Statistical Tests

chi-square tests
t-test
ANOVA

temp.packages <- c("binom")
temp.packages <- temp.packages[!(temp.packages %in% installed.packages()[,"Package"])]
if(length(temp.packages)>0) install.packages(temp.packages)
remove("temp.packages")

seg.df <- read.csv("http://goo.gl/qw303p")
summary(seg.df)

##       age           gender        income            kids        ownHome   
##  Min.   :19.26   Female:157   Min.   : -5183   Min.   :0.00   ownNo :159  
##  1st Qu.:33.01   Male  :143   1st Qu.: 39656   1st Qu.:0.00   ownYes:141  
##  Median :39.49                Median : 52014   Median :1.00               
##  Mean   :41.20                Mean   : 50937   Mean   :1.27               
##  3rd Qu.:47.90                3rd Qu.: 61403   3rd Qu.:2.00               
##  Max.   :80.49                Max.   :114278   Max.   :7.00               
##   subscribe         Segment   
##  subNo :260   Moving up : 70  
##  subYes: 40   Suburb mix:100  
##               Travelers : 80  
##               Urban hip : 50  
##                               
##

chi-square tests

(tmp.tab <- table(rep(c(1:4), times=c(25,25,25,20))))

## 
##  1  2  3  4 
## 25 25 25 20

chisq.test(tmp.tab)

## 
##  Chi-squared test for given probabilities
## 
## data:  tmp.tab
## X-squared = 0.78947, df = 3, p-value = 0.852

(tmp.tab <- table(rep(c(1:4), times=c(10,10,10,20))))

## 
##  1  2  3  4 
## 10 10 10 20

chisq.test(tmp.tab)

## 
##  Chi-squared test for given probabilities
## 
## data:  tmp.tab
## X-squared = 6, df = 3, p-value = 0.1116

tmp.tab <- tmp.tab/5
tmp.tab

## 
## 1 2 3 4 
## 2 2 2 4

chisq.test(tmp.tab)

## Warning in chisq.test(tmp.tab): Chi-squared approximation may be incorrect

## 
##  Chi-squared test for given probabilities
## 
## data:  tmp.tab
## X-squared = 1.2, df = 3, p-value = 0.753

# one way chi-square in our data
chisq.test(table(seg.df$Segment))

## 
##  Chi-squared test for given probabilities
## 
## data:  table(seg.df$Segment)
## X-squared = 17.333, df = 3, p-value = 0.0006035

# chisq.test one way formula
observedV = table(seg.df$Segment)
ExpectedV <- sum(observedV/sum(observedV)*observedV)
(ChiSquared <- sum((ExpectedV-observedV)^2/ExpectedV))

## [1] 17.33333

# two-way chi-square
table(seg.df$subscribe, seg.df$ownHome)

##         
##          ownNo ownYes
##   subNo    137    123
##   subYes    22     18

#contingency tables = defaults to using Yates’ correction(correct=TRUE)
chisq.test(table(seg.df$subscribe, seg.df$ownHome))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(seg.df$subscribe, seg.df$ownHome)
## X-squared = 0.010422, df = 1, p-value = 0.9187

# chisq.test tow way formula
#https://www.khanacademy.org/math/probability/statistics-inferential/chi-square/v/contingency-table-chi-square-test

# two-way chi-square without correction (matches traditional formula)
chisq.test(table(seg.df$subscribe, seg.df$ownHome), correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  table(seg.df$subscribe, seg.df$ownHome)
## X-squared = 0.074113, df = 1, p-value = 0.7854

# two-way with simulation to establish p value
#Monte Carlo's Simulation
chisq.test(table(seg.df$subscribe, seg.df$ownHome), sim=TRUE, B=10000) #based on 2000 replicates

## 
##  Pearson's Chi-squared test with simulated p-value (based on 10000
##  replicates)
## 
## data:  table(seg.df$subscribe, seg.df$ownHome)
## X-squared = 0.074113, df = NA, p-value = 0.8645

### binomial
#the confidence interval includes 50 %. You can't reject NULL Hypothesis
binom.test(12, 20, p=0.5)

## 
##  Exact binomial test
## 
## data:  12 and 20
## number of successes = 12, number of trials = 20, p-value = 0.5034
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.3605426 0.8088099
## sample estimates:
## probability of success 
##                    0.6

#more samples, the confidence interval no longer includes 50 %. p-value is less than 0.05
binom.test(120, 200, p=0.5)

## 
##  Exact binomial test
## 
## data:  120 and 200
## number of successes = 120, number of trials = 200, p-value =
## 0.005685
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.5285357 0.6684537
## sample estimates:
## probability of success 
##                    0.6

#more samples 
binom.test(36000,60000, p=0.5)

## 
##  Exact binomial test
## 
## data:  36000 and 60000
## number of successes = 36000, number of trials = 60000, p-value <
## 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.5960670 0.6039233
## sample estimates:
## probability of success 
##                    0.6

test.mat = matrix(data = NA, nrow=30, ncol=2)
for(i in seq_along(1:30)) test.mat[i,] <- binom.test(i*3, i*5, p=0.5)$conf.int
testdata = rbind(data.frame(row(test.mat),ci= test.mat[,1])[,2:3],data.frame(row(test.mat),ci=test.mat[,2])[,2:3])
boxplot(testdata$ci ~ testdata$X2, col= "lightblue",
        ylab = "95 percent confidence interval",
        xlab = "sampleing size",
        xaxt = "n",
        main = "probability of success(0.6) based on sample size"
        )

axis(side=1, at=row(test.mat)[,1], labels = row(test.mat)[,1]*5)
#probability of success(0.6)
abline(h=0.6, col ="grey", lwd = 3, lty = 3)
#include 0.5 or not
abline(h=0.5, col ="red", lwd =3, lty = 3 )

#If we observe 20 fans, and the true split is 50 %, there is a 73.7 % chance that we would observe between 8 and 12 fans(and thus a 1 − p or 27.3 % chance of observ- ing fewer than 8 or more than 12)
sum(dbinom(8:12, 20, 0.5))

## [1] 0.736824

# agresti-coull might be more applicable for small N
# install.packages("binom")
binom::binom.confint(12, 20, method="ac")

##          method  x  n mean     lower     upper
## 1 agresti-coull 12 20  0.6 0.3860304 0.7817446

#The negative lower bound may be ignored as an artifact, and we conclude that al- though Chris observed 0 cases, the occurrence of mixed fandom groups is likely to be somewhere between 0 and 19 %.
binom::binom.confint(0, 20, method="ac")

##          method x  n mean      lower     upper
## 1 agresti-coull 0 20    0 -0.0286844 0.1898096

t-test

# first do some plotting to check income
hist(seg.df$income)

with(seg.df, hist(income[ownHome=="ownYes"]))

with(seg.df, hist(income[ownHome=="ownNo"]))

# then the t-tests
t.test(income ~ ownHome, data=seg.df)

## 
##  Welch Two Sample t-test
## 
## data:  income by ownHome
## t = -3.2731, df = 285.25, p-value = 0.001195
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12080.155  -3007.193
## sample estimates:
##  mean in group ownNo mean in group ownYes 
##             47391.01             54934.68

#t-test formula
tMean = aggregate(income ~ ownHome, data=seg.df, mean)[,2]
#?????#?????#?????#?????#?????
#????? why df = 285.25  #?????
#?????#?????#?????#?????#?????
tVar = aggregate(income ~ ownHome, data=seg.df, var)[,2]
tCount = with(seg.df, table(ownHome))
tSem = sqrt(sum(tVar/tCount))
(tMean[1]-tMean[2])/tSem

## [1] -3.273094

#Travelers are not 
t.test(income ~ ownHome, data=subset(seg.df, Segment=="Travelers"))

## 
##  Welch Two Sample t-test
## 
## data:  income by ownHome
## t = 0.26561, df = 53.833, p-value = 0.7916
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8508.993 11107.604
## sample estimates:
##  mean in group ownNo mean in group ownYes 
##             63188.42             61889.12

ANOVA

seg.aov.own <- aov(income ~ ownHome, data=seg.df)
sum((seg.df[seg.df$ownHome== "ownNo","income"] - mean(seg.df[seg.df$ownHome== "ownNo","income"]))^2)

## [1] 56673474312

plot(aov(income ~ ownHome, data=seg.df))

aggregate(income ~ ownHome, data=seg.df, mean)[2]

##     income
## 1 47391.01
## 2 54934.68

#total mean - vaue


anova(seg.aov.own)

## Analysis of Variance Table
## 
## Response: income
##            Df     Sum Sq    Mean Sq F value   Pr(>F)   
## ownHome     1 4.2527e+09 4252661211  10.832 0.001118 **
## Residuals 298 1.1700e+11  392611030                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

seg.aov.seg <- aov(income ~ Segment, data=seg.df)
anova(seg.aov.seg)

## Analysis of Variance Table
## 
## Response: income
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Segment     3 5.4970e+10 1.8323e+10  81.828 < 2.2e-16 ***
## Residuals 296 6.6281e+10 2.2392e+08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# two-way aov
anova(aov(income ~ Segment + ownHome, data=seg.df))

## Analysis of Variance Table
## 
## Response: income
##            Df     Sum Sq    Mean Sq F value Pr(>F)    
## Segment     3 5.4970e+10 1.8323e+10 81.6381 <2e-16 ***
## ownHome     1 6.9918e+07 6.9918e+07  0.3115 0.5772    
## Residuals 295 6.6211e+10 2.2444e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(aov(income ~ Segment * ownHome, data=seg.df))

## Analysis of Variance Table
## 
## Response: income
##                  Df     Sum Sq    Mean Sq F value Pr(>F)    
## Segment           3 5.4970e+10 1.8323e+10 81.1305 <2e-16 ***
## ownHome           1 6.9918e+07 6.9918e+07  0.3096 0.5784    
## Segment:ownHome   3 2.6329e+08 8.7762e+07  0.3886 0.7613    
## Residuals       292 6.5948e+10 2.2585e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# compare models
anova(aov(income ~ Segment, data=seg.df),
      aov(income ~ Segment + ownHome, data=seg.df))

## Analysis of Variance Table
## 
## Model 1: income ~ Segment
## Model 2: income ~ Segment + ownHome
##   Res.Df        RSS Df Sum of Sq      F Pr(>F)
## 1    296 6.6281e+10                           
## 2    295 6.6211e+10  1  69918004 0.3115 0.5772

### Visualize ANOVA group means

# create an aov model. problem for glht() plotting because of the intercept term
seg.aov <- aov(income ~ Segment, data=seg.df)
multcomp::glht(seg.aov)

## 
##   General Linear Hypotheses
## 
## Linear Hypotheses:
##                        Estimate
## (Intercept) == 0          53091
## SegmentSuburb mix == 0     1943
## SegmentTravelers == 0      9123
## SegmentUrban hip == 0    -31409

# make new AOV without intercept, as a *convenience for plotting* (not for modeling)
# it helps with plotting because it keeps all segments on same scale
seg.aov <- aov(income ~ -1 + Segment, data=seg.df)

multcomp::glht(seg.aov)

## 
##   General Linear Hypotheses
## 
## Linear Hypotheses:
##                        Estimate
## SegmentMoving up == 0     53091
## SegmentSuburb mix == 0    55034
## SegmentTravelers == 0     62214
## SegmentUrban hip == 0     21682

par(mar=c(6,10,2,2))   # adjusts margins to preserve axis labels
plot(multcomp::glht(seg.aov), xlab="Income", main="Average Income by Segment (95% CI)")

### stepwise ANOVA

seg.aov.step <- step(aov(income ~ ., data=seg.df))

## Start:  AIC=5779.17
## income ~ age + gender + kids + ownHome + subscribe + Segment
## 
##             Df  Sum of Sq        RSS    AIC
## - age        1 4.7669e+06 6.5661e+10 5777.2
## - ownHome    1 1.0337e+08 6.5759e+10 5777.6
## - kids       1 1.3408e+08 6.5790e+10 5777.8
## - subscribe  1 1.5970e+08 6.5816e+10 5777.9
## - gender     1 2.6894e+08 6.5925e+10 5778.4
## <none>                    6.5656e+10 5779.2
## - Segment    3 1.9303e+10 8.4959e+10 5850.5
## 
## Step:  AIC=5777.19
## income ~ gender + kids + ownHome + subscribe + Segment
## 
##             Df  Sum of Sq        RSS    AIC
## - ownHome    1 1.0159e+08 6.5762e+10 5775.7
## - kids       1 1.3205e+08 6.5793e+10 5775.8
## - subscribe  1 1.5794e+08 6.5819e+10 5775.9
## - gender     1 2.7009e+08 6.5931e+10 5776.4
## <none>                    6.5661e+10 5777.2
## - Segment    3 4.9044e+10 1.1470e+11 5938.6
## 
## Step:  AIC=5775.66
## income ~ gender + kids + subscribe + Segment
## 
##             Df  Sum of Sq        RSS    AIC
## - kids       1 1.0707e+08 6.5869e+10 5774.1
## - subscribe  1 1.6370e+08 6.5926e+10 5774.4
## - gender     1 2.5520e+08 6.6017e+10 5774.8
## <none>                    6.5762e+10 5775.7
## - Segment    3 5.2897e+10 1.1866e+11 5946.7
## 
## Step:  AIC=5774.15
## income ~ gender + subscribe + Segment
## 
##             Df  Sum of Sq        RSS    AIC
## - subscribe  1 1.6226e+08 6.6032e+10 5772.9
## - gender     1 2.4390e+08 6.6113e+10 5773.3
## <none>                    6.5869e+10 5774.1
## - Segment    3 5.3005e+10 1.1887e+11 5945.3
## 
## Step:  AIC=5772.88
## income ~ gender + Segment
## 
##           Df  Sum of Sq        RSS    AIC
## - gender   1 2.4949e+08 6.6281e+10 5772.0
## <none>                  6.6032e+10 5772.9
## - Segment  3 5.4001e+10 1.2003e+11 5946.2
## 
## Step:  AIC=5772.02
## income ~ Segment
## 
##           Df Sum of Sq        RSS    AIC
## <none>                 6.6281e+10 5772.0
## - Segment  3 5.497e+10 1.2125e+11 5947.2

anova(seg.aov.step)

## Analysis of Variance Table
## 
## Response: income
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Segment     3 5.4970e+10 1.8323e+10  81.828 < 2.2e-16 ***
## Residuals 296 6.6281e+10 2.2392e+08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# clean up -- not shown in book
rm(seg.aov.own, seg.aov.seg, seg.aov.step, seg.aov)

*Bayesian ANOVA

set.seed(96761)
library(BayesFactor)

## ************
## Welcome to BayesFactor 0.9.12-2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************

seg.bf1 <- lmBF(income ~ Segment, data=seg.df)
seg.bf2 <- lmBF(income ~ Segment + ownHome, data=seg.df)
seg.bf1 / seg.bf2

## Bayes factor analysis
## --------------
## [1] Segment : 6.579729 ±1.62%
## 
## Against denominator:
##   income ~ Segment + ownHome 
## ---
## Bayes factor type: BFlinearModel, JZS

seg.bf.chain <- posterior(seg.bf1, 1, iterations = 10000)

# plot the trace for posterior draw chain
plot(seg.bf.chain[, 1:6])   # note console: may need to hit <Return> to see all

summary(seg.bf.chain)

## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                          Mean        SD  Naive SE Time-series SE
## mu                  4.806e+04 8.969e+02 8.969e+00      8.804e+00
## Segment-Moving up   4.951e+03 1.548e+03 1.548e+01      1.548e+01
## Segment-Suburb mix  6.927e+03 1.373e+03 1.373e+01      1.373e+01
## Segment-Travelers   1.398e+04 1.487e+03 1.487e+01      1.518e+01
## Segment-Urban hip  -2.586e+04 1.777e+03 1.777e+01      1.956e+01
## sig2                2.259e+08 1.856e+07 1.856e+05      1.856e+05
## g_Segment           2.138e+00 3.359e+00 3.359e-02      3.359e-02
## 
## 2. Quantiles for each variable:
## 
##                          2.5%        25%        50%        75%      97.5%
## mu                  4.631e+04  4.745e+04  4.805e+04  4.868e+04  4.982e+04
## Segment-Moving up   1.925e+03  3.916e+03  4.968e+03  5.961e+03  8.054e+03
## Segment-Suburb mix  4.243e+03  5.996e+03  6.934e+03  7.857e+03  9.608e+03
## Segment-Travelers   1.104e+04  1.297e+04  1.399e+04  1.499e+04  1.690e+04
## Segment-Urban hip  -2.934e+04 -2.703e+04 -2.586e+04 -2.466e+04 -2.239e+04
## sig2                1.923e+08  2.128e+08  2.249e+08  2.378e+08  2.647e+08
## g_Segment           3.765e-01  7.949e-01  1.298e+00  2.284e+00  8.738e+00

head(seg.bf.chain)

## Markov Chain Monte Carlo (MCMC) output:
## Start = 1 
## End = 7 
## Thinning interval = 1 
##            mu Segment-Moving up Segment-Suburb mix Segment-Travelers
## [1,] 48055.75         4964.3105           6909.032          13983.21
## [2,] 47706.52         6478.1497           7816.873          12160.32
## [3,] 48362.90         5228.0718           6654.030          12565.87
## [4,] 49417.43         5300.9543           7249.228          12218.89
## [5,] 48177.21         6150.6339           6025.763          15589.03
## [6,] 49440.51          186.6233           6259.676          14172.76
## [7,] 46650.93         3530.4188           6921.267          14348.78
##      Segment-Urban hip      sig2 g_Segment
## [1,]         -25856.55 223195155 0.7316422
## [2,]         -26455.34 242898723 1.0773890
## [3,]         -24447.97 212407122 0.4827941
## [4,]         -24769.08 210490536 5.9279560
## [5,]         -27765.43 249129453 1.8348678
## [6,]         -20619.06 249498924 2.8711583
## [7,]         -24800.47 220446304 2.5618715

seg.bf.chain[1:4, 1:5]

##            mu Segment-Moving up Segment-Suburb mix Segment-Travelers
## [1,] 48055.75          4964.310           6909.032          13983.21
## [2,] 47706.52          6478.150           7816.873          12160.32
## [3,] 48362.90          5228.072           6654.030          12565.87
## [4,] 49417.43          5300.954           7249.228          12218.89
##      Segment-Urban hip
## [1,]         -25856.55
## [2,]         -26455.34
## [3,]         -24447.97
## [4,]         -24769.08

seg.bf.chain[1:4, 2:5] + seg.bf.chain[1:4, 1]

##      Segment-Moving up Segment-Suburb mix Segment-Travelers
## [1,]          53020.06           54964.78          62038.95
## [2,]          54184.67           55523.40          59866.84
## [3,]          53590.97           55016.93          60928.77
## [4,]          54718.38           56666.66          61636.32
##      Segment-Urban hip
## [1,]          22199.20
## [2,]          21251.18
## [3,]          23914.93
## [4,]          24648.35

seg.bf.chain.total <- seg.bf.chain[, 2:5] + seg.bf.chain[, 1]
seg.bf.ci <- t(apply(seg.bf.chain.total, 2, 
                     quantile, pr=c(0.025, 0.5, 0.975)))
seg.bf.ci

##                        2.5%      50%    97.5%
## Segment-Moving up  49582.08 53020.98 56522.05
## Segment-Suburb mix 52039.66 54988.99 57867.29
## Segment-Travelers  58799.46 62048.33 65355.62
## Segment-Urban hip  17992.85 22216.26 26450.56

### plot the credible intervals
# ggplot2 version
# install.packages("ggplot2")
library(ggplot2)

# make a data frame from the posterior summary
seg.bf.df <- data.frame(seg.bf.ci)
seg.bf.df$Segment <- rownames(seg.bf.df)

# basic plot with segment, 50% and 95% limits
p <- ggplot(seg.bf.df, aes(x=Segment, y=X50., ymax=X97.5., ymin=X2.5.))
# add the points and error bars
p <- p + geom_point(size=4) + geom_errorbar(width=0.2) + ylab("Income")

# plot it, adding a title, and flipping to horizontal

p + ggtitle("95% CI for Mean Income by Segment") + coord_flip()