temp.packages <- c("binom")
temp.packages <- temp.packages[!(temp.packages %in% installed.packages()[,"Package"])]
if(length(temp.packages)>0) install.packages(temp.packages)
remove("temp.packages")
seg.df <- read.csv("http://goo.gl/qw303p")
summary(seg.df)
## age gender income kids ownHome
## Min. :19.26 Female:157 Min. : -5183 Min. :0.00 ownNo :159
## 1st Qu.:33.01 Male :143 1st Qu.: 39656 1st Qu.:0.00 ownYes:141
## Median :39.49 Median : 52014 Median :1.00
## Mean :41.20 Mean : 50937 Mean :1.27
## 3rd Qu.:47.90 3rd Qu.: 61403 3rd Qu.:2.00
## Max. :80.49 Max. :114278 Max. :7.00
## subscribe Segment
## subNo :260 Moving up : 70
## subYes: 40 Suburb mix:100
## Travelers : 80
## Urban hip : 50
##
##
chi-square tests
(tmp.tab <- table(rep(c(1:4), times=c(25,25,25,20))))
##
## 1 2 3 4
## 25 25 25 20
chisq.test(tmp.tab)
##
## Chi-squared test for given probabilities
##
## data: tmp.tab
## X-squared = 0.78947, df = 3, p-value = 0.852
(tmp.tab <- table(rep(c(1:4), times=c(10,10,10,20))))
##
## 1 2 3 4
## 10 10 10 20
chisq.test(tmp.tab)
##
## Chi-squared test for given probabilities
##
## data: tmp.tab
## X-squared = 6, df = 3, p-value = 0.1116
tmp.tab <- tmp.tab/5
tmp.tab
##
## 1 2 3 4
## 2 2 2 4
chisq.test(tmp.tab)
## Warning in chisq.test(tmp.tab): Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: tmp.tab
## X-squared = 1.2, df = 3, p-value = 0.753
# one way chi-square in our data
chisq.test(table(seg.df$Segment))
##
## Chi-squared test for given probabilities
##
## data: table(seg.df$Segment)
## X-squared = 17.333, df = 3, p-value = 0.0006035
# chisq.test one way formula
observedV = table(seg.df$Segment)
ExpectedV <- sum(observedV/sum(observedV)*observedV)
(ChiSquared <- sum((ExpectedV-observedV)^2/ExpectedV))
## [1] 17.33333
# two-way chi-square
table(seg.df$subscribe, seg.df$ownHome)
##
## ownNo ownYes
## subNo 137 123
## subYes 22 18
#contingency tables = defaults to using Yates’ correction(correct=TRUE)
chisq.test(table(seg.df$subscribe, seg.df$ownHome))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(seg.df$subscribe, seg.df$ownHome)
## X-squared = 0.010422, df = 1, p-value = 0.9187
# chisq.test tow way formula
#https://www.khanacademy.org/math/probability/statistics-inferential/chi-square/v/contingency-table-chi-square-test
# two-way chi-square without correction (matches traditional formula)
chisq.test(table(seg.df$subscribe, seg.df$ownHome), correct=FALSE)
##
## Pearson's Chi-squared test
##
## data: table(seg.df$subscribe, seg.df$ownHome)
## X-squared = 0.074113, df = 1, p-value = 0.7854
# two-way with simulation to establish p value
#Monte Carlo's Simulation
chisq.test(table(seg.df$subscribe, seg.df$ownHome), sim=TRUE, B=10000) #based on 2000 replicates
##
## Pearson's Chi-squared test with simulated p-value (based on 10000
## replicates)
##
## data: table(seg.df$subscribe, seg.df$ownHome)
## X-squared = 0.074113, df = NA, p-value = 0.8645
### binomial
#the confidence interval includes 50 %. You can't reject NULL Hypothesis
binom.test(12, 20, p=0.5)
##
## Exact binomial test
##
## data: 12 and 20
## number of successes = 12, number of trials = 20, p-value = 0.5034
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.3605426 0.8088099
## sample estimates:
## probability of success
## 0.6
#more samples, the confidence interval no longer includes 50 %. p-value is less than 0.05
binom.test(120, 200, p=0.5)
##
## Exact binomial test
##
## data: 120 and 200
## number of successes = 120, number of trials = 200, p-value =
## 0.005685
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5285357 0.6684537
## sample estimates:
## probability of success
## 0.6
#more samples
binom.test(36000,60000, p=0.5)
##
## Exact binomial test
##
## data: 36000 and 60000
## number of successes = 36000, number of trials = 60000, p-value <
## 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5960670 0.6039233
## sample estimates:
## probability of success
## 0.6
test.mat = matrix(data = NA, nrow=30, ncol=2)
for(i in seq_along(1:30)) test.mat[i,] <- binom.test(i*3, i*5, p=0.5)$conf.int
testdata = rbind(data.frame(row(test.mat),ci= test.mat[,1])[,2:3],data.frame(row(test.mat),ci=test.mat[,2])[,2:3])
boxplot(testdata$ci ~ testdata$X2, col= "lightblue",
ylab = "95 percent confidence interval",
xlab = "sampleing size",
xaxt = "n",
main = "probability of success(0.6) based on sample size"
)
axis(side=1, at=row(test.mat)[,1], labels = row(test.mat)[,1]*5)
#probability of success(0.6)
abline(h=0.6, col ="grey", lwd = 3, lty = 3)
#include 0.5 or not
abline(h=0.5, col ="red", lwd =3, lty = 3 )

#If we observe 20 fans, and the true split is 50 %, there is a 73.7 % chance that we would observe between 8 and 12 fans(and thus a 1 − p or 27.3 % chance of observ- ing fewer than 8 or more than 12)
sum(dbinom(8:12, 20, 0.5))
## [1] 0.736824
# agresti-coull might be more applicable for small N
# install.packages("binom")
binom::binom.confint(12, 20, method="ac")
## method x n mean lower upper
## 1 agresti-coull 12 20 0.6 0.3860304 0.7817446
#The negative lower bound may be ignored as an artifact, and we conclude that al- though Chris observed 0 cases, the occurrence of mixed fandom groups is likely to be somewhere between 0 and 19 %.
binom::binom.confint(0, 20, method="ac")
## method x n mean lower upper
## 1 agresti-coull 0 20 0 -0.0286844 0.1898096
t-test
# first do some plotting to check income
hist(seg.df$income)

with(seg.df, hist(income[ownHome=="ownYes"]))

with(seg.df, hist(income[ownHome=="ownNo"]))

# then the t-tests
t.test(income ~ ownHome, data=seg.df)
##
## Welch Two Sample t-test
##
## data: income by ownHome
## t = -3.2731, df = 285.25, p-value = 0.001195
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -12080.155 -3007.193
## sample estimates:
## mean in group ownNo mean in group ownYes
## 47391.01 54934.68
#t-test formula
tMean = aggregate(income ~ ownHome, data=seg.df, mean)[,2]
#?????#?????#?????#?????#?????
#????? why df = 285.25 #?????
#?????#?????#?????#?????#?????
tVar = aggregate(income ~ ownHome, data=seg.df, var)[,2]
tCount = with(seg.df, table(ownHome))
tSem = sqrt(sum(tVar/tCount))
(tMean[1]-tMean[2])/tSem
## [1] -3.273094
#Travelers are not
t.test(income ~ ownHome, data=subset(seg.df, Segment=="Travelers"))
##
## Welch Two Sample t-test
##
## data: income by ownHome
## t = 0.26561, df = 53.833, p-value = 0.7916
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8508.993 11107.604
## sample estimates:
## mean in group ownNo mean in group ownYes
## 63188.42 61889.12
ANOVA
seg.aov.own <- aov(income ~ ownHome, data=seg.df)
sum((seg.df[seg.df$ownHome== "ownNo","income"] - mean(seg.df[seg.df$ownHome== "ownNo","income"]))^2)
## [1] 56673474312
plot(aov(income ~ ownHome, data=seg.df))




aggregate(income ~ ownHome, data=seg.df, mean)[2]
## income
## 1 47391.01
## 2 54934.68
#total mean - vaue
anova(seg.aov.own)
## Analysis of Variance Table
##
## Response: income
## Df Sum Sq Mean Sq F value Pr(>F)
## ownHome 1 4.2527e+09 4252661211 10.832 0.001118 **
## Residuals 298 1.1700e+11 392611030
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
seg.aov.seg <- aov(income ~ Segment, data=seg.df)
anova(seg.aov.seg)
## Analysis of Variance Table
##
## Response: income
## Df Sum Sq Mean Sq F value Pr(>F)
## Segment 3 5.4970e+10 1.8323e+10 81.828 < 2.2e-16 ***
## Residuals 296 6.6281e+10 2.2392e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# two-way aov
anova(aov(income ~ Segment + ownHome, data=seg.df))
## Analysis of Variance Table
##
## Response: income
## Df Sum Sq Mean Sq F value Pr(>F)
## Segment 3 5.4970e+10 1.8323e+10 81.6381 <2e-16 ***
## ownHome 1 6.9918e+07 6.9918e+07 0.3115 0.5772
## Residuals 295 6.6211e+10 2.2444e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(aov(income ~ Segment * ownHome, data=seg.df))
## Analysis of Variance Table
##
## Response: income
## Df Sum Sq Mean Sq F value Pr(>F)
## Segment 3 5.4970e+10 1.8323e+10 81.1305 <2e-16 ***
## ownHome 1 6.9918e+07 6.9918e+07 0.3096 0.5784
## Segment:ownHome 3 2.6329e+08 8.7762e+07 0.3886 0.7613
## Residuals 292 6.5948e+10 2.2585e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compare models
anova(aov(income ~ Segment, data=seg.df),
aov(income ~ Segment + ownHome, data=seg.df))
## Analysis of Variance Table
##
## Model 1: income ~ Segment
## Model 2: income ~ Segment + ownHome
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 296 6.6281e+10
## 2 295 6.6211e+10 1 69918004 0.3115 0.5772
### Visualize ANOVA group means
# create an aov model. problem for glht() plotting because of the intercept term
seg.aov <- aov(income ~ Segment, data=seg.df)
multcomp::glht(seg.aov)
##
## General Linear Hypotheses
##
## Linear Hypotheses:
## Estimate
## (Intercept) == 0 53091
## SegmentSuburb mix == 0 1943
## SegmentTravelers == 0 9123
## SegmentUrban hip == 0 -31409
# make new AOV without intercept, as a *convenience for plotting* (not for modeling)
# it helps with plotting because it keeps all segments on same scale
seg.aov <- aov(income ~ -1 + Segment, data=seg.df)
multcomp::glht(seg.aov)
##
## General Linear Hypotheses
##
## Linear Hypotheses:
## Estimate
## SegmentMoving up == 0 53091
## SegmentSuburb mix == 0 55034
## SegmentTravelers == 0 62214
## SegmentUrban hip == 0 21682
par(mar=c(6,10,2,2)) # adjusts margins to preserve axis labels
plot(multcomp::glht(seg.aov), xlab="Income", main="Average Income by Segment (95% CI)")

### stepwise ANOVA
seg.aov.step <- step(aov(income ~ ., data=seg.df))
## Start: AIC=5779.17
## income ~ age + gender + kids + ownHome + subscribe + Segment
##
## Df Sum of Sq RSS AIC
## - age 1 4.7669e+06 6.5661e+10 5777.2
## - ownHome 1 1.0337e+08 6.5759e+10 5777.6
## - kids 1 1.3408e+08 6.5790e+10 5777.8
## - subscribe 1 1.5970e+08 6.5816e+10 5777.9
## - gender 1 2.6894e+08 6.5925e+10 5778.4
## <none> 6.5656e+10 5779.2
## - Segment 3 1.9303e+10 8.4959e+10 5850.5
##
## Step: AIC=5777.19
## income ~ gender + kids + ownHome + subscribe + Segment
##
## Df Sum of Sq RSS AIC
## - ownHome 1 1.0159e+08 6.5762e+10 5775.7
## - kids 1 1.3205e+08 6.5793e+10 5775.8
## - subscribe 1 1.5794e+08 6.5819e+10 5775.9
## - gender 1 2.7009e+08 6.5931e+10 5776.4
## <none> 6.5661e+10 5777.2
## - Segment 3 4.9044e+10 1.1470e+11 5938.6
##
## Step: AIC=5775.66
## income ~ gender + kids + subscribe + Segment
##
## Df Sum of Sq RSS AIC
## - kids 1 1.0707e+08 6.5869e+10 5774.1
## - subscribe 1 1.6370e+08 6.5926e+10 5774.4
## - gender 1 2.5520e+08 6.6017e+10 5774.8
## <none> 6.5762e+10 5775.7
## - Segment 3 5.2897e+10 1.1866e+11 5946.7
##
## Step: AIC=5774.15
## income ~ gender + subscribe + Segment
##
## Df Sum of Sq RSS AIC
## - subscribe 1 1.6226e+08 6.6032e+10 5772.9
## - gender 1 2.4390e+08 6.6113e+10 5773.3
## <none> 6.5869e+10 5774.1
## - Segment 3 5.3005e+10 1.1887e+11 5945.3
##
## Step: AIC=5772.88
## income ~ gender + Segment
##
## Df Sum of Sq RSS AIC
## - gender 1 2.4949e+08 6.6281e+10 5772.0
## <none> 6.6032e+10 5772.9
## - Segment 3 5.4001e+10 1.2003e+11 5946.2
##
## Step: AIC=5772.02
## income ~ Segment
##
## Df Sum of Sq RSS AIC
## <none> 6.6281e+10 5772.0
## - Segment 3 5.497e+10 1.2125e+11 5947.2
anova(seg.aov.step)
## Analysis of Variance Table
##
## Response: income
## Df Sum Sq Mean Sq F value Pr(>F)
## Segment 3 5.4970e+10 1.8323e+10 81.828 < 2.2e-16 ***
## Residuals 296 6.6281e+10 2.2392e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# clean up -- not shown in book
rm(seg.aov.own, seg.aov.seg, seg.aov.step, seg.aov)
*Bayesian ANOVA
set.seed(96761)
library(BayesFactor)
## ************
## Welcome to BayesFactor 0.9.12-2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
seg.bf1 <- lmBF(income ~ Segment, data=seg.df)
seg.bf2 <- lmBF(income ~ Segment + ownHome, data=seg.df)
seg.bf1 / seg.bf2
## Bayes factor analysis
## --------------
## [1] Segment : 6.579729 ±1.62%
##
## Against denominator:
## income ~ Segment + ownHome
## ---
## Bayes factor type: BFlinearModel, JZS
seg.bf.chain <- posterior(seg.bf1, 1, iterations = 10000)
# plot the trace for posterior draw chain
plot(seg.bf.chain[, 1:6]) # note console: may need to hit <Return> to see all


summary(seg.bf.chain)
##
## Iterations = 1:10000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## mu 4.806e+04 8.969e+02 8.969e+00 8.804e+00
## Segment-Moving up 4.951e+03 1.548e+03 1.548e+01 1.548e+01
## Segment-Suburb mix 6.927e+03 1.373e+03 1.373e+01 1.373e+01
## Segment-Travelers 1.398e+04 1.487e+03 1.487e+01 1.518e+01
## Segment-Urban hip -2.586e+04 1.777e+03 1.777e+01 1.956e+01
## sig2 2.259e+08 1.856e+07 1.856e+05 1.856e+05
## g_Segment 2.138e+00 3.359e+00 3.359e-02 3.359e-02
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## mu 4.631e+04 4.745e+04 4.805e+04 4.868e+04 4.982e+04
## Segment-Moving up 1.925e+03 3.916e+03 4.968e+03 5.961e+03 8.054e+03
## Segment-Suburb mix 4.243e+03 5.996e+03 6.934e+03 7.857e+03 9.608e+03
## Segment-Travelers 1.104e+04 1.297e+04 1.399e+04 1.499e+04 1.690e+04
## Segment-Urban hip -2.934e+04 -2.703e+04 -2.586e+04 -2.466e+04 -2.239e+04
## sig2 1.923e+08 2.128e+08 2.249e+08 2.378e+08 2.647e+08
## g_Segment 3.765e-01 7.949e-01 1.298e+00 2.284e+00 8.738e+00
head(seg.bf.chain)
## Markov Chain Monte Carlo (MCMC) output:
## Start = 1
## End = 7
## Thinning interval = 1
## mu Segment-Moving up Segment-Suburb mix Segment-Travelers
## [1,] 48055.75 4964.3105 6909.032 13983.21
## [2,] 47706.52 6478.1497 7816.873 12160.32
## [3,] 48362.90 5228.0718 6654.030 12565.87
## [4,] 49417.43 5300.9543 7249.228 12218.89
## [5,] 48177.21 6150.6339 6025.763 15589.03
## [6,] 49440.51 186.6233 6259.676 14172.76
## [7,] 46650.93 3530.4188 6921.267 14348.78
## Segment-Urban hip sig2 g_Segment
## [1,] -25856.55 223195155 0.7316422
## [2,] -26455.34 242898723 1.0773890
## [3,] -24447.97 212407122 0.4827941
## [4,] -24769.08 210490536 5.9279560
## [5,] -27765.43 249129453 1.8348678
## [6,] -20619.06 249498924 2.8711583
## [7,] -24800.47 220446304 2.5618715
seg.bf.chain[1:4, 1:5]
## mu Segment-Moving up Segment-Suburb mix Segment-Travelers
## [1,] 48055.75 4964.310 6909.032 13983.21
## [2,] 47706.52 6478.150 7816.873 12160.32
## [3,] 48362.90 5228.072 6654.030 12565.87
## [4,] 49417.43 5300.954 7249.228 12218.89
## Segment-Urban hip
## [1,] -25856.55
## [2,] -26455.34
## [3,] -24447.97
## [4,] -24769.08
seg.bf.chain[1:4, 2:5] + seg.bf.chain[1:4, 1]
## Segment-Moving up Segment-Suburb mix Segment-Travelers
## [1,] 53020.06 54964.78 62038.95
## [2,] 54184.67 55523.40 59866.84
## [3,] 53590.97 55016.93 60928.77
## [4,] 54718.38 56666.66 61636.32
## Segment-Urban hip
## [1,] 22199.20
## [2,] 21251.18
## [3,] 23914.93
## [4,] 24648.35
seg.bf.chain.total <- seg.bf.chain[, 2:5] + seg.bf.chain[, 1]
seg.bf.ci <- t(apply(seg.bf.chain.total, 2,
quantile, pr=c(0.025, 0.5, 0.975)))
seg.bf.ci
## 2.5% 50% 97.5%
## Segment-Moving up 49582.08 53020.98 56522.05
## Segment-Suburb mix 52039.66 54988.99 57867.29
## Segment-Travelers 58799.46 62048.33 65355.62
## Segment-Urban hip 17992.85 22216.26 26450.56
### plot the credible intervals
# ggplot2 version
# install.packages("ggplot2")
library(ggplot2)
# make a data frame from the posterior summary
seg.bf.df <- data.frame(seg.bf.ci)
seg.bf.df$Segment <- rownames(seg.bf.df)
# basic plot with segment, 50% and 95% limits
p <- ggplot(seg.bf.df, aes(x=Segment, y=X50., ymax=X97.5., ymin=X2.5.))
# add the points and error bars
p <- p + geom_point(size=4) + geom_errorbar(width=0.2) + ylab("Income")
# plot it, adding a title, and flipping to horizontal
p + ggtitle("95% CI for Mean Income by Segment") + coord_flip()
