Hi all! This section is about hypothesis testing.
- one sample t.test
- one sample prop.test
- two group comparisons
Download McDonalds.csv here
Download PERatio.csv here
one sample t.test
# example 1
McDonalds <- read.csv("McDonalds.csv")
str(McDonalds)
## 'data.frame': 25 obs. of 1 variable:
## $ Time: num 155 143 140 152 188 ...
attach(McDonalds)
# t.test
t.test(Time)
##
## One Sample t-test
##
## data: Time
## t = 44.257, df = 24, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 151.6068 166.4388
## sample estimates:
## mean of x
## 159.0228
## manual calculation
m <- mean(Time)
s <- sd(Time)
n <- length(Time)
df <- n-1 # degree of freedom
# qt returns the "t" value for P(X <= x), degree of freedom = n-1
t95 <- qt(0.975, df)
## manual calculation for t statistic
t <- (m-0)/(s/sqrt(n))
## manual calculation of confidence interval at 0.05 significance level
ci <- m +c(-1,1)*t95*s/sqrt(n)
ci
## [1] 151.6068 166.4388
# example 3
t.test(Time, mu = 174.22)
##
## One Sample t-test
##
## data: Time
## t = -4.2294, df = 24, p-value = 0.0002946
## alternative hypothesis: true mean is not equal to 174.22
## 95 percent confidence interval:
## 151.6068 166.4388
## sample estimates:
## mean of x
## 159.0228
t.test(Time, mu = 174.22, alternative = "less")
##
## One Sample t-test
##
## data: Time
## t = -4.2294, df = 24, p-value = 0.0001473
## alternative hypothesis: true mean is less than 174.22
## 95 percent confidence interval:
## -Inf 165.1703
## sample estimates:
## mean of x
## 159.0228
detach(McDonalds)
one sample prop.test
# example 2
MShopping <- read.csv("MShopping.csv")
str(MShopping)
## 'data.frame': 465 obs. of 1 variable:
## $ MShopping: Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 1 1 2 1 ...
# table() count the "Yes" & "No" and put it into table
table <- table(MShopping)
table
## MShopping
## No Yes
## 140 325
n <- nrow(MShopping)
# proportion test, "correct = FALSE" for Yates continuity correction not applied
prop.test(table[2], n, correct = FALSE)
##
## 1-sample proportions test without continuity correction
##
## data: table[2] out of n, null probability 0.5
## X-squared = 73.602, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.6557399 0.7388497
## sample estimates:
## p
## 0.6989247
# manual calculation of p
p <- table[2]/n
p
## Yes
## 0.6989247
# manual 2x2 table chi square contigency
ct <- rbind(table, c(n/2,n/2))
rownames(ct) <- c("Observed", "Expected")
ct
## No Yes
## Observed 140.0 325.0
## Expected 232.5 232.5
# manual chi square calculation
chi2 <- (140-232.5)^2/232.5 + (325-232.5)^2/232.5
chi2
## [1] 73.60215
df <- n-1
# estimate to t distribution with df = 464
se <- sqrt(p*(1-p)/n)
p + c(-1,1)*qt(0.975, df)*se
## [1] 0.6571216 0.7407278
two group comparisons
# example 4
cola <- read.csv("Cola.csv")
str(cola)
## 'data.frame': 10 obs. of 2 variables:
## $ Normal : int 22 34 52 62 30 40 64 84 56 59
## $ EndAisle: int 52 71 76 54 67 83 66 90 77 84
attach(cola)
# normality check by shapiro test, note p-value here is large and not <0.05 for siginficance
shapiro.test(Normal); shapiro.test(EndAisle)
##
## Shapiro-Wilk normality test
##
## data: Normal
## W = 0.96721, p-value = 0.8638
##
## Shapiro-Wilk normality test
##
## data: EndAisle
## W = 0.9534, p-value = 0.7088
# variance equality check for normal distributions
var.test(x = Normal, y = EndAisle)
##
## F test to compare two variances
##
## data: Normal and EndAisle
## F = 2.2289, num df = 9, denom df = 9, p-value = 0.2482
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5536233 8.9734747
## sample estimates:
## ratio of variances
## 2.228884
# t.test, use var.equal = TRUE
t.test(Normal, EndAisle, var.equal = TRUE)
##
## Two Sample t-test
##
## data: Normal and EndAisle
## t = -3.0446, df = 18, p-value = 0.006975
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.6743 -6.7257
## sample estimates:
## mean of x mean of y
## 50.3 72.0
# just try and see the difference if var.equal = FALSe
t.test(Normal, EndAisle, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: Normal and EndAisle
## t = -3.0446, df = 15.723, p-value = 0.007849
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.831299 -6.568701
## sample estimates:
## mean of x mean of y
## 50.3 72.0
detach(cola)
# example 5
peratio <- read.csv("PERatio.csv")
str(peratio)
## 'data.frame': 9 obs. of 3 variables:
## $ Company: int 1 2 3 4 5 6 7 8 9
## $ Year1 : num 8.9 38.1 43 34 34.5 15.2 20.3 19.9 61.9
## $ Year2 : num 12.7 45.4 10 27.2 22.8 ...
attach(peratio)
# normailty test
shapiro.test(Year1); shapiro.test(Year2)
##
## Shapiro-Wilk normality test
##
## data: Year1
## W = 0.95123, p-value = 0.7035
##
## Shapiro-Wilk normality test
##
## data: Year2
## W = 0.76551, p-value = 0.008197
# ansari test for distributions that are not normal
# note p-value here is large and not <0.05 for siginficance
ansari.test(Year1, Year2)
##
## Ansari-Bradley test
##
## data: Year1 and Year2
## AB = 43, p-value = 0.795
## alternative hypothesis: true ratio of scales is not equal to 1
# t.test, use paired = TRUE & var.equal = TRUE
t.test(Year1, Year2, paired = TRUE, var.equal = TRUE)
##
## Paired t-test
##
## data: Year1 and Year2
## t = -0.69909, df = 8, p-value = 0.5043
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -21.63607 11.56941
## sample estimates:
## mean of the differences
## -5.033333
detach(peratio)
# example 6
Beachcomber <- c(163,64,227)
Windsurfer <- c(154,108,262)
survey <- data.frame(Beachcomber, Windsurfer, Total = Beachcomber + Windsurfer,
row.names = c("Yes", "No", "Total"))
survey
## Beachcomber Windsurfer Total
## Yes 163 154 317
## No 64 108 172
## Total 227 262 489
# proportion test
prop.test(as.integer(survey[1, 1:2]), as.integer(survey[3, 1:2]), correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: as.integer(survey[1, 1:2]) out of as.integer(survey[3, 1:2])
## X-squared = 9.0526, df = 1, p-value = 0.002623
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.04673792 0.21381291
## sample estimates:
## prop 1 prop 2
## 0.7180617 0.5877863
# manual calculation of chi^2
chi2 <- (163*108-154*64)^2*489/(317*172*227*262)
chi2
## [1] 9.052598