R Markdown
## T-test for two independent samples
## data in pottery.csv
## Please perform a two sample independent t.test on the file navaho in ecourseware. I am also attaching it here.
data <- read.csv("navajo.csv", header=FALSE)
head(data)
head(data)
X1 <- as.integer(data$V1)
## Warning: NAs introduced by coercion
X2 <- as.integer(data$V2)
## Warning: NAs introduced by coercion
X1 <- X1[!is.na(X1)]
X2 <- X2[!is.na(X2)]
## Checking assumptions for normality- will reject normality
qqnorm(X1)

qqnorm(X2)

shapiro.test(X1)
##
## Shapiro-Wilk normality test
##
## data: X1
## W = 0.78368, p-value = 0.0191
shapiro.test(X2)
##
## Shapiro-Wilk normality test
##
## data: X2
## W = 0.88113, p-value = 0.1931
library(e1071)
skewness(X1)
## [1] 0.596518
skewness(X2)
## [1] 0.4433483
kurtosis(X1)
## [1] -1.638805
kurtosis(X2)
## [1] -1.620382
hist(X1)

hist(X2)

mean(X1)
## [1] 25.25
mean(X2)
## [1] 31.25
var(X1)
## [1] 207.3571
var(X2)
## [1] 482.5
## Try transformations
X1log <- log(X1)
X2log <- log(X2)
qqnorm(X1log)

qqnorm(X2log)

shapiro.test(X1log)
##
## Shapiro-Wilk normality test
##
## data: X1log
## W = 0.83394, p-value = 0.06523
shapiro.test(X2log)
##
## Shapiro-Wilk normality test
##
## data: X2log
## W = 0.92877, p-value = 0.5049
skewness(X1log)
## [1] 0.4381175
skewness(X2log)
## [1] 0.02925934
hist(X1log)

hist(X2log)

var(X1log)
## [1] 0.2849778
var(X2log)
## [1] 0.5711664
## since p-value is high then we fail to reject that the variances are the same
var.test(X1log,X2log)
##
## F test to compare two variances
##
## data: X1log and X2log
## F = 0.49894, num df = 7, denom df = 7, p-value = 0.3793
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.0998897 2.4921600
## sample estimates:
## ratio of variances
## 0.49894
## to test mean_1 = mean_2. So, we fail to reject that they have the same mean
t.test(X1log,X2log,alternative = "two.sided", var.equal = TRUE)
##
## Two Sample t-test
##
## data: X1log and X2log
## t = -0.32619, df = 14, p-value = 0.7491
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8083461 0.5949282
## sample estimates:
## mean of x mean of y
## 3.097521 3.204230
## if variances are not equal then we use
t.test(X1log,X2log,alternative = "two.sided", var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: X1log and X2log
## t = -0.32619, df = 12.593, p-value = 0.7496
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8157732 0.6023553
## sample estimates:
## mean of x mean of y
## 3.097521 3.204230
## Wilcox test- fail to reject null hypothesis that the means are equal
wilcox.test(X1,X2,alternative= "two.sided")
## Warning in wilcox.test.default(X1, X2, alternative = "two.sided"): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: X1 and X2
## W = 27.5, p-value = 0.674
## alternative hypothesis: true location shift is not equal to 0
## Fischers test- cannot reject the null that the medians are the same
cs <- c(X1,X2)
med_combined <- median(cs)
a <- sum(X1<med_combined)
b <- length(X1)-a
c <- sum(X2<med_combined)
d <- length(X2)-c
x <- matrix(c(a,c,b,d), nrow=2)
fisher.test(x)
##
## Fisher's Exact Test for Count Data
##
## data: x
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.1557766 18.7666075
## sample estimates:
## odds ratio
## 1.613716
## Permutation test
n <- length(X1)
m <- length(X2)
choose(n+m,n)
## [1] 12870
## too many to do all permutations so choose a random sample of permutations
pvar <- ((n-1)*var(X1) + (m-1)*var(X2))/(n+m-2)
t.obs <- (mean(X1)-mean(X2))/sqrt(pvar*(1/n+1/m))
cs <- c(X1,X2)
S <- sum(cs)
SS <- sum(cs^2)
tdist <- NULL
M <- 10000
for(i in 1:M){
nx1 <- sample(cs,n,replace=FALSE)
ybar1 <- mean(nx1)
ybar2 <- (S-sum(nx1))/m
SS1 <- sum(nx1^2)
SS2 <- SS-SS1
pvart <- ((SS1-n*ybar1^2)+(SS2-m*ybar2^2))/(n+m-2)
tdist[i] <- (ybar1-ybar2)/(sqrt(pvart*(1/n + 1/m)))
}
pvalue <- sum(tdist>abs(t.obs))/M
hist(tdist,breaks=20)
abline(v=t.obs, col="red")
