R Markdown

## T-test for two independent samples
## data in pottery.csv
## Please perform a two sample  independent t.test on the file navaho in ecourseware.  I am also attaching it here.

data <-  read.csv("navajo.csv", header=FALSE)
head(data)
head(data)
X1 <- as.integer(data$V1)
## Warning: NAs introduced by coercion
X2 <- as.integer(data$V2)
## Warning: NAs introduced by coercion
X1 <- X1[!is.na(X1)]
X2 <- X2[!is.na(X2)]

## Checking assumptions for normality- will reject normality
qqnorm(X1)

qqnorm(X2)

shapiro.test(X1)
## 
##  Shapiro-Wilk normality test
## 
## data:  X1
## W = 0.78368, p-value = 0.0191
shapiro.test(X2)
## 
##  Shapiro-Wilk normality test
## 
## data:  X2
## W = 0.88113, p-value = 0.1931
library(e1071)

skewness(X1)
## [1] 0.596518
skewness(X2)
## [1] 0.4433483
kurtosis(X1)
## [1] -1.638805
kurtosis(X2)
## [1] -1.620382
hist(X1)

hist(X2)

mean(X1)
## [1] 25.25
mean(X2)
## [1] 31.25
var(X1)
## [1] 207.3571
var(X2)
## [1] 482.5
## Try transformations
X1log <- log(X1)
X2log <- log(X2)

qqnorm(X1log)

qqnorm(X2log)

shapiro.test(X1log)
## 
##  Shapiro-Wilk normality test
## 
## data:  X1log
## W = 0.83394, p-value = 0.06523
shapiro.test(X2log)
## 
##  Shapiro-Wilk normality test
## 
## data:  X2log
## W = 0.92877, p-value = 0.5049
skewness(X1log)
## [1] 0.4381175
skewness(X2log)
## [1] 0.02925934
hist(X1log)

hist(X2log)

var(X1log)
## [1] 0.2849778
var(X2log)
## [1] 0.5711664
## since p-value is high then we fail to reject that the variances are the same
var.test(X1log,X2log)
## 
##  F test to compare two variances
## 
## data:  X1log and X2log
## F = 0.49894, num df = 7, denom df = 7, p-value = 0.3793
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.0998897 2.4921600
## sample estimates:
## ratio of variances 
##            0.49894
## to test mean_1 = mean_2. So, we fail to reject that they have the same mean
t.test(X1log,X2log,alternative = "two.sided", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  X1log and X2log
## t = -0.32619, df = 14, p-value = 0.7491
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8083461  0.5949282
## sample estimates:
## mean of x mean of y 
##  3.097521  3.204230
## if variances are not equal then we use
t.test(X1log,X2log,alternative = "two.sided", var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  X1log and X2log
## t = -0.32619, df = 12.593, p-value = 0.7496
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8157732  0.6023553
## sample estimates:
## mean of x mean of y 
##  3.097521  3.204230
## Wilcox test- fail to reject null hypothesis that the means are equal
wilcox.test(X1,X2,alternative= "two.sided")
## Warning in wilcox.test.default(X1, X2, alternative = "two.sided"): cannot
## compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  X1 and X2
## W = 27.5, p-value = 0.674
## alternative hypothesis: true location shift is not equal to 0
## Fischers test- cannot reject the null that the medians are the same
cs <- c(X1,X2)
med_combined <- median(cs)

a <- sum(X1<med_combined)
b <- length(X1)-a

c <- sum(X2<med_combined)
d <- length(X2)-c

x <- matrix(c(a,c,b,d), nrow=2)

fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.1557766 18.7666075
## sample estimates:
## odds ratio 
##   1.613716
## Permutation test

n <- length(X1)
m <- length(X2)

choose(n+m,n)
## [1] 12870
## too many to do all permutations so choose a random sample of permutations
pvar <- ((n-1)*var(X1) + (m-1)*var(X2))/(n+m-2)

t.obs <- (mean(X1)-mean(X2))/sqrt(pvar*(1/n+1/m))

cs <- c(X1,X2)
S <- sum(cs)
SS <- sum(cs^2)

tdist <- NULL
M <- 10000
for(i in 1:M){
  nx1 <- sample(cs,n,replace=FALSE)
  ybar1 <- mean(nx1)
  ybar2 <- (S-sum(nx1))/m
  SS1 <- sum(nx1^2)
  SS2 <- SS-SS1
  pvart <- ((SS1-n*ybar1^2)+(SS2-m*ybar2^2))/(n+m-2)
  tdist[i] <- (ybar1-ybar2)/(sqrt(pvart*(1/n + 1/m)))
}

pvalue <- sum(tdist>abs(t.obs))/M

hist(tdist,breaks=20)
abline(v=t.obs, col="red")