d <- read.table(file="D:/College/Datasets_csv_files/scores_data.csv", header = T, sep=",")
View(d)
names(d)
## [1] "scores" "remark"
levels(d$remark)
## NULL
table(d$remark)
## 
##  A  B 
## 10 13
#box plot
boxplot(d$scores~d$remark, las=1, ylab= "scores", xlab="remark", main="scores by remark")

#mean

mean(d$scores[d$remark=="A"])
## [1] 429.8
mean(d$scores[d$remark=="B"])
## [1] 226.2308
#OR other way
with(d,tapply(scores, remark, mean))
##        A        B 
## 429.8000 226.2308
#calculate the absolute difference in means
abs(diff(with(d,tapply(scores, remark, mean))))
##        B 
## 203.5692
#OR OTHER WAY 
test.stat1 <- abs(mean(d$scores[d$remark=="A"])- mean(d$scores[d$remark=="B"]))
test.stat1
## [1] 203.5692

#median

median(d$scores[d$remark=="A"])
## [1] 447.5
median(d$scores[d$remark=="B"])
## [1] 228
#OR ANOTHER WAY
with(d,tapply(scores, remark, median))
##     A     B 
## 447.5 228.0
#calculate the absolute diff in medians
abs(diff(with(d, tapply(scores, remark, median))))
##     B 
## 219.5
#OR OTHER WAY
test.stat2 <- abs(median(d$scores[d$remark=="A"]) - median(d$scores[d$remark=="B"]))
test.stat2
## [1] 219.5

Let’s take a look at the 3 “Classic” hyp tests we could consider

each of which comes with their own limitations

#let's look at independent 2-sample t-test
t.test(d$scores~d$remark, paired=F, var.eq=F)
## 
##  Welch Two Sample t-test
## 
## data:  d$scores by d$remark
## t = 6.0372, df = 20.485, p-value = 6.034e-06
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  133.3385 273.8000
## sample estimates:
## mean in group A mean in group B 
##        429.8000        226.2308
#look at wilcoxon aka mann-Whitney U
wilcox.test(d$scores~d$remark,paired=F) #tests Ho:medians are equal
## Warning in wilcox.test.default(x = c(450L, 480L, 445L, 435L, 340L, 400L, :
## cannot compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  d$scores by d$remark
## W = 125, p-value = 0.0002234
## alternative hypothesis: true location shift is not equal to 0
#look at kolmogorov-smirnov 2-sample test
ks.test(d$scores[d$remark=="A"],d$scores[d$remark=="B"], paired = F)
## Warning in ks.test(d$scores[d$remark == "A"], d$scores[d$remark == "B"], :
## cannot compute exact p-value with ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  d$scores[d$remark == "A"] and d$scores[d$remark == "B"]
## D = 0.8, p-value = 0.001442
## alternative hypothesis: two-sided
#############################################################
########### BOOT STRAP#######################################
##############################################################
set.seed(112358) # for reproducibility
n <-length(d$remark)
n
## [1] 23
BS <- 10000   # number of bootstrap samples
variable <- d$scores  #variable we will resample from
# now get bootstrap samples ( without loops!)
BootstrapSamples <- matrix(sample(variable,size=n*BS, replace=TRUE), nrow=n,ncol=BS)
dim(BootstrapSamples)
## [1]    23 10000
# now calculate the means (Yc and Ym) for each of the bootstrap samples
#( the ineffficient but transparent way... best to start simple and once working
#well then make code efficient)
#initialize the vector to store the TEST-STATS
Boot.test.stat1 <- rep(0,BS)
Boot.test.stat2 <- rep(0,BS)
#run through a loop, each time calculating the bootstrap test.stat
# NOTE: could make this faster by writing a "function" and then
# using apply to apply it to columns of "BootSamples"
for (i in 1:BS) {
  #calculate the boot-test-stat1 and save it
  Boot.test.stat1[i] <- abs(mean(BootstrapSamples[1:12,i]) - 
                              mean(BootstrapSamples[13:23,i]))
  #calculate the boot-test-stat2 and save it
  Boot.test.stat2[i] <- abs(median(BootstrapSamples[1:12,i]) - 
                              median(BootstrapSamples[13:23,i]))
}
#OBSERVED TEST STATS
test.stat1;test.stat2
## [1] 203.5692
## [1] 219.5
#take a look at first 20 bootstrap -TEST Stats for 1 and 2
round(Boot.test.stat1[1:20],1)
##  [1] 41.0  9.9 42.7 92.7 17.2 21.9 82.6 67.8  4.5 59.9  9.6 71.9  2.5 58.7 22.2
## [16] 54.9 19.3 52.0 39.6 78.4
round(Boot.test.stat2[1:20],1)
##  [1]  90.0  11.0  68.0  48.0  70.0  25.5  85.5  70.0  26.5 116.0   9.0  83.5
## [13]  81.0   7.0  38.0  32.5  49.0 121.5  19.5 140.0
# let's calculate the bootstrap p-value
#Notice how we can ask R for a try/false question ( for first 20)
(Boot.test.stat1 >=test.stat1)[1:20]
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# if we ask for the mean of all those, it treats 0=FALSE, 1=TRUE
#..calculate p-value
mean(Boot.test.stat1 >=test.stat1)
## [1] 4e-04
# let's calculate the p-value for test statistic 2 (abs diff in medians)
mean(Boot.test.stat2 >= test.stat2)
## [1] 0.0068
table(d$remark)
## 
##  A  B 
## 10 13
#DENSITY PLOT
plot(density(Boot.test.stat1),
     xlab=expression(group("|", bar(Yc)-bar(Ym),"|")))