d <- read.table(file="D:/College/Datasets_csv_files/scores_data.csv", header = T, sep=",")
View(d)
names(d)
## [1] "scores" "remark"
levels(d$remark)
## NULL
table(d$remark)
##
## A B
## 10 13
#box plot
boxplot(d$scores~d$remark, las=1, ylab= "scores", xlab="remark", main="scores by remark")
#mean
mean(d$scores[d$remark=="A"])
## [1] 429.8
mean(d$scores[d$remark=="B"])
## [1] 226.2308
#OR other way
with(d,tapply(scores, remark, mean))
## A B
## 429.8000 226.2308
#calculate the absolute difference in means
abs(diff(with(d,tapply(scores, remark, mean))))
## B
## 203.5692
#OR OTHER WAY
test.stat1 <- abs(mean(d$scores[d$remark=="A"])- mean(d$scores[d$remark=="B"]))
test.stat1
## [1] 203.5692
#median
median(d$scores[d$remark=="A"])
## [1] 447.5
median(d$scores[d$remark=="B"])
## [1] 228
#OR ANOTHER WAY
with(d,tapply(scores, remark, median))
## A B
## 447.5 228.0
#calculate the absolute diff in medians
abs(diff(with(d, tapply(scores, remark, median))))
## B
## 219.5
#OR OTHER WAY
test.stat2 <- abs(median(d$scores[d$remark=="A"]) - median(d$scores[d$remark=="B"]))
test.stat2
## [1] 219.5
#let's look at independent 2-sample t-test
t.test(d$scores~d$remark, paired=F, var.eq=F)
##
## Welch Two Sample t-test
##
## data: d$scores by d$remark
## t = 6.0372, df = 20.485, p-value = 6.034e-06
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
## 133.3385 273.8000
## sample estimates:
## mean in group A mean in group B
## 429.8000 226.2308
#look at wilcoxon aka mann-Whitney U
wilcox.test(d$scores~d$remark,paired=F) #tests Ho:medians are equal
## Warning in wilcox.test.default(x = c(450L, 480L, 445L, 435L, 340L, 400L, :
## cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: d$scores by d$remark
## W = 125, p-value = 0.0002234
## alternative hypothesis: true location shift is not equal to 0
#look at kolmogorov-smirnov 2-sample test
ks.test(d$scores[d$remark=="A"],d$scores[d$remark=="B"], paired = F)
## Warning in ks.test(d$scores[d$remark == "A"], d$scores[d$remark == "B"], :
## cannot compute exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: d$scores[d$remark == "A"] and d$scores[d$remark == "B"]
## D = 0.8, p-value = 0.001442
## alternative hypothesis: two-sided
#############################################################
########### BOOT STRAP#######################################
##############################################################
set.seed(112358) # for reproducibility
n <-length(d$remark)
n
## [1] 23
BS <- 10000 # number of bootstrap samples
variable <- d$scores #variable we will resample from
# now get bootstrap samples ( without loops!)
BootstrapSamples <- matrix(sample(variable,size=n*BS, replace=TRUE), nrow=n,ncol=BS)
dim(BootstrapSamples)
## [1] 23 10000
# now calculate the means (Yc and Ym) for each of the bootstrap samples
#( the ineffficient but transparent way... best to start simple and once working
#well then make code efficient)
#initialize the vector to store the TEST-STATS
Boot.test.stat1 <- rep(0,BS)
Boot.test.stat2 <- rep(0,BS)
#run through a loop, each time calculating the bootstrap test.stat
# NOTE: could make this faster by writing a "function" and then
# using apply to apply it to columns of "BootSamples"
for (i in 1:BS) {
#calculate the boot-test-stat1 and save it
Boot.test.stat1[i] <- abs(mean(BootstrapSamples[1:12,i]) -
mean(BootstrapSamples[13:23,i]))
#calculate the boot-test-stat2 and save it
Boot.test.stat2[i] <- abs(median(BootstrapSamples[1:12,i]) -
median(BootstrapSamples[13:23,i]))
}
#OBSERVED TEST STATS
test.stat1;test.stat2
## [1] 203.5692
## [1] 219.5
#take a look at first 20 bootstrap -TEST Stats for 1 and 2
round(Boot.test.stat1[1:20],1)
## [1] 41.0 9.9 42.7 92.7 17.2 21.9 82.6 67.8 4.5 59.9 9.6 71.9 2.5 58.7 22.2
## [16] 54.9 19.3 52.0 39.6 78.4
round(Boot.test.stat2[1:20],1)
## [1] 90.0 11.0 68.0 48.0 70.0 25.5 85.5 70.0 26.5 116.0 9.0 83.5
## [13] 81.0 7.0 38.0 32.5 49.0 121.5 19.5 140.0
# let's calculate the bootstrap p-value
#Notice how we can ask R for a try/false question ( for first 20)
(Boot.test.stat1 >=test.stat1)[1:20]
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# if we ask for the mean of all those, it treats 0=FALSE, 1=TRUE
#..calculate p-value
mean(Boot.test.stat1 >=test.stat1)
## [1] 4e-04
# let's calculate the p-value for test statistic 2 (abs diff in medians)
mean(Boot.test.stat2 >= test.stat2)
## [1] 0.0068
table(d$remark)
##
## A B
## 10 13
#DENSITY PLOT
plot(density(Boot.test.stat1),
xlab=expression(group("|", bar(Yc)-bar(Ym),"|")))