setwd("~/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
# install.packages(c("beanplot", "car", "fBasics", "gmodels", "plotrix","pastecs", "reshape2", "gplots","ggplot2","corrplot","readr"))
power.prop.test(p1 = .3, p2 = .7, power = .8, sig.level = 0.05)
##
## Two-sample comparison of proportions power calculation
##
## n = 23.31288
## p1 = 0.3
## p2 = 0.7
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
# Question 1: What power would we have for the above example if we had 100 cats in each group?
power.prop.test(p1 = .3, p2 = .7, sig.level = 0.05, n = 100)
##
## Two-sample comparison of proportions power calculation
##
## n = 100
## p1 = 0.3
## p2 = 0.7
## sig.level = 0.05
## power = 0.9999725
## alternative = two.sided
##
## NOTE: n is number in *each* group
# Question 2: If we wanted the significance threshold to be 1% and 80% power, how many cats would we need in each group?
power.prop.test(p1 = .3, p2 = .7, power = .8, sig.level = 0.01)
##
## Two-sample comparison of proportions power calculation
##
## n = 35.01146
## p1 = 0.3
## p2 = 0.7
## sig.level = 0.01
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
setwd("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
cats.data<-read.table("cats.dat", sep="\t",header=T)
head(cats.data)
## Training Dance
## 1 Food as Reward Yes
## 2 Food as Reward Yes
## 3 Food as Reward Yes
## 4 Food as Reward Yes
## 5 Food as Reward Yes
## 6 Food as Reward Yes
table(cats.data)
## Dance
## Training No Yes
## Affection as Reward 114 48
## Food as Reward 10 28
contingency.table <- table(cats.data)
contingency.table100<-prop.table(contingency.table,1)
contingency.table100<-round(contingency.table100*100)
contingency.table100
## Dance
## Training No Yes
## Affection as Reward 70 30
## Food as Reward 26 74
plot(contingency.table100, col=c("white","darkgrey"),cex.axis=1)
library(gplots)
## Warning: package 'gplots' was built under R version 4.5.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
balloonplot(t(contingency.table))
chisq <- chisq.test(contingency.table, correct=FALSE)
chisq
##
## Pearson's Chi-squared test
##
## data: contingency.table
## X-squared = 25.356, df = 1, p-value = 4.767e-07
# Observed counts
chisq$observed
## Dance
## Training No Yes
## Affection as Reward 114 48
## Food as Reward 10 28
# Expected counts
round(chisq$expected,2)
## Dance
## Training No Yes
## Affection as Reward 100.44 61.56
## Food as Reward 23.56 14.44
# Question 3: How do you calculate these expected values yourself?
# Total affection = 114 + 48 = 162, Total food = 10 + 28 = 38, Total No = 114 + 10 = 124, Total Yes = 48 + 28 = 76, Total = 162 + 38 = 200. Affection no = (162 * 124) / 200 = 100.44, Affection yes = (162 * 76) / 200 = 61.56, Food no = (38 * 124) / 200 = 23.56, Food yes = (38 * 76) / 200 = 14.44
# Question 4: How do you calculate the chi-squared value yourself?
# chisq = (114 - 100.44)^2 / 100.44 + (48 - 61.56)^2 / 61.56 +...
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
corrplot(chisq$residuals, is.cor = FALSE)
fisher.test(contingency.table)
##
## Fisher's Exact Test for Count Data
##
## data: contingency.table
## p-value = 1.312e-06
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 2.837773 16.429686
## sample estimates:
## odds ratio
## 6.579265
# we input the data from the literature:
mean1<-92
mean2<-87.4 #(5% less than 92cm)
# we compute the difference
delta<-92-87.4
sd<-7 #standard deviation
# Now we can use these data to perform a power analysis
power.t.test(delta=92-87.4, sd = 7, sig.level = 0.05, power = 0.8)
##
## Two-sample t test power calculation
##
## n = 37.33624
## delta = 4.6
## sd = 7
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
setwd("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
coyote<-read.csv("coyote.csv",header=TRUE)
head(coyote)
## length gender
## 1 93.0 female
## 2 97.0 female
## 3 92.0 female
## 4 101.6 female
## 5 93.0 female
## 6 84.5 female
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
ggplot(coyote, aes(gender, length))+geom_boxplot()+geom_jitter(aes(color=gender))
library(beanplot)
## Warning: package 'beanplot' was built under R version 4.5.2
beanplot(coyote$length~coyote$gender, las=1, ylab="Length (cm)")
par(mfrow=c(1,2))
hist(coyote[coyote$gender=="male",]$length,main="Male", xlab="Length",col="lightblue")
hist(coyote[coyote$gender=="female",]$length,main="Female", xlab="Length",col="pink")
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.5.2
tapply(coyote$length,coyote$gender, stat.desc, basic=F, desc=F, norm=T)
## $female
## skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
## -0.5290403 -0.7320175 0.5029741 0.3546892 0.9700101 0.3164448
##
## $male
## skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
## -0.08453876 -0.11697379 -0.67756124 -0.47780521 0.98445697 0.81898309
# Question 5: The boxplots seemed to show some outliers - can you manipulate the above function to print out the median and mean of each distributions, and consider whether they are very different from each other?
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
leveneTest(coyote$length, coyote$gender,centre=mean)
## Warning in leveneTest.default(coyote$length, coyote$gender, centre = mean):
## coyote$gender coerced to factor.
## Levene's Test for Homogeneity of Variance (center = median: mean)
## Df F value Pr(>F)
## group 1 0.1679 0.683
## 84
# Question 6: For which of the following examples would we use a paired t-test: a) comparing the number of cars parked on each street at two different times of year; b) comparing the lengths of orange and black fish in a lake; c) comparing the dexterity of left and right hands across a group of people; d) comparing the test scores of students in two different assignments?
# paired, independent, paired, paired
# Independent t-test
res <- t.test(coyote$length~coyote$gender, var.equal=T)
res
##
## Two Sample t-test
##
## data: coyote$length by coyote$gender
## t = -1.6411, df = 84, p-value = 0.1045
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -5.184747 0.496375
## sample estimates:
## mean in group female mean in group male
## 89.71163 92.05581
# printing the p-value
res$p.value
## [1] 0.104514
# get the degrees of freedom
res$parameter
## df
## 84
# get the t.test statistics
res$statistic
## t
## -1.641109
# printing the mean
res$estimate
## mean in group female mean in group male
## 89.71163 92.05581
# Question 7: What does the confidence interval tell us for the body lengths of female and male coyotes?
# No significant difference.
setwd("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
working.memory<-read.csv("Working memory.csv", header=T)
head(working.memory)
## Subject Placebo DA.depletion
## 1 M1 9 7
## 2 M2 10 7
## 3 M3 15 10
## 4 M4 18 12
## 5 M5 19 13
## 6 M6 22 15
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.2
working_memory_m=melt(working.memory, id.vars="Subject")
ggplot(working_memory_m, aes(variable, value))+geom_boxplot()
stat.desc(working.memory$Placebo, basic=F, desc=F, norm=T)
## skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
## 0.3338775 0.2877662 -1.0318530 -0.4602800 0.9591353 0.6773687
stat.desc(working.memory$DA.depletion, basic=F, desc=F, norm=T)
## skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
## 0.4334375 0.3735762 -0.9792473 -0.4368141 0.9427404 0.4180816
t.test(working.memory$Placebo, working.memory$DA.depletion,paired=T)
##
## Paired t-test
##
## data: working.memory$Placebo and working.memory$DA.depletion
## t = 8.6161, df = 14, p-value = 5.715e-07
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 6.308997 10.491003
## sample estimates:
## mean difference
## 8.4
# Question 8: Is there a significant difference between the groups if the data is not paired?
t.test(working.memory$Placebo, working.memory$DA.depletion,paired=F)
##
## Welch Two Sample t-test
##
## data: working.memory$Placebo and working.memory$DA.depletion
## t = 2.1029, df = 25.153, p-value = 0.04564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1758379 16.6241621
## sample estimates:
## mean of x mean of y
## 27.26667 18.86667
# Yes
# check the distribution of the differences.
working.memory$Difference<- working.memory$Placebo-working.memory$DA.depletion
head(working.memory)
## Subject Placebo DA.depletion Difference
## 1 M1 9 7 2
## 2 M2 10 7 3
## 3 M3 15 10 5
## 4 M4 18 12 6
## 5 M5 19 13 6
## 6 M6 22 15 7
stat.desc(working.memory$Difference, basic=F, desc=F, norm=T)
## skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
## 0.08857017 0.07633789 -1.04670910 -0.46690688 0.97726706 0.94740752
stripchart(working.memory$Difference,vertical=TRUE, method="jitter",
las=1, ylab="Differences",pch=16, col=c("blue"), cex=2)
diff.mean <- mean(working.memory$Difference)
library(plotrix)
## Warning: package 'plotrix' was built under R version 4.5.2
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:gplots':
##
## plotCI
diff.se <- std.error(working.memory$Difference)
loc.strip=1
lower<-diff.mean-1.96*diff.se
upper<-diff.mean+1.96*diff.se
arrows(x0=loc.strip, y0=lower, x1=loc.strip, y1=upper, length=0.3, angle=90,code=3,lwd=3)
segments(loc.strip-0.15,diff.mean, loc.strip+0.15, diff.mean, col="black", lwd=3)
library("readr")
## Warning: package 'readr' was built under R version 4.5.2
breast_cancer_continents <- read_csv("breast_cancer_continents.csv")
## Rows: 173 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): country, continent
## dbl (15): incomeperperson, alcconsumption, armedforcesrate, breastcancer, co...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
means<- round(tapply(breast_cancer_continents$breastcancer,
breast_cancer_continents$continent, mean), digits=2)
# note that I round values to just 2 decimal places
means
## AF AS EE LATAM NORAM OC WE
## 24.02 24.51 49.44 36.70 71.73 45.80 74.80
# Question 9: Can you edit the function to also calculate the variance for each group? Do they look similar?
variance<- round(tapply(breast_cancer_continents$breastcancer,
breast_cancer_continents$continent, var), digits=2)
variance
## AF AS EE LATAM NORAM OC WE
## 161.02 105.95 79.05 289.49 1389.36 759.27 302.73
# No
library(gplots) #I load the “gplots” package to plot means
plotmeans(breast_cancer_continents$breastcancer~breast_cancer_continents$continent, digits=2, col="red", mean.labels=T, main="Plot of breast cancer means by continent")
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
aov_cont<- aov(breast_cancer_continents$breastcancer ~ breast_cancer_continents$continent)
summary(aov_cont) # here I see results for my ANOVA test'
## Df Sum Sq Mean Sq F value Pr(>F)
## breast_cancer_continents$continent 6 52531 8755 40.28 <2e-16 ***
## Residuals 166 36083 217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tuk<- TukeyHSD(aov_cont)
tuk
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = breast_cancer_continents$breastcancer ~ breast_cancer_continents$continent)
##
## $`breast_cancer_continents$continent`
## diff lwr upr p adj
## AS-AF 0.4953571 -8.986848 9.9775626 0.9999987
## EE-AF 25.4248377 14.352007 36.4976680 0.0000000
## LATAM-AF 12.6875000 2.501977 22.8730225 0.0050172
## NORAM-AF 47.7172619 21.638434 73.7960896 0.0000035
## OC-AF 21.7839286 5.151040 38.4168172 0.0025337
## WE-AF 50.7886905 39.528172 62.0492093 0.0000000
## EE-AS 24.9294805 12.956321 36.9026399 0.0000001
## LATAM-AS 12.1921429 1.034462 23.3498237 0.0223712
## NORAM-AS 47.2219048 20.748253 73.6955563 0.0000067
## OC-AS 21.2885714 4.043225 38.5339174 0.0056343
## WE-AS 50.2933333 38.146389 62.4402777 0.0000000
## LATAM-EE -12.7373377 -25.274849 -0.1998261 0.0437993
## NORAM-EE 22.2924242 -4.791696 49.3765447 0.1822328
## OC-EE -3.6409091 -21.809489 14.5276712 0.9967979
## WE-EE 25.3638528 11.938369 38.7893364 0.0000015
## NORAM-LATAM 35.0297619 8.296134 61.7633901 0.0025162
## OC-LATAM 9.0964286 -8.545414 26.7382711 0.7208506
## WE-LATAM 38.1011905 25.397612 50.8047690 0.0000000
## OC-NORAM -25.9333333 -55.725866 3.8591991 0.1332198
## WE-NORAM 3.0714286 -24.089965 30.2328219 0.9998787
## WE-OC 29.0047619 10.721189 47.2883344 0.0000943
plot (tuk)
# Question 10: Would the ANOVA test still be significant if we just included continent groups AS,EE,NORAM,OC and WE?
trap_experiment <- read.csv("trap_experiment.csv")
#Inspect structure of the data
head(trap_experiment)
## number_of_moths location type_of_lure
## 1 32 Top Chemical
## 2 29 Top Chemical
## 3 16 Top Chemical
## 4 18 Top Chemical
## 5 20 Top Chemical
## 6 37 Middle Chemical
# visualize the data with a boxplot
ggplot(trap_experiment, aes(x=location,y=number_of_moths, fill = type_of_lure)) + geom_boxplot()
# check for normality using a shapiro test. The shapiro test shows our data is not normally distributed (it is significant).
shapiro.test(trap_experiment$number_of_moths)
##
## Shapiro-Wilk normality test
##
## data: trap_experiment$number_of_moths
## W = 0.94533, p-value = 0.009448
# check for equality of variance using a lavene test. The levene test shows no difference between groups.
leveneTest(trap_experiment$number_of_moths~trap_experiment$location *
trap_experiment$type_of_lure)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 11 0.6377 0.7875
## 48
# do a log transformation of our data
no_of_moth_log = log(trap_experiment$number_of_moths)
trap_experiment$no_of_moth_log = no_of_moth_log
shapiro.test(trap_experiment$no_of_moth_log)
##
## Shapiro-Wilk normality test
##
## data: trap_experiment$no_of_moth_log
## W = 0.94746, p-value = 0.01185
# check for equality of variance using a lavene test. The shapiro test shows our data is not normally distributed (it is significant).
leveneTest(trap_experiment$no_of_moth_log~trap_experiment$location*
trap_experiment$type_of_lure)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 11 0.5978 0.8211
## 48
# assume it is normally distribution (actually we should do non-parametric ver.)
moth_anova = aov(trap_experiment$no_of_moth_log~trap_experiment$location*
trap_experiment$type_of_lure)
summary(moth_anova)
## Df Sum Sq Mean Sq F value
## trap_experiment$location 3 2.966 0.9886 9.828
## trap_experiment$type_of_lure 2 0.262 0.1308 1.301
## trap_experiment$location:trap_experiment$type_of_lure 6 0.196 0.0326 0.324
## Residuals 48 4.828 0.1006
## Pr(>F)
## trap_experiment$location 3.64e-05 ***
## trap_experiment$type_of_lure 0.282
## trap_experiment$location:trap_experiment$type_of_lure 0.921
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Question 11: If we didn’t transform the count data, would the test still be significant?
moth_anova = aov(trap_experiment$no_of_moth~trap_experiment$location*
trap_experiment$type_of_lure)
summary(moth_anova)
## Df Sum Sq Mean Sq F value
## trap_experiment$location 3 2.966 0.9886 9.828
## trap_experiment$type_of_lure 2 0.262 0.1308 1.301
## trap_experiment$location:trap_experiment$type_of_lure 6 0.196 0.0326 0.324
## Residuals 48 4.828 0.1006
## Pr(>F)
## trap_experiment$location 3.64e-05 ***
## trap_experiment$type_of_lure 0.282
## trap_experiment$location:trap_experiment$type_of_lure 0.921
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Yes
exam.anxiety<-read.table("Exam Anxiety.dat", sep="\t",header=T)
colours<-c(exam.anxiety$Gender)
colours[colours == "Female"] <- "1"
colours[colours == "Male"] <- "2"
plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col= colours,
pch=16)
legend("topright", title="Gender",inset=.05, c("Female","Male"),
horiz=TRUE, pch=16,col=1:2)
fit.male<-lm(Anxiety~Revise,data=exam.anxiety[exam.anxiety$Gender=="Male",])
fit.female<-lm(Anxiety~Revise,data=exam.anxiety[exam.anxiety$Gender=="Female",])
plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col=colours,
pch=16)
legend("topright", title="Gender",inset=.05, c("Female","Male"),
horiz=TRUE, pch=16,col=1:2)
#New lines
abline((fit.male), col="red")
abline((fit.female), col="black")
# Question 12: If we suspect that an outlier is influencing our Pearson’s correlation statistic, is there another test we can try?