Set up of working directory and package installation

setwd("~/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
# install.packages(c("beanplot", "car", "fBasics", "gmodels", "plotrix","pastecs", "reshape2", "gplots","ggplot2","corrplot","readr"))

Power analysis with qualitative data

power.prop.test(p1 = .3, p2 = .7, power = .8, sig.level = 0.05)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 23.31288
##              p1 = 0.3
##              p2 = 0.7
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
# Question 1: What power would we have for the above example if we had 100 cats in each group?

power.prop.test(p1 = .3, p2 = .7, sig.level = 0.05, n = 100)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 100
##              p1 = 0.3
##              p2 = 0.7
##       sig.level = 0.05
##           power = 0.9999725
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
# Question 2: If we wanted the significance threshold to be 1% and 80% power, how many cats would we need in each group?

power.prop.test(p1 = .3, p2 = .7, power = .8, sig.level = 0.01)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 35.01146
##              p1 = 0.3
##              p2 = 0.7
##       sig.level = 0.01
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Working with qualitative data

setwd("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
cats.data<-read.table("cats.dat", sep="\t",header=T)
head(cats.data)
##         Training Dance
## 1 Food as Reward   Yes
## 2 Food as Reward   Yes
## 3 Food as Reward   Yes
## 4 Food as Reward   Yes
## 5 Food as Reward   Yes
## 6 Food as Reward   Yes
table(cats.data)
##                      Dance
## Training               No Yes
##   Affection as Reward 114  48
##   Food as Reward       10  28
contingency.table <- table(cats.data)
contingency.table100<-prop.table(contingency.table,1)
contingency.table100<-round(contingency.table100*100)
contingency.table100
##                      Dance
## Training              No Yes
##   Affection as Reward 70  30
##   Food as Reward      26  74
plot(contingency.table100, col=c("white","darkgrey"),cex.axis=1)

library(gplots)
## Warning: package 'gplots' was built under R version 4.5.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
balloonplot(t(contingency.table))

chisq <- chisq.test(contingency.table, correct=FALSE)
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  contingency.table
## X-squared = 25.356, df = 1, p-value = 4.767e-07
# Observed counts
chisq$observed
##                      Dance
## Training               No Yes
##   Affection as Reward 114  48
##   Food as Reward       10  28
# Expected counts
round(chisq$expected,2)
##                      Dance
## Training                  No   Yes
##   Affection as Reward 100.44 61.56
##   Food as Reward       23.56 14.44
# Question 3: How do you calculate these expected values yourself?
# Total affection = 114 + 48 = 162, Total food = 10 + 28 = 38, Total No = 114 + 10 = 124, Total Yes = 48 + 28 = 76, Total = 162 + 38 = 200. Affection no = (162 * 124) / 200 = 100.44, Affection yes = (162 * 76) / 200 = 61.56, Food no = (38 * 124) / 200 = 23.56, Food yes = (38 * 76) / 200 = 14.44

# Question 4: How do you calculate the chi-squared value yourself?
# chisq = (114 - 100.44)^2 / 100.44 + (48 - 61.56)^2 / 61.56 +...

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
corrplot(chisq$residuals, is.cor = FALSE)

fisher.test(contingency.table)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency.table
## p-value = 1.312e-06
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   2.837773 16.429686
## sample estimates:
## odds ratio 
##   6.579265

Working with quantitative data

Power analysis with quantitative data

# we input the data from the literature:
mean1<-92
mean2<-87.4 #(5% less than 92cm)

# we compute the difference
delta<-92-87.4
sd<-7 #standard deviation

# Now we can use these data to perform a power analysis
power.t.test(delta=92-87.4, sd = 7, sig.level = 0.05, power = 0.8)
## 
##      Two-sample t test power calculation 
## 
##               n = 37.33624
##           delta = 4.6
##              sd = 7
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Assumptions of parametric data

setwd("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
coyote<-read.csv("coyote.csv",header=TRUE)
head(coyote)
##   length gender
## 1   93.0 female
## 2   97.0 female
## 3   92.0 female
## 4  101.6 female
## 5   93.0 female
## 6   84.5 female
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
ggplot(coyote, aes(gender, length))+geom_boxplot()+geom_jitter(aes(color=gender))

library(beanplot)
## Warning: package 'beanplot' was built under R version 4.5.2
beanplot(coyote$length~coyote$gender, las=1, ylab="Length (cm)")

par(mfrow=c(1,2))
hist(coyote[coyote$gender=="male",]$length,main="Male", xlab="Length",col="lightblue")
hist(coyote[coyote$gender=="female",]$length,main="Female", xlab="Length",col="pink")

library(pastecs)
## Warning: package 'pastecs' was built under R version 4.5.2
tapply(coyote$length,coyote$gender, stat.desc, basic=F, desc=F, norm=T)
## $female
##   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
## -0.5290403 -0.7320175  0.5029741  0.3546892  0.9700101  0.3164448 
## 
## $male
##    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
## -0.08453876 -0.11697379 -0.67756124 -0.47780521  0.98445697  0.81898309
# Question 5: The boxplots seemed to show some outliers - can you manipulate the above function to print out the median and mean of each distributions, and consider whether they are very different from each other?

library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
leveneTest(coyote$length, coyote$gender,centre=mean)
## Warning in leveneTest.default(coyote$length, coyote$gender, centre = mean):
## coyote$gender coerced to factor.
## Levene's Test for Homogeneity of Variance (center = median: mean)
##       Df F value Pr(>F)
## group  1  0.1679  0.683
##       84

t-test

# Question 6: For which of the following examples would we use a paired t-test: a) comparing the number of cars parked on each street at two different times of year; b) comparing the lengths of orange and black fish in a lake; c) comparing the dexterity of left and right hands across a group of people; d) comparing the test scores of students in two different assignments?
# paired, independent, paired, paired

# Independent t-test
res <- t.test(coyote$length~coyote$gender, var.equal=T)
res
## 
##  Two Sample t-test
## 
## data:  coyote$length by coyote$gender
## t = -1.6411, df = 84, p-value = 0.1045
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -5.184747  0.496375
## sample estimates:
## mean in group female   mean in group male 
##             89.71163             92.05581
# printing the p-value
res$p.value
## [1] 0.104514
# get the degrees of freedom
res$parameter
## df 
## 84
# get the t.test statistics
res$statistic
##         t 
## -1.641109
# printing the mean
res$estimate
## mean in group female   mean in group male 
##             89.71163             92.05581
# Question 7: What does the confidence interval tell us for the body lengths of female and male coyotes?
# No significant difference.

Paired t-test

setwd("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Data_for_Stats")
working.memory<-read.csv("Working memory.csv", header=T)
head(working.memory)
##   Subject Placebo DA.depletion
## 1      M1       9            7
## 2      M2      10            7
## 3      M3      15           10
## 4      M4      18           12
## 5      M5      19           13
## 6      M6      22           15
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.2
working_memory_m=melt(working.memory, id.vars="Subject")
ggplot(working_memory_m, aes(variable, value))+geom_boxplot()

stat.desc(working.memory$Placebo, basic=F, desc=F, norm=T)
##   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
##  0.3338775  0.2877662 -1.0318530 -0.4602800  0.9591353  0.6773687
stat.desc(working.memory$DA.depletion, basic=F, desc=F, norm=T)
##   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
##  0.4334375  0.3735762 -0.9792473 -0.4368141  0.9427404  0.4180816
t.test(working.memory$Placebo, working.memory$DA.depletion,paired=T)
## 
##  Paired t-test
## 
## data:  working.memory$Placebo and working.memory$DA.depletion
## t = 8.6161, df = 14, p-value = 5.715e-07
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##   6.308997 10.491003
## sample estimates:
## mean difference 
##             8.4
# Question 8: Is there a significant difference between the groups if the data is not paired?
t.test(working.memory$Placebo, working.memory$DA.depletion,paired=F)
## 
##  Welch Two Sample t-test
## 
## data:  working.memory$Placebo and working.memory$DA.depletion
## t = 2.1029, df = 25.153, p-value = 0.04564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   0.1758379 16.6241621
## sample estimates:
## mean of x mean of y 
##  27.26667  18.86667
# Yes

#  check the distribution of the differences.
working.memory$Difference<- working.memory$Placebo-working.memory$DA.depletion
head(working.memory)
##   Subject Placebo DA.depletion Difference
## 1      M1       9            7          2
## 2      M2      10            7          3
## 3      M3      15           10          5
## 4      M4      18           12          6
## 5      M5      19           13          6
## 6      M6      22           15          7
stat.desc(working.memory$Difference, basic=F, desc=F, norm=T)
##    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
##  0.08857017  0.07633789 -1.04670910 -0.46690688  0.97726706  0.94740752
stripchart(working.memory$Difference,vertical=TRUE, method="jitter",
las=1, ylab="Differences",pch=16, col=c("blue"), cex=2)
diff.mean <- mean(working.memory$Difference)
library(plotrix)
## Warning: package 'plotrix' was built under R version 4.5.2
## 
## Attaching package: 'plotrix'
## The following object is masked from 'package:gplots':
## 
##     plotCI
diff.se <- std.error(working.memory$Difference)
loc.strip=1
lower<-diff.mean-1.96*diff.se
upper<-diff.mean+1.96*diff.se
arrows(x0=loc.strip, y0=lower, x1=loc.strip, y1=upper, length=0.3, angle=90,code=3,lwd=3)
segments(loc.strip-0.15,diff.mean, loc.strip+0.15, diff.mean, col="black", lwd=3)

Analysis of variance

One-way ANOVA

library("readr")
## Warning: package 'readr' was built under R version 4.5.2
breast_cancer_continents <- read_csv("breast_cancer_continents.csv")
## Rows: 173 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): country, continent
## dbl (15): incomeperperson, alcconsumption, armedforcesrate, breastcancer, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
means<- round(tapply(breast_cancer_continents$breastcancer,
breast_cancer_continents$continent, mean), digits=2)
# note that I round values to just 2 decimal places
means
##    AF    AS    EE LATAM NORAM    OC    WE 
## 24.02 24.51 49.44 36.70 71.73 45.80 74.80
# Question 9: Can you edit the function to also calculate the variance for each group? Do they look similar?
variance<- round(tapply(breast_cancer_continents$breastcancer,
breast_cancer_continents$continent, var), digits=2)
variance
##      AF      AS      EE   LATAM   NORAM      OC      WE 
##  161.02  105.95   79.05  289.49 1389.36  759.27  302.73
# No

library(gplots) #I load the “gplots” package to plot means
plotmeans(breast_cancer_continents$breastcancer~breast_cancer_continents$continent, digits=2, col="red", mean.labels=T, main="Plot of breast cancer means by continent")
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

aov_cont<- aov(breast_cancer_continents$breastcancer ~ breast_cancer_continents$continent)
summary(aov_cont) # here I see results for my ANOVA test'
##                                     Df Sum Sq Mean Sq F value Pr(>F)    
## breast_cancer_continents$continent   6  52531    8755   40.28 <2e-16 ***
## Residuals                          166  36083     217                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tuk<- TukeyHSD(aov_cont)
tuk
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = breast_cancer_continents$breastcancer ~ breast_cancer_continents$continent)
## 
## $`breast_cancer_continents$continent`
##                    diff        lwr        upr     p adj
## AS-AF         0.4953571  -8.986848  9.9775626 0.9999987
## EE-AF        25.4248377  14.352007 36.4976680 0.0000000
## LATAM-AF     12.6875000   2.501977 22.8730225 0.0050172
## NORAM-AF     47.7172619  21.638434 73.7960896 0.0000035
## OC-AF        21.7839286   5.151040 38.4168172 0.0025337
## WE-AF        50.7886905  39.528172 62.0492093 0.0000000
## EE-AS        24.9294805  12.956321 36.9026399 0.0000001
## LATAM-AS     12.1921429   1.034462 23.3498237 0.0223712
## NORAM-AS     47.2219048  20.748253 73.6955563 0.0000067
## OC-AS        21.2885714   4.043225 38.5339174 0.0056343
## WE-AS        50.2933333  38.146389 62.4402777 0.0000000
## LATAM-EE    -12.7373377 -25.274849 -0.1998261 0.0437993
## NORAM-EE     22.2924242  -4.791696 49.3765447 0.1822328
## OC-EE        -3.6409091 -21.809489 14.5276712 0.9967979
## WE-EE        25.3638528  11.938369 38.7893364 0.0000015
## NORAM-LATAM  35.0297619   8.296134 61.7633901 0.0025162
## OC-LATAM      9.0964286  -8.545414 26.7382711 0.7208506
## WE-LATAM     38.1011905  25.397612 50.8047690 0.0000000
## OC-NORAM    -25.9333333 -55.725866  3.8591991 0.1332198
## WE-NORAM      3.0714286 -24.089965 30.2328219 0.9998787
## WE-OC        29.0047619  10.721189 47.2883344 0.0000943
plot (tuk)

# Question 10: Would the ANOVA test still be significant if we just included continent groups AS,EE,NORAM,OC and WE?

Two-way ANOVA

trap_experiment <- read.csv("trap_experiment.csv")
#Inspect structure of the data
head(trap_experiment)
##   number_of_moths location type_of_lure
## 1              32      Top     Chemical
## 2              29      Top     Chemical
## 3              16      Top     Chemical
## 4              18      Top     Chemical
## 5              20      Top     Chemical
## 6              37   Middle     Chemical
# visualize the data with a boxplot
ggplot(trap_experiment, aes(x=location,y=number_of_moths, fill = type_of_lure)) + geom_boxplot()

# check for normality using a shapiro test. The shapiro test shows our data is not normally distributed (it is significant).
shapiro.test(trap_experiment$number_of_moths)
## 
##  Shapiro-Wilk normality test
## 
## data:  trap_experiment$number_of_moths
## W = 0.94533, p-value = 0.009448
# check for equality of variance using a lavene test. The levene test shows no difference between groups.
leveneTest(trap_experiment$number_of_moths~trap_experiment$location *
trap_experiment$type_of_lure)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group 11  0.6377 0.7875
##       48
# do a log transformation of our data
no_of_moth_log = log(trap_experiment$number_of_moths)
trap_experiment$no_of_moth_log = no_of_moth_log
shapiro.test(trap_experiment$no_of_moth_log)
## 
##  Shapiro-Wilk normality test
## 
## data:  trap_experiment$no_of_moth_log
## W = 0.94746, p-value = 0.01185
# check for equality of variance using a lavene test. The shapiro test shows our data is not normally distributed (it is significant).
leveneTest(trap_experiment$no_of_moth_log~trap_experiment$location*
trap_experiment$type_of_lure)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group 11  0.5978 0.8211
##       48
# assume it is normally distribution (actually we should do non-parametric ver.)
moth_anova = aov(trap_experiment$no_of_moth_log~trap_experiment$location*
trap_experiment$type_of_lure)
summary(moth_anova)
##                                                       Df Sum Sq Mean Sq F value
## trap_experiment$location                               3  2.966  0.9886   9.828
## trap_experiment$type_of_lure                           2  0.262  0.1308   1.301
## trap_experiment$location:trap_experiment$type_of_lure  6  0.196  0.0326   0.324
## Residuals                                             48  4.828  0.1006        
##                                                         Pr(>F)    
## trap_experiment$location                              3.64e-05 ***
## trap_experiment$type_of_lure                             0.282    
## trap_experiment$location:trap_experiment$type_of_lure    0.921    
## Residuals                                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Question 11: If we didn’t transform the count data, would the test still be significant?
moth_anova = aov(trap_experiment$no_of_moth~trap_experiment$location*
trap_experiment$type_of_lure)
summary(moth_anova)
##                                                       Df Sum Sq Mean Sq F value
## trap_experiment$location                               3  2.966  0.9886   9.828
## trap_experiment$type_of_lure                           2  0.262  0.1308   1.301
## trap_experiment$location:trap_experiment$type_of_lure  6  0.196  0.0326   0.324
## Residuals                                             48  4.828  0.1006        
##                                                         Pr(>F)    
## trap_experiment$location                              3.64e-05 ***
## trap_experiment$type_of_lure                             0.282    
## trap_experiment$location:trap_experiment$type_of_lure    0.921    
## Residuals                                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Yes

Correlation

exam.anxiety<-read.table("Exam Anxiety.dat", sep="\t",header=T)

colours<-c(exam.anxiety$Gender)
colours[colours == "Female"] <- "1"
colours[colours == "Male"] <- "2"
plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col= colours,
pch=16)
legend("topright", title="Gender",inset=.05, c("Female","Male"),
horiz=TRUE, pch=16,col=1:2)

fit.male<-lm(Anxiety~Revise,data=exam.anxiety[exam.anxiety$Gender=="Male",])
fit.female<-lm(Anxiety~Revise,data=exam.anxiety[exam.anxiety$Gender=="Female",])
plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col=colours,
pch=16)
legend("topright", title="Gender",inset=.05, c("Female","Male"),
horiz=TRUE, pch=16,col=1:2)
#New lines
abline((fit.male), col="red")
abline((fit.female), col="black")

# Question 12: If we suspect that an outlier is influencing our Pearson’s correlation statistic, is there another test we can try?