First, Set up working Directory.
#Get Working Directory (aka what directory are you currently working from)
# getwd()
#Set Working Directory
setwd("C:/Users/ual-laptop/Desktop/ENVS 567_R")
The following are the necessary Packages and libraries used for this homework (some will be included again in the problem’s code)
# install.packages("tidyverse")
# install.packages("ISwR")
# install.packages("readxl")
# install.packages("ggplot2")
# install.packages("reshape2")
# install.packages("performance")
# install.packages("lme4")
# install.packages("see")
# install.packages("lmerTest")
# install.packages("car")
# install.packages("ggeffects")
# install.packages("GGally")
# install.packages("MASS")
# install.packages("relaimpo")
# install.packages("sjPlot")
# install.packages("dplyr")
# install.packages("nycflights13")
# install.packages('plyr', repos = "http://cran.us.r-project.org")
# install.packages("gapminder")
# install.packages("Lahman")
# install.packages("ggord")
# install.packages("patchwork")
# install.packages("qqplotr")
# install.packages("ggplot2movies")
# installed.packages("Rtools")
# install.packages("rgbif")
# # Load libraries
# library(tidyverse)
# library(ISwR)
# library(readxl)
# library(ggplot2)
# library(reshape2)
# library(performance)
# library(lme4)
# library(see)
# library(lmerTest)
# library(car)
# library(ggeffects)
# library(GGally)
# library(MASS)
# library(relaimpo)
# library(sjPlot)
# library(dplyr)
# library(nycflights13)
# library(plyr)
# library(gapminder)
# library(Lahman)
# library(performance)
# library(see)
# library(qqplotr)
# library(ggrepel)
# library(ggplot2movies)
# library(rgbif)
##1.1 - Install the ISwR R package. #Write the built-in dataset thuesen to a tab-separated text file. #View it with a text editor. #Change the NA to . (period), and #read the changed file back into R.
thuesen <- read.table(file = "thuesen.txt", header = FALSE, sep = "\t", na.strings = NA)
print(thuesen)
## V1
## 1 x
## 2 thuesen_tabsep
thuesen1 <- thuesen
thuesen1[is.na(thuesen1)]<-"."
print(thuesen1)
## V1
## 1 x
## 2 thuesen_tabsep
##1.2 - The exponential growth of a population is described by this mathematical function: Nt=No*e^(rt); where Nt is the population size at time t, No is the initial population size, and r is the rate of growth or reproductive rate. #Write an exponential growth function in R that also generates a plot with time on the x axis and Nt on the y axis. #Using that function create plots for 20 days assuming an initial population size of 10 individuals under three growth rate scenarios (0.5, 0.8, -0.1).
exp.growth <- function(No, r, t){
No <- No
r <- r
t <- 0:t
Nt <- No*exp(r*t)
plot(x = t, y = Nt, xlab = "Time (days)", ylab = "Polulation Size", type = "l")}
exp.growth(No = 10, r = 0.5, t = 20)
exp.growth(No = 10, r=0.8, t = 20)
exp.growth(No = 10, r=-0.1, t = 20)
##1.3 - Under highly favorable conditions, populations grow exponentially. However resources will eventually limit population growth and exponential growth cannot continue indefinitely. This phenomenon is described by the logistic growth function: Nt=(KNo)/((No)+(K-No)exp(-rt)); where K is the carrying capacity. #Write a logistic growth function in R that also generates a plot with time on the x axis and Nt on the y axis. Using that function create plots for 20 days assuming an initial population size of 10 individuals and a carrying capacity of 1,000 individuals under three growth rate scenarios (0.5, 0.8, 0.4).
logis.growth <- function(No, r, t, K){
No <- No
r <- r
t <- 0:t
K <- K
Nt.l <- (K*No)/(No+(K-No)*exp(-r*t))
plot(x = t, y = Nt.l, xlab = "Time (days)", ylab = "Polulation Size", main = "Logistic Growth Graph", type = "l")}
logis.growth(No = 10, r = 0.5, t = 20, K=1000)
logis.growth(No = 10, r=0.8, t = 20, K=1000)
logis.growth(No = 10, r=0.4, t = 20, K=1000)
##1.4 - Write a function (sum_n) that for any given value, say n,
computes the sum of the integers from 1 to n (inclusive). #Use the
function to determine the sum of integers from 1 to 5,000.
sum_n <- function(n){
n <- 1:n
x <- n-(n-1)
sum_n_eq <- sum(c(from = 1, to = n, by = 1))}
sum_n(n = 5000)
print(sum_n(n = 5000))
## [1] 12502502
##1.5 - Write a function (sqrt_round) #that takes a number x as input, #takes the square root, rounds it to the nearest whole number and then returns the result.
sqrt_round <- function(x){
x <- x
round(sqrt(x))}
print(sqrt_round(x=6.5))
## [1] 3
##1.6 - Install the R package ‘nycflights13’, and load the ‘weather’ data. #a) Explore the columns names and the top part of the dataset to get a sense of the data #b) Make a subset of the data from just the first month (1) and then save that subset as ‘weather1’ #c) Using ggplot2, make a beautiful histogram of the variable ‘temp’ #d) Using ggplot2, make a beautiful line plot of ‘temp’ as a function of ‘time_hour’ #e) Using ggplot2, make a beautiful boxplot of ‘temp’ as a function of ‘origin’
# install.packages("nycflights13")
library(nycflights13)
data(weather)
head(weather) #a
## # A tibble: 6 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
weather1 <- subset(weather, month == 1) #b
library(ggplot2)
ggplot(data = weather1, aes(x = temp)) + geom_histogram(bins = 30)
ggplot() + geom_histogram(data = weather1, aes(x = temp, fill = ..x..), binwidth = 1) + scale_colour_gradient2() + xlab("Temperature") + ylab("Count")
ggplot(weather1, aes(x=time_hour, y=temp))+geom_line(colour = "red")
ggplot(weather1, aes(x=origin, y=temp))+geom_boxplot()
ggplot(weather1, aes(x=origin, y=temp))+geom_boxplot(color = "blue")
ggplot(weather1, aes(x=origin, y=temp, fill="Group")) + geom_boxplot(alpha=0.6)
##1.7 - Write content in markdown syntax to replicate, as much as possible, the format of the following sample text.
some text using Markdown syntax
Some text in code format:
computing with data
The main data objects in R are:
The R website is: https://www.r-project.org
##2.1 - A researcher videotaped the glides of 8 tree snakes leaping from a 10-m tower. Undulation rates of the snakes measured in hertz (cycles per second) were as follows: 0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6. #a) Draw a histogram of the undulation rate #b) Calculate the sample mean #c) Calculate the range #d) Calculate the standard deviation #e) Write a function to express the standard deviation as a percentage of the mean (that is, the coefficient of variation) and calculate it.
snake_rates <- c(0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6)
hist(snake_rates)
snake_mean <- mean(snake_rates)
range(snake_rates)
## [1] 0.9 2.0
snake_sd <- sd(snake_rates)
Coef_v <- function(snake_mean, snake_sd){
snake_mean <- snake_mean
snake_sd <- snake_sd
}
(snake_sd/snake_mean)*100 #Coefficient of variation (%) is the ratio of the sd to the mean
## [1] 23.56633
snake_mean #1.375 mean
## [1] 1.375
cv <- function(snake_rates){
paste("mean = ", mean(snake_rates), ", cv = ",round(sd(snake_rates)/mean(snake_rates)*100), "%", sep="")}
cv(snake_rates)
## [1] "mean = 1.375, cv = 24%"
##2.2 - Blood pressure was measured (in units of mm Hg). Here are the measurements: 112, 128, 108, 129, 125, 153, 155, 132, 137. #a) How many individuals are in the sample? #b) What is the mean of this sample? #c) What is the variance? #d) What is the standard deviation? #e) What is the coefficient of variation?
blood_pressure <- c(112, 128, 108, 129, 125, 153, 155, 132, 137) #Blood pressure measurements (mm Hg)
length(blood_pressure) #amount of people in sample
## [1] 9
mean(blood_pressure) #mean of sample
## [1] 131
var(blood_pressure) #variance
## [1] 254.5
sd(blood_pressure) #standard deviation
## [1] 15.95306
paste(round(sd(blood_pressure)/mean(blood_pressure)*100), "%", sep="") #Coefficient of variation is the ratio of sd to the mean 131
## [1] "12%"
cv(blood_pressure)
## [1] "mean = 131, cv = 12%"
##2.3 - The data in the file DesertBirdAbundance.csv are from a survey of the breeding birds of Organ Pipe Cactus National Monument in southern Arizona. #a) Draw a histogram of the abundance data. #b) Calculate the median and the mean of the bird abundance data. #c) In this particular case, which do you think is the best measure of center, the mean or the median? #d) Calculate the range, standard deviation, variance and coefficient of variation of the bird abundance data.
dba_data <- readr::read_csv(file = "DesertBirdAbundance.csv")
## Rows: 43 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): species
## dbl (1): abundance
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
hist(dba_data$abundance, xlab = "Desert Bird Abundance", main = "Desert Bird Abundance Histogram")
mean(dba_data$abundance)
## [1] 74.76744
median(dba_data$abundance) #better measure of center
## [1] 18
range(dba_data$abundance)
## [1] 1 625
sd(dba_data$abundance)
## [1] 121.9473
var(dba_data$abundance)
## [1] 14871.14
paste("mean = ", mean(dba_data$abundance), ", cv = ",round(sd(dba_data$abundance)/mean(dba_data$abundance)*100), "%", sep="") #coefficient of variation of bird abundance data
## [1] "mean = 74.7674418604651, cv = 163%"
cv(dba_data$abundance)
## [1] "mean = 74.7674418604651, cv = 163%"
##2.4 - Calculate the probability of each of the following events: #a) A standard normally distributed variable is larger than 3 #b) A normally distributed variable with mean 35 and standard deviation 6 is larger than 42 #c) Getting 10 out of 10 successes in a binomial distribution with probability 0.8 #d) X > 6.5 in a Chi-squared distribution with 2 degrees of freedom
1-pnorm(3) #since 3 is > pnorm(q, mean = 0, sd = 1)
## [1] 0.001349898
1-pnorm(mean = 35, sd = 6, q = 42) #q= a normally distributed variable larger than "q"
## [1] 0.1216725
dbinom(x=10, size = 10, prob = 0.8) #x=possible outcomes; size=number of trials; prob=the probability of success
## [1] 0.1073742
1-pchisq(q=6.5, df=2) #Chi-sqrd distribution with 2 degrees of freedom x>6.5
## [1] 0.03877421
##2.5 - Demonstrate graphically the central limit theorem (by sampling and calculating the mean) using a Binomial distribution with 10 trials (size=10) and 0.9 probability of success (prob=0.9) and a sample size of 5.
c.l.thm_ten <- numeric(10) # sampling 10
for (i in 1:10) {
c.l.thm_ten[i]<-mean(rbinom(n=5, size=10, prob=0.9)) # calculating mean using binom dist
}
hist(c.l.thm_ten, main = "Histogram of the Central Limit Theorem", xlab = "C.L.T. of sample size 10") # represent in graph
c.l.thm_hunnid<-numeric(100) # sampling 100
for (i in 1:100){
c.l.thm_hunnid[i]<-mean(rbinom(n=5, size=10, prob=0.9))
}
hist(c.l.thm_hunnid)
c.l.thm_thous<-numeric(1000) # sampling 1,000
for (i in 1:1000){
c.l.thm_thous[i]<-mean(rbinom(n=5, size=10, prob=0.9))
}
hist(c.l.thm_thous)
##2.6 - Imagine height is genetically determined by the combined (that is, the sum) effect of several genes (polygenic trait). Assume that each gene has an effect on height as a uniform distribution with min=1 and max=3. Simulate stochastic model of height for 1,000 random people based on 1 gene, 2 genes, and 5 genes. As we increase the number of genes, what is the resulting height distribution?
one_gene <- runif(1000, min = 1, max = 3)
hist(one_gene, main = "One Gene", xlab = "Height", breaks = 16) #Histogram not distributed
two_gene <- runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)
hist(two_gene, main = "Two Gene", xlab = "Height", breaks = 16)
five_gene <- runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)
hist(five_gene, main = "Five Gene", xlab = "Height", breaks = 16) #Becoming more normally distributed and symmetric shaped
##2.7 - Do the same as in the previous problem but now assuming that the combined effect of the genes is multiplicative (not the sum). As we increase the number of genes, what is the resulting height distribution?
one_gene <- runif(1000, min = 1, max = 3)
hist(one_gene, main = "One Gene", xlab = "Height", breaks = 16)
two_gene <- runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)
hist(two_gene, main = "Two Gene", xlab = "Height", breaks = 16)
five_gene <- runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)
hist(five_gene, main = "Five Gene", xlab = "Height", breaks = 16) #becoming more skewed right
##3.1 - Normal human body temperature is 98.6 F. Researchers obtained body-temperature measurements on randomly chosen healthy people: see code #a) Make a histogram of the data #b) Make a normal quantile plot #c) Perform a Shapiro-Wilk test to test for normality #d) Are the data normally distributed? #e) Are these measurements consistent with a population mean of 98.6 F?
rchp <- c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4)
hist(rchp)
qqnorm(rchp) #Normal quantile plot
qqnorm(rchp)
qqline(rchp) #run in console to get plot with line
shapiro.test(rchp) #We fail to reject the null hypothesis
##
## Shapiro-Wilk normality test
##
## data: rchp
## W = 0.97216, p-value = 0.7001
t.test(rchp, mu = 98.6) #Measurements are not consistent with the population mean of 98.6
##
## One Sample t-test
##
## data: rchp
## t = -0.56065, df = 24, p-value = 0.5802
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
## 98.24422 98.80378
## sample estimates:
## mean of x
## 98.524
##3.2 - The brown recluse spider often lives in houses throughout central North America. A diet-preference study gave each of 41 spiders a choice between two crickets, one live and one dead. 31 of the 41 spiders chose the dead cricket over the live one. Does this represent evidence for a diet preference?
#of the 41 brown recluse spiders, 31 spiders chose to eat dead crickets instead of live crickets
barplot(c(31,10), xlab="Brown Recluse Diet (dead vs live cricket)")
binom.test(x=31, n=41, p=0.5) #binom test is assessing the probability @ p=50% of a binom distribution
##
## Exact binomial test
##
## data: 31 and 41
## number of successes = 31, number of trials = 41, p-value = 0.00145
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5969538 0.8763675
## sample estimates:
## probability of success
## 0.7560976
#There is evidence for a diet preference according to a significant binomial test with p-value=0.00145
sum(dbinom(x=31, size=41, p=0.5))*2 #*2 as a 2-tail test which is the p-value from the binom.test()
## [1] 0.001019634
##3.3 - Ten epileptic patients participated in a study of a new anticonvulsant drug. During the first 8-week period, half the patients received a placebo and half were given the drug, and the number of seizures were recorded. Following this, the same patients were given the opposite treatment and the number of seizures were recorded. Assuming that the distribution of the difference between the placebo and drug meets the assumption of normality, perform an appropriate test to determine whether there were differences in the number of epileptic seizures with and without the drug. #a) Make a boxplot of the data #b) Test the difference.
epil.study <- data.frame(c(1:10), c(37, 52, 68,4,29,32,19,52,19,12), c(5,23,40,3,38,19,9,24,17,14))
names(epil.study) <- c("Patient_No.", "Placebo (amount of seizures)", "Drug (amount of seizures)")
head(epil.study)
## Patient_No. Placebo (amount of seizures) Drug (amount of seizures)
## 1 1 37 5
## 2 2 52 23
## 3 3 68 40
## 4 4 4 3
## 5 5 29 38
## 6 6 32 19
epil.study.long <- tidyr::pivot_longer(data = epil.study, cols = "Placebo (amount of seizures)":"Drug (amount of seizures)", names_to = "Treatment", values_to = "Seizures") #transforms the 2 columns of trials into 1 column
epil.study.long
## # A tibble: 20 x 3
## Patient_No. Treatment Seizures
## <int> <chr> <dbl>
## 1 1 Placebo (amount of seizures) 37
## 2 1 Drug (amount of seizures) 5
## 3 2 Placebo (amount of seizures) 52
## 4 2 Drug (amount of seizures) 23
## 5 3 Placebo (amount of seizures) 68
## 6 3 Drug (amount of seizures) 40
## 7 4 Placebo (amount of seizures) 4
## 8 4 Drug (amount of seizures) 3
## 9 5 Placebo (amount of seizures) 29
## 10 5 Drug (amount of seizures) 38
## 11 6 Placebo (amount of seizures) 32
## 12 6 Drug (amount of seizures) 19
## 13 7 Placebo (amount of seizures) 19
## 14 7 Drug (amount of seizures) 9
## 15 8 Placebo (amount of seizures) 52
## 16 8 Drug (amount of seizures) 24
## 17 9 Placebo (amount of seizures) 19
## 18 9 Drug (amount of seizures) 17
## 19 10 Placebo (amount of seizures) 12
## 20 10 Drug (amount of seizures) 14
boxplot(epil.study.long$Seizures ~ epil.study.long$Treatment + epil.study.long$Patient_No.)
boxplot(epil.study.long$Seizures ~ epil.study.long$Treatment)
hist(epil.study.long$Seizures) #maybe normal
shapiro.test(epil.study.long$Seizures) #normal
##
## Shapiro-Wilk normality test
##
## data: epil.study.long$Seizures
## W = 0.93723, p-value = 0.2124
t.test(epil.study$"Placebo (amount of seizures)", epil.study$"Drug (amount of seizures)", paired = TRUE)
##
## Paired t-test
##
## data: epil.study$"Placebo (amount of seizures)" and epil.study$"Drug (amount of seizures)"
## t = 2.7661, df = 9, p-value = 0.02189
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.404666 23.995334
## sample estimates:
## mean of the differences
## 13.2
#the number of seizures after patients received the drug versus after receiving the placebo is significant given the p-value of 0.02189
##3.4 - A bee biologist is analyzing whether there was an association between bee colony number and type of forest habitat. We expect that there is no habitat preference for bee colony number. Is this true based on this data? #a) Make a barplot of the data #b) Test the hypothesis of no-association
bee <- (c("Oak"=33, "Hickory"=30, "Maple"=29, "Red Cedar"=4, "Poplar"=4))
barplot(bee)
chisq.test(bee)
##
## Chi-squared test for given probabilities
##
## data: bee
## X-squared = 43.1, df = 4, p-value = 9.865e-09
#Accept hypothesis of no association between colony and habitat of the 5 tree types
##3.5 - Perform 10 one-sample t-tests for mu=0 on simulated standard normally distributed data of 25 observations each and get the P-value. Repeat the experiment, but instead simulate samples of 25 observations from a t-distribution with 2 degrees of freedom. Find a way to automate the first experiment to do it a 1,000 times and applying a false discovery rate correction to the P-values (check ?replicate).
replicate(10, t.test(rnorm(25), mu=0)$p.value) #repeating the t.test 10x
## [1] 0.2834226 0.8126056 0.6469057 0.6990219 0.2237562 0.7454398 0.7293084
## [8] 0.2133691 0.2897533 0.3734600
replicate(10, t.test(rt(25, df=2), mu=0)$p.value)
## [1] 0.8300742 0.5970417 0.7391379 0.9025699 0.7132827 0.1968971 0.4098774
## [8] 0.2221683 0.6354194 0.7154488
p.adjust(replicate(1000, t.test(rnorm(25), mu=0)$p.value), method="fdr")
## [1] 0.99411174 0.98727289 0.99411174 0.92270050 0.92270050 0.99411174
## [7] 0.86497521 0.86980590 0.92270050 0.99411174 0.90638528 0.86497521
## [13] 0.99411174 0.99411174 0.99411174 0.86980590 0.98654661 0.99411174
## [19] 0.99411174 0.90638528 0.92270050 0.99411174 0.92270050 0.99411174
## [25] 0.99411174 0.99411174 0.92270050 0.98727289 0.92270050 0.99411174
## [31] 0.92270050 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174
## [37] 0.99411174 0.99411174 0.98041106 0.86497521 0.99411174 0.92270050
## [43] 0.98727289 0.99411174 0.98654661 0.92270050 0.99411174 0.96269220
## [49] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [55] 0.99411174 0.99411174 0.99411174 0.90638528 0.99411174 0.99411174
## [61] 0.86980590 0.92270050 0.86497521 0.98105932 0.92270050 0.92270050
## [67] 0.92270050 0.99411174 0.92270050 0.99411174 0.99411174 0.92270050
## [73] 0.99411174 0.90638528 0.92270050 0.92151555 0.99411174 0.92270050
## [79] 0.86980590 0.99411174 0.99411174 0.99411174 0.99411174 0.98654661
## [85] 0.99411174 0.99411174 0.91651405 0.99411174 0.99411174 0.99411174
## [91] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [97] 0.86980590 0.99411174 0.92270050 0.99411174 0.91651405 0.91651405
## [103] 0.99411174 0.92270050 0.98727289 0.99411174 0.86497521 0.99411174
## [109] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.92270050
## [115] 0.92270050 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174
## [121] 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174 0.99411174
## [127] 0.99411174 0.87852503 0.99411174 0.99411174 0.86497521 0.86497521
## [133] 0.92270050 0.92270050 0.99411174 0.92270050 0.86497521 0.99411174
## [139] 0.91651405 0.99411174 0.99411174 0.92270050 0.99411174 0.98727289
## [145] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [151] 0.86497521 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [157] 0.99411174 0.99411174 0.91651405 0.99411174 0.99411174 0.92270050
## [163] 0.92151555 0.99411174 0.99411174 0.86497521 0.92270050 0.99411174
## [169] 0.92270050 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [175] 0.86497521 0.92270050 0.99411174 0.99411174 0.91651405 0.92270050
## [181] 0.99411174 0.99411174 0.99411174 0.86497521 0.99411174 0.99411174
## [187] 0.99411174 0.86980590 0.99411174 0.99411174 0.92270050 0.99411174
## [193] 0.98654661 0.99411174 0.92270050 0.99411174 0.90638528 0.99411174
## [199] 0.92270050 0.99411174 0.92270050 0.92270050 0.99411174 0.91651405
## [205] 0.86497521 0.99411174 0.99411174 0.92270050 0.86497521 0.99411174
## [211] 0.92270050 0.98727289 0.99411174 0.99411174 0.86980590 0.99411174
## [217] 0.99411174 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174
## [223] 0.92270050 0.99411174 0.86980590 0.92270050 0.99411174 0.98727289
## [229] 0.92270050 0.92151555 0.92270050 0.99411174 0.99411174 0.86497521
## [235] 0.99411174 0.98727289 0.99411174 0.99411174 0.98654661 0.98654661
## [241] 0.99411174 0.99411174 0.92270050 0.97226266 0.86497521 0.86497521
## [247] 0.99411174 0.99411174 0.99411174 0.99411174 0.98654661 0.90638528
## [253] 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174 0.99411174
## [259] 0.99411174 0.99411174 0.90638528 0.99411174 0.99411174 0.99411174
## [265] 0.86980590 0.99411174 0.91651405 0.90638528 0.99411174 0.99411174
## [271] 0.92270050 0.95226833 0.99411174 0.99411174 0.99411174 0.92270050
## [277] 0.99411174 0.99411174 0.99411174 0.92270050 0.98654661 0.99411174
## [283] 0.99411174 0.92270050 0.94321539 0.99411174 0.92270050 0.86497521
## [289] 0.92270050 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
## [295] 0.99411174 0.99411174 0.86980590 0.99411174 0.91651405 0.94220154
## [301] 0.99411174 0.92270050 0.92270050 0.95156308 0.99411174 0.99411174
## [307] 0.86497521 0.92270050 0.99411174 0.99411174 0.91651405 0.92270050
## [313] 0.99411174 0.99411174 0.92270050 0.86980590 0.92151555 0.99411174
## [319] 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174 0.86497521
## [325] 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174 0.92151555
## [331] 0.98654661 0.92270050 0.98654661 0.98727289 0.91651405 0.99411174
## [337] 0.99411174 0.99411174 0.99411174 0.90638528 0.99411174 0.86980590
## [343] 0.99411174 0.99411174 0.03750134 0.92270050 0.92270050 0.92270050
## [349] 0.99411174 0.86497521 0.92270050 0.99411174 0.99411174 0.99411174
## [355] 0.92270050 0.98654661 0.99411174 0.99411174 0.99411174 0.98727289
## [361] 0.86980590 0.25998618 0.99411174 0.87852503 0.99411174 0.99411174
## [367] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [373] 0.99411174 0.90638528 0.91651405 0.98727289 0.99411174 0.92270050
## [379] 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174 0.99411174
## [385] 0.99411174 0.98727289 0.86980590 0.99411174 0.99411174 0.99411174
## [391] 0.86497521 0.92270050 0.99411174 0.99411174 0.86497521 0.92699079
## [397] 0.99411174 0.91651405 0.99411174 0.92270050 0.99411174 0.98654661
## [403] 0.99411174 0.99411174 0.99411174 0.92270050 0.90638528 0.99411174
## [409] 0.99411174 0.99411174 0.99411174 0.92270050 0.86497521 0.99411174
## [415] 0.86980590 0.99411174 0.86497521 0.98654661 0.91651405 0.99411174
## [421] 0.99411174 0.99411174 0.86980590 0.99411174 0.92270050 0.91651405
## [427] 0.92151555 0.98654661 0.99411174 0.99411174 0.45880383 0.92270050
## [433] 0.92270050 0.99411174 0.99411174 0.99411174 0.99411174 0.93551596
## [439] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [445] 0.99411174 0.92270050 0.98654661 0.99411174 0.99411174 0.92270050
## [451] 0.99411174 0.92270050 0.99411174 0.86980590 0.90638528 0.99411174
## [457] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [463] 0.91651405 0.99411174 0.98971095 0.99411174 0.99411174 0.99411174
## [469] 0.98727289 0.99411174 0.86980590 0.95996837 0.92270050 0.99411174
## [475] 0.99411174 0.99411174 0.90638528 0.92270050 0.92270050 0.99411174
## [481] 0.98654661 0.99411174 0.99411174 0.99411174 0.86497521 0.99411174
## [487] 0.98654661 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [493] 0.92270050 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174
## [499] 0.92270050 0.87852503 0.99411174 0.99411174 0.99411174 0.99411174
## [505] 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174 0.92491868
## [511] 0.99411174 0.99411174 0.99411174 0.99411174 0.98654661 0.99411174
## [517] 0.99411174 0.90638528 0.86497521 0.86497521 0.99411174 0.99411174
## [523] 0.86980590 0.99411174 0.92270050 0.86497521 0.92270050 0.99411174
## [529] 0.92270050 0.99411174 0.99411174 0.98654661 0.99411174 0.92270050
## [535] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.98727289
## [541] 0.92270050 0.98654661 0.86980590 0.99411174 0.98654661 0.86497521
## [547] 0.91651405 0.86980590 0.99411174 0.99411174 0.92270050 0.92270050
## [553] 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174 0.92270050
## [559] 0.92270050 0.92270050 0.99411174 0.92270050 0.92270050 0.86980590
## [565] 0.92270050 0.92270050 0.86497521 0.99411174 0.99411174 0.99411174
## [571] 0.99411174 0.99411174 0.86497521 0.99411174 0.99411174 0.98727289
## [577] 0.86497521 0.99411174 0.99411174 0.99411174 0.99411174 0.91651405
## [583] 0.99411174 0.98654661 0.99411174 0.93296569 0.99411174 0.99411174
## [589] 0.92270050 0.99411174 0.99411174 0.99411174 0.86980590 0.86497521
## [595] 0.99411174 0.99411174 0.90638528 0.99411174 0.99411174 0.92270050
## [601] 0.99411174 0.99411174 0.99411174 0.92270050 0.91651405 0.98654661
## [607] 0.99411174 0.92270050 0.99411174 0.99411174 0.92270050 0.86497521
## [613] 0.92270050 0.99411174 0.99411174 0.91651405 0.98654661 0.92270050
## [619] 0.92151555 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
## [625] 0.86497521 0.92270050 0.99411174 0.99411174 0.92270050 0.99411174
## [631] 0.99411174 0.99411174 0.86497521 0.92270050 0.95226833 0.91289956
## [637] 0.99411174 0.99411174 0.92270050 0.86980590 0.99411174 0.99411174
## [643] 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174 0.86980590
## [649] 0.99411174 0.86980590 0.98654661 0.92270050 0.93972950 0.99411174
## [655] 0.92270050 0.90638528 0.98654661 0.99411174 0.86980590 0.92270050
## [661] 0.99411174 0.99411174 0.92270050 0.92270050 0.99411174 0.86980590
## [667] 0.99411174 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174
## [673] 0.86497521 0.98727289 0.92270050 0.99411174 0.99411174 0.98654661
## [679] 0.99411174 0.90638528 0.99411174 0.99411174 0.92270050 0.99411174
## [685] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [691] 0.99411174 0.86980590 0.86497521 0.99411174 0.92270050 0.99411174
## [697] 0.86980590 0.99411174 0.98654661 0.86980590 0.92270050 0.99411174
## [703] 0.97112313 0.99411174 0.99411174 0.86497521 0.86980590 0.99411174
## [709] 0.99411174 0.86497521 0.90638528 0.99411174 0.91105273 0.86497521
## [715] 0.99411174 0.86980590 0.92270050 0.99411174 0.99411174 0.99411174
## [721] 0.86980590 0.86497521 0.92270050 0.99411174 0.98654661 0.92270050
## [727] 0.99411174 0.92270050 0.86497521 0.99411174 0.92270050 0.99411174
## [733] 0.99411174 0.92270050 0.99411174 0.92270050 0.86980590 0.99411174
## [739] 0.86497521 0.91651405 0.92270050 0.86497521 0.99411174 0.99411174
## [745] 0.99411174 0.98654661 0.90638528 0.99411174 0.99411174 0.99411174
## [751] 0.92270050 0.99411174 0.92270050 0.99411174 0.99411174 0.90638528
## [757] 0.99411174 0.99411174 0.86980590 0.90638528 0.99411174 0.98654661
## [763] 0.99411174 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174
## [769] 0.98727289 0.99411174 0.99411174 0.92270050 0.92270050 0.99411174
## [775] 0.99411174 0.99411174 0.99411174 0.86980590 0.99411174 0.86980590
## [781] 0.99411174 0.86980590 0.98727289 0.92270050 0.99411174 0.99411174
## [787] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [793] 0.99411174 0.99411174 0.86980590 0.86497521 0.99411174 0.99411174
## [799] 0.99411174 0.92270050 0.99411174 0.91651405 0.92270050 0.99411174
## [805] 0.99411174 0.92270050 0.86980590 0.99411174 0.86497521 0.98654661
## [811] 0.98654661 0.91651405 0.99411174 0.99411174 0.99411174 0.98727289
## [817] 0.99411174 0.99411174 0.99411174 0.92270050 0.98727289 0.92270050
## [823] 0.94752030 0.99411174 0.99411174 0.98727289 0.99411174 0.99411174
## [829] 0.99411174 0.86980590 0.99411174 0.86497521 0.99411174 0.92270050
## [835] 0.99411174 0.86980590 0.92270050 0.99411174 0.99411174 0.86980590
## [841] 0.86497521 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174
## [847] 0.99411174 0.99411174 0.99411174 0.92270050 0.86980590 0.92270050
## [853] 0.99411174 0.99411174 0.86497521 0.99411174 0.98727289 0.92270050
## [859] 0.99411174 0.99411174 0.93551596 0.99411174 0.99411174 0.92270050
## [865] 0.99411174 0.86980590 0.99411174 0.99411174 0.92270050 0.99411174
## [871] 0.99411174 0.99411174 0.99411174 0.92270050 0.92270050 0.99411174
## [877] 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
## [883] 0.99411174 0.92270050 0.91651405 0.99411174 0.99411174 0.99411174
## [889] 0.86980590 0.90638528 0.99411174 0.99411174 0.99411174 0.99411174
## [895] 0.99411174 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174
## [901] 0.99411174 0.92270050 0.99411174 0.90638528 0.98727289 0.92151555
## [907] 0.86980590 0.99411174 0.98654661 0.86980590 0.99411174 0.99411174
## [913] 0.98727289 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
## [919] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.90638528
## [925] 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174 0.86497521
## [931] 0.92270050 0.86980590 0.90638528 0.92270050 0.86980590 0.99411174
## [937] 0.93551596 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174
## [943] 0.99411174 0.99411174 0.94907325 0.99411174 0.99411174 0.92270050
## [949] 0.92270050 0.98654661 0.99411174 0.99411174 0.99411174 0.99411174
## [955] 0.86497521 0.99411174 0.99411174 0.92270050 0.92151555 0.99411174
## [961] 0.98727289 0.99411174 0.98654661 0.99411174 0.99411174 0.99411174
## [967] 0.99411174 0.92270050 0.92270050 0.98654661 0.99411174 0.97249925
## [973] 0.99411174 0.99411174 0.99411174 0.99411174 0.95996837 0.99411174
## [979] 0.99411174 0.99411174 0.92270050 0.86497521 0.92270050 0.86497521
## [985] 0.86497521 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
## [991] 0.99411174 0.99411174 0.91651405 0.99411174 0.90638528 0.99411174
## [997] 0.91651405 0.98727289 0.99411174 0.86980590
##4.1 - You are hired by the US Department of Agriculture to develop effective control practices for invasive plants (data_ANOVA.xlsx). In particular, you are asked to investigate pesticide controls for kudzu which is an invasive vine that grows in thick mats that smother underlying plants. Two of the most widely used pesticides for kudzu are glyphosate and triclopyr. To determine the effectiveness of a single application of pesticide, you conduct an experiment in 18 plots that each had 50% kudzu cover. In mid-summer, you applied equal amounts of 2% glyphosate to 6 plots, 2% triclopyr to 6 plots, and water without pesticides to 6 plots. Then you returned in autumn and measured the percent cover of kudzu in the plots. #a) Transform the data from wide to long format (Google how to do it - Hint: function melt from reshape2 package) #b) Make a boxplot of the data #c) Perform an ANOVA #d) What is the variation explained (R2)? #e) Perform a post hoc test
USDA_inv <- readxl::read_xlsx(path = "data_ANOVA.xlsx", col_names = T)
USDA_inv
## # A tibble: 6 x 3
## Glyphosate Triclopyr Control
## <dbl> <dbl> <dbl>
## 1 20 17 34
## 2 17 14 31
## 3 24 12 29
## 4 18 16 32
## 5 16 18 27
## 6 22 13 30
Um <- reshape2::melt(USDA_inv)
## No id variables; using all as measure variables
Um
## variable value
## 1 Glyphosate 20
## 2 Glyphosate 17
## 3 Glyphosate 24
## 4 Glyphosate 18
## 5 Glyphosate 16
## 6 Glyphosate 22
## 7 Triclopyr 17
## 8 Triclopyr 14
## 9 Triclopyr 12
## 10 Triclopyr 16
## 11 Triclopyr 18
## 12 Triclopyr 13
## 13 Control 34
## 14 Control 31
## 15 Control 29
## 16 Control 32
## 17 Control 27
## 18 Control 30
boxplot(value~variable, data=Um, main="Treatment Effect on Kudzu")
anova(lm(value~variable, data = Um))
## Analysis of Variance Table
##
## Response: value
## Df Sum Sq Mean Sq F value Pr(>F)
## variable 2 763 381.5 54.5 1.318e-07 ***
## Residuals 15 105 7.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(value~variable, data= Um)) #r^2 87.9%
##
## Call:
## lm(formula = value ~ variable, data = Um)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.500 -1.875 0.000 1.875 4.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.500 1.080 18.053 1.38e-11 ***
## variableTriclopyr -4.500 1.528 -2.946 0.01 *
## variableControl 11.000 1.528 7.201 3.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.646 on 15 degrees of freedom
## Multiple R-squared: 0.879, Adjusted R-squared: 0.8629
## F-statistic: 54.5 on 2 and 15 DF, p-value: 1.318e-07
TukeyHSD(aov(value~variable, data= Um)) #Amount of Kudzu from most to least: control, gly, tri
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = value ~ variable, data = Um)
##
## $variable
## diff lwr upr p adj
## Triclopyr-Glyphosate -4.5 -8.467701 -0.5322987 0.0255594
## Control-Glyphosate 11.0 7.032299 14.9677013 0.0000087
## Control-Triclopyr 15.5 11.532299 19.4677013 0.0000001
##4.2 - Data in growth.txt come from a farm-scale trial of animal diets. There are two factors: diet and supplement. Diet is a factor with three levels: barley, oats and wheat. Supplement is a factor with four levels: agrimore, control, supergain and supersupp. The response variable is weight gain after 6 weeks. #a) Inspect the data using boxplots #b) Perform a two-way ANOVA #c) Assess graphically if there exists an interaction between both factors #d) Given what you learnt about the interaction, what would be a better model? #e) What is the variation explained (R2) of this new model? #f) Perform a Tukey HSD test of this new model
farm_diets <- read.table("growth.txt", header = 1)
#a) inspect data w/ boxplots
par(mfrow = c(1,2))
boxplot(gain~diet, data = farm_diets); boxplot(gain~supplement, data = farm_diets)
par(mfrow = c(1,1))
#b) Perform a two-way ANOVA
summary(lm(gain~diet+supplement, data = farm_diets))
##
## Call:
## lm(formula = gain ~ diet + supplement, data = farm_diets)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.30792 -0.85929 -0.07713 0.92052 2.90615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.1230 0.4408 59.258 < 2e-16 ***
## dietoats -3.0928 0.4408 -7.016 1.38e-08 ***
## dietwheat -5.9903 0.4408 -13.589 < 2e-16 ***
## supplementcontrol -2.6967 0.5090 -5.298 4.03e-06 ***
## supplementsupergain -3.3815 0.5090 -6.643 4.72e-08 ***
## supplementsupersupp -0.7274 0.5090 -1.429 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8356
## F-statistic: 48.76 on 5 and 42 DF, p-value: < 2.2e-16
#c) Assess graphically if there exists an interaction between both factors
#d) Given what you learnt about the interaction, what would be a better model?
interaction.plot(farm_diets$diet, farm_diets$supplement, farm_diets$gain)
interaction.plot(farm_diets$supplement, farm_diets$diet, farm_diets$gain)
#e) What is the variation explained (R2) of this new model? #no apparent interaction based on plots
anova(lm(gain~diet+supplement, data=farm_diets))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 92.358 4.199e-16 ***
## supplement 3 91.881 30.627 19.700 3.980e-08 ***
## Residuals 42 65.296 1.555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain~diet+supplement, data=farm_diets))
##
## Call:
## lm(formula = gain ~ diet + supplement, data = farm_diets)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.30792 -0.85929 -0.07713 0.92052 2.90615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.1230 0.4408 59.258 < 2e-16 ***
## dietoats -3.0928 0.4408 -7.016 1.38e-08 ***
## dietwheat -5.9903 0.4408 -13.589 < 2e-16 ***
## supplementcontrol -2.6967 0.5090 -5.298 4.03e-06 ***
## supplementsupergain -3.3815 0.5090 -6.643 4.72e-08 ***
## supplementsupersupp -0.7274 0.5090 -1.429 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8356
## F-statistic: 48.76 on 5 and 42 DF, p-value: < 2.2e-16
#f) Perform a Tukey HSD test of this new model
TukeyHSD(aov(gain~diet+supplement, data=farm_diets))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gain ~ diet + supplement, data = farm_diets)
##
## $diet
## diff lwr upr p adj
## oats-barley -3.092817 -4.163817 -2.021817 0e+00
## wheat-barley -5.990298 -7.061298 -4.919297 0e+00
## wheat-oats -2.897481 -3.968481 -1.826481 2e-07
##
## $supplement
## diff lwr upr p adj
## control-agrimore -2.6967005 -4.0583332 -1.3350677 0.0000234
## supergain-agrimore -3.3814586 -4.7430914 -2.0198259 0.0000003
## supersupp-agrimore -0.7273521 -2.0889849 0.6342806 0.4888738
## supergain-control -0.6847581 -2.0463909 0.6768746 0.5400389
## supersupp-control 1.9693484 0.6077156 3.3309811 0.0020484
## supersupp-supergain 2.6541065 1.2924737 4.0157392 0.0000307
##5.1. In a study of hyena laughter, a researcher investigated whether sound spectral properties of hyenas’ giggles are associated with age. #Age(years): 2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20 #Frequency (Hz): 840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500 #a) Inspect the data using a scatterplot #b) Test the linear association between both variables #c) Assume that the data are not normally distributed, test the linear association using a non-parametric correlation coefficient
age<-c(2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20)
freq<-c(840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500)
plot(age, freq)
cor.test(age, freq, method="pearson")
##
## Pearson's product-moment correlation
##
## data: age and freq
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8453293 -0.1511968
## sample estimates:
## cor
## -0.601798
cor.test(freq, age, method="pearson")
##
## Pearson's product-moment correlation
##
## data: freq and age
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8453293 -0.1511968
## sample estimates:
## cor
## -0.601798
##6.1. Testosterone is known to predict aggressive behavior # and is involved in face shape during puberty. Does face # shape predict aggression? Researchers measured the face # width-to-height ratio of 21 university hockey players with # the average number of penalty minutes awarded per game for # aggressive infractions. The data are available in face.txt. # #a) How steeply does the number of penalty minutes increase # per unit increase in face ratio? Calculate the estimate of # the intercept. Write the result in the form of an equation # for the line. #b) How many degrees of freedom does this # analysis have? #c) What is the t-statistic? #d) What is your # final conclusion about the slope? #e) What is the variation # explained, R2? #f) Make a scatterplot of the data and include # a linear regression line #g) Check the model assumptions
testosterone<-read.table("face.txt", sep="\t", header=T)
testo_model <- lm(Penalty~Face, data=testosterone)
summary(testo_model)
##
## Call:
## lm(formula = Penalty ~ Face, data = testosterone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9338 -0.3883 -0.1260 0.3381 1.0711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.505 2.089 -2.157 0.0440 *
## Face 3.189 1.150 2.774 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6129 on 19 degrees of freedom
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2508
## F-statistic: 7.694 on 1 and 19 DF, p-value: 0.0121
# a) Summary() says that the number of penalty minutes increase per unit increase in face shape is 3.189; the Y-intercept is -4.505; thus, the equation for the line is y = 3.189(x) - 4.505
# b) This analysis has 19 degrees of freedom
# c) The t-statistic is -2.157 on the y-intercept side and 2.774 on the other side of the graph
# d) the slope is a gradual positive slope
# e) What is the variation explained, R2? multiple R-squared = 0.2882 which is in penalty minuets and explained by face shape.
# f-g)
plot(testosterone$Face, testosterone$Penalty) + abline(testo_model, col="red", lwd=3)
## integer(0)
performance::check_model(testo_model, k=2)
##6.2. Respiratory rate (Y) is expected to depend on body mass (X) #by the power law, Y=aX??, where ?? is the scaling exponent. #The data are available in respiratoryrate_bodymass.txt. #a) Make a scatterplot of the raw data #b) Make a scatterplot of the linearized relationship. Which transformation did you use? linear model and log transformed lm #c) Use linear regression to estimate B #d) Carry out a formal test of the null hypothesis of ??=0 #e) What is the variation explained, R2? #f) Check the model assumptions
resp_rate <- read.table("respiratoryrate_bodymass.txt", sep="\t", header=T)
plot(resp_rate$BodyMass, resp_rate$Respiration) #a
plot(resp_rate$BodyMass, resp_rate$Respiration, log = "xy") #b) linear model & log transformed linear model
lm(log(Respiration) ~ log(BodyMass), data = resp_rate) #b
##
## Call:
## lm(formula = log(Respiration) ~ log(BodyMass), data = resp_rate)
##
## Coefficients:
## (Intercept) log(BodyMass)
## 1.3401 0.8574
plot(lm(log(Respiration) ~ log(BodyMass), data = resp_rate)) #b
summary(lm(log(Respiration) ~ log(BodyMass), data = resp_rate)) #c) linear regression to estimate B. B = 0.99
##
## Call:
## lm(formula = log(Respiration) ~ log(BodyMass), data = resp_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## log(BodyMass) 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
t.test(x=(log(resp_rate$BodyMass)), y= (log(resp_rate$Respiration)), data = resp_rate) #2d) formal test of null hypothesis of B=0
##
## Welch Two Sample t-test
##
## data: (log(resp_rate$BodyMass)) and (log(resp_rate$Respiration))
## t = -0.61922, df = 17.978, p-value = 0.5435
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.1293946 0.6152354
## sample estimates:
## mean of x mean of y
## 7.596998 7.854078
# e) variation explained, R2? From summary(), multiple R-squared = 0.7885 or 78.85% of the data can be explained by the graph.
# f) Check the model assumptions
performance::check_model(lm(log(Respiration) ~ log(BodyMass), data = resp_rate))
*********************** #7 - Advanced Regression —
##7.1.Use the ozone.txt data to model the ozone concentration as a linear # function of wind speed, air temperature and the intensity of solar radiation. # Assume that the requirements to perform a linear regression are met. # a) Make a multiple panel bivariate scatterplot # b) Perform a multiple linear regression # c) What is the variation explained, R2? # d) Assess the collinearity of the explanatory variables using the variance # inflation factor # e) Check the model assumptions
ozone <- read.table("ozone.txt", sep="\t", header=T)
pairs(ozone) # a) Make a multiple panel bivariate scatterplot
ozoneconcentration <- lm (ozone ~ wind+temp+rad, data=ozone) # b) multiple linear regression
summary(ozoneconcentration) # c) variation explained, R2? From summary(), multiple R-squared = 0.6062 or 60.62%; Adjusted R-squared = 0.5952 or 59.52%
##
## Call:
## lm(formula = ozone ~ wind + temp + rad, data = ozone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.210 -3.556 10.124 95.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.23208 23.04204 -2.788 0.00628 **
## wind -3.33760 0.65384 -5.105 1.45e-06 ***
## temp 1.65121 0.25341 6.516 2.43e-09 ***
## rad 0.05980 0.02318 2.580 0.01124 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.17 on 107 degrees of freedom
## Multiple R-squared: 0.6062, Adjusted R-squared: 0.5952
## F-statistic: 54.91 on 3 and 107 DF, p-value: < 2.2e-16
# d) Assess the collinearity of the explanatory variables using the variance inflation factor
croc<-relaimpo::calc.relimp(ozoneconcentration)
plot(croc)
ozoneconcentration.two <- lm (ozone ~ wind+temp, data=ozone)
car::vif(ozoneconcentration.two)
## wind temp
## 1.328293 1.328293
car::vif(ozoneconcentration)
## wind temp rad
## 1.328979 1.431201 1.095241
performance::check_model(ozoneconcentration.two)# e) Check the model assumptions
performance::check_model(ozoneconcentration) # e) Check the model assumptions
##7.2. Use the diminish.txt data (xv is explanatory, yv is response variable) to: # a) Perform a simple linear regression b) Perform a polynomial (second-degree) regression # c) Compare both models with Akaike’s Information Criterion (AIC). Which model is better? # d) Make a scatterplot of the data and include both regression lines
diminish <- read.table("diminish.txt", sep="\t", header=T)
diminish.lm <- lm(yv ~ xv, data=diminish) # a) simple linear regression
summary(diminish.lm)
##
## Call:
## lm(formula = yv ~ xv, data = diminish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3584 -2.1206 -0.3218 1.8763 4.1782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.61438 1.17315 23.54 7.66e-14 ***
## xv 0.22683 0.02168 10.46 1.45e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 16 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.8646
## F-statistic: 109.5 on 1 and 16 DF, p-value: 1.455e-08
diminish.poly <- lm(yv ~ xv + I(xv^2), data=diminish) # b) polynomial (second-degree) regression
summary(diminish.poly)
##
## Call:
## lm(formula = yv ~ xv + I(xv^2), data = diminish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0490 -1.2890 0.6018 1.1829 2.9610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.6323529 1.6916139 14.561 2.95e-10 ***
## xv 0.4057534 0.0819860 4.949 0.000175 ***
## I(xv^2) -0.0018834 0.0008386 -2.246 0.040203 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.131 on 15 degrees of freedom
## Multiple R-squared: 0.9046, Adjusted R-squared: 0.8919
## F-statistic: 71.12 on 2 and 15 DF, p-value: 2.222e-08
AIC(diminish.lm, diminish.poly) # c) Compare models AIC(); polynomial model is better because it is lower by ~ 3 points
## df AIC
## diminish.lm 3 86.26193
## diminish.poly 4 83.04403
plot(diminish$xv, diminish$yv, xlab="xv", ylab="yv") +
abline(diminish.lm, col="red", lwd=3) +
lines(diminish$xv, predict(diminish.poly), col="blue", lwd=3) # d) scatterplot of data w/ both regression lines
## integer(0)
##7.3. Stork data - logistic regression. The data in stork.txt display the # stress-induced corticosterone levels circulating in the blood of European # white storks and their survival over the subsequent five years of study. # #a) Make a scatterplot of the data #b) Which type of regression model is # suitable for these data? # c) Perform an appropriate regression to predict survival from corticosterone # d) What is the pseudo-R2 of the model? # e) What is the p-value of the model? # f) Include the predicted curve in the scatterplot # g) Check the model assumptions
my_stork <- read.table('stork.txt', header = 1)
plot(my_stork) # a) scatterplot # b) needs a logistic or binomal regression
m_s_glm <- glm(Survival ~ Corticosterone, data = my_stork, family = binomial(logit)) # c) predict survival from corticosterone
summary(m_s_glm)
##
## Call:
## glm(formula = Survival ~ Corticosterone, family = binomial(logit),
## data = my_stork)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4316 -0.8724 -0.6703 1.2125 1.8211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.70304 1.74725 1.547 0.1219
## Corticosterone -0.07980 0.04368 -1.827 0.0677 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45.234 on 33 degrees of freedom
## Residual deviance: 41.396 on 32 degrees of freedom
## AIC: 45.396
##
## Number of Fisher Scoring iterations: 4
1-(m_s_glm$deviance/m_s_glm$null.deviance) # d) pseudo-R-squared = 0.084852
## [1] 0.084852
anova(m_s_glm, test='Chisq') # e) p-value of the model = 0.0501
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survival
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 33 45.234
## Corticosterone 1 3.8382 32 41.396 0.0501 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(aes(x=Corticosterone, y=Survival), data=m_s_glm)+
geom_point()+
geom_smooth(method=glm, method.args = list(family = "binomial"), se=T) # f) fancier way to predicted curve
## `geom_smooth()` using formula 'y ~ x'
performance::check_model(m_s_glm) # g) check model
##7.4. The clusters.txt dataset contains the response variable Cancers (the # number of reported cancer cases per year per clinic) and the explanatory # variable Distance (the distance from a nuclear plant to the clinic in km). # a) Make a scatterplot of the data b) Which regression is the more appropriate # for these data? (Don’t take overdispersion into account for now) # c) Given your choice, is the trend significant? # d) What is the pseudo-R2 of the model? # e) What is the p-value of the model? # f) Include the predicted relationship from the model in the scatterplot # g) Do you think there might be some evidence of overdispersion? # h) Perform a new generalized linear model with a distribution that better accounts # for overdispersion # i) Check this last model assumptions
clusters <- read.table('clusters.txt', sep="\t", header = T)
plot(clusters) # a) scatterplot
hist(clusters$Cancers) # b) poisson regression needed for count data
clusters.glm <- glm(Cancers ~ Distance, poisson, clusters)
summary(clusters.glm) # c) negative trend is not significant p=0.0941 is 0.1 sig
##
## Call:
## glm(formula = Cancers ~ Distance, family = poisson, data = clusters)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5504 -1.3491 -1.1553 0.3877 3.1304
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.186865 0.188728 0.990 0.3221
## Distance -0.006138 0.003667 -1.674 0.0941 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 149.48 on 93 degrees of freedom
## Residual deviance: 146.64 on 92 degrees of freedom
## AIC: 262.41
##
## Number of Fisher Scoring iterations: 5
1-(clusters.glm$deviance/clusters.glm$null.deviance) # d) PseudoR^2 = 0.019
## [1] 0.01900423
anova(clusters.glm, test='Chisq') # e) p-value of model is 0.0919
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: Cancers
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 93 149.48
## Distance 1 2.8408 92 146.64 0.0919 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(aes(x=Distance, y=Cancers), data=clusters.glm)+
geom_point()+
geom_smooth(method=glm, method.args = list(family = "poisson"), se=T) # f)
## `geom_smooth()` using formula 'y ~ x'
performance::check_overdispersion(clusters.glm) # g) over-dispersion is detected & in the plot, the predicted relationship misses a lot of the data points above 2 and below 1 cancers
## # Overdispersion test
##
## dispersion ratio = 1.555
## Pearson's Chi-Squared = 143.071
## p-value = 0.001
## Overdispersion detected.
clusters.glm.nb <- MASS::glm.nb(Cancers ~ Distance, data = clusters) # h) glm that includes a negative binomial estimation to account for over dispersion
summary(clusters.glm.nb) # i) model assumptions summary says that the dispersion parameter for the nb is 1.3595 family taken to be 1
##
## Call:
## MASS::glm.nb(formula = Cancers ~ Distance, data = clusters, init.theta = 1.359466981,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3103 -1.1805 -1.0442 0.3065 1.9582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.182490 0.252434 0.723 0.470
## Distance -0.006041 0.004727 -1.278 0.201
##
## (Dispersion parameter for Negative Binomial(1.3595) family taken to be 1)
##
## Null deviance: 96.647 on 93 degrees of freedom
## Residual deviance: 94.973 on 92 degrees of freedom
## AIC: 253.19
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.359
## Std. Err.: 0.612
##
## 2 x log-likelihood: -247.191
performance::check_model(clusters.glm.nb)
##7.5. Use the jaws.txt data to: # a) Make a scatterplot of the data (age explanatory, bone response) # b) Perform a non-linear regression assuming an asymptotic exponential relationship eq. 1: y=a(1-e^(-cx)) # c) Perform a non-linear regression assuming a Michaelis-Menten model eq. 2: y=(ax)/(1+bx) # d) Estimate the percentage of variation explained by both models # (comparing them with a null model with only a constant) # e) Compare both models with Akaike’s Information Criterion (AIC). Which model is better? # f) Make a scatterplot of the data and include both regression lines
jaws <- read.table('jaws.txt', sep="\t", header = T)
GGally::ggpairs(jaws) # a) scatterplot (located in the bottom left) was made by ggpairs
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
jaws.nls <- nls(bone~a*(1 - exp(-c*age)), start=list(a=100, c = 0.1), jaws) # b) non linear regression with assumed eq 1
jaws.nls.m <- nls(bone~(a*age)/(1+b*age), start=list(a=100, b = 0.1), jaws) # c) non linear regression with assumed eq 2
# d) to find the estimated % of variation explained, calculate by: (100*(tss-rss)/tss) where tss= total sum of squares and rss= residual sum of squares
summary(nls(bone~a*(1 - exp(-c*age)), start=list(a=100, c = 0.1), jaws)) # eq 1, residual standard error=13.1 on 52 degrees of freedom
##
## Formula: bone ~ a * (1 - exp(-c * age))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 115.58059 2.84365 40.645 < 2e-16 ***
## c 0.11882 0.01233 9.635 3.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 52 degrees of freedom
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 4.025e-06
rss1 <- 13.1^2*52 # rss = rse^2 * degrees of freedom for eq 1
summary(nls(bone~(a*age)/(1+b*age), start=list(a=100, b = 0.1), jaws)) # eq 2, residual standard error=13.77 on 52 degrees of freedom
##
## Formula: bone ~ (a * age)/(1 + b * age)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 18.72524 2.52585 7.413 1.09e-09 ***
## b 0.13596 0.02339 5.814 3.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 52 degrees of freedom
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 7.785e-06
rss2 <- 13.77^2*52 # rss = rse^2 * degrees of freedom for eq 2
null<-lm(bone~1, data=jaws) # to find tss, fit a null model
summary(null) # the residual standard error is 33.37 on 53 degrees of freedom
##
## Call:
## lm(formula = bone ~ 1, data = jaws)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.979 -8.844 9.872 21.775 48.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93.979 4.541 20.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.37 on 53 degrees of freedom
tss <- 33.37^2*53 # tss = null rse^2 * degrees of freedom
100*(tss-rss1)/tss # d) calculate the estimated % of variation explained for eq 1 = 84.8798
## [1] 84.8798
100*(tss-rss2)/tss # d) calculate the estimated % of variation explained for eq 2 = 83.2936
## [1] 83.2936
AIC(nls(bone~a*(1 - exp(-c*age)), start=list(a=100, c = 0.1), jaws)) # e) AIC for eq 1 = 435.0823 (eq 1 is better because the 435 value is smaller than 440 value)
## [1] 435.0823
AIC(nls(bone~(a*age)/(1+b*age), start=list(a=100, b = 0.1), jaws)) # e) AIC for eq 2 = 440.4066
## [1] 440.4066
plot(jaws$age, jaws$bone) # f) scatterplot of data
plot(jaws$age, jaws$bone) +
lines(seq(0, 50, 0.1), predict(jaws.nls, list(age=seq(0, 50, 0.1)),
type="response"), col="red", lwd=3) # f) scatterplot regression line 1
## integer(0)
plot(jaws$age, jaws$bone)+
lines(seq(0, 50, 0.1), predict(jaws.nls.m, list(age=seq(0, 50, 0.1)),
type="response"), col="tan", lwd=3) # f) scatterplot regression line 2
## integer(0)
plot(jaws$age, jaws$bone)+lines(seq(0, 50, 0.1), predict(jaws.nls, list(age=seq(0, 50, 0.1)), type="response"), col="red", lwd=3)+
lines(seq(0, 50, 0.1), predict(jaws.nls.m, list(age=seq(0, 50, 0.1)),
type="response"), col="tan", lwd=3) # plot with both lines
## integer(0)
##7.6. In a recent paper we read: “Linear mixed effects modelling fit by restricted maximum likelihood was used to explain the variations in growth. The linear mixed effects model was generated using the lmer function in the R package lme4, with turbidity, temperature, tide, and wave action set as fixed factors and site and date set as random effects”. Write down the R code to recreate their model.
# using rats data
rats<-read.table("rats.txt", sep="\t", header=T)
rats$Treatment<-factor(rats$Treatment) #transform Treatment type into a factor
rats$Rat<-factor(rats$Rat) #transform which rat under the knife into a factor
rats$Liver<-factor(rats$Liver) #transform liver part into a factor
rats$Rat_wTreatment<-rats$Treatment:rats$Rat # the treatments varied, so the unique factor is created
rats$Liver_wRat<-rats$Treatment:rats$Rat:rats$Liver # 2 rats had their liver cut in threes, so the random effect is a unique factors of each liver piece
rats.lme<-lme4::lmer(Glycogen~Treatment+(1|Rat_wTreatment)+(1|Liver_wRat), data=rats)
summary(rats.lme)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Glycogen ~ Treatment + (1 | Rat_wTreatment) + (1 | Liver_wRat)
## Data: rats
##
## REML criterion at convergence: 219.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.48212 -0.47263 0.03062 0.42934 1.82935
##
## Random effects:
## Groups Name Variance Std.Dev.
## Liver_wRat (Intercept) 14.17 3.764
## Rat_wTreatment (Intercept) 36.06 6.005
## Residual 21.17 4.601
## Number of obs: 36, groups: Liver_wRat, 18; Rat_wTreatment, 6
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 140.500 4.707 29.848
## Treatment2 10.500 6.657 1.577
## Treatment3 -5.333 6.657 -0.801
##
## Correlation of Fixed Effects:
## (Intr) Trtmn2
## Treatment2 -0.707
## Treatment3 -0.707 0.500
anova(rats.lme)
## Analysis of Variance Table
## npar Sum Sq Mean Sq F value
## Treatment 2 123.99 61.996 2.929
##7.7. Researchers at the University of Arizona want to assess the germination rate of saguaros using a factorial design, with 3 levels of soil type (remnant, cultivated and restored) and 2 levels of sterilization (yes or no). The same experimental design was deployed in 4 different greenhouses. Each of the unique treatments was replicated in 5 pots. 6 seeds were planted in each pot. a) How many fixed treatments (unique combinations) exist? b) What is the total number of pots? c) What is the total number of plants measured? d) Write the R code that you would use to analyze these data
3*2 # a) with 3 levels of soil type and 2 levels of sterilization = 6 unique combinations
## [1] 6
6*5*4 # b) each of the 6 treatment combos had 5 pots and 4 greenhouses = 120 pots
## [1] 120
6*120 # c) 6 unique combinations in 120 pots = 720 plants
## [1] 720
# glmer(germination ~ soil * sterilization + (1|Greenhouse) + (1|Greenhouse:Pot), family=binomial, data = problem) # d) R code to analyze data (i.e., AB notes: Notice that there are 2 fixed treatments (with interaction) and 2 nested random effects. The model should be a logistic GLMM because germination is a proportion (between 0 and 1).)
##7.8. Check these hypothetical data from 4 subjects relating number of previous performances to negative affect. [see graph]
##7.9. Load dragons.RData # a) Perform a simple linear regression with testScore as response and bodyLength as explanatory # b) Plot the data with ggplot2 and add a linear regression line with confidence # intervals. Hint: geom_smooth() # c) We collected multiple samples from 8 mountain ranges. Generate a # boxplot using (or not) ggplot2 to explore this new explanatory variable # d) Now repeat the scatterplot in b) but coloring by mountain range and without # linear regression line # e) Instead of coloring, use facet_wrap() to separate by mountain range # f) Perform a new linear model adding mountain range and assuming that it is fixed effects # g) Perform the same linear model as before but now assuming mountain range is random effects # h) How much of the variation in test scores is explained by the random effect (mountain range) # i) Estimate the R2 (conditional and marginal) of the mixed-effects model # j) Check this last model assumptions # k) Include the fitted lines for each mountain range (plot from e). [Hint: predict function within geom_line layer]
load("dragons.RData")
drag <- lm(testScore ~ bodyLength, data = dragons)# a) simple linear regression; response=testScore, explanatory=bodyLength
summary(drag)
##
## Call:
## lm(formula = testScore ~ bodyLength, data = dragons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.962 -16.411 -0.783 15.193 55.200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.31783 12.06694 -5.081 5.38e-07 ***
## bodyLength 0.55487 0.05975 9.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.2 on 478 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1511
## F-statistic: 86.25 on 1 and 478 DF, p-value: < 2.2e-16
ggplot(aes(y=testScore, x=bodyLength), data=dragons)+geom_point()+
geom_smooth(method="lm", se=T)+ theme_bw() # b) plot with linear regression line & confidence interval
## `geom_smooth()` using formula 'y ~ x'
ggplot(data = dragons, mapping = aes(y = testScore, x = mountainRange, fill = mountainRange))+
geom_boxplot()+labs(title = "Dragon Test Score and Mountain Range")+
theme_bw()+theme(panel.grid = element_blank(),legend.position = "none") # c) boxplot to explore new explanatory variable
ggplot(aes(y=testScore, x=bodyLength, color = mountainRange), data=dragons)+
geom_point()+theme_bw()+labs(title = "Dragon Test Score and Body Length by Mountain Range",
color = "Mountain Range")+theme(panel.grid = element_blank(),
legend.position = "bottom") # d) scatterplot in b with the samples from each mountain range colored
ggplot(aes(y=testScore, x=bodyLength, color = mountainRange), data=dragons)+
geom_point()+theme_bw()+labs(title = "Dragon Test Score and Body Length by Mountain Range",
color = "Mountain Range")+theme(panel.grid = element_blank(),
legend.position = "none")+facet_wrap(.~mountainRange) # e) scatterplots by each mountainrange unsing facet_wrap()
mr <- lm(testScore ~ bodyLength + mountainRange, data = dragons)
summary(mr) # f) perform new linear model assuming moountain ranges as a fixed effect
##
## Call:
## lm(formula = testScore ~ bodyLength + mountainRange, data = dragons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.263 -9.926 0.361 9.994 44.488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.83051 14.47218 1.439 0.15072
## bodyLength 0.01267 0.07974 0.159 0.87379
## mountainRangeCentral 36.58277 3.59929 10.164 < 2e-16 ***
## mountainRangeEmmental 16.20923 3.69665 4.385 1.43e-05 ***
## mountainRangeJulian 45.11469 4.19012 10.767 < 2e-16 ***
## mountainRangeLigurian 17.74779 3.67363 4.831 1.84e-06 ***
## mountainRangeMaritime 49.88133 3.13924 15.890 < 2e-16 ***
## mountainRangeSarntal 41.97841 3.19717 13.130 < 2e-16 ***
## mountainRangeSouthern 8.51961 2.73128 3.119 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.96 on 471 degrees of freedom
## Multiple R-squared: 0.5843, Adjusted R-squared: 0.5773
## F-statistic: 82.76 on 8 and 471 DF, p-value: < 2.2e-16
mr2 <- lme4::lmer(testScore ~ bodyLength + (1|mountainRange), data = dragons)
summary(mr2) # g) perform new linear model assuming moountain ranges as a random effect
## Linear mixed model fit by REML ['lmerMod']
## Formula: testScore ~ bodyLength + (1 | mountainRange)
## Data: dragons
##
## REML criterion at convergence: 3991.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4815 -0.6513 0.0066 0.6685 2.9583
##
## Random effects:
## Groups Name Variance Std.Dev.
## mountainRange (Intercept) 339.7 18.43
## Residual 223.8 14.96
## Number of obs: 480, groups: mountainRange, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 43.70938 17.13489 2.551
## bodyLength 0.03316 0.07865 0.422
##
## Correlation of Fixed Effects:
## (Intr)
## bodyLength -0.924
# h) How much of the variation in test scores is explained by the random effect (mountain range)
100*(339.7/(339.7+223.8)) # 100*((random effects mountain range intercept variance)/((random effects mountain range intercept variance)+(random effects residual intercept variance))) = 60.28% of variation is explained with mountainRange
## [1] 60.28394
performance::r2(mr2) # i) the mixed-effects model R2 (conditional)= 0.603, R2 (marginal)= 5.128e-4
## # R2 for Mixed Models
##
## Conditional R2: 0.603
## Marginal R2: 0.001
performance::check_model(mr2) # j) Check this last model assumptions
predict <- data.frame(dragons, "predicted"=c(predict(mr2))) # set up the predicted function data frame
ggplot(aes(y=testScore, x=bodyLength, color = mountainRange), data=dragons)+
geom_point()+
geom_line(data = predict, aes(x=bodyLength, y=predicted), color = "black")+
theme_bw()+
labs(title = "Dragon Test Score and Body Length by Mountain Range",
color = "Mountain Range")+
theme(panel.grid = element_blank(),
legend.position = "none")+
facet_wrap(.~mountainRange) # k) fitted lines for each mountain range
##7.10. Data in Estuaries.csv correspond to counts of invertebrates at 3-4 sites in each of 7 (randomly chosen) estuaries. a) Fit a linear mixed model with Total as response and Modification as explanatory, controlling for Estuary b) Estimate the R2 (conditional and marginal) of this model c) Plot the data with ggplot2 in a way that helps you understand the different effects d) Include the variable Site as a random effect. Do you think this corresponds to a crossed or a nested design? # e) What are the R2 (conditional and marginal) of the model including Site # f) Check the model assumptions g) Plot the data trying to include Site # h) Transform the variable Hydroid to presence/absence data # i) Fit a generalized linear mixed model (GLMM) with this transformed variable as # Response and the same fixed and random effects as in d). [Hint: function glmer] # j) Check the model assumptions
estuaries <- read.csv("Estuaries.csv", header = T)
estu.lmer<-lme4::lmer(Total ~ Modification + (1|Estuary), data=estuaries) # a) linear mixed model, response=Total, explanatory=Modification, controlling for Estuary
summary(estu.lmer)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Total ~ Modification + (1 | Estuary)
## Data: estuaries
##
## REML criterion at convergence: 394.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3859 -0.7142 0.2766 0.5240 2.0456
##
## Random effects:
## Groups Name Variance Std.Dev.
## Estuary (Intercept) 55.12 7.424
## Residual 86.07 9.277
## Number of obs: 54, groups: Estuary, 7
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 40.973 4.727 8.668
## ModificationPristine -14.473 6.230 -2.323
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.759
performance::r2(estu.lmer) # b) Conditional R2: 0.553, Marginal R2: 0.267
## # R2 for Mixed Models
##
## Conditional R2: 0.553
## Marginal R2: 0.267
ggplot2::ggplot(estuaries, aes(x = Modification, y = Total))+geom_boxplot()+geom_jitter()+
facet_wrap(~Estuary) # c) plot understanding the different effects
estu.lmer.site<-lme4::lmer(Total ~ Modification + (1|Estuary/Site),
data=estuaries) # d) variable site as random effect; corresponds as nested design
summary(estu.lmer.site)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Total ~ Modification + (1 | Estuary/Site)
## Data: estuaries
##
## REML criterion at convergence: 386.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8686 -0.6687 0.1504 0.6505 1.9816
##
## Random effects:
## Groups Name Variance Std.Dev.
## Site:Estuary (Intercept) 49.85 7.061
## Estuary (Intercept) 47.59 6.899
## Residual 43.65 6.607
## Number of obs: 54, groups: Site:Estuary, 27; Estuary, 7
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 41.053 4.739 8.662
## ModificationPristine -14.553 6.232 -2.335
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.760
performance::r2(estu.lmer.site) # e) Conditional R2: 0.774, Marginal R2: 0.270 for mixed models
## # R2 for Mixed Models
##
## Conditional R2: 0.774
## Marginal R2: 0.270
performance::check_model(estu.lmer.site) # f) check model assumptions
ggplot2::ggplot(estuaries, aes(x = Modification, y = Total))+geom_boxplot()+
geom_jitter(aes(color=as.factor(Site)))+
facet_wrap(~Estuary) # g) plot data including the site
estuaries$HydroidPres <- estuaries$Hydroid > 0 # h) transform Hydroid variable to presence/absence data
hydro.glmm.site<-lme4::glmer(HydroidPres ~ Modification +
(1|Estuary/Site), family=binomial, data=estuaries) # i) fit a glmm with transformed variable
summary(hydro.glmm.site)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: HydroidPres ~ Modification + (1 | Estuary/Site)
## Data: estuaries
##
## AIC BIC logLik deviance df.resid
## 57.1 65.0 -24.5 49.1 50
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.06063 -0.25028 -0.05443 0.25475 0.98719
##
## Random effects:
## Groups Name Variance Std.Dev.
## Site:Estuary (Intercept) 14.45 3.802
## Estuary (Intercept) 1.13 1.063
## Number of obs: 54, groups: Site:Estuary, 27; Estuary, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.710 2.419 -2.361 0.0182 *
## ModificationPristine 6.534 3.140 2.081 0.0375 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.888
performance::r2(hydro.glmm.site)
## # R2 for Mixed Models
##
## Conditional R2: 0.888
## Marginal R2: 0.358
performance::check_model(hydro.glmm.site) # j) check model assumptions
##8.1. Write a function to calculate the 3-point moving average for an input vector. The formula is: y_i=((y_i-1)+(y_i)+(y_i+1))/3
pntavg<- function (x) {
y<-numeric(length(x)-2)
for (i in 2:(length(x)-1)) {
y[i]<-(x[i-1]+x[i]+x[i+1])/3
}
y
}
##8.2. Read the temp.txt file. The data correspond to monthly average temperatures. # a) Plot the time series data. [Hint: first you need to create a monthly time series object] # b) Calculate the 5-point moving average and plot it together with the time series # c) Decompose the time series into seasonal, trend and residual error components # d) Generate a temporal correlogram to assess the autocorrelation of the time series # e) Generate a new correlogram but removing the trend and seasonal variation # f) Generate a partial correlogram plot # g) Find the best ARIMA model using the forecast package # h) Estimate future values using the previous ARIMA model and plot the results
temp <- read.table("temp.txt", sep="\t", header=T)
temp.ts <- ts(temp$temps, frequency = 12) # a) create monthly time series of temp
plot(temp.ts) # a) plot time series
library(TTR)
temp.ma <- smooth::sma(temp.ts, n = 5) # b) calc 5 pnt moving average
plot(temp.ma) # b) Calculate the 5-point moving average and plot it together with the time series
decom.temp <- decompose(temp.ts) # c) decompose ts into seasonal, trend, and residual (random & observed) error components
plot(decom.temp) # c) plotted decomposed temp data
acf(temp.ts) # d) using the "Auto-/Cross- Covariance & Correlation Function Estimation", generate a temporal correlogram to assess the autocorrelation of the time series
acf(decom.temp$random[!is.na(decom.temp$random)]) # e) Generate a new correlogram but removing the trend and seasonal variation
pacf(temp.ts) # f) pacf() creates partial autocorrelations
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
temp.fit<-auto.arima(temp.ts)
summary(temp.fit)
## Series: temp.ts
## ARIMA(0,0,1)(2,1,0)[12]
##
## Coefficients:
## ma1 sar1 sar2
## 0.2456 -0.5212 -0.3233
## s.e. 0.0774 0.0823 0.0827
##
## sigma^2 = 3.772: log likelihood = -300.76
## AIC=609.52 AICc=609.81 BIC=621.4
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2384692 1.846384 1.498262 0.2682136 11.90101 0.8327783
## ACF1
## Training set 0.005413047
checkresiduals(temp.fit) # g) best ARIMA model
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12]
## Q* = 38.688, df = 21, p-value = 0.01069
##
## Model df: 3. Total lags used: 24
forecast<-forecast(temp.fit) # h) Estimate future values using the previous ARIMA model
plot(forecast) # h) plot the results
##8.3. Read the bac_dust.txt dataset. The data corresponds to proteobacterial # abundance and bacterial species in dust samples collected around the globe. # a) Make a map of the sampling points using the R libraries maps and ggplot2. # Color the points based on the continent they were collected. # b) Make a map of the sampling points using the R libraries maps and ggplot2. # Size the points based on bacterial richness. # c) Calculate Moran’s I autocorrelation for the variable Richness and Proteobacteria # d) Make a simple linear regression of Richness as a function of Proteobacteria. # Are the residuals spatially autocorrelated? # e) Perform a generalized linear mixed model (GLMM) using the coordinates # as random effect and using the Matérn covariance function (R package spaMM). # f) Report the nu and the rho values for this spatial regression model. # g) What can you say about the estimated correlation of this particular # spatial regression model across increasing distances between pairs of locations.
bac<-read.table("bac_dust.txt", header = T, sep = "\t")
world_map <- map_data("world")
ggplot(world_map)+
geom_polygon(aes(x = long, y = lat, group = group), fill = "lightgray", colour = "black", size = 0.1)+
geom_point(data = bac, aes(x = Longitude, y = Latitude, color = Continent), size = 2)+
scale_color_manual(values = c("SouthAmerica" = 'chartreuse4', "NorthAmerica" = 'chartreuse1', "Africa" = 'goldenrod1', "Oceania" = 'darkorchid3', "Europe" = 'firebrick4', "Asia" = 'cornflowerblue'))+
theme_bw(base_size = 15)+
theme(legend.title = element_blank(), axis.title.x = element_blank(),
axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.title.y = element_blank(), axis.text.y = element_blank(),
axis.ticks.y = element_blank()) # a) map of sampling, points per continent
ggplot(world_map)+
geom_polygon(aes(x = long, y = lat, group = group), fill = "lightgray", colour = "black", size = 0.1)+
geom_point(data = bac, aes(x = Longitude, y = Latitude, color = Continent, size=Richness), alpha=0.6)+
scale_color_manual(values = c("SouthAmerica" = 'chartreuse4', "NorthAmerica" = 'chartreuse1', "Africa" = 'goldenrod1', "Oceania" = 'darkorchid3', "Europe" = 'firebrick4', "Asia" = 'cornflowerblue'))+
theme_bw(base_size = 15)+
theme(legend.title = element_blank(), axis.title.x = element_blank(),
axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.title.y = element_blank(), axis.text.y = element_blank(),
axis.ticks.y = element_blank()) # b) same as a) but Size the points based on bacterial richness
bac.dists <- as.matrix(dist(cbind(bac$Latitude,
bac$Longitude)))
bac.dists.inv <- 1/bac.dists
diag(bac.dists.inv) <- 0
ape::Moran.I(bac$Richness, bac.dists.inv) # c) Calc Moran's I autocorr for variable Richness
## $observed
## [1] 0.1159867
##
## $expected
## [1] -0.003225806
##
## $sd
## [1] 0.03860909
##
## $p.value
## [1] 0.002017263
ape::Moran.I(bac$Proteobacteria, bac.dists.inv) # c) Calc Moran's I autocorr for variable Proteobacteria
## $observed
## [1] 0.216501
##
## $expected
## [1] -0.003225806
##
## $sd
## [1] 0.03858986
##
## $p.value
## [1] 1.241692e-08
anova(lm(Richness~Proteobacteria, data=bac)) # d) simple lg of Richness as a function of Proteobacteria.
## Analysis of Variance Table
##
## Response: Richness
## Df Sum Sq Mean Sq F value Pr(>F)
## Proteobacteria 1 38901 38901 0.5349 0.4651
## Residuals 309 22471204 72722
bac$resid <-resid(lm(Richness~Proteobacteria, data=bac))
ape::Moran.I(bac$resid, bac.dists.inv) # d) Are the residuals spatially autocorrelated? not significant
## $observed
## [1] 0.1098749
##
## $expected
## [1] -0.003225806
##
## $sd
## [1] 0.03861023
##
## $p.value
## [1] 0.003397342
spaMM::fitme(Richness~Proteobacteria+Matern(1|Latitude+Longitude),
data = bac) # e) GLMM, random effect=coordinates, Matérn covariance function
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
## If the 'RSpectra' package were installed, an extreme eigenvalue computation could be faster.
## formula: Richness ~ Proteobacteria + Matern(1 | Latitude + Longitude)
## ML: Estimation of corrPars, lambda and phi by ML.
## Estimation of fixed effects by ML.
## Estimation of lambda and phi by 'outer' ML, maximizing p_v.
## family: gaussian( link = identity )
## ------------ Fixed effects (beta) ------------
## Estimate Cond. SE t-value
## (Intercept) 411.58 48.47 8.4910
## Proteobacteria 37.66 112.90 0.3336
## --------------- Random effects ---------------
## Family: gaussian( link = identity )
## --- Correlation parameters:
## 1.nu 1.rho
## 0.2041722 0.3145336
## --- Variance parameters ('lambda'):
## lambda = var(u) for u ~ Gaussian;
## Latitude . : 19190
## # of obs: 311; # of groups: Latitude ., 311
## -------------- Residual variance ------------
## phi estimate was 54713
## ------------- Likelihood values -------------
## logLik
## p_v(h) (marginal L): -2170.262
# f) for this spatial regression model, nu = 0.2042, rho = 0.3145
dist<-dist(bac[,c("Longitude", "Latitude")])
matern<-spaMM::MaternCorr(dist, nu=0.2041723, rho=0.3145336)
plot(as.numeric(dist), as.numeric(matern), xlab="Distance between pairs of locations", ylab="Estimated correlation")
# g) the estimated correlation of this particular spatial regression model across increasing distances between pairs of locations shows that within the groups, there is high correlation but not when comparing the groups to eachother
##8.4. Download the map of Spain. # a) Make a basic map of Spain # b) Get information on the population of Spanish cities and make a map of Spain # including the locations and population of Spanish cities. c) Visit Spain and enjoy
library(maps)
##
## Attaching package: 'maps'
## The following object is masked _by_ '.GlobalEnv':
##
## ozone
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
##
## unemp
spain<-map_data("world", region="spain")
ggplot(spain)+
geom_polygon(aes(x = long, y = lat, group=group), fill='lightgray', color="black", size=0.1)+
scale_size_continuous(range=c(1,12))# a) Make a basic map of Spain
cities<-get('world.cities')
head(cities)
## name country.etc pop lat long capital
## 1 'Abasan al-Jadidah Palestine 5629 31.31 34.34 0
## 2 'Abasan al-Kabirah Palestine 18999 31.32 34.35 0
## 3 'Abdul Hakim Pakistan 47788 30.55 72.11 0
## 4 'Abdullah-as-Salam Kuwait 21817 29.36 47.98 0
## 5 'Abud Palestine 2456 32.03 35.07 0
## 6 'Abwein Palestine 3434 32.03 35.20 0
cities.spain<-cities[cities$country.etc == 'Spain',]
head(cities.spain)
## name country.etc pop lat long capital
## 128 A Coruna Spain 243088 43.33 -8.42 0
## 129 A Estrada Spain 21997 42.70 -8.50 0
## 130 A Laracha Spain 10856 43.25 -8.59 0
## 131 A Pobra do Caraminal Spain 9955 42.61 -8.94 0
## 183 Abanto Zierbena Spain 9505 43.32 -3.08 0
## 186 Abaran Spain 12890 38.20 -1.40 0
ggplot(spain)+
geom_polygon(aes(x = long, y = lat, group=group), fill='lightgray', color="black", size=0.1)+
geom_point(data=cities.spain, aes(x=long, y=lat, size=pop, color="red"), alpha=0.5)+
scale_size_continuous(range=c(1,12))+
theme_void()
##8.5. Download GBIF georeferenced occurrence data of the Pyrenean desman # (Galemys pyrenaicus) and make a map of its geographical distribution # (Hint: it is only present in Spain, Andorra, Portugal and France).
rgbif::occ_count(georeferenced = T)
## [1] 2060618296
gp_gbif<-rgbif::occ_search(scientificName = "Galemys pyrenaicus", hasCoordinate = T)
gp<-as.data.frame(gp_gbif$data[,c("decimalLatitude", "decimalLongitude")])
world <- map_data("world")
eu<-map_data("world", region=c("Spain", "Portugal", "France", "Andorra"))
ggplot(eu, aes(x=long, y=lat))+
geom_polygon(aes(group=group, fill=region))+
geom_point(data=gp, aes(x=decimalLongitude, y=decimalLatitude), color="black", alpha=0.2, size=2)+
theme_bw()
##9.1. Load the dune_bio.txt dataset. Species should be in columns and sites in rows. # a) Calculate the total number of individuals of all species # b) Calculate the total number of individuals for each species # c) Calculate the average number of individuals for each species # d) Calculate the total number of individuals for each site # e) Calculate the average number of individuals for each site # f) Write a function to report the median number of individuals for each species and the median number of individuals for each site # g) Use the vegan function decostand() to transform the dataset to relative abundances. How would you confirm the transformation worked? # h) Use the vegan function decostand() to standardize the dataset into the range 0 to 1 # i) Use the vegan function decostand() to standardize the dataset to mean=0 and variance=1
db<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
sum(db)
## [1] 685
colSums(db)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 13 2 13 18 5 25 18 4 49 14 2
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 9 54 4 48 10 9 47 21 11 16 63
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 1 26 20 26 48 58 36 15
colMeans(db)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 0.65 0.10 0.65 0.90 0.25 1.25 0.90 0.20 2.45 0.70 0.10
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 0.45 2.70 0.20 2.40 0.50 0.45 2.35 1.05 0.55 0.80 3.15
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 0.05 1.30 1.00 1.30 2.40 2.90 1.80 0.75
rowSums(db)
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 42 33 45 33 48 18 40 43 15 23 43 32 42 27 40 31 24 31 35 40
rowMeans(db)
## 2 13 4 16 6 1 8 5
## 1.4000000 1.1000000 1.5000000 1.1000000 1.6000000 0.6000000 1.3333333 1.4333333
## 17 15 10 11 9 18 3 20
## 0.5000000 0.7666667 1.4333333 1.0666667 1.4000000 0.9000000 1.3333333 1.0333333
## 14 19 12 7
## 0.8000000 1.0333333 1.1666667 1.3333333
median_sps_sites<-function(db){
cat("The median number of individuals per sp is", apply(db, 2, median), "and for sites is", apply(db, 1, median))
}
dune.bio.relabu<-vegan::decostand(db, method = "total")
rowSums(dune.bio.relabu) # g) Use the vegan function decostand() to transform the dataset to relative abundances. How would you confirm the transformation worked? summary()
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
dune.bio.range<-vegan::decostand(db, method = "range")
dune.bio.stand<-vegan::decostand(db, method = "standardize")
summary(dune.bio.relabu)
## Belper Empnig Junbuf Junart
## Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.000000 Median :0.00000 Median :0.00000
## Mean :0.01665 Mean :0.003226 Mean :0.01752 Mean :0.02728
## 3rd Qu.:0.04496 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.02273
## Max. :0.07407 Max. :0.064516 Max. :0.11429 Max. :0.13043
## Airpra Elepal Rumace Viclat
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.01151 Mean :0.04278 Mean :0.02105 Mean :0.00614
## 3rd Qu.:0.00000 3rd Qu.:0.02500 3rd Qu.:0.01190 3rd Qu.:0.00000
## Max. :0.13333 Max. :0.24242 Max. :0.12500 Max. :0.06250
## Brarut Ranfla Cirarv Hyprad
## Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.03333 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.05000 Median :0.00000 Median :0.000000 Median :0.00000
## Mean :0.07213 Mean :0.02353 Mean :0.002222 Mean :0.01786
## 3rd Qu.:0.12216 3rd Qu.:0.05265 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :0.22222 Max. :0.12903 Max. :0.044444 Max. :0.16129
## Leoaut Potpal Poapra Calcus
## Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.05536 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.06977 Median :0.000000 Median :0.07778 Median :0.00000
## Mean :0.08170 Mean :0.008514 Mean :0.06960 Mean :0.01772
## 3rd Qu.:0.09498 3rd Qu.:0.000000 3rd Qu.:0.10000 3rd Qu.:0.00000
## Max. :0.19355 Max. :0.086957 Max. :0.22222 Max. :0.16667
## Tripra Trirep Antodo Salrep
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.03816 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.05530 Median :0.00000 Median :0.00000
## Mean :0.01003 Mean :0.06625 Mean :0.03471 Mean :0.01846
## 3rd Qu.:0.00000 3rd Qu.:0.08772 3rd Qu.:0.05312 3rd Qu.:0.00000
## Max. :0.10417 Max. :0.25000 Max. :0.26667 Max. :0.16129
## Achmil Poatri Chealb Elyrep
## Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.00000 Median :0.09651 Median :0.000000 Median :0.00000
## Mean :0.02458 Mean :0.08232 Mean :0.001515 Mean :0.03711
## 3rd Qu.:0.04738 3rd Qu.:0.12054 3rd Qu.:0.000000 3rd Qu.:0.08992
## Max. :0.13333 Max. :0.27273 Max. :0.030303 Max. :0.22222
## Sagpro Plalan Agrsto Lolper
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.03571 Median :0.06085
## Mean :0.02714 Mean :0.03767 Mean :0.07145 Mean :0.08353
## 3rd Qu.:0.05265 3rd Qu.:0.09635 3rd Qu.:0.15396 3rd Qu.:0.12863
## Max. :0.11429 Max. :0.13333 Max. :0.21212 Max. :0.38889
## Alogen Brohor
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000
## Mean :0.04824 Mean :0.01757
## 3rd Qu.:0.08387 3rd Qu.:0.01163
## Max. :0.22857 Max. :0.09524
rowSums(dune.bio.relabu) # i) Use the vegan function decostand() to standardize the dataset to mean=0 and variance=1
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##9.2. Write a function to calculate the observed richness of a vector representing the abundances of different species in a community.
richnessfunct <- function(abun){
dat <- abun
relab <- decostand(dat, method = "total")
specnumber(relab)
}
##9.3. Write a function to calculate the Shannon-Wiener diversity index of a vector representing the abundances of different species in a community.
swdiversityfunct <- function(abun){
dat <- abun
relab <- decostand(dat, method = "total")
diversity(relab, index="shannon")
}
##9.4. Write a function that calculates both the observed richness and the # Shannon-Wiener diversity index for each community in a matrix, and writes an # output table with the results. You can use the vegan built-in functions. # Test your function with the dune_bio.txt dataset.
richnessswfunct <- function(abun){
dat <- abun
relab <- decostand(dat, method = "total")
rich <- specnumber(relab)
swd <- diversity(relab, index="shannon")
}
##9.5. Make a rank-abundance curve using the second site (13) of the dune_bio.txt dataset. Fit a lognormal model to the data.
dunecurve <- read.table("dune_bio.txt", sep="\t", header=T)
plot(vegan::rad.lognormal(dunecurve), lty=2, lwd=2) # fit a log normal model to the data
vegan::rad.lognormal(dunecurve)
##
## RAD model: Log-Normal
## Family: poisson
## No. of species: 217
## Total abundance: 895
##
## log.mu log.sigma Deviance AIC BIC
## 1.2207285 0.6292969 20.2518355 702.2895203 709.0493150
plot(vegan::radfit(dunecurve)) # fit lognormal with radfit()
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
##9.6. Create a Euclidean distance matrix after standardization using the first # (A1), second (Moisture) and fifth (Manure) variables of dune_env.txt. # Then create a Bray-Curtis distance matrix using dune_bio.txt. # Perform a Mantel test on both distance matrices and plot the relationship.
duneenv <- read.table("dune_env.txt", sep="\t", header=T)
dunebio <- read.table("dune_bio.txt", sep="\t", header=T)
dune.sd <- vegan::decostand(duneenv[,c(2,3,6)], method="standardize") # standardize the 1st, 2nd,and 5th
dist.euc <- vegan::vegdist(dune.sd, method="euclidean")
dist.bray <- vegan::vegdist(dunebio[,-1], method="bray") # create a Bray-Curtis distance matrix using dune_bio.txt
vegan::mantel(dist.euc, dist.bray) # perform a mantel test on euc and bray dist matrices
##
## Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## vegan::mantel(xdis = dist.euc, ydis = dist.bray)
##
## Mantel statistic r: 0.5496
## Significance: 0.001
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.146 0.191 0.224 0.254
## Permutation: free
## Number of permutations: 999
plot(dist.euc, dist.bray) # plot the relationship
##10.1. Make dendrograms of the results of hierarchical clustering using # the single, average and complete methods. Use the dune_bio.txt dataset # after creating a Bray-Curtis distance matrix.
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune.bio.bray<-vegan::vegdist(dune.bio, method = "bray") # Hierarchical clustering
dunebiosing<-stats::hclust(dune.bio.bray, method="single")
plot(dunebiosing) #dendrogram (tree) visualization, single
dunebioavg<-hclust(dune.bio.bray, method="average")
plot(dunebioavg) #dendrogram (tree) visualization, average
dunebiocomp<-hclust(dune.bio.bray, method="complete")
plot(dunebiocomp) #dendrogram (tree) visualization, complete
##10.2. Perform k-means partitions from 2 to 6 on the dune_bio.txt dataset and select the best partition using the Calinski criterion. Visualize the results.
dunebio.k<-vegan::cascadeKM(dune.bio, inf.gr=2, sup.gr=6) # Non-hierarchical clustering (K-means) with inf.gr= minimum number of groups, sup.gr= maximum number of groups
dunebio.k$results # calinski maximum value is the best partitioning which is 5.793253 @ 3 groups
## 2 groups 3 groups 4 groups 5 groups 6 groups
## SSE 1218.500000 950.516667 777.333333 676.500000 593.750000
## calinski 5.611243 5.793253 5.633047 5.110033 4.737482
dunebio.k3grps<-vegan::cascadeKM(dune.bio, inf.gr=2, sup.gr=8)
plot(dunebio.k3grps, sortg=T) # visualize
##11.1. Load the varechem dataset within the R package vegan. This data frame # collects soil characteristics. Given the nature of this dataset perform a PCA # or a CA ordination analysis. # a) How much variation is explained by the two first axes? # b) Make a screeplot of the results # c) Plot the ordination results of the sites # d) Plot both the sites scores and the soil characteristics scores focusing on the soil variables
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-2
data(varechem)
varechem_pca<-vegan::rda(varechem, scale=T) # vegan function
summary(varechem_pca) # a) variation explained by the two first axes (or proportion explained) 0.3708 + 0.2281 = 0.5989
##
## Call:
## rda(X = varechem, scale = T)
##
## Partitioning of correlations:
## Inertia Proportion
## Total 14 1
## Unconstrained 14 1
##
## Eigenvalues, and their contribution to the correlations
##
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Eigenvalue 5.1916 3.1928 1.6855 1.06899 0.81599 0.70581 0.43641
## Proportion Explained 0.3708 0.2281 0.1204 0.07636 0.05829 0.05042 0.03117
## Cumulative Proportion 0.3708 0.5989 0.7193 0.79564 0.85392 0.90434 0.93551
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Eigenvalue 0.36884 0.1707 0.14953 0.08526 0.06986 0.035057 0.023615
## Proportion Explained 0.02635 0.0122 0.01068 0.00609 0.00499 0.002504 0.001687
## Cumulative Proportion 0.96185 0.9740 0.98473 0.99082 0.99581 0.998313 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores: 4.236078
##
##
## Species scores
##
## PC1 PC2 PC3 PC4 PC5 PC6
## N 0.2441 0.2516 -0.23818 0.92846 -0.359554 0.30319
## P -0.9181 -0.4194 0.13356 0.09649 0.224909 0.07938
## K -0.9369 -0.3272 -0.06261 0.25051 0.180899 -0.28683
## Ca -0.9280 -0.1711 0.47982 -0.02138 -0.241479 -0.09536
## Mg -0.9087 -0.1978 0.12446 -0.04430 -0.497428 -0.10781
## S -0.8576 -0.6002 -0.31620 -0.01652 0.071629 -0.08779
## Al 0.2980 -0.9720 -0.39009 0.09092 0.005386 -0.19467
## Fe 0.5236 -0.7748 -0.23063 0.37187 -0.023377 -0.32292
## Mn -0.7418 0.3908 0.13114 0.44346 0.501064 0.06776
## Zn -0.8926 -0.3277 0.06142 -0.06121 -0.153677 0.51653
## Mo -0.1072 -0.4625 -0.85720 -0.27258 -0.030063 0.39721
## Baresoil -0.4218 0.6387 -0.40089 -0.07389 -0.416610 -0.31060
## Humdepth -0.6070 0.6825 -0.41674 0.02975 -0.047738 -0.15923
## pH 0.4286 -0.6517 0.66384 0.07599 -0.265374 0.05070
##
##
## Site scores (weighted sums of species scores)
##
## PC1 PC2 PC3 PC4 PC5 PC6
## 18 0.05126 0.84085 -0.190142 -0.40431 0.101518 -1.01997
## 15 0.20452 0.37397 0.002978 -1.06030 1.175734 -0.92257
## 24 -1.53647 -1.25830 -0.012170 -0.86690 -1.854372 1.74375
## 27 -1.25643 0.51345 0.743928 0.63835 1.097881 0.15631
## 23 -0.71203 0.86247 -0.203389 -0.05971 -0.786196 -0.80080
## 19 -0.76179 0.73785 -0.761645 -0.31728 -1.303297 -0.62264
## 22 -0.30340 0.86304 0.188484 0.54758 -0.078064 0.24567
## 16 0.62797 0.76436 -0.124301 -0.26047 0.009044 -0.02807
## 28 -1.13923 0.44909 0.229104 1.61582 0.842871 0.60852
## 13 -0.62483 -0.85407 -1.259755 1.38195 -0.157593 -1.04794
## 14 -0.04206 0.60916 -0.719743 -0.73109 -0.230147 0.25505
## 20 -0.93747 0.20753 -0.689713 0.62198 -0.282188 0.33232
## 25 -0.34451 0.90128 0.710714 0.64127 1.214897 0.48173
## 7 1.50502 -0.22114 -0.495604 0.87068 -0.769266 -0.25733
## 5 1.40889 0.60035 0.911463 0.71133 -0.488111 2.45195
## 6 1.30776 0.03604 -0.243884 -0.87792 0.543259 0.46876
## 3 1.64967 -0.82755 -0.343406 1.40028 -0.546374 -0.19580
## 4 -0.26711 -1.65198 -1.744251 -0.60763 0.947492 0.46203
## 2 0.59653 -1.21610 -0.200030 0.72815 0.799208 -0.76013
## 9 0.03271 -0.69445 -0.350028 -1.35247 0.717140 0.82965
## 12 0.47944 0.26377 0.184677 -1.27744 0.573929 0.07253
## 10 -0.32993 -0.95483 1.670469 -0.51343 0.819138 -0.48682
## 11 -0.09921 -1.52318 2.493913 -0.09044 -1.092296 -1.10654
## 21 0.49069 1.17838 0.202333 -0.73801 -1.254205 -0.85966
stats::screeplot(varechem_pca) # b) screeplot of the results
plot(varechem_pca, display="sites", xlab="PC1 (37%)", ylab="PC2 (23%)") # c) Plot the ordination results of the sites
stats::biplot(varechem_pca, scaling=1, xlab="PC1 (37%)", ylab="PC2 (23%)") # d) Plot both the sites scores and the soil characteristics scores focusing on the soil variables
##11.2. The dune_bio.txt dataset corresponds to species abundances. # Given the nature of this dataset perform a PCA or a CA ordination analysis. # a) How much variation is explained by the two first axes? # b) Make a screeplot of the results # c) Plot the ordination results of the sites
dunebio <- read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dunebio.ca<-vegan::cca(dunebio)
dunebio.ca
## Call: cca(X = dunebio)
##
## Inertia Rank
## Total 2.115
## Unconstrained 2.115 19
## Inertia is scaled Chi-square
##
## Eigenvalues for unconstrained axes:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7 CA8
## 0.5360 0.4001 0.2598 0.1760 0.1448 0.1079 0.0925 0.0809
## (Showing 8 of 19 unconstrained eigenvalues)
summary(dunebio.ca) # a) variation explained by the two first axes (or proportion explained) 0.2534 + 0.1892 = 0.4426
##
## Call:
## cca(X = dunebio)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 2.115 1
## Unconstrained 2.115 1
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7
## Eigenvalue 0.5360 0.4001 0.2598 0.17598 0.14476 0.10791 0.09247
## Proportion Explained 0.2534 0.1892 0.1228 0.08319 0.06844 0.05102 0.04372
## Cumulative Proportion 0.2534 0.4426 0.5654 0.64858 0.71702 0.76804 0.81175
## CA8 CA9 CA10 CA11 CA12 CA13 CA14
## Eigenvalue 0.08091 0.07332 0.05630 0.04826 0.04125 0.03523 0.020529
## Proportion Explained 0.03825 0.03466 0.02661 0.02282 0.01950 0.01665 0.009705
## Cumulative Proportion 0.85000 0.88467 0.91128 0.93410 0.95360 0.97025 0.979955
## CA15 CA16 CA17 CA18 CA19
## Eigenvalue 0.014911 0.009074 0.007938 0.007002 0.003477
## Proportion Explained 0.007049 0.004290 0.003753 0.003310 0.001644
## Cumulative Proportion 0.987004 0.991293 0.995046 0.998356 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
##
##
## Species scores
##
## CA1 CA2 CA3 CA4 CA5 CA6
## Belper -0.50018 -0.35503 -0.15239 -0.704153 -0.058546 -0.07308
## Empnig -0.69027 3.26420 1.95716 -0.176936 -0.073518 0.16083
## Junbuf 0.08157 -0.68074 1.00545 1.078390 0.268360 -0.24168
## Junart 1.27580 0.09963 -0.09320 0.005536 0.289410 0.78247
## Airpra -1.00434 3.06749 1.33773 0.194305 -1.081813 0.53699
## Elepal 1.76383 0.34562 -0.57336 -0.002976 -0.332396 0.14688
## Rumace -0.65289 -0.25525 -0.59728 1.160164 0.255849 0.32730
## Viclat -0.61893 0.37140 -0.46057 -1.000375 1.162652 -1.44971
## Brarut 0.18222 0.26477 -0.16606 0.064009 0.576334 -0.07741
## Ranfla 1.55886 0.30700 -0.29765 0.046974 -0.008747 0.14744
## Cirarv -0.05647 -0.76398 0.91793 -1.175919 -0.384024 0.13985
## Hyprad -0.85408 2.52821 1.13951 -0.175115 -0.311874 -0.11177
## Leoaut -0.19566 0.38884 0.03975 -0.130392 0.141232 -0.23717
## Potpal 1.91690 0.52150 -1.18215 -0.021738 -1.359988 -1.31207
## Poapra -0.38919 -0.32999 -0.02015 -0.358371 0.079296 0.05165
## Calcus 1.95199 0.56743 -0.85948 -0.098969 -0.556737 -0.23282
## Tripra -0.88116 -0.09792 -1.18172 1.282429 0.325706 0.33388
## Trirep -0.07666 -0.02032 -0.20594 0.026462 -0.186748 -0.53957
## Antodo -0.96676 1.08361 -0.17188 0.459788 -0.607533 0.30425
## Salrep 0.61035 1.54868 0.04970 -0.607136 1.429729 0.55183
## Achmil -0.90859 0.08461 -0.58636 -0.008919 -0.660183 0.18877
## Poatri -0.18185 -0.53997 0.23388 0.178834 -0.155342 0.07584
## Chealb 0.42445 -0.84402 1.59029 1.248755 -0.207480 -0.87566
## Elyrep -0.37074 -0.74148 0.26238 -0.566308 -0.270122 0.72624
## Sagpro 0.00364 0.01719 1.11570 0.066981 0.186654 -0.32463
## Plalan -0.84058 0.24886 -0.78066 0.371149 0.271377 -0.11989
## Agrsto 0.93378 -0.20651 0.28165 0.024293 -0.139326 0.02256
## Lolper -0.50272 -0.35955 -0.21821 -0.474727 0.101494 0.01594
## Alogen 0.40088 -0.61839 0.85013 0.346740 0.016509 -0.10169
## Brohor -0.65762 -0.40634 -0.30685 -0.496751 -0.561358 -0.07004
##
##
## Site scores (weighted averages of species scores)
##
## CA1 CA2 CA3 CA4 CA5 CA6
## 2 -0.63268 -0.6958357 -0.09708 -1.18695 -0.97686 -0.06575
## 13 0.42445 -0.8440195 1.59029 1.24876 -0.20748 -0.87566
## 4 -0.05647 -0.7639784 0.91793 -1.17592 -0.38402 0.13985
## 16 2.00229 0.1090627 -0.33414 0.33760 -0.50097 0.76159
## 6 -0.85633 -0.0005408 -1.39735 1.59909 0.65494 0.19386
## 1 -0.81167 -1.0826714 -0.14479 -2.10665 -0.39287 1.83462
## 8 0.76268 -0.2968459 0.35648 -0.10772 0.17507 0.36444
## 5 -0.95293 -0.1846015 -0.95609 0.86853 -0.34552 0.98333
## 17 -1.47545 2.7724102 0.40859 0.75117 -2.59425 1.10122
## 15 1.91384 0.5079036 -0.96567 0.04227 -0.50681 -0.19370
## 10 -0.87885 -0.0353136 -0.82987 -0.68053 -0.75438 -0.81070
## 11 -0.64223 0.4440332 -0.17371 -1.09684 1.37462 -2.00626
## 9 0.09693 -0.7864314 0.86492 0.40090 0.28704 1.02783
## 18 -0.31241 0.6328355 -0.66501 -1.12728 2.65575 -0.97565
## 3 -0.10148 -0.9128732 0.68815 -0.68137 -0.08709 0.28678
## 20 1.94438 1.0688809 -0.66595 -0.55317 1.59606 1.70292
## 14 1.91996 0.5351062 -1.39863 -0.08575 -2.21317 -2.43044
## 19 -0.69027 3.2642026 1.95716 -0.17694 -0.07352 0.16083
## 12 0.28557 -0.6656161 1.64423 1.71496 0.65381 -1.17376
## 7 -0.87149 -0.2547040 -0.86830 0.90468 0.17385 0.03446
sum(0.2534, 0.1892) # 44.3% variation
## [1] 0.4426
stats::screeplot(dunebio.ca) # b) screeplot
plot(dunebio.ca, display="sites", xlab="CA1 (25%)", ylab="CA2 (19%)") # c) ordination results plotted
##11.3. Perform a nonmetric multidimensional scaling (NMDS) on the dune_bio.txt # dataset after calculating Bray-Curtis distances among sites. # NMDS: ordination from any distance matrix, not an eigenvector-based method # Represent the set of objects along a predetermined number of axes while # preserving the ordering relationships among them # a) What is the stress? #Stress is the spread of points from the # monotone regression step line (stress < 0.1 is extremely good, # stress < 0.2 is very good) # b) Make a Shepard plot of the NMDS results # c) Plot the ordination results of the sites # d) [Advanced - ggplot2] Plot the ordination results using # ggplot2. Use the dune_env.txt dataset to make the size of # the points proportional to the site richness (number of # species) and the color to represent the variable Management.
#Install Packages & Load Libraries:
# install.packages("vegan")
# library(vegan)
# install.packages("tidyverse")
# library(tidyverse)
dunebio <- read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune_bio_nmds <- vegan::metaMDS(dunebio, distance="bray", k=2) #function calcs the Bray-Curtis and dimension amount
## Run 0 stress 0.1192678
## Run 1 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02027176 max resid 0.06496899
## Run 2 stress 0.1886532
## Run 3 stress 0.2075713
## Run 4 stress 0.1192678
## Run 5 stress 0.1192679
## Run 6 stress 0.1886532
## Run 7 stress 0.1808912
## Run 8 stress 0.1183186
## ... Procrustes: rmse 3.869369e-05 max resid 0.0001221982
## ... Similar to previous best
## Run 9 stress 0.1886532
## Run 10 stress 0.1192678
## Run 11 stress 0.1808911
## Run 12 stress 0.1192678
## Run 13 stress 0.1183186
## ... Procrustes: rmse 9.605114e-06 max resid 3.020403e-05
## ... Similar to previous best
## Run 14 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 9.305398e-06 max resid 3.015801e-05
## ... Similar to previous best
## Run 15 stress 0.1192679
## Run 16 stress 0.1192678
## Run 17 stress 0.2075713
## Run 18 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 5.810304e-06 max resid 1.67842e-05
## ... Similar to previous best
## Run 19 stress 0.1192678
## Run 20 stress 0.1808911
## *** Solution reached
dune_bio_nmds$stress #a) stress=0.1183186
## [1] 0.1183186
vegan::stressplot(dune_bio_nmds) # b) Make a Shepard plot of the NMDS results, Sphepard plot
plot(dune_bio_nmds, type="t", display="sites") # c) Plot the ordination results of the sites
# d) [Advanced - ggplot2] Plot the ordination results using
# ggplot2. Use the dune_env.txt dataset to make the size of
# the points proportional to the site richness (number of
# species) and the color to represent the variable Management.
#Enable the r-universe repo
# {options(repos = c(fawda123 = 'https://fawda123.r-universe.dev',
# CRAN = 'https://cloud.r-project.org'))}
# install.packages('ggord')
dune_env <- read.table("dune_env.txt", sep="\t", header=T, row.names=1)
dune_env$Richness <- vegan::specnumber(x=dunebio) #Define as Species Richness
point_data <- as.data.frame(dune_bio_nmds$points) #Define bio nmds points as a data frame
View(point_data)
dune_env$MDS1 <- point_data$MDS1 #Attaching MDS1 of dune_bio proportional to dune_bio
dune_env$MDS2 <- point_data$MDS2 #Make MDS2 of dune_bio proportional to dune_bio
ggplot(data = dune_env, aes(x=MDS1, y=MDS2))+
geom_point(aes(fill=`Management`, size=dune_env$Richness), shape = 21) #d
## Warning: Use of `dune_env$Richness` is discouraged. Use `Richness` instead.
ggplot(data=dunebio, mapping=aes(x=MDS1, y=MDS2))+
geom_point(data=dune_env, mapping=aes(fill=Management, size=dune_env$Richness), shape=24)
## Warning: Use of `dune_env$Richness` is discouraged. Use `Richness` instead.
##11.4. Perform a constrained correspondence analysis (CCA) on the dune_bio.txt # dataset using the first (A1), second (Moisture) and fifth (Manure) variables of # dune_env.txt as explanatory. # a) What is the variation explained by the two first constrained axes? # b) What is the adjusted R2 of the model? # c) Use a permutation test to assess the overall statistical significance of the model # d) Use a permutation test to assess the marginal statistical significance of # each explanatory variable of the model. Which variables are significant? # e) Make a triplot of the ordination results focusing on the sites
dune_bio<-read.table("dune_bio.txt", sep="\t", header=T)
dune_env<-read.table("dune_env.txt", sep="\t", header=T)
dune_cca<-vegan::cca(dune_bio ~ A1 + Moisture + Manure, data = dune_env)
summary(dune_cca) # a) variation explained by the two first constrained axes (or Proportion Explained) 0.2135 + 0.1236 = 0.3371 (33.71%)
##
## Call:
## cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 1.6252 1.0000
## Constrained 0.6221 0.3828
## Unconstrained 1.0031 0.6172
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CCA1 CCA2 CCA3 CA1 CA2 CA3 CA4
## Eigenvalue 0.3469 0.2010 0.07428 0.1879 0.15906 0.1401 0.11071
## Proportion Explained 0.2135 0.1236 0.04571 0.1156 0.09787 0.0862 0.06812
## Cumulative Proportion 0.2135 0.3371 0.38281 0.4985 0.59632 0.6825 0.75065
## CA5 CA6 CA7 CA8 CA9 CA10 CA11
## Eigenvalue 0.07916 0.07293 0.05730 0.04718 0.04572 0.03865 0.01710
## Proportion Explained 0.04871 0.04487 0.03526 0.02903 0.02813 0.02378 0.01052
## Cumulative Proportion 0.79936 0.84423 0.87949 0.90852 0.93666 0.96044 0.97096
## CA12 CA13 CA14 CA15 CA16
## Eigenvalue 0.01500 0.013210 0.008019 0.007311 0.003656
## Proportion Explained 0.00923 0.008128 0.004934 0.004499 0.002249
## Cumulative Proportion 0.98019 0.988318 0.993252 0.997751 1.000000
##
## Accumulated constrained eigenvalues
## Importance of components:
## CCA1 CCA2 CCA3
## Eigenvalue 0.3469 0.2010 0.07428
## Proportion Explained 0.5576 0.3230 0.11939
## Cumulative Proportion 0.5576 0.8806 1.00000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
##
##
## Species scores
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## Sites -0.36612 -0.272851 0.03468 -0.026780 -0.05701 -0.17817
## Belper 0.83300 -0.024552 0.26178 0.249493 0.68038 -0.05697
## Empnig -1.19645 -1.015213 -1.29464 -2.381277 0.35802 -1.57136
## Junbuf -0.33355 0.392914 -0.33779 -0.612703 0.28445 1.45139
## Junart -0.98487 0.158124 -0.32922 0.777789 -0.26320 0.26276
## Airpra -0.62178 -1.218943 -0.79362 -2.003397 0.14075 -1.48745
## Elepal -1.18987 0.411338 0.36981 0.894116 -0.56011 -0.49119
## Rumace 0.68758 -0.009132 0.40907 -0.563922 -1.07097 0.83356
## Viclat 0.74779 -1.150745 0.06289 0.485533 0.45456 -0.40237
## Brarut -0.01433 -0.185135 0.11098 0.103773 -0.23756 -0.05516
## Ranfla -1.17130 0.226331 0.04852 0.751432 -0.38610 -0.11623
## Cirarv 0.88097 1.399633 -0.03436 0.096612 1.33775 -1.35317
## Hyprad -0.40538 -1.134432 -0.72287 -1.557435 0.27779 -1.35848
## Leoaut 0.11313 -0.315311 0.00254 -0.144195 0.13654 -0.09109
## Potpal -1.68519 -0.199852 2.32945 0.558279 0.23074 -0.25627
## Poapra 0.60710 0.166498 -0.15189 0.164077 0.25719 0.05963
## Calcus -1.25179 -0.024437 0.18648 1.213098 -0.60626 -0.42266
## Tripra 1.06049 -0.045435 0.48790 -0.536955 -1.77008 0.52948
## Trirep 0.09018 -0.173311 0.19104 -0.006099 0.10233 0.27931
## Antodo 0.34515 -0.648581 -0.02741 -0.934434 -0.54225 -0.27585
## Salrep -0.67734 -1.194890 -0.81458 0.470121 -0.14927 -0.49681
## Achmil 0.84311 -0.390114 0.04982 -0.120906 -0.18623 0.15482
## Poatri 0.32724 0.523661 -0.06588 -0.149603 0.14705 0.39989
## Chealb -0.87273 1.439561 -0.12589 -1.166287 0.34406 0.92844
## Elyrep 0.67402 0.439441 -0.05561 0.212329 0.81665 0.34163
## Sagpro -0.18977 0.413840 -0.38749 -0.580719 0.62725 -0.03569
## Plalan 0.87667 -0.503952 0.32002 -0.243956 -0.74970 0.07749
## Agrsto -0.53992 0.678450 0.04437 0.303519 0.14484 -0.11011
## Lolper 0.82254 0.171363 -0.12671 0.244396 0.14938 -0.08128
## Alogen -0.20392 0.933599 -0.24562 -0.149106 0.35350 0.39106
## Brohor 0.88060 0.005436 0.02269 0.126893 0.42629 0.07325
##
##
## Site scores (weighted averages of species scores)
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## row1 1.30733 0.52050 -0.10807 0.67156 1.5229 0.54566
## row2 -0.53317 1.35011 -0.84353 -1.16629 0.3441 0.92844
## row3 0.59360 1.60088 -0.76573 0.09661 1.3378 -1.35317
## row4 -1.67234 0.87023 0.77543 1.01571 -1.3732 -1.20479
## row5 1.23344 -0.59583 1.71239 -0.52391 -2.2494 0.64215
## row6 1.82267 1.04962 -1.25008 1.04448 1.0312 -0.69674
## row7 -0.60118 1.09273 -0.57638 0.44127 -0.4551 -0.10654
## row8 1.35816 -0.36499 1.34351 -0.85365 -0.8100 0.51841
## row9 -0.23644 -2.20690 -0.82467 -1.43658 -0.1851 -1.36159
## row10 -1.66957 -0.04445 2.46397 0.29039 0.3173 -0.32879
## row11 1.04331 -0.76431 0.46182 0.28509 0.2433 0.35529
## row12 0.52779 -1.06784 -0.38141 0.38132 0.5402 -0.82315
## row13 0.02359 0.98908 -0.96098 0.15283 0.9026 2.39822
## row14 0.03511 -1.48173 0.02349 0.89441 0.4946 -0.31849
## row15 0.66962 1.59633 -0.76475 0.19191 0.8725 -0.39087
## row16 -1.71708 -0.61475 -0.47319 1.92639 -0.8400 0.04092
## row17 -1.66869 -0.15401 3.11596 0.82617 0.1442 -0.18375
## row18 -0.78671 -2.41592 -2.93782 -2.38128 0.3580 -1.57136
## row19 -0.43877 1.09271 -0.77217 -1.14296 0.5297 1.49304
## row20 1.16321 -0.33658 0.68024 -0.25287 -1.5318 0.25885
##
##
## Site constraints (linear combinations of constraining variables)
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## row1 1.0905 -0.31742 0.0006521 0.67156 1.5229 0.54566
## row2 -0.8727 1.43956 -0.1258921 -1.16629 0.3441 0.92844
## row3 0.8810 1.39963 -0.0343648 0.09661 1.3378 -1.35317
## row4 -0.8508 1.40305 -0.2881650 1.01571 -1.3732 -1.20479
## row5 1.0322 -0.22006 0.4333799 -0.52391 -2.2494 0.64215
## row6 1.4693 1.04731 -0.4282122 1.04448 1.0312 -0.69674
## row7 -0.7414 1.22051 -1.0995296 0.44127 -0.4551 -0.10654
## row8 0.8863 0.02333 1.5151993 -0.85365 -0.8100 0.51841
## row9 0.2402 -1.52454 -0.0420918 -1.43658 -0.1851 -1.36159
## row10 -1.7654 -0.06599 2.9244526 0.29039 0.3173 -0.32879
## row11 0.4551 -0.88477 -0.4458423 0.28509 0.2433 0.35529
## row12 0.9267 -1.04237 0.0257658 0.38132 0.5402 -0.82315
## row13 -0.5464 -0.47220 -0.9563308 0.15283 0.9026 2.39822
## row14 0.6826 -1.63347 0.6458803 0.89441 0.4946 -0.31849
## row15 0.8737 1.41180 0.0197262 0.19191 0.8725 -0.39087
## row16 -1.1819 -1.03955 -1.4028253 1.92639 -0.8400 0.04092
## row17 -1.6050 -0.33372 1.7344512 0.82617 0.1442 -0.18375
## row18 -1.1965 -1.01521 -1.2946433 -2.38128 0.3580 -1.57136
## row19 -0.5358 0.50832 0.1544659 -1.14296 0.5297 1.49304
## row20 1.3054 0.32236 -0.4030985 -0.25287 -1.5318 0.25885
##
##
## Biplot scores for constraining variables
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## A1 -0.5541 0.1536 0.8182 0 0 0
## Moisture -0.9613 0.2111 -0.1771 0 0 0
## Manure 0.4376 0.8881 -0.1408 0 0 0
0.2135 + 0.1236
## [1] 0.3371
vegan::RsquareAdj(dune_cca)$adj.r.squared # b) What is the adjusted R2 of the model? 0.267
## [1] 0.2680708
anova(dune_cca) # c) permutation test to assess the overall statistical significance of the model, 999 permutations
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
## Df ChiSquare F Pr(>F)
## Model 3 0.62215 3.308 0.001 ***
## Residual 16 1.00307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(dune_cca, by="margin") # d) permutation test to assess the marginal statistical significance of each explanatory variable of the model. Which variables are significant? Moisture and manure
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
## Df ChiSquare F Pr(>F)
## A1 1 0.08492 1.3546 0.168
## Moisture 1 0.24521 3.9113 0.001 ***
## Manure 1 0.20790 3.3162 0.002 **
## Residual 16 1.00307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(dune_cca, scaling=2, xlab="CCA1 (21%)", ylab="CCA2 (12%)") # e) Make a triplot of the ordination results focusing on the sites
##11.5. Perform a permutational multivariate analysis of variance (PERMANOVA) # with 9,999 permutations using the dune_bio.txt dataset after calculating Bray- # Curtis distances and the first (A1), second (Moisture) and fifth (Manure) # variables of dune_env.txt as explanatory. # a) Which explanatory variables are significant? # b) What is the explanatory power (R2) of the significant variables? # c) Perform a new PERMANOVA analysis with 9,999 permutations with A1 as the # explanatory variable but constrain the permutations within the Use variable.
dune_bio<-read.table("dune_bio.txt", sep="\t", header=T)
dune_env<-read.table("dune_env.txt", sep="\t", header=T)
vegan::adonis2(vegdist(dune_bio, method="bray") ~ A1 + Moisture + Manure, data=dune_env, permutations=9999) # a) A1, Moisture, and Manure are significant
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## vegan::adonis2(formula = vegdist(dune_bio, method = "bray") ~ A1 + Moisture + Manure, data = dune_env, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## A1 1 0.5107 0.15417 5.8578 2e-04 ***
## Moisture 1 0.7196 0.21722 8.2537 1e-04 ***
## Manure 1 0.6874 0.20752 7.8849 3e-04 ***
## Residual 16 1.3949 0.42109
## Total 19 3.3126 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# b) A1 R2=0.15417, Moisture R2=0.21722, Manure R2=0.20752
vegan::adonis2(vegdist(dune_bio, method="bray") ~ Use/A1, data=dune_env, permutations=9999, strata=dune_env$Use)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 9999
##
## vegan::adonis2(formula = vegdist(dune_bio, method = "bray") ~ Use/A1, data = dune_env, permutations = 9999, strata = dune_env$Use)
## Df SumOfSqs R2 F Pr(>F)
## Use 2 0.4904 0.14805 1.9394 4e-04 ***
## Use:A1 3 1.0520 0.31757 2.7733 4e-04 ***
## Residual 14 1.7702 0.53438
## Total 19 3.3126 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1