#Problem 1
1. Install ISwR. Write the built-in dataset thuesen to a tab-separated text file. View it with a text editor. Change the NA to . (period), and read the changed file back into R
library(ISwR)
data(package = "ISwR")
data("thuesen")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
thuesen_txt <- write.table(thuesen, file = "thuesen.txt", sep = "\t")
#opened file in excel and replaced the NA to .
thuesen_NA <- read.table("thuesen.txt", sep = "\t")
#Problem 2
2. The exponential growth of a population is described by this mathematical function: Nt =N0 ert where Nt is the population size at time t, N0 is the initial population size, and r is the rate of growth or reproductive rate. Write an exponential growth function in R that also generates a plot with time on the x axis and Nt on the y axis. Using that function create plots for 20 days assuming an initial population size of 10 individuals under three growth rate scenarios (0.5, 0.8, -0.1)
exponential_growth <- function(N0, r, days, col) {
t <- 1:days
Nt <- N0 * exp(r * t)
lines(t, Nt, type="l", lty=2, lwd=2.5, col=col)
}
# Create an empty plot with the appropriate axis labels and title
plot(0, type="n", xlim=c(1, 20), ylim=c(0, 1000), xlab="Time (days)", ylab="Population size (individuals)", main="Exponential Growth")
# Add the three scenarios to the plot with different colors and line types
exponential_growth(10, 0.5, 20, col="blue")
exponential_growth(10, 0.8, 20, col="red")
exponential_growth(10, -0.1, 20, col="green")
# Add a legend to the plot
legend("topright", legend=c("r=0.5", "r=0.8", "r=-0.1"), col=c("blue", "red", "green"), lty=1, lwd=2.5)
#Problem 3
3. Under highly favorable conditions, populations grow exponentially. However resources will eventually limit population growth and exponential growth cannot continue indefinitely. This phenomenon is described by the logistic growth function: Nt = K N0 N 0+(K −N0)e−rt where K is the carrying capacity. Write a logistic growth function in R that also generates a plot with time on the x axis and Nt on the y axis. Using that function create plots for 20 days assuming an initial population size of 10 individuals and a carrying capacity of 1,000 individuals under three growth rate scenarios (0.5, 0.8, 0.4)
log.growth<-function (N0, r, t, K, col) {
Nt <- K*N0/(N0+(K-N0) * exp(1)^(-r*t))
lines(t, Nt, type="l", lty=2, lwd=2.5, col=col)}
# Create an empty plot with the appropriate axis labels and title
plot(0, type="n", xlim=c(0, 20), ylim=c(0, 1000), xlab="Time (days)", ylab="Population size (individuals)", main="Logistic Growth")
log.growth(N0=10, r=0.5, t=0:20, K=1000, col="blue")
log.growth(N0=10, r=0.8, t=0:20, K=1000, col="red")
log.growth(N0=10, r=0.4, t=0:20, K=1000, col="green")
# Add a legend to the plot
legend("topleft", legend=c("r=0.5", "r=0.8", "r=0.4"), col=c("blue", "red", "green"), lty=1, lwd=2.5)
#Problem 4
4. Write a function (sum_n) that for any given value, say n, computes the sum of the integers from 1 to n (inclusive). Use the function to determine the sum of integers from 1 to 5,000
sum_n <- function(n) {sum(seq(n))}
sum_n(5000)
## [1] 12502500
#Problem 5
5. Write a function (sqrt_round) that takes a number x as input, takes the square root, rounds it to the nearest whole number and then returns the result.
sqrt_round <- function(x) {
y <- sqrt(x)
z <- round(y, digits = 0)
print(z)}
#Problem 6
6. Install the R package ‘nycflights13’, and load the ‘weather’ data. a) Explore the columns names and the top part of the dataset to get a sense of the data b) Make a subset of the data from just the first month (1) and then save that subset as ‘weather1’ c) Using ggplot2, make a beautiful histogram of the variable ‘temp’ d) Using ggplot2, make a beautiful line plot of ‘temp’ as a function of ‘time_hour’ e) Using ggplot2, make a beautiful boxplot of ‘temp’ as a function of ‘origin’
library(nycflights13)
library(tidyverse)
library(ggplot2)
data(package = "nycflights13")
weather
## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_g…¹
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0 NA
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4 NA
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0 NA
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8 NA
## # … with 26,105 more rows, 4 more variables: precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>, and abbreviated variable name ¹wind_gust
#Part A
head(weather)
## # A tibble: 6 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # … with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
#Part B
weather1 <- filter(weather, month ==1)
#Part C
ggplot(weather1, aes(x=temp))+
geom_histogram(bins = 25, color='black', fill = 'pink') +
ggtitle("Temperature for NYC Flights")+
ylab("Observations(n)")+
xlab("Temperature(F)")
#Part D
ggplot(weather1, aes(x=time_hour, y=temp))+
geom_line(color="pink")+
ggtitle("Hourly temperature in NYC")+
ylab("Temperature (F)")+
xlab("Time of the day (hour)")+
theme(plot.title = element_text(hjust = 0.5))
#Part E
ggplot(weather1, aes(x=origin, y=temp, fill=origin))+
geom_boxplot()+
xlab("Airport")+
ylab("Temperature (F)")+
ggtitle("Temperature by Airport")+
theme(plot.title = element_text(hjust = 0.5))
#Problem 7
7. Write content in markdown syntax to replicate, as much as possible, the format of the following sample text
Some text using Markdown syntex
Some text in code format: computing with data
The main data objects in R are:
vectors
arrays
lists
The R website is: https://www.r-project.org
#Problem 1
1. A researcher videotaped the glides of 8 tree snakes leaping from a 10-m tower. Undulation rates of the snakes measured in hertz (cycles per second) were as follows: 0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6. a) Draw a histogram of the undulation rate b) Calculate the sample mean c) Calculate the range d) Calculate the standard deviation e) Write a function to express the standard deviation as a percentage of the mean (that is, the coefficient of variation) and calculate it.
rates <- data.frame("snake" = c(1,2,3,4,5,6,7,8),
"undulation rate" = c(0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6))
#Part A
hist(rates$undulation.rate, col= 'pink', border = 'black', xlab = "Undulation Rate (cycles per second)", main = "Undulation Rates of Tree Snakes")
#Part B
mean(rates$undulation.rate)
## [1] 1.375
#Part C
range(rates$undulation.rate)
## [1] 0.9 2.0
#Part D
sd(rates$undulation.rate)
## [1] 0.324037
#Part E
coefficient_of_var <- function(x) {
sd(x)/mean(x)*100
}
coefficient_of_var(rates$undulation.rate)
## [1] 23.56633
#Problem 2
2. Blood pressure was measured (in units of mm Hg). Here are the measurements: 112, 128, 108, 129, 125, 153, 155, 132, 137. a) How many individuals are in the sample? –> 9 b) What is the mean of this sample? –> 131 c) What is the variance? –> 254.5 d) What is the standard deviation? –> 15.95306 e) What is the coefficient of variation? –> 12.17791
bp <- c(112, 128, 108, 129, 125, 153, 155, 132, 137)
#Part A
length(bp)
## [1] 9
#Part B
mean(bp)
## [1] 131
#Part C
var(bp)
## [1] 254.5
#Part D
sd(bp)
## [1] 15.95306
#Part E
coefficient_of_var <- function(x) {
sd(x)/mean(x)*100
}
coefficient_of_var(bp)
## [1] 12.17791
#Problem 3
3. The data in the file DesertBirdAbundance.csv are from a survey of the breeding birds of Organ Pipe Cactus National Monument in southern Arizona. a) Draw a histogram of the abundance data. b) Calculate the median and the mean of the bird abundance data. c) In this particular case, which do you think is the best measure of center, the mean or the median? d) Calculate the range, standard deviation, variance and coefficient of variation of the bird abundance data.
library(tidyverse)
bird <- read.csv('DesertBirdAbundance.csv', header = TRUE)
#Part A
hist(bird$abundance, col= 'pink', border = 'black', xlab = "Abundance", main = "Desert Bird Abundance")
#Part B
mean(bird$abundance)
## [1] 74.76744
median(bird$abundance)
## [1] 18
#Part C
# The best measure of center would be the median because the median is more resistant to outliers.
#Part D
range(bird$abundance)
## [1] 1 625
sd(bird$abundance)
## [1] 121.9473
var(bird$abundance)
## [1] 14871.14
coefficient_of_var(bird$abundance)
## [1] 163.1021
#Problem 4
#Part A
1-pnorm(3)
## [1] 0.001349898
#Part B
1-pnorm(42, mean=35, sd=6)
## [1] 0.1216725
#Part C
dbinom(10, size=10, prob=0.8) #it is about a point probability (the others involve the cumulative distribution function)
## [1] 0.1073742
#Part D
1-pchisq(6.5, df=2)
## [1] 0.03877421
#Problem 5
5. Demonstrate graphically the central limit theorem (by sampling and calculating the mean) using a Binomial distribution with 10 trials (size=10) and 0.9 probability of success (prob=0.9) and a sample size of 5.
# Create a df with the 5 units, probability of 0.9 and 10 trials
p1 <- rbinom(5, size=10, prob=0.9)
hist(runif(p1))
# Loops the mean of the sample for 100 times
clt_10 <-numeric(10)
for (i in 1:10) {
clt_10[i]<-mean(runif(p1))
}
# create a histogram
hist(clt_10,col="lavender", main="After 10 samples")
abline(v = mean(clt_10),lty=3,lwd = 2)
# Loops the mean of the sample for 1000 times
clt_100 <-numeric(100)
for (i in 1:100) {
clt_100[i]<-mean(runif(p1))
}
# create a histogram
hist(clt_100,col="pink", main="After 100 samples")
abline(v = mean(clt_100),lty=3,lwd = 2)
#Problem 6
6. Imagine height is genetically determined by the combined (that is, the sum) effect of several genes (polygenic trait). Assume that each gene has an effect on height as a uniform distribution with min=1 and max=3. Simulate an stochastic model of height for 1,000 random people based on 1 gene, 2 genes, and 5 genes. As we increase the number of genes, what is the resulting height distribution? Becomes more normal with increasing genes
hist(runif(1000, 1, 3))
hist(runif(1000, 1, 3)+runif(1000, 1, 3))
hist(runif(1000, 1, 3)+runif(1000, 1, 3)+runif(1000, 1, 3))
hist(runif(1000, 1, 3)+runif(1000, 1, 3)+runif(1000, 1, 3)+runif(1000, 1, 3)+runif(1000, 1, 3))
#Problem 7
7. Do the same as in the previous problem but now assuming that the combined effect of the genes is multiplicative (not the sum). As we increase the number of genes, what is the resulting height distribution? Becomes more skewed
hist(runif(1000, 1, 3))
hist(runif(1000, 1, 3)*runif(1000, 1, 3))
hist(runif(1000, 1, 3)*runif(1000, 1, 3)*runif(1000, 1, 3))
hist(runif(1000, 1, 3)*runif(1000, 1, 3)*runif(1000, 1, 3)*runif(1000, 1, 3)*runif(1000, 1, 3))
#Problem 1
1. Normal human body temperature is 98.6 F. Researchers obtained body-temperature measurements on randomly chosen healthy people: 98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4. a) Make a histogram of the data b) Make a normal quantile plot c) Perform a Shapiro-Wilk test to test for normality d) Are the data normally distributed? No e) Are these measurements consistent with a population mean of 98.6 F? No
temp <- c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0,
98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5,
98.8, 98.6, 100.0, 98.4)
#Part A
hist(temp, main = "Human Body Temperatures", xlab = "temp (F)", col="lavender")
#Part B
qqnorm(temp, main = "Human Body Temperatures", xlab = "temp (F)")
#Part C
shapiro.test(temp)
##
## Shapiro-Wilk normality test
##
## data: temp
## W = 0.97216, p-value = 0.7001
#Part D
# Shapiro-Wilk not significant -> Data are normal
#Part E
t.test(temp, mu=98.6)
##
## One Sample t-test
##
## data: temp
## t = -0.56065, df = 24, p-value = 0.5802
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
## 98.24422 98.80378
## sample estimates:
## mean of x
## 98.524
#Problem 2
2. The brown recluse spider often lives in houses throughout central North America. A diet-preference study gave each of 41 spiders a choice between two crickets, one live and one dead. 31 of the 41 spiders chose the dead cricket over the live one. Does this represent evidence for a diet preference? -Yes, the p-value is 0.00145 indicating that we should reject the null in favor of the hypothesis that there is a diet preference
#run binomial test to evaluate if there is a preference comparing groups
binom.test(31, 41, p = 0.5)
##
## Exact binomial test
##
## data: 31 and 41
## number of successes = 31, number of trials = 41, p-value = 0.00145
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5969538 0.8763675
## sample estimates:
## probability of success
## 0.7560976
Ten epileptic patients participated in a study of a new anticonvulsant drug. During the first 8-week period, half the patients received a placebo and half were given the drug, and the number of seizures were recorded. Following this, the same patients were given the opposite treatment and the number of seizures were recorded. Assuming that the distribution of the difference between the placebo and drug meets the assumption of normality, perform an appropriate test to determine whether there were differences in the number of epileptic seizures with and without the drug
A. Make a boxplot of the data
B. Test the difference.
placebo <- c(37, 52, 68, 4, 29, 32, 19, 52, 19, 12)
drug <- c(5, 23, 40, 3, 38, 19, 9, 24, 17 , 14)
seizure <- data.frame(placebo, drug)
#Part A
boxplot(seizure, ylab = "Frequency of Seizures", col= c('pink', 'lavender'))
#Part B
t.test(seizure$placebo, seizure$drug, paired = T)
##
## Paired t-test
##
## data: seizure$placebo and seizure$drug
## t = 2.7661, df = 9, p-value = 0.02189
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 2.404666 23.995334
## sample estimates:
## mean difference
## 13.2
#Problem 4
4. A bee biologist is analyzing whether there was an association between bee colony number and type of forest habitat. We expect that there is no habitat preference for bee colony number. Is this true based on this data? a) Make a barplot of the data b) Test the hypothesis of no-association
bee_habitat <- data.frame("Habitat"= c("Oak", "Hickory", "Maple", "Red Cedar", "Poplar"), "Bee Colonies"= c(33, 30, 29, 4, 4))
#Part A
barplot(bee_habitat$Bee.Colonies, names.arg = bee_habitat$Habitat, ylab = "Number of Bee Colonies", xlab = "Tree Species",
main = 'Bee colonies by Tree Species', col='lightyellow')
#Part B
#perform chi square test to compare groups
chisq.test(bee_habitat$Bee.Colonies)
##
## Chi-squared test for given probabilities
##
## data: bee_habitat$Bee.Colonies
## X-squared = 43.1, df = 4, p-value = 9.865e-09
#Problem 5
5. Perform 10 one-sample t-tests for mu=0 on simulated standard normally distributed data of 25 observations each and get the P-value. Repeat the experiment, but instead simulate samples of 25 observations from a t-distribution with 2 degrees of freedom. Find a way to automate the first experiment to do it a 1,000 times and applying a false discovery rate correction to the P-values (check ?replicate).
replicate(10, t.test(rnorm(25), mu=0)$p.value)
## [1] 0.6644501 0.7619880 0.7130990 0.1888785 0.5940204 0.4728194 0.1164432
## [8] 0.8832654 0.4263708 0.7161937
replicate(10, t.test(rt(25, df=2), mu=0)$p.value)
## [1] 0.3725147 0.7013407 0.4792406 0.3990850 0.6505667 0.8619722 0.8340536
## [8] 0.6002098 0.1790056 0.1461821
sig <- replicate (1000, t.test(rnorm(25), mu=0)$p.value)
#Show significant values
sig[sig<0.05]
## [1] 0.013747435 0.043539156 0.024613004 0.027708818 0.032914040 0.010688793
## [7] 0.040537716 0.035485433 0.043757097 0.007334281 0.044649957 0.010301723
## [13] 0.018228632 0.023304689 0.028225881 0.043801347 0.022285319 0.001341912
## [19] 0.022832448 0.013995861 0.046788058 0.035500722 0.044633584 0.032095828
## [25] 0.046050931 0.038590321 0.032431448 0.020032178 0.041953001 0.038034602
## [31] 0.003206472 0.028327595 0.034963475 0.004957444 0.041764619 0.030625825
## [37] 0.000988568 0.005277087 0.029627619 0.003581010 0.028071689 0.049288839
## [43] 0.012486790 0.049881152 0.032015861 0.039561009 0.002107872 0.012896292
## [49] 0.016915981 0.024594987 0.034392101
50 tests are significant
#Apply a false discovery rate correction to the P-values
sig.correct <- p.adjust(sig, method = "fdr")
#Show significant values
sig.correct[sig.correct<0.05]
## numeric(0)
No significant results
####Practice Problems - ANOVA
#Problem 1
1.You are hired by the US Department of Agriculture to develop effective control practices for invasive plants (data_ANOVA.xlsx). In particular, you are asked to investigate pesticide controls for kudzu which is an invasive vine that grows in thick mats that smother underlying plants. Two of the most widely used pesticides for kudzu are glyphosate and triclopyr. To determine the effectiveness of a single application of pesticide, you conduct an experiment in 18 plots that each had 50% kudzu cover. In mid-summer, you applied equal amounts of 2% glyphosate to 6 plots, 2% triclopyr to 6 plots, and water without pesticides to 6 plots. Then you returned in autumn and measured the percent cover of kudzu in the plots. a) Transform the data from wide to long format (Google how to do it - Hint: function melt from reshape2 package) b) Make a boxplot of the data c) Perform an ANOVA d) What is the variation explained (R2)? e) Perform a post hoc test
library(readxl)
library(tidyr)
# Read data
us_agri <- read_xlsx(path = "data_ANOVA.xlsx", col_names = T)
us_agri
## # A tibble: 6 × 3
## Glyphosate Triclopyr Control
## <dbl> <dbl> <dbl>
## 1 20 17 34
## 2 17 14 31
## 3 24 12 29
## 4 18 16 32
## 5 16 18 27
## 6 22 13 30
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
us_agri_trans <- melt(us_agri)
## No id variables; using all as measure variables
us_agri_trans
## variable value
## 1 Glyphosate 20
## 2 Glyphosate 17
## 3 Glyphosate 24
## 4 Glyphosate 18
## 5 Glyphosate 16
## 6 Glyphosate 22
## 7 Triclopyr 17
## 8 Triclopyr 14
## 9 Triclopyr 12
## 10 Triclopyr 16
## 11 Triclopyr 18
## 12 Triclopyr 13
## 13 Control 34
## 14 Control 31
## 15 Control 29
## 16 Control 32
## 17 Control 27
## 18 Control 30
boxplot(us_agri_trans$value~us_agri_trans$variable, col=c("lavender","pink","purple"), xlab = "Additive", ylab="Kudzu", main="Pesticide Control for Kuzdo")
anova(lm(value~variable, data = us_agri_trans))
## Analysis of Variance Table
##
## Response: value
## Df Sum Sq Mean Sq F value Pr(>F)
## variable 2 763 381.5 54.5 1.318e-07 ***
## Residuals 15 105 7.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(value~variable, data= us_agri_trans))
##
## Call:
## lm(formula = value ~ variable, data = us_agri_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.500 -1.875 0.000 1.875 4.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.500 1.080 18.053 1.38e-11 ***
## variableTriclopyr -4.500 1.528 -2.946 0.01 *
## variableControl 11.000 1.528 7.201 3.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.646 on 15 degrees of freedom
## Multiple R-squared: 0.879, Adjusted R-squared: 0.8629
## F-statistic: 54.5 on 2 and 15 DF, p-value: 1.318e-07
R^2 = 0.8629
TukeyHSD(aov(value~variable, data=us_agri_trans))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = value ~ variable, data = us_agri_trans)
##
## $variable
## diff lwr upr p adj
## Triclopyr-Glyphosate -4.5 -8.467701 -0.5322987 0.0255594
## Control-Glyphosate 11.0 7.032299 14.9677013 0.0000087
## Control-Triclopyr 15.5 11.532299 19.4677013 0.0000001
#Problem 2
2. Data in growth.txt come from a farm-scale trial of animal diets. There are two factors: diet and supplement. Diet is a factor with three levels: barley, oats and wheat. Supplement is a factor with four levels: agrimore, control, supergain and supersupp. The response variable is weight gain after 6 weeks. a) Inspect the data using boxplots b) Perform a two-way ANOVA c) Assess graphically if there exists an interaction between both factors d) Given what you learnt about the interaction, what would be a better model? e) What is the variation explained (R2) of this new model? f) Perform a Tukey HSD test of this new model
# Read dataset
growth <- read.table("growth.txt", sep="\t", header = T)
head(growth)
## supplement diet gain
## 1 supergain wheat 17.37125
## 2 supergain wheat 16.81489
## 3 supergain wheat 18.08184
## 4 supergain wheat 15.78175
## 5 control wheat 17.70656
## 6 control wheat 18.22717
# ggplot:
library(ggplot2)
diet_plot <- ggplot(growth, aes(x=diet, y=gain, fill=diet)) +
geom_boxplot() +
scale_fill_manual(values=c("lavender", "pink", "orchid4"))+
geom_jitter(shape=16, position=position_jitter(0.2))+
labs(x="Diet Type", y="Weight Gain (g)")
diet_plot
#Boxplot using the columns gain and supplement
supp_plot <-ggplot(growth, aes(x=supplement, y=gain, fill=supplement)) +
geom_boxplot() +
geom_jitter(shape=16, position=position_jitter(0.2))+
scale_fill_manual(values=c("aquamarine","limegreen", "olivedrab", "forestgreen"))+
labs(x="Supplement Type", y="Weight Gain (g)")
supp_plot
#Boxplots using combining gain diet + suplement
comb_plot <- ggplot(growth, aes(x=supplement, y=gain, fill=supplement)) +
geom_boxplot() +
scale_fill_brewer(palette="Dark2") +
geom_jitter(shape=16, position=position_jitter(0.2)) +
facet_wrap(~diet) +
theme(axis.text.x = element_text(angle=45))+
labs(x="Supplement Type", y="Weight Gain (g)")
comb_plot
anova(lm(gain~diet*supplement, data = growth))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 83.5201 2.999e-14 ***
## supplement 3 91.881 30.627 17.8150 2.952e-07 ***
## diet:supplement 6 3.406 0.568 0.3302 0.9166
## Residuals 36 61.890 1.719
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
interaction.plot(growth$supplement, growth$diet, growth$gain, type="b", col=c("red","blue", "green"),
main = "Interaction between Diet and Supplement Type",
xlab = "Supplement", ylab = "Gain (g)", trace.label = "Diet")
interaction.plot(growth$diet, growth$supplement, growth$gain, type="b", col=c("red","blue", "green"),
main = "Interaction between Diet and Supplement Type",
xlab = "Diet", ylab = "Gain", trace.label = "Supplement")
anova(lm(gain ~ diet + supplement, data = growth))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 92.358 4.199e-16 ***
## supplement 3 91.881 30.627 19.700 3.980e-08 ***
## Residuals 42 65.296 1.555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain ~ diet + supplement, data = growth))
##
## Call:
## lm(formula = gain ~ diet + supplement, data = growth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.30792 -0.85929 -0.07713 0.92052 2.90615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.1230 0.4408 59.258 < 2e-16 ***
## dietoats -3.0928 0.4408 -7.016 1.38e-08 ***
## dietwheat -5.9903 0.4408 -13.589 < 2e-16 ***
## supplementcontrol -2.6967 0.5090 -5.298 4.03e-06 ***
## supplementsupergain -3.3815 0.5090 -6.643 4.72e-08 ***
## supplementsupersupp -0.7274 0.5090 -1.429 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8356
## F-statistic: 48.76 on 5 and 42 DF, p-value: < 2.2e-16
TukeyHSD(aov(gain ~ diet + supplement, data = growth))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gain ~ diet + supplement, data = growth)
##
## $diet
## diff lwr upr p adj
## oats-barley -3.092817 -4.163817 -2.021817 0e+00
## wheat-barley -5.990298 -7.061298 -4.919297 0e+00
## wheat-oats -2.897481 -3.968481 -1.826481 2e-07
##
## $supplement
## diff lwr upr p adj
## control-agrimore -2.6967005 -4.0583332 -1.3350677 0.0000234
## supergain-agrimore -3.3814586 -4.7430914 -2.0198259 0.0000003
## supersupp-agrimore -0.7273521 -2.0889849 0.6342806 0.4888738
## supergain-control -0.6847581 -2.0463909 0.6768746 0.5400389
## supersupp-control 1.9693484 0.6077156 3.3309811 0.0020484
## supersupp-supergain 2.6541065 1.2924737 4.0157392 0.0000307
#Problem 1 1. In a study of hyena laughter, a researcher investigated whether sound spectral properties of hyenas’ giggles are associated with age. Age(years): 2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20 Frequency (Hz): 840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500 a) Inspect the data using a scatterplot b) Test the linear association between both variables c) Assume that the data are not normally distributed, test the linear association using a non-parametric correlation coefficient
age <- c(2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20)
freq <- c(840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500)
#Part A
plot(age, freq, main = "Hyena Laughter", xlab = "Age (years)", ylab = "Frequency (Hz)")
#Part B
cor.test(age, freq, method="pearson")
##
## Pearson's product-moment correlation
##
## data: age and freq
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8453293 -0.1511968
## sample estimates:
## cor
## -0.601798
#Part C
cor.test(age, freq, method="spearman")
## Warning in cor.test.default(age, freq, method = "spearman"): Cannot compute
## exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: age and freq
## S = 1047.1, p-value = 0.03092
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.5397807
#Problem 1 1. Testosterone is known to predict aggressive behavior and is involved in face shape during puberty. Does face shape predict aggression? Researchers measured the face width- to - height ratio of 21 university hockey players with the average number of penalty minutes awarded per game for aggressive infractions. a) How steeply does the number of penalty minutes increase per unit increase in face ratio? Calculate the estimate of the intercept. Write the result in the form of an equation for the line. b) How many degrees of freedom does this analysis have? c) What is the t-statistic? d) What is your final conclusion about the slope? e) What is the variation explained, R2? f) Make a scatterplot of the data and include a linear regression line g) Check the model assumptions
# Read Table
testosterone<-read.table("face.txt", sep="\t", header=T)
head(testosterone)
## Face Penalty
## 1 1.59 0.44
## 2 1.67 1.43
## 3 1.65 1.57
## 4 1.72 0.14
## 5 1.79 0.27
## 6 1.77 0.35
# Part A
plot <- ggplot(testosterone, aes(x=Face, y=Penalty)) + geom_point()
plot
#write a linear model and the summary stats
model_t <- lm(Penalty ~ Face, data=testosterone)
summary(model_t)
##
## Call:
## lm(formula = Penalty ~ Face, data = testosterone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9338 -0.3883 -0.1260 0.3381 1.0711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.505 2.089 -2.157 0.0440 *
## Face 3.189 1.150 2.774 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6129 on 19 degrees of freedom
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2508
## F-statistic: 7.694 on 1 and 19 DF, p-value: 0.0121
# y = 3.189(x) - 4.505
# Part B -> 19 degrees of freedom
# Part C -> 7.694
# Part D ->
# Part E -> 0.2882
# Part F
plot2 <- ggplot(testosterone, aes(x=Face, y=Penalty)) +
geom_point() +
stat_smooth(method = "lm", col = "red", se=F)
plot2
## `geom_smooth()` using formula = 'y ~ x'
# Part G
library(performance)
check_model(lm(Penalty ~ Face, data=testosterone))
#Problem 2
resp <-read.table("respiratoryrate_bodymass.txt", sep="\t", header=T)
head(resp)
## BodyMass Respiration
## 1 453 666
## 2 1283 643
## 3 695 1512
## 4 1640 2198
## 5 1207 2535
## 6 2096 4176
#Part A
resp_scatter <- ggplot(resp, aes(x=BodyMass, y=Respiration)) +
geom_point()
resp_scatter
#Part B
lin_scatter <- ggplot(resp, aes(x=log(BodyMass), y=log(Respiration))) +
geom_point() +
stat_smooth(method = "lm", col = "red", se=F)
lin_scatter
## `geom_smooth()` using formula = 'y ~ x'
#Part C
# Linear regression to estimate beta
fit <- lm(log(Respiration) ~ log(BodyMass), data=resp)
beta <- coef(fit)[2]
beta
## log(BodyMass)
## 0.8574374
#Part D
summary(fit)
##
## Call:
## lm(formula = log(Respiration) ~ log(BodyMass), data = resp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## log(BodyMass) 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
#The null hypothesis is that beta=0, p-value: 0.0006 is significant, so null hypothesis is rejected.
#This indicates that there is a statistically significant linear relationship between log(RespiratoryRate) and log(BodyMass), and #that the scaling exponent beta is not zero.
#Part E -> 0.7885
summary(fit)$r.squared
## [1] 0.7884663
#Part F
library(performance)
check_model(fit)
#Problem 1
1. Use the ozone.txt data to model the ozone concentration as a linear function of wind speed, air temperature and the intensity of solar radiation. Assume that the requirements to perform a linear regression are met. a) Make a multiple panel bivariate scatterplot b) Perform a multiple linear regression c) What is the variation explained, R2? d) Assess the collinearity of the explanatory variables using the variance inflation factor e) Check the model assumptions
ozone.data<-read.table("ozone (1).txt", sep="\t", header=T)
#Part A
pairs(ozone.data)
#Part B
ozone.lm<-lm(ozone ~ rad+temp+wind, data=ozone.data)
#Part C -> 0.6062
summary(ozone.lm)
##
## Call:
## lm(formula = ozone ~ rad + temp + wind, data = ozone.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.210 -3.556 10.124 95.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.23208 23.04204 -2.788 0.00628 **
## rad 0.05980 0.02318 2.580 0.01124 *
## temp 1.65121 0.25341 6.516 2.43e-09 ***
## wind -3.33760 0.65384 -5.105 1.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.17 on 107 degrees of freedom
## Multiple R-squared: 0.6062, Adjusted R-squared: 0.5952
## F-statistic: 54.91 on 3 and 107 DF, p-value: < 2.2e-16
#Part D
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(ozone.lm)
## rad temp wind
## 1.095241 1.431201 1.328979
#Part E
library(performance)
check_model(ozone.lm)
## Variable `Component` is not in your data frame :/
#Problem 2
2. Use the diminish.txt data (xv is explanatory, yv is response variable) to: a) Perform a simple linear regression b) Perform a polynomial (second-degree) regression c) Compare both models with Akaike’s Information Criterion (AIC). Which model is better? d) Make a scatterplot of the data and include both regression lines
diminish <-read.table("diminish.txt", sep="\t", header=T)
head(diminish)
## xv yv
## 1 5 26
## 2 10 29
## 3 15 31
## 4 20 30
## 5 25 35
## 6 30 37
diminish.lm <- lm(yv ~ xv, data=diminish)
summary(diminish.lm)
##
## Call:
## lm(formula = yv ~ xv, data = diminish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3584 -2.1206 -0.3218 1.8763 4.1782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.61438 1.17315 23.54 7.66e-14 ***
## xv 0.22683 0.02168 10.46 1.45e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 16 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.8646
## F-statistic: 109.5 on 1 and 16 DF, p-value: 1.455e-08
##Make Scatter plot of linearize relationship
library(ggplot2)
ggplot(aes(x=xv, y=yv), data = diminish) +
geom_point() +
stat_smooth(method = "lm", col = "red",se=F)+
ggtitle("Linear Regression")+
theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
diminish.poly <- lm(yv ~ xv + I(xv^2), data=diminish)
summary(diminish.poly)
##
## Call:
## lm(formula = yv ~ xv + I(xv^2), data = diminish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0490 -1.2890 0.6018 1.1829 2.9610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.6323529 1.6916139 14.561 2.95e-10 ***
## xv 0.4057534 0.0819860 4.949 0.000175 ***
## I(xv^2) -0.0018834 0.0008386 -2.246 0.040203 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.131 on 15 degrees of freedom
## Multiple R-squared: 0.9046, Adjusted R-squared: 0.8919
## F-statistic: 71.12 on 2 and 15 DF, p-value: 2.222e-08
ggplot(diminish, aes(x=xv, y=yv)) +
geom_point() +
stat_smooth(method='lm', formula = y ~ poly(x,2), linewidth = 1) +
ggtitle("Polynomial Regression")+
theme(plot.title = element_text(hjust = 0.5))
AIC(diminish.poly,diminish.lm) #the lower AIC, the better
## df AIC
## diminish.poly 4 83.04403
## diminish.lm 3 86.26193
polynomial regression has a lower AIC value which is better
ggplot(diminish, aes(x=xv, y=yv)) +
geom_point() +
stat_smooth(method='lm', formula = y ~ poly(x,2), size = 1) +
stat_smooth(method='lm', formula = y ~ x, size = 1, color="red") +
geom_text(aes(x=8, y=25, label="Poly"), color="blue") +
geom_text(aes(x=8, y=32, label="Linear"), color="red") +
labs(x="Explanatory Variables", y="Response Variables") +
theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
#Problem 3
3. The data in stork.txt display the stress-induced corticosterone levels circulating in the blood of European white storks and their survival over the subsequent five years of study. a) Make a scatterplot of the data b) Which type of regression model is suitable for these data? c) Perform an appropriate regression to predict survival from corticosterone d) What is the pseudo-R2 of the model? e) What is the p-value of the model? f) Include the predicted curve in the scatterplot g) Check the model assumptions
# Read Data
stork<-read.table("stork.txt", sep="\t", header=T)
head(stork)
## Corticosterone Survival
## 1 26.0 1
## 2 28.2 1
## 3 29.8 1
## 4 34.9 1
## 5 34.9 1
## 6 35.9 1
ggplot(stork, aes(x=Corticosterone, y=Survival)) +
geom_point()
The data seems to be binary so use logistic regression
stork_glm <- glm(Survival~Corticosterone, family=binomial, stork)
summary(stork_glm)
##
## Call:
## glm(formula = Survival ~ Corticosterone, family = binomial, data = stork)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4316 -0.8724 -0.6703 1.2125 1.8211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.70304 1.74725 1.547 0.1219
## Corticosterone -0.07980 0.04368 -1.827 0.0677 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45.234 on 33 degrees of freedom
## Residual deviance: 41.396 on 32 degrees of freedom
## AIC: 45.396
##
## Number of Fisher Scoring iterations: 4
1-(stork_glm$deviance/stork_glm$null.deviance)
## [1] 0.084852
anova(stork_glm, test='Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survival
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 33 45.234
## Corticosterone 1 3.8382 32 41.396 0.0501 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(stork, aes(x=Corticosterone, y=Survival)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
check_model(stork_glm)
library(DHARMa) #model diagnostics for glm
## This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
simulateResiduals(stork_glm, plot = T)
## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.7302745 0.6874525 0.7807209 0.8907125 0.8503342 0.6393513 0.8988643 0.7981121 0.8541423 0.8065418 0.882481 0.9436615 0.920374 0.1942313 0.2127305 0.1329645 0.04536652 0.1618905 0.4134033 0.3203154 ...
#Problem 4
clusters<-read.table("clusters.txt", sep="\t", header=T)
hist(clusters$Cancers)
#Part A
plot(clusters$Distance, clusters$Cancers)
#Part B -> categorical data = poisson regression
#Part C -> not significant
clusters.glm<-glm(Cancers~Distance, family=poisson, clusters)
summary(clusters.glm)
##
## Call:
## glm(formula = Cancers ~ Distance, family = poisson, data = clusters)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5504 -1.3491 -1.1553 0.3877 3.1304
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.186865 0.188728 0.990 0.3221
## Distance -0.006138 0.003667 -1.674 0.0941 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 149.48 on 93 degrees of freedom
## Residual deviance: 146.64 on 92 degrees of freedom
## AIC: 262.41
##
## Number of Fisher Scoring iterations: 5
#Part D
1-(clusters.glm$deviance/clusters.glm$null.deviance)
## [1] 0.01900423
library(performance)
r2(clusters.glm)
## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.037
#Part E
anova(clusters.glm, test='Chisq')
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: Cancers
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 93 149.48
## Distance 1 2.8408 92 146.64 0.0919 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Part F
library(ggplot2)
ggplot(aes(x=Distance, y=Cancers), data=clusters)+
geom_point()+
geom_smooth(method="glm", method.args = list(family = "poisson"), se=T)
## `geom_smooth()` using formula = 'y ~ x'
#Part G
check_overdispersion(clusters.glm)
## # Overdispersion test
##
## dispersion ratio = 1.555
## Pearson's Chi-Squared = 143.071
## p-value = 0.001
## Overdispersion detected.
#Part H
library(DHARMa)
simulateResiduals(clusters.glm, plot = T)
## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.1887512 0.05144332 0.3351128 0.3629404 0.02498429 0.3534021 0.1981034 0.1886604 0.3388054 0.4883336 0.2663416 0.06092614 0.2587126 0.3506601 0.1793212 0.01476056 0.3826403 0.0440561 0.1791483 0.2383648 ...
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
clusters.glm.nb<-glm.nb(Cancers ~ Distance, data=clusters)
summary(clusters.glm.nb)
##
## Call:
## glm.nb(formula = Cancers ~ Distance, data = clusters, init.theta = 1.359466981,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3103 -1.1805 -1.0442 0.3065 1.9582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.182490 0.252434 0.723 0.470
## Distance -0.006041 0.004727 -1.278 0.201
##
## (Dispersion parameter for Negative Binomial(1.3595) family taken to be 1)
##
## Null deviance: 96.647 on 93 degrees of freedom
## Residual deviance: 94.973 on 92 degrees of freedom
## AIC: 253.19
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.359
## Std. Err.: 0.612
##
## 2 x log-likelihood: -247.191
#94.973/92 = 1.032315
#Part I
check_overdispersion(clusters.glm.nb)
## # Overdispersion test
##
## dispersion ratio = 0.919
## Pearson's Chi-Squared = 84.593
## p-value = 0.696
## No overdispersion detected.
r2(clusters.glm.nb)
## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.027
library(DHARMa)
simulateResiduals(clusters.glm.nb, plot = T)
## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.1691706 0.08785123 0.2964165 0.1319477 0.5308209 0.07382625 0.1803357 0.3979211 0.2507544 0.1048807 0.1318335 0.07170322 0.5260157 0.2110324 0.1405788 0.424706 0.076321 0.02451732 0.4505731 0.1809262 ...
ggplot(aes(x=Distance, y=Cancers), data=clusters)+
geom_point()+
geom_smooth(method="glm.nb", se=T)
## `geom_smooth()` using formula = 'y ~ x'
#Problem 5
5. Use the jaws.txt data to: a) Make a scatterplot of the data (age explanatory, bone response) b) Perform a non-linear regression assuming an asymptotic exponential relationship: y=a(1−e−cx) c) Perform a non-linear regression assuming a Michaelis-Menten model: y= ax1+bx d) Estimate the percentage of variation explained by both models (comparing them with a null model with only a constant) e) Compare both models with Akaike’s Information Criterion (AIC). Which model is better? f) Make a scatterplot of the data and include both regression lines
# Read dataset
jaws <- read.table('jaws.txt', sep="\t", header = T)
head(jaws)
## age bone
## 1 0.000000 0.00000
## 2 5.112000 20.22000
## 3 1.320000 11.11130
## 4 35.240000 140.65000
## 5 1.632931 26.15218
## 6 2.297635 10.00100
plot(bone~age, data=jaws, xlab='Age', ylab='Bone')
jaws_nls<-nls(bone~(a*(1-exp(-1*c*age))), start = list(a=100, c=0.1),jaws)
summary(jaws_nls)
##
## Formula: bone ~ (a * (1 - exp(-1 * c * age)))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 115.58059 2.84365 40.645 < 2e-16 ***
## c 0.11882 0.01233 9.635 3.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 52 degrees of freedom
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 4.023e-06
jaws_mmm<-nls(bone~(a*age/(1+b*age)), start = list(a=100, b=0.1),jaws)
summary(jaws_mmm)
##
## Formula: bone ~ (a * age/(1 + b * age))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 18.72524 2.52585 7.413 1.09e-09 ***
## b 0.13596 0.02339 5.814 3.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 52 degrees of freedom
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 7.774e-06
# First, we calculate residual sum of squares
# this is the residual standard of error over the degree of freedom (in the summary function)
rss_1<-13.1^2*52 # for asymptotic exponential relationship
rss_2<-13.77^2*52 # for Michaelis-Menten model
#Then, we calculate the total sum of squares by fitting a null model with only a constant
null<-lm(bone~1, data=jaws)
summary(null)
##
## Call:
## lm(formula = bone ~ 1, data = jaws)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.979 -8.844 9.872 21.775 48.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93.979 4.541 20.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.37 on 53 degrees of freedom
tss<-33.37^2*53
#Finally, the estimated percentage of variation explained is
100*(tss-rss_1)/tss #for asymptotic exponential relationship
## [1] 84.8798
100*(tss-rss_2)/tss #for Michaelis-Menten model
## [1] 83.2936
AIC(jaws_nls,jaws_mmm) #the lower AIC, the better
## df AIC
## jaws_nls 3 435.0823
## jaws_mmm 3 440.4066
non-linear regression assuming an asymptotic exponential relationship model is better since it has a lower AIC
#plot "jaws"
plot(jaws)
#plot NLR line
lines(seq(0, 50, 0.1), predict(jaws_nls, list(age=seq(0, 50, 0.1)), type="response"), col="red", lwd=3)
#plot NLR with MMM model
lines(seq(0, 50, 0.1), predict(jaws_mmm, list(age=seq(0, 50, 0.1)), type="response"), col="blue", lwd=3)
legend("topleft",
legend=c("Asymptotic", "MMM"),
col=c("red", "blue"),
lwd=3)
#Problem 6
6. In a recent paper we read: “Linear mixed effects modelling fit by restricted maximum likelihood was used to explain the variations in growth. The linear mixed effects model was generated using the lmer function in the R package lme4, with turbidity, temperature, tide, and wave action set as fixed factors and site and date set as random effects”. Write down the R code to recreate their model.
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
#model <- lmer(growth ~ turbidity + temperature + tide + wave_action + (1|site) + (1|date), data = growth_data)
#Problem 7
7. Researchers at the University of Arizona want to assess the germination rate of saguaros using a factorial design, with 3 levels of soil type (remnant, cultivated and restored) and 2 levels of sterilization (yes or no). The same experimental design was deployed in 4 different greenhouses. Each of the unique treatments was replicated in 5 pots. 6 seeds were planted in each pot. a) How many fixed treatments (unique combinations) exist? b) What is the total number of pots? c) What is the total number of plants measured? d) Write the R code that you would use to analyze these data ##### 7.a. How many fixed treatments (unique combinations) exist? 3 levels of soil type and 2 levels of sterilization will give 6 unique combinations \(3 \times 2 = 6\)
each of the 6 treatment combinations had 5 pots and 4 greenhouses = 120 pots \(6 \times 5 \times 4 = 120\)
6 unique combinations in 120 pots = 720 plants \(6 \times 120 = 720\)
there are 2 fixed treatments (with interaction) and 2 nested random effects.
# glmer(germination ~ soil * sterilization + (1|Greenhouse) + (1|Greenhouse:Pot), family=binomial, data = problem)
#Problem 8
8. Check these hypothetical data from 4 subjects relating number of previous performances to negative affect. a) What does the thick dashed black line represent? b) What is depicting the solid black line? c) What do the 4 thin dashed lines represent? ##### 8.a. What does the thick dashed black line represent? The thick black dashed line represents a simple linear regression trend line.
The solid black line represents the random variables, it is the average of the four dashed trendlines
The four thin black dashed lines represend the random-intercept levels for a mixed effects model
#Problem 9
load("dragons.RData")
head(dragons)
## testScore bodyLength mountainRange X site
## 1 16.147309 165.5485 Bavarian NA a
## 2 33.886183 167.5593 Bavarian NA a
## 3 6.038333 165.8830 Bavarian NA a
## 4 18.838821 167.6855 Bavarian NA a
## 5 33.862328 169.9597 Bavarian NA a
## 6 47.043246 168.6887 Bavarian NA a
dragon_lm <- lm(testScore ~ bodyLength, data = dragons)
summary(dragon_lm)
##
## Call:
## lm(formula = testScore ~ bodyLength, data = dragons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.962 -16.411 -0.783 15.193 55.200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.31783 12.06694 -5.081 5.38e-07 ***
## bodyLength 0.55487 0.05975 9.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.2 on 478 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1511
## F-statistic: 86.25 on 1 and 478 DF, p-value: < 2.2e-16
library(ggplot2)
ggplot(dragons, aes(x=bodyLength, y=testScore)) +
geom_point() +
geom_smooth(method=lm, se=T) # Add linear regression line
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = dragons, mapping = aes(y = testScore, x = mountainRange, fill = mountainRange))+
geom_boxplot()
ggplot(dragons, aes(x=bodyLength, y=testScore, color = mountainRange)) +
geom_point() +
labs(color = "Mountain Range")
ggplot(dragons, aes(x=bodyLength, y=testScore, color = mountainRange)) +
geom_point() +
facet_wrap(~mountainRange)
dragon_lm_mr <- lm(testScore ~ bodyLength + mountainRange, data = dragons)
summary(dragon_lm_mr)
##
## Call:
## lm(formula = testScore ~ bodyLength + mountainRange, data = dragons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.263 -9.926 0.361 9.994 44.488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.83051 14.47218 1.439 0.15072
## bodyLength 0.01267 0.07974 0.159 0.87379
## mountainRangeCentral 36.58277 3.59929 10.164 < 2e-16 ***
## mountainRangeEmmental 16.20923 3.69665 4.385 1.43e-05 ***
## mountainRangeJulian 45.11469 4.19012 10.767 < 2e-16 ***
## mountainRangeLigurian 17.74779 3.67363 4.831 1.84e-06 ***
## mountainRangeMaritime 49.88133 3.13924 15.890 < 2e-16 ***
## mountainRangeSarntal 41.97841 3.19717 13.130 < 2e-16 ***
## mountainRangeSouthern 8.51961 2.73128 3.119 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.96 on 471 degrees of freedom
## Multiple R-squared: 0.5843, Adjusted R-squared: 0.5773
## F-statistic: 82.76 on 8 and 471 DF, p-value: < 2.2e-16
library(lme4)
dragons_lm_mr2 <- lmer(testScore ~ bodyLength + (1|mountainRange), data = dragons)
summary(dragons_lm_mr2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: testScore ~ bodyLength + (1 | mountainRange)
## Data: dragons
##
## REML criterion at convergence: 3991.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4815 -0.6513 0.0066 0.6685 2.9583
##
## Random effects:
## Groups Name Variance Std.Dev.
## mountainRange (Intercept) 339.7 18.43
## Residual 223.8 14.96
## Number of obs: 480, groups: mountainRange, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 43.70938 17.13489 2.551
## bodyLength 0.03316 0.07865 0.422
##
## Correlation of Fixed Effects:
## (Intr)
## bodyLength -0.924
100 \(\times\) ((random effects mountain range intercept variance) / ((random effects mountain range intercept variance)+(random effects residual intercept variance)))
\(100 \times \frac {339.7}{339.7+223.8} = 60.28\) 60.28% of variation is explained with mountain Range
r2(dragons_lm_mr2)
## # R2 for Mixed Models
##
## Conditional R2: 0.603
## Marginal R2: 0.001
check_model(dragons_lm_mr2)
# Define function to calculate fitted values for each mountain range
fitted_values <- function(dragons) {
predicted <- predict(dragons_lm_mr2, newdata = data.frame(bodyLength = dragons$bodyLength, mountainRange = dragons$mountainRange))
return(predicted)
}
# Add predicted values to original data frame
dragons$predicted <- fitted_values(dragons)
# Plot scatterplot with facet_wrap() by mountain range and fitted lines for each mountain range
ggplot(aes(y = testScore, x = bodyLength, color = mountainRange), data = dragons) +
geom_point() +
geom_line(aes(y = predicted), color = "black") +
theme_bw() +
labs(title = "Dragon Test Score and Body Length by Mountain Range",
color = "Mountain Range") +
theme(panel.grid = element_blank(),
legend.position = "bottom") +
facet_wrap(~mountainRange)
#Problem 10
10. Data in Estuaries.csv correspond to counts of invertebrates at 3-4 sites in each of 7 (randomly chosen) estuaries. a) Fit a linear mixed model with Total as response and Modification as explanatory, controlling for Estuary b) Estimate the R2 (conditional and marginal) of this model c) Plot the data with ggplot2 in a way that helps you understand the different effects d) Include the variable Site as a random effect. Do you think this corresponds to a crossed or a nested design? e) What are the R2 (conditional and marginal) of the model including Site f) Check the model assumptions g) Plot the data trying to include Site h) Transform the variable Hydroid to presence/absence data i) Fit a generalized linear mixed model (GLMM) with this transformed variable as Response and the same fixed and random effects as in d). [Hint: function glmer] j) Check the model assumptions
# load libraries:
library(ggplot2)
library(lme4)
library(performance)
library(DHARMa)
# Read data:
estuaries <- read.csv("Estuaries.csv", header = T)
estuaries$Site <- as.factor(estuaries$Site) # set the Site as a factor instead of numbers
head(estuaries)
## X Modification Estuary Site Hydroid Total Schizoporella.errata
## 1 1 Modified JAK 1 0 44 15
## 2 2 Modified JAK 1 0 42 8
## 3 3 Modified JAK 2 0 32 9
## 4 4 Modified JAK 2 0 44 14
## 5 5 Modified JAK 3 1 42 6
## 6 6 Modified JAK 3 1 48 12
estu.lmer<-lmer(Total ~ Modification + (1|Estuary), data=estuaries) #library(lme4)
summary(estu.lmer)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Total ~ Modification + (1 | Estuary)
## Data: estuaries
##
## REML criterion at convergence: 394.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3859 -0.7142 0.2766 0.5240 2.0456
##
## Random effects:
## Groups Name Variance Std.Dev.
## Estuary (Intercept) 55.12 7.424
## Residual 86.07 9.277
## Number of obs: 54, groups: Estuary, 7
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 40.973 4.727 8.668
## ModificationPristine -14.473 6.230 -2.323
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.759
r2(estu.lmer) # library(performance)
## # R2 for Mixed Models
##
## Conditional R2: 0.553
## Marginal R2: 0.267
ggplot(estuaries, aes(x = Modification, y = Total)) +
geom_boxplot()+
geom_jitter(aes(color=Estuary))
estu.lmer.site<-lmer(Total ~ Modification + (1|Estuary/Site), data=estuaries) #implicit nesting
#summary(lmer(Total ~ Modification + (1|Estuary) + (1|Estuary:Site), data=estuaries)) #explicit nesting
summary(estu.lmer.site)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Total ~ Modification + (1 | Estuary/Site)
## Data: estuaries
##
## REML criterion at convergence: 386.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8686 -0.6687 0.1504 0.6505 1.9816
##
## Random effects:
## Groups Name Variance Std.Dev.
## Site:Estuary (Intercept) 49.85 7.061
## Estuary (Intercept) 47.59 6.899
## Residual 43.65 6.607
## Number of obs: 54, groups: Site:Estuary, 27; Estuary, 7
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 41.053 4.739 8.662
## ModificationPristine -14.553 6.232 -2.335
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.760
r2(estu.lmer.site)
## # R2 for Mixed Models
##
## Conditional R2: 0.774
## Marginal R2: 0.270
simulateResiduals(estu.lmer.site, plot = T) #library(DHARMa)
## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.572 0.512 0.208 0.544 0.5 0.692 0.62 0.296 0.584 0.976 0.104 0.56 0.916 0.868 0.804 0.908 0.74 0.74 0.476 0.668 ...
check_model(estu.lmer.site) # library(performance)
## no boxplot
ggplot(aes(y=Total, x=Estuary, group = Estuary), data = estuaries)+
geom_jitter(aes(shape = Site, color = Estuary), size = 2)+
facet_wrap(~Modification)+
theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 12, angle = 45, vjust = 0.5))
### as box plots:
ggplot(estuaries, aes(x = Modification, y = Total)) +
geom_boxplot()+
geom_jitter(aes(color=Estuary, shape=Site), size=2.5)
estuaries$HydroidPres <- estuaries$Hydroid > 0
hydro.glmm.site<-glmer(HydroidPres ~ Modification + (1|Estuary/Site), family=binomial, data=estuaries) #implicit nesting # library(lme4)
summary(hydro.glmm.site)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: HydroidPres ~ Modification + (1 | Estuary/Site)
## Data: estuaries
##
## AIC BIC logLik deviance df.resid
## 57.1 65.0 -24.5 49.1 50
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.06063 -0.25028 -0.05443 0.25475 0.98719
##
## Random effects:
## Groups Name Variance Std.Dev.
## Site:Estuary (Intercept) 14.45 3.802
## Estuary (Intercept) 1.13 1.063
## Number of obs: 54, groups: Site:Estuary, 27; Estuary, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.710 2.419 -2.360 0.0183 *
## ModificationPristine 6.534 3.140 2.081 0.0375 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.888
r2(hydro.glmm.site) # performance
## # R2 for Mixed Models
##
## Conditional R2: 0.888
## Marginal R2: 0.358
simulateResiduals(hydro.glmm.site, plot = T) #Darhma
## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.6382768 0.3853122 0.8086934 0.01626812 0.9424474 0.9426142 0.5324421 0.3949867 0.82084 0.4799594 0.8009645 0.03378411 0.3224195 0.3157432 0.602992 0.6115054 0.276876 0.8050706 0.7838545 0.5106198 ...
#Problem 1
1. Write a function to calculate the 3-point moving average for an input vector. The formula is: yi’= (yi−1+ yi+ yi+1) / 3
# Define function to calculate 3-point moving average
moving_average_3 <- function(x) {
y <- numeric(length(x))
for (i in 2:(length(x)-1)) {
y[i] <- (x[i-1] + x[i] + x[i+1]) / 3
}
return(y)
}
# Test the function with a sample input vector
x <- c(1, 2, 3, 4, 5, 6, 7)
moving_average_3(x)
## [1] 0 2 3 4 5 6 0
#Problem 2
2. Read the temp.txt file. The data correspond to monthly average temperatures. a) Plot the time series data. [Hint: first you need to create a monthly time series object] b) Calculate the 5-point moving average and plot it together with the time series c) Decompose the time series into seasonal, trend and residual error components d) Generate a temporal correlogram to assess the autocorrelation of the time series e) Generate a new correlogram but removing the trend and seasonal variation f) Find the best ARIMA model using the forecast package g) Estimate future values using the previous ARIMA model and plot the results
temp<-read.table("temp.txt", header=T)
temp.ts<-ts(temp$temps, frequency=12) #create monthly time series
#Part A
plot(temp.ts, ylab="Temp Over Months")
#Part B
library(TTR)
library(dplyr)
temp.ma<-SMA(temp.ts, n = 5)
plot(temp.ma, col="red", lwd=2)
#Part C
temp.decomp<-decompose(temp.ts)
plot(temp.decomp)
#Part D
acf(temp.ts)
#Part E
acf(temp.decomp$random[!is.na(temp.decomp$random)])
#Part F
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
temp.fit<-auto.arima(temp.ts)
summary(temp.fit)
## Series: temp.ts
## ARIMA(0,0,1)(2,1,0)[12]
##
## Coefficients:
## ma1 sar1 sar2
## 0.2456 -0.5212 -0.3233
## s.e. 0.0774 0.0823 0.0827
##
## sigma^2 = 3.772: log likelihood = -300.76
## AIC=609.52 AICc=609.81 BIC=621.4
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2384692 1.846384 1.498262 0.2682136 11.90101 0.8327783
## ACF1
## Training set 0.005413047
checkresiduals(temp.fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12]
## Q* = 38.688, df = 21, p-value = 0.01069
##
## Model df: 3. Total lags used: 24
#Part G
temp.fcast<-forecast(temp.fit)
plot(temp.fcast)
#Problem 3
3. Download the map of Spain. a) Make a basic map of Spain b) Get information on the population of Spanish cities and make a map of Spain including the locations and population of Spanish cities. c) Visit Spain and enjoy
#loading libraries:
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
##
## unemp
#Part A
spain <-map_data("world", region="spain")
ggplot(spain)+
geom_polygon(aes(x = long, y = lat, group=group), fill = "lightgray", colour = "black", size = 0.1)+
theme_void()+
labs(title="Spain")
#Part B
cities <-get('world.cities')
cities_spain <-cities[cities$country.etc== 'Spain',]
ggplot(spain) +
geom_polygon(aes(x = long, y = lat, group=group), fill='lightgray',color="black", size=0.5) +
geom_point(data=cities_spain, aes(x=long, y=lat, size=pop), color="blue", alpha=0.4) +
labs(title="Spanish population by city") +
theme_void()
#Part C -> :)
#Problem 4
4. Download GBIF georeferenced occurrence data of the Pyrenean desman (Galemys pyrenaicus) and make a map of its geographical distribution (Hint: it is only present in Spain, Andorra, Portugal and France).
library(rgbif)
library(maps)
library(ggplot2)
desman_gbif<-occ_search(scientificName = "Galemys pyrenaicus", hasCoordinate = T)
desman<-as.data.frame(desman_gbif$data[,c("decimalLatitude", "decimalLongitude")])
southEU_map<-map_data("world", region=c("Spain", "Portugal", "France", "Andorra"))
ggplot(southEU_map)+
geom_polygon(aes(x = long, y = lat, group=group), fill="lightgray")+
geom_point(data=desman, aes(x=decimalLongitude, y=decimalLatitude), color="red", alpha=0.4, size=2)+
theme_bw()
#Problem 1
1. Load the dune_bio.txt dataset. Species should be in columns and sites in rows. a) Calculate the total number of individuals of all species b) Calculate the total number of individuals for each species c) Calculate the average number of individuals for each species d) Calculate the total number of individuals for each site e) Calculate the average number of individuals for each site f) Write a function to report the median number of individuals for each species and the median number of individuals for each site g) Use the vegan function decostand() to transform the dataset to relative abundances. How would you confirm the transformation worked? h) Use the vegan function decostand() to standardize the dataset into the range 0 to 1 i) Use the vegan function decostand() to standardize the dataset to mean=0 and variance=1
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
#Part A
sum(dune.bio)
## [1] 685
#Part B
colSums(dune.bio)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 13 2 13 18 5 25 18 4 49 14 2
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 9 54 4 48 10 9 47 21 11 16 63
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 1 26 20 26 48 58 36 15
#Part C
colMeans(dune.bio)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 0.65 0.10 0.65 0.90 0.25 1.25 0.90 0.20 2.45 0.70 0.10
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 0.45 2.70 0.20 2.40 0.50 0.45 2.35 1.05 0.55 0.80 3.15
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 0.05 1.30 1.00 1.30 2.40 2.90 1.80 0.75
#Part D
rowSums(dune.bio)
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 42 33 45 33 48 18 40 43 15 23 43 32 42 27 40 31 24 31 35 40
#Part E
rowMeans(dune.bio)
## 2 13 4 16 6 1 8 5
## 1.4000000 1.1000000 1.5000000 1.1000000 1.6000000 0.6000000 1.3333333 1.4333333
## 17 15 10 11 9 18 3 20
## 0.5000000 0.7666667 1.4333333 1.0666667 1.4000000 0.9000000 1.3333333 1.0333333
## 14 19 12 7
## 0.8000000 1.0333333 1.1666667 1.3333333
#Part F
median_sps_sites<-function(x){
cat("The median number of individuals per sp is", apply(x, 2, median), "and for sites is", apply(x, 1, median))
}
median_sps_sites(dune.bio)
## The median number of individuals per sp is 0 0 0 0 0 0 0 0 2 0 0 0 2 0 3 0 0 2 0 0 0 4 0 0 0 0 1.5 2 0 0 and for sites is 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#Part G
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
dune.bio.relabu<-decostand(dune.bio, method = "total")
rowSums(dune.bio.relabu)
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#Part H
dune.bio.range<-decostand(dune.bio, method = "range")
#Part I
dune.bio.stand<-decostand(dune.bio, method = "standardize")
#Problem 2 2. Write a function to calculate the observed richness of a vector representing the abundances of different species in a community
richness <- function(x){
return(specnumber(decostand(x, method = "total")))
}
#Problem 3
3. Write a function to calculate the Shannon-Wiener diversity index of a vector representing the abundances of different species in a community.
shannon_diversity <- function(x){
return(diversity(decostand(x, method = "total"), index="shannon"))
}
#Problem 4
4. Write a function that calculates both the observed richness and the Shannon-Wiener diversity index for each community in a matrix, and writes an output table with the results. You can use the vegan built-in functions. Test your function with the dune_bio.txt dataset.
richness_diversity <- function(x){
richness <- specnumber(decostand(x, method = "total"))
shannon_diversity <- diversity(decostand(x, method = "total"), index = "shannon")
results <- data.frame(richness = richness, shannon_diversity = shannon_diversity)
return(results)
}
# Calculate richness and Shannon diversity for dune_bio
results <- richness_diversity(dune.bio)
results
## richness shannon_diversity
## 2 10 2.252516
## 13 10 2.099638
## 4 13 2.426779
## 16 8 1.959795
## 6 11 2.345946
## 1 5 1.440482
## 8 12 2.434898
## 5 14 2.544421
## 17 7 1.876274
## 15 8 1.979309
## 10 12 2.398613
## 11 9 2.106065
## 9 13 2.493568
## 18 9 2.079387
## 3 10 2.193749
## 20 8 2.048270
## 14 7 1.863680
## 19 9 2.134024
## 12 9 2.114495
## 7 13 2.471733
# Write the results to a table file
write.table(results, file = "richness_diversity_results.txt", sep = "\t", quote = FALSE, row.names = TRUE)
#Problem 5
5. Make a rank-abundance curve using the second site (13) of the dune_bio.txt dataset. Fit a lognormal model to the data.
# Load data:
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
# Rank-abundance curve using the second site (13)
plot(radfit(dune.bio[2,]))
radfit(dune.bio[2,])
##
## RAD models, family poisson
## No. of species 10, total abundance 33
##
## par1 par2 par3 Deviance AIC BIC
## Null 3.91659 32.99179 32.99179
## Preemption 0.20971 2.02192 33.09712 33.39970
## Lognormal 0.99822 0.71061 1.13244 34.20764 34.81281
## Zipf 0.27866 -0.79375 0.79058 33.86578 34.47095
## Mandelbrot 0.40348 -0.95679 0.51608 0.75881 35.83401 36.74176
plot(rad.lognormal(dune.bio[2,]), lty=2, lwd=2) # Plot
rad.lognormal(dune.bio[2,]) # statistic of the visualization
##
## RAD model: Log-Normal
## Family: poisson
## No. of species: 10
## Total abundance: 33
##
## log.mu log.sigma Deviance AIC BIC
## 0.9982177 0.7106063 1.1324449 34.2076395 34.8128097
#Problem 6
6. Create a Euclidean distance matrix after standardization using the first (A1), second (Moisture) and fifth (Manure) variables of dune_env.txt. Then create a Bray-Curtis distance matrix using dune_bio.txt. Perform a Mantel test on both distance matrices and plot the relationship.
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune.env<-read.table("dune_env.txt", sep="\t", header=T, row.names=1)
dune.env.euc<-vegdist(decostand(dune.env[,c(1,2,5)], method="standardize"), method="euclidean")
dune.bio.bray<-vegdist(dune.bio, method="bray")
mantel(dune.env.euc, dune.bio.bray)
##
## Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel(xdis = dune.env.euc, ydis = dune.bio.bray)
##
## Mantel statistic r: 0.5496
## Significance: 0.001
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.142 0.186 0.219 0.251
## Permutation: free
## Number of permutations: 999
plot(dune.env.euc, dune.bio.bray)
#Problem 1
1. Make dendrograms of the results of hierarchical clustering using the single, average and complete methods. Use the dune_bio.txt dataset after creating a Bray-Curtis distance matrix.
# Load data:
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
# Data for the clustering
dune.bio_veg <-vegdist(dune.bio, method = "bray")
#Single method
single_dune.bio_veg <- hclust(dune.bio_veg, method="single")
plot(single_dune.bio_veg, main="Single method")
#Average method
avg_dune.bio_veg <-hclust(dune.bio_veg, method="average")
plot(avg_dune.bio_veg, main=" Average method")
#complete method
com_dune.bio_veg <-hclust(dune.bio_veg, method="complete")
plot(com_dune.bio_veg, main="Complete method")
#Problem 2
2. Perform k-means partitions from 2 to 6 on the dune_bio.txt dataset and select the best partition using the Calinski criterion. Visualize the results
# Load data:
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
# Non-hierarchical clustering
dune.bio.nhc <- cascadeKM(dune.bio, inf.gr=2, sup.gr=6)
dune.bio.nhc$results
## 2 groups 3 groups 4 groups 5 groups 6 groups
## SSE 1218.500000 950.516667 777.333333 676.500000 593.750000
## calinski 5.611243 5.793253 5.633047 5.110033 4.737482
calinski maximum value is the best partitioning which is 5.793253 @ 3 groups
plot(dune.bio.nhc, sortg=T)
#Problem 1 1. Load the varechem dataset within the R package vegan. This data frame collects soil characteristics. Given the nature of this dataset perform a PCA or a CA ordination analysis. a) How much variation is explained by the two first axes? b) Make a screeplot of the results c) Plot the ordination results of the sites d) Plot both the sites scores and the soil characteristics scores focusing on the soil variables
# Load Libraries:
library(vegan)
# Load data:
data(varechem)
head(varechem)
## N P K Ca Mg S Al Fe Mn Zn Mo Baresoil Humdepth
## 18 19.8 42.1 139.9 519.4 90.0 32.3 39.0 40.9 58.1 4.5 0.3 43.9 2.2
## 15 13.4 39.1 167.3 356.7 70.7 35.2 88.1 39.0 52.4 5.4 0.3 23.6 2.2
## 24 20.2 67.7 207.1 973.3 209.1 58.1 138.0 35.4 32.1 16.8 0.8 21.2 2.0
## 27 20.6 60.8 233.7 834.0 127.2 40.7 15.4 4.4 132.0 10.7 0.2 18.7 2.9
## 23 23.8 54.5 180.6 777.0 125.8 39.5 24.2 3.0 50.1 6.6 0.3 46.0 3.0
## 19 22.8 40.9 171.4 691.8 151.4 40.8 104.8 17.6 43.6 9.1 0.4 40.5 3.8
## pH
## 18 2.7
## 15 2.8
## 24 3.0
## 27 2.8
## 23 2.7
## 19 2.7
varechem_pca<-rda(varechem, scale=T) # vegan function
summary(varechem_pca)
##
## Call:
## rda(X = varechem, scale = T)
##
## Partitioning of correlations:
## Inertia Proportion
## Total 14 1
## Unconstrained 14 1
##
## Eigenvalues, and their contribution to the correlations
##
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Eigenvalue 5.1916 3.1928 1.6855 1.06899 0.81599 0.70581 0.43641
## Proportion Explained 0.3708 0.2281 0.1204 0.07636 0.05829 0.05042 0.03117
## Cumulative Proportion 0.3708 0.5989 0.7193 0.79564 0.85392 0.90434 0.93551
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Eigenvalue 0.36884 0.1707 0.14953 0.08526 0.06986 0.035057 0.023615
## Proportion Explained 0.02635 0.0122 0.01068 0.00609 0.00499 0.002504 0.001687
## Cumulative Proportion 0.96185 0.9740 0.98473 0.99082 0.99581 0.998313 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores: 4.236078
##
##
## Species scores
##
## PC1 PC2 PC3 PC4 PC5 PC6
## N 0.2441 0.2516 -0.23818 0.92846 -0.359554 0.30319
## P -0.9181 -0.4194 0.13356 0.09649 0.224909 0.07938
## K -0.9369 -0.3272 -0.06261 0.25051 0.180899 -0.28683
## Ca -0.9280 -0.1711 0.47982 -0.02138 -0.241479 -0.09536
## Mg -0.9087 -0.1978 0.12446 -0.04430 -0.497428 -0.10781
## S -0.8576 -0.6002 -0.31620 -0.01652 0.071629 -0.08779
## Al 0.2980 -0.9720 -0.39009 0.09092 0.005386 -0.19467
## Fe 0.5236 -0.7748 -0.23063 0.37187 -0.023377 -0.32292
## Mn -0.7418 0.3908 0.13114 0.44346 0.501064 0.06776
## Zn -0.8926 -0.3277 0.06142 -0.06121 -0.153677 0.51653
## Mo -0.1072 -0.4625 -0.85720 -0.27258 -0.030063 0.39721
## Baresoil -0.4218 0.6387 -0.40089 -0.07389 -0.416610 -0.31060
## Humdepth -0.6070 0.6825 -0.41674 0.02975 -0.047738 -0.15923
## pH 0.4286 -0.6517 0.66384 0.07599 -0.265374 0.05070
##
##
## Site scores (weighted sums of species scores)
##
## PC1 PC2 PC3 PC4 PC5 PC6
## 18 0.05126 0.84085 -0.190142 -0.40431 0.101518 -1.01997
## 15 0.20452 0.37397 0.002978 -1.06030 1.175734 -0.92257
## 24 -1.53647 -1.25830 -0.012170 -0.86690 -1.854372 1.74375
## 27 -1.25643 0.51345 0.743928 0.63835 1.097881 0.15631
## 23 -0.71203 0.86247 -0.203389 -0.05971 -0.786196 -0.80080
## 19 -0.76179 0.73785 -0.761645 -0.31728 -1.303297 -0.62264
## 22 -0.30340 0.86304 0.188484 0.54758 -0.078064 0.24567
## 16 0.62797 0.76436 -0.124301 -0.26047 0.009044 -0.02807
## 28 -1.13923 0.44909 0.229104 1.61582 0.842871 0.60852
## 13 -0.62483 -0.85407 -1.259755 1.38195 -0.157593 -1.04794
## 14 -0.04206 0.60916 -0.719743 -0.73109 -0.230147 0.25505
## 20 -0.93747 0.20753 -0.689713 0.62198 -0.282188 0.33232
## 25 -0.34451 0.90128 0.710714 0.64127 1.214897 0.48173
## 7 1.50502 -0.22114 -0.495604 0.87068 -0.769266 -0.25733
## 5 1.40889 0.60035 0.911463 0.71133 -0.488111 2.45195
## 6 1.30776 0.03604 -0.243884 -0.87792 0.543259 0.46876
## 3 1.64967 -0.82755 -0.343406 1.40028 -0.546374 -0.19580
## 4 -0.26711 -1.65198 -1.744251 -0.60763 0.947492 0.46203
## 2 0.59653 -1.21610 -0.200030 0.72815 0.799208 -0.76013
## 9 0.03271 -0.69445 -0.350028 -1.35247 0.717140 0.82965
## 12 0.47944 0.26377 0.184677 -1.27744 0.573929 0.07253
## 10 -0.32993 -0.95483 1.670469 -0.51343 0.819138 -0.48682
## 11 -0.09921 -1.52318 2.493913 -0.09044 -1.092296 -1.10654
## 21 0.49069 1.17838 0.202333 -0.73801 -1.254205 -0.85966
#Part A Proportion Explained by first two axes: 0.3708 + 0.2281 = 0.5989 (around 60%)
#Part B
screeplot(varechem_pca, col="bisque")
#Part C
plot(varechem_pca, display="sites", xlab="PC1 (37%)", ylab="PC2 (23%)")
#Part D
biplot(varechem_pca, scaling="sites", xlab="PC1 (37%)", ylab="PC2 (23%)")
#Problem 2
2. The dune_bio.txt dataset corresponds to species abundances. Given the nature of this dataset perform a PCA or a CA ordination analysis. a) How much variation is explained by the two first axes? b) Make a screeplot of the results c) Plot the ordination results of the sites
# Load data:
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
head(dune.bio)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 2 3 0 0 0 0 0 0 0 0 0 0
## 13 0 0 3 0 0 0 0 0 0 2 0
## 4 2 0 0 0 0 0 0 0 2 0 2
## 16 0 0 0 3 0 8 0 0 4 2 0
## 6 0 0 0 0 0 0 6 0 6 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 2 0 5 0 4 0 0 5 0 0 3 7
## 13 0 2 0 2 0 0 2 0 0 0 9
## 4 0 2 0 4 0 0 1 0 0 0 5
## 16 0 0 0 0 3 0 0 0 0 0 2
## 6 0 3 0 3 0 5 5 3 0 2 4
## 1 0 0 0 4 0 0 0 0 0 1 2
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 2 0 4 0 0 0 5 2 4
## 13 1 0 2 0 5 0 5 0
## 4 0 4 5 0 8 5 2 3
## 16 0 0 0 0 7 0 4 0
## 6 0 0 0 5 0 6 0 0
## 1 0 4 0 0 0 7 0 0
dunebio.ca<-cca(dune.bio) # vegan package
dunebio.ca
## Call: cca(X = dune.bio)
##
## Inertia Rank
## Total 2.115
## Unconstrained 2.115 19
## Inertia is scaled Chi-square
##
## Eigenvalues for unconstrained axes:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7 CA8
## 0.5360 0.4001 0.2598 0.1760 0.1448 0.1079 0.0925 0.0809
## (Showing 8 of 19 unconstrained eigenvalues)
#Part A
summary(dunebio.ca)
##
## Call:
## cca(X = dune.bio)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 2.115 1
## Unconstrained 2.115 1
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7
## Eigenvalue 0.5360 0.4001 0.2598 0.17598 0.14476 0.10791 0.09247
## Proportion Explained 0.2534 0.1892 0.1228 0.08319 0.06844 0.05102 0.04372
## Cumulative Proportion 0.2534 0.4426 0.5654 0.64858 0.71702 0.76804 0.81175
## CA8 CA9 CA10 CA11 CA12 CA13 CA14
## Eigenvalue 0.08091 0.07332 0.05630 0.04826 0.04125 0.03523 0.020529
## Proportion Explained 0.03825 0.03466 0.02661 0.02282 0.01950 0.01665 0.009705
## Cumulative Proportion 0.85000 0.88467 0.91128 0.93410 0.95360 0.97025 0.979955
## CA15 CA16 CA17 CA18 CA19
## Eigenvalue 0.014911 0.009074 0.007938 0.007002 0.003477
## Proportion Explained 0.007049 0.004290 0.003753 0.003310 0.001644
## Cumulative Proportion 0.987004 0.991293 0.995046 0.998356 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
##
##
## Species scores
##
## CA1 CA2 CA3 CA4 CA5 CA6
## Belper -0.50018 -0.35503 -0.15239 -0.704153 -0.058546 -0.07308
## Empnig -0.69027 3.26420 1.95716 -0.176936 -0.073518 0.16083
## Junbuf 0.08157 -0.68074 1.00545 1.078390 0.268360 -0.24168
## Junart 1.27580 0.09963 -0.09320 0.005536 0.289410 0.78247
## Airpra -1.00434 3.06749 1.33773 0.194305 -1.081813 0.53699
## Elepal 1.76383 0.34562 -0.57336 -0.002976 -0.332396 0.14688
## Rumace -0.65289 -0.25525 -0.59728 1.160164 0.255849 0.32730
## Viclat -0.61893 0.37140 -0.46057 -1.000375 1.162652 -1.44971
## Brarut 0.18222 0.26477 -0.16606 0.064009 0.576334 -0.07741
## Ranfla 1.55886 0.30700 -0.29765 0.046974 -0.008747 0.14744
## Cirarv -0.05647 -0.76398 0.91793 -1.175919 -0.384024 0.13985
## Hyprad -0.85408 2.52821 1.13951 -0.175115 -0.311874 -0.11177
## Leoaut -0.19566 0.38884 0.03975 -0.130392 0.141232 -0.23717
## Potpal 1.91690 0.52150 -1.18215 -0.021738 -1.359988 -1.31207
## Poapra -0.38919 -0.32999 -0.02015 -0.358371 0.079296 0.05165
## Calcus 1.95199 0.56743 -0.85948 -0.098969 -0.556737 -0.23282
## Tripra -0.88116 -0.09792 -1.18172 1.282429 0.325706 0.33388
## Trirep -0.07666 -0.02032 -0.20594 0.026462 -0.186748 -0.53957
## Antodo -0.96676 1.08361 -0.17188 0.459788 -0.607533 0.30425
## Salrep 0.61035 1.54868 0.04970 -0.607136 1.429729 0.55183
## Achmil -0.90859 0.08461 -0.58636 -0.008919 -0.660183 0.18877
## Poatri -0.18185 -0.53997 0.23388 0.178834 -0.155342 0.07584
## Chealb 0.42445 -0.84402 1.59029 1.248755 -0.207480 -0.87566
## Elyrep -0.37074 -0.74148 0.26238 -0.566308 -0.270122 0.72624
## Sagpro 0.00364 0.01719 1.11570 0.066981 0.186654 -0.32463
## Plalan -0.84058 0.24886 -0.78066 0.371149 0.271377 -0.11989
## Agrsto 0.93378 -0.20651 0.28165 0.024293 -0.139326 0.02256
## Lolper -0.50272 -0.35955 -0.21821 -0.474727 0.101494 0.01594
## Alogen 0.40088 -0.61839 0.85013 0.346740 0.016509 -0.10169
## Brohor -0.65762 -0.40634 -0.30685 -0.496751 -0.561358 -0.07004
##
##
## Site scores (weighted averages of species scores)
##
## CA1 CA2 CA3 CA4 CA5 CA6
## 2 -0.63268 -0.6958357 -0.09708 -1.18695 -0.97686 -0.06575
## 13 0.42445 -0.8440195 1.59029 1.24876 -0.20748 -0.87566
## 4 -0.05647 -0.7639784 0.91793 -1.17592 -0.38402 0.13985
## 16 2.00229 0.1090627 -0.33414 0.33760 -0.50097 0.76159
## 6 -0.85633 -0.0005408 -1.39735 1.59909 0.65494 0.19386
## 1 -0.81167 -1.0826714 -0.14479 -2.10665 -0.39287 1.83462
## 8 0.76268 -0.2968459 0.35648 -0.10772 0.17507 0.36444
## 5 -0.95293 -0.1846015 -0.95609 0.86853 -0.34552 0.98333
## 17 -1.47545 2.7724102 0.40859 0.75117 -2.59425 1.10122
## 15 1.91384 0.5079036 -0.96567 0.04227 -0.50681 -0.19370
## 10 -0.87885 -0.0353136 -0.82987 -0.68053 -0.75438 -0.81070
## 11 -0.64223 0.4440332 -0.17371 -1.09684 1.37462 -2.00626
## 9 0.09693 -0.7864314 0.86492 0.40090 0.28704 1.02783
## 18 -0.31241 0.6328355 -0.66501 -1.12728 2.65575 -0.97565
## 3 -0.10148 -0.9128732 0.68815 -0.68137 -0.08709 0.28678
## 20 1.94438 1.0688809 -0.66595 -0.55317 1.59606 1.70292
## 14 1.91996 0.5351062 -1.39863 -0.08575 -2.21317 -2.43044
## 19 -0.69027 3.2642026 1.95716 -0.17694 -0.07352 0.16083
## 12 0.28557 -0.6656161 1.64423 1.71496 0.65381 -1.17376
## 7 -0.87149 -0.2547040 -0.86830 0.90468 0.17385 0.03446
variation explained by the two first axes: 0.2534 + 0.1892 = 0.4426
#Part B
screeplot(dunebio.ca)
#Part C
plot(dunebio.ca, display="sites", xlab="CA1 (25%)", ylab="CA2 (19%)")
#Problem 3
3. Perform a nonmetric multidimensional scaling (NMDS) on the dune_bio.txt dataset after calculating Bray-Curtis distances among sites. a) What is the stress? b) Make a Shepard plot of the NMDS results c) Plot the ordination results of the sites d) [Advanced – ggplot2] Plot the ordination results using ggplot2. Use the dune_env.txt dataset to make the size of the points proportional to the site richness (number of species) and the color to represent the variable Management.
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune.bio.nmds<-metaMDS(dune.bio, distance="bray", k=2, trymax=100)
## Run 0 stress 0.1192678
## Run 1 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02026967 max resid 0.06495955
## Run 2 stress 0.1192678
## Run 3 stress 0.1192678
## Run 4 stress 0.1192678
## Run 5 stress 0.1192678
## Run 6 stress 0.2075713
## Run 7 stress 0.1183186
## ... Procrustes: rmse 3.19601e-05 max resid 9.480994e-05
## ... Similar to previous best
## Run 8 stress 0.1192678
## Run 9 stress 0.1183186
## ... Procrustes: rmse 1.234597e-05 max resid 3.78678e-05
## ... Similar to previous best
## Run 10 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 6.55419e-06 max resid 2.111809e-05
## ... Similar to previous best
## Run 11 stress 0.1192679
## Run 12 stress 0.1183186
## ... Procrustes: rmse 2.8718e-05 max resid 8.337203e-05
## ... Similar to previous best
## Run 13 stress 0.1192678
## Run 14 stress 0.1183186
## ... Procrustes: rmse 3.70733e-06 max resid 1.088412e-05
## ... Similar to previous best
## Run 15 stress 0.2809945
## Run 16 stress 0.1192678
## Run 17 stress 0.1183186
## ... Procrustes: rmse 1.905587e-06 max resid 6.35279e-06
## ... Similar to previous best
## Run 18 stress 0.1192678
## Run 19 stress 0.1183186
## ... Procrustes: rmse 1.013181e-05 max resid 3.108071e-05
## ... Similar to previous best
## Run 20 stress 0.1808911
## *** Best solution repeated 5 times
#Part A
dune.bio.nmds$stress
## [1] 0.1183186
#Part B
stressplot(dune.bio.nmds)
#Part C
plot(dune.bio.nmds, type="t", display="sites")
dune.env<-read.table("dune_env.txt", sep="\t", header=T, row.names=1)
rownames(dune.env)==rownames(dune.bio)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
dune.env$Axis01<-dune.bio.nmds$points[,1]
dune.env$Axis02<-dune.bio.nmds$points[,2]
dune.env$Richness<-specnumber(dune.bio)
#Part D
library(ggplot2)
ggplot(dune.env, aes(x=Axis01, y=Axis02))+
geom_point(aes(fill=Management, size=Richness), alpha=0.6, pch=21)
#Problem 4
4. Perform a constrained correspondence analysis (CCA) on the dune_bio.txt dataset using the first (A1), second (Moisture) and fifth (Manure) variables of dune_env.txt as explanatory. a) What is the variation explained by the two first constrained axes? b) What is the adjusted R2 of the model? c) Use a permutation test to assess the overall statistical significance of the model d) Use a permutation test to assess the marginal statistical significance of each explanatory variable of the model. Which variables are significant? e) Make a triplot of the ordination results focusing on the sites
# load data:
dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune.env<-read.table("dune_env.txt", sep="\t", header=T, row.names=1)
# CCA:
dune_cca<-cca(dune.bio ~ A1 + Moisture + Manure, data = dune.env)
#Part A
summary(dune_cca)
##
## Call:
## cca(formula = dune.bio ~ A1 + Moisture + Manure, data = dune.env)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 2.1153 1.0000
## Constrained 0.7692 0.3637
## Unconstrained 1.3460 0.6363
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CCA1 CCA2 CCA3 CA1 CA2 CA3 CA4
## Eigenvalue 0.4264 0.2347 0.10812 0.3124 0.2101 0.17518 0.13748
## Proportion Explained 0.2016 0.1110 0.05112 0.1477 0.0993 0.08282 0.06499
## Cumulative Proportion 0.2016 0.3125 0.36366 0.5114 0.6107 0.69348 0.75847
## CA5 CA6 CA7 CA8 CA9 CA10 CA11
## Eigenvalue 0.09442 0.09295 0.07380 0.06360 0.05541 0.04663 0.021117
## Proportion Explained 0.04464 0.04394 0.03489 0.03007 0.02620 0.02204 0.009983
## Cumulative Proportion 0.80311 0.84705 0.88194 0.91201 0.93820 0.96025 0.970230
## CA12 CA13 CA14 CA15 CA16
## Eigenvalue 0.019730 0.016183 0.012531 0.009960 0.004566
## Proportion Explained 0.009327 0.007651 0.005924 0.004709 0.002159
## Cumulative Proportion 0.979558 0.987208 0.993132 0.997841 1.000000
##
## Accumulated constrained eigenvalues
## Importance of components:
## CCA1 CCA2 CCA3
## Eigenvalue 0.4264 0.2347 0.1081
## Proportion Explained 0.5543 0.3051 0.1406
## Cumulative Proportion 0.5543 0.8594 1.0000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
##
##
## Species scores
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## Belper -0.72508 0.09015 0.28678 -0.14997 -0.08941 0.64602
## Empnig 0.92386 -1.49359 -1.29239 2.87128 1.31651 0.74936
## Junbuf 0.50547 0.17496 -0.35707 0.33578 -1.58298 -0.40514
## Junart 1.08445 -0.22093 -0.33048 -0.84568 0.15861 -0.23372
## Airpra 0.32923 -1.52131 -0.75777 2.80560 1.49278 0.23227
## Elepal 1.39255 0.01967 0.39847 -0.90296 0.89564 -0.18234
## Rumace -0.56966 0.07756 0.44317 0.23147 -0.33959 -1.25112
## Viclat -0.95631 -1.05874 0.13224 -0.42277 -0.16218 0.84521
## Brarut 0.05861 -0.29093 0.14295 -0.17325 0.10170 -0.08514
## Ranfla 1.30695 -0.17793 0.06749 -0.84091 0.45935 -0.18497
## Cirarv -0.40316 1.49565 -0.09657 0.17279 0.57950 1.51909
## Hyprad 0.14257 -1.37899 -0.68907 2.13762 1.11990 0.54595
## Leoaut -0.10805 -0.39424 0.03343 0.21126 0.05153 0.18568
## Potpal 1.82080 -0.59463 2.50587 -0.53778 0.46936 0.51885
## Poapra -0.47298 0.19818 -0.15806 -0.08959 -0.09919 0.21588
## Calcus 1.32589 -0.43845 0.22643 -1.29121 0.99188 -0.18948
## Tripra -0.94282 0.14003 0.52488 0.16097 0.18159 -1.70499
## Trirep -0.03691 -0.24820 0.22610 -0.03138 -0.23464 0.04178
## Antodo -0.42848 -0.66783 0.01590 1.13407 0.58011 -0.56583
## Salrep 0.38937 -1.51269 -0.78060 -0.47447 0.69625 0.26139
## Achmil -0.84528 -0.28199 0.08034 0.23663 0.15351 -0.33457
## Poatri -0.09591 0.48672 -0.08195 0.10052 -0.39496 -0.08346
## Chealb 1.33134 1.08879 -0.17910 0.92504 -1.53277 -0.26877
## Elyrep -0.45995 0.49110 -0.07018 -0.09270 -0.32202 0.53794
## Sagpro 0.36673 0.22891 -0.41200 0.63169 -0.40176 0.48274
## Plalan -0.89463 -0.37036 0.37142 0.16434 0.15598 -0.65298
## Agrsto 0.80663 0.42851 0.03513 -0.31737 0.09257 0.16535
## Lolper -0.68266 0.25866 -0.13366 -0.17559 0.06653 0.20307
## Alogen 0.52884 0.74865 -0.28721 0.11872 -0.57814 0.07754
## Brohor -0.77674 0.11771 0.03195 -0.07993 -0.06235 0.29678
##
##
## Site scores (weighted averages of species scores)
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## 2 -0.8544 0.57195 -0.04460 -0.44645 -0.473383 1.10419
## 13 0.7656 1.43234 -1.04660 0.92504 -1.532773 -0.26877
## 4 -0.1565 1.36916 -0.67602 0.17279 0.579503 1.51909
## 16 2.0459 0.46834 0.70504 -0.99503 1.669766 -0.75493
## 6 -1.0571 -0.29557 1.50936 0.06506 0.216688 -2.09457
## 1 -1.2439 1.24492 -0.99278 -0.48730 0.708935 1.13256
## 8 0.8113 0.66762 -0.54772 -0.28201 0.298269 -0.31177
## 5 -1.1247 -0.04619 1.17587 0.61920 0.008872 -0.95018
## 17 -0.7722 -2.94478 -1.24411 2.70708 1.757175 -0.54337
## 15 2.0065 -0.48090 2.87633 -0.26585 0.401900 0.57133
## 10 -1.0958 -0.42412 0.49762 -0.26374 -0.332785 0.08742
## 11 -0.7816 -0.90608 -0.28150 -0.26599 -0.008900 1.12676
## 9 0.1817 0.86595 -0.91242 -0.34851 -2.059004 -0.10377
## 18 -0.6053 -1.51952 0.07324 -0.89534 -0.298138 1.03991
## 3 -0.2093 1.35237 -0.66038 0.06196 0.171454 0.84659
## 20 1.8996 -1.40266 -0.55715 -2.22940 0.920734 -0.49851
## 14 1.9462 -0.67177 3.54931 -0.80971 0.536813 0.46637
## 19 0.2690 -3.39538 -3.20303 2.87128 1.316513 0.74936
## 12 0.6252 1.06204 -0.88731 0.77476 -2.069361 -0.26842
## 7 -1.0498 -0.01449 0.64118 -0.05748 0.266550 -1.48584
##
##
## Site constraints (linear combinations of constraining variables)
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## 2 -1.0722 -0.15065 0.02248 -0.44645 -0.473383 1.10419
## 13 1.3313 1.08879 -0.17910 0.92504 -1.532773 -0.26877
## 4 -0.4032 1.49565 -0.09657 0.17279 0.579503 1.51909
## 16 1.2912 1.04854 -0.34917 -0.99503 1.669766 -0.75493
## 6 -0.9651 -0.04331 0.47601 0.06506 0.216688 -2.09457
## 1 -1.0995 1.27129 -0.50141 -0.48730 0.708935 1.13256
## 8 1.0904 0.84728 -1.19953 -0.28201 0.298269 -0.31177
## 5 -0.6973 0.22504 1.60982 0.61920 0.008872 -0.95018
## 17 -0.5627 -1.56290 0.04417 2.70708 1.757175 -0.54337
## 15 1.9681 -0.44704 3.12947 -0.26585 0.401900 0.57133
## 10 -0.6232 -0.89889 -0.41620 -0.26374 -0.332785 0.08742
## 11 -1.1054 -0.90857 0.08601 -0.26599 -0.008900 1.12676
## 9 0.4481 -0.77218 -0.96709 -0.34851 -2.059004 -0.10377
## 18 -0.9913 -1.51891 0.77314 -0.89534 -0.298138 1.03991
## 3 -0.3898 1.50907 -0.03988 0.06196 0.171454 0.84659
## 20 0.8971 -1.52042 -1.40577 -2.22940 0.920734 -0.49851
## 14 1.6735 -0.74222 1.88228 -0.80971 0.536813 0.46637
## 19 0.9239 -1.49359 -1.29239 2.87128 1.316513 0.74936
## 12 0.7625 0.26751 0.15988 0.77476 -2.069361 -0.26842
## 7 -1.1327 0.51336 -0.43788 -0.05748 0.266550 -1.48584
##
##
## Biplot scores for constraining variables
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## A1 0.6048 0.04018 0.7954 0 0 0
## Moisture 0.9746 -0.06076 -0.2158 0 0 0
## Manure -0.2059 0.96205 -0.1791 0 0 0
variation explained by the two first constrained axes 0.2135 + 0.1236 = 0.3371
#Part B
RsquareAdj(dune_cca)$adj.r.squared
## [1] 0.2448777
#Part C
anova(dune_cca)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune.bio ~ A1 + Moisture + Manure, data = dune.env)
## Df ChiSquare F Pr(>F)
## Model 3 0.76925 3.048 0.001 ***
## Residual 16 1.34602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Part D
anova(dune_cca, by="margin")
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune.bio ~ A1 + Moisture + Manure, data = dune.env)
## Df ChiSquare F Pr(>F)
## A1 1 0.13047 1.5508 0.113
## Moisture 1 0.30886 3.6714 0.001 ***
## Manure 1 0.23418 2.7837 0.004 **
## Residual 16 1.34602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Moisture and manure are the significant variables.
#Part E
plot(dune_cca, scaling=2, xlab="CCA1 (21%)", ylab="CCA2 (12%)")
#Problem 5
5. Perform a permutational multivariate analysis of variance (PERMANOVA) with 9,999 permutations using the dune_bio.txt dataset after calculating Bray-Curtis distances and the first (A1), second (Moisture) and fifth (Manure) variables of dune_env.txt as explanatory. a) Which explanatory variables are significant? b) What is the explanatory power (R2) of the significant variables? c) Perform a new PERMANOVA analysis with 9,999 permutations with A1 as the explanatory variable but constrain the permutations within the Use variable. d) Plot a NMDS ordination to visually confirm your results in c).
dune_env <- read.table("dune_env.txt", sep = "\t", header = TRUE, row.names = 1)
dune_bio <- read.table("dune_bio.txt", sep = "\t", header = TRUE, row.names = 1)
#Check that the two files are in the same order
rownames(dune_bio)==rownames(dune_env)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
adonis2(vegdist(dune_bio, method = "bray")~A1+Moisture+Manure, data = dune_env, permutations = 9,999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9
##
## adonis2(formula = vegdist(dune_bio, method = "bray") ~ A1 + Moisture + Manure, data = dune_env, permutations = 9, method = 999)
## Df SumOfSqs R2 F Pr(>F)
## A1 1 0.7230 0.16817 5.8146 0.1
## Moisture 1 0.8671 0.20169 6.9736 0.1
## Manure 1 0.7197 0.16741 5.7885 0.1
## Residual 16 1.9893 0.46274
## Total 19 4.2990 1.00000
#Part A- A1, Moisture, and Manure are significant.
#Part B- A1 R2=0.16817 , Moisture R2=0.20169 , Manure R2=0.16741
#Part C
adonis2(vegdist(dune_bio, method = "bray")~A1+Moisture+Manure, strata = dune_env$Use, data = dune_env, permutations = 9,999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 9
##
## adonis2(formula = vegdist(dune_bio, method = "bray") ~ A1 + Moisture + Manure, data = dune_env, permutations = 9, method = 999, strata = dune_env$Use)
## Df SumOfSqs R2 F Pr(>F)
## A1 1 0.7230 0.16817 5.8146 0.1
## Moisture 1 0.8671 0.20169 6.9736 0.1
## Manure 1 0.7197 0.16741 5.7885 0.1
## Residual 16 1.9893 0.46274
## Total 19 4.2990 1.00000
#Part D
library(ggplot2)
dune.env$Axis01<-metaMDS(vegdist(dune.bio, method = "bray"))$points[,1]
## Run 0 stress 0.1192678
## Run 1 stress 0.1192678
## ... New best solution
## ... Procrustes: rmse 4.656707e-06 max resid 1.437242e-05
## ... Similar to previous best
## Run 2 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02027234 max resid 0.0649748
## Run 3 stress 0.1183186
## ... Procrustes: rmse 1.473262e-05 max resid 4.753161e-05
## ... Similar to previous best
## Run 4 stress 0.1812932
## Run 5 stress 0.1192679
## Run 6 stress 0.1183186
## ... Procrustes: rmse 2.97434e-05 max resid 9.284026e-05
## ... Similar to previous best
## Run 7 stress 0.1183186
## ... Procrustes: rmse 2.370564e-05 max resid 7.367693e-05
## ... Similar to previous best
## Run 8 stress 0.2686996
## Run 9 stress 0.1192679
## Run 10 stress 0.1192679
## Run 11 stress 0.1192678
## Run 12 stress 0.1192678
## Run 13 stress 0.1192678
## Run 14 stress 0.1192678
## Run 15 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 6.510386e-06 max resid 2.170637e-05
## ... Similar to previous best
## Run 16 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 1.219777e-05 max resid 4.016986e-05
## ... Similar to previous best
## Run 17 stress 0.1809577
## Run 18 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 6.824018e-06 max resid 2.11945e-05
## ... Similar to previous best
## Run 19 stress 0.2422121
## Run 20 stress 0.1183186
## ... Procrustes: rmse 6.900954e-06 max resid 2.194961e-05
## ... Similar to previous best
## *** Best solution repeated 2 times
dune.env$Axis02<-metaMDS(vegdist(dune.bio, method = "bray"))$points[,2]
## Run 0 stress 0.1192678
## Run 1 stress 0.1192678
## ... Procrustes: rmse 7.725394e-05 max resid 0.0002362878
## ... Similar to previous best
## Run 2 stress 0.2019173
## Run 3 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02026982 max resid 0.06495976
## Run 4 stress 0.1192678
## Run 5 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 6.272132e-07 max resid 1.563122e-06
## ... Similar to previous best
## Run 6 stress 0.1192678
## Run 7 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 5.151047e-06 max resid 1.504792e-05
## ... Similar to previous best
## Run 8 stress 0.1192678
## Run 9 stress 0.1809578
## Run 10 stress 0.1183186
## ... Procrustes: rmse 1.268105e-06 max resid 3.781268e-06
## ... Similar to previous best
## Run 11 stress 0.1192678
## Run 12 stress 0.1192678
## Run 13 stress 0.1192678
## Run 14 stress 0.1183186
## ... Procrustes: rmse 5.207052e-06 max resid 1.531356e-05
## ... Similar to previous best
## Run 15 stress 0.1192678
## Run 16 stress 0.1808911
## Run 17 stress 0.1192678
## Run 18 stress 0.1192678
## Run 19 stress 0.1192678
## Run 20 stress 0.1183186
## ... Procrustes: rmse 4.783883e-06 max resid 1.489151e-05
## ... Similar to previous best
## *** Best solution repeated 4 times
ggplot(dune.env, aes(x=Axis01, y=Axis02))+
geom_point(aes(fill=Use, size=A1), alpha=0.6, pch=21)