PRACTICE PROBLEMS R LANGUAGE 1. Install the ISwR R package. Write the built-in dataset thuesen to a tab-separated text file. View it with a text editor. Change the NA to ., and read the changed file back into R.
library(ISwR)
thuesen<-read.delim("thuesen.txt", header=T)
thuesen$short.velocity[which(is.na(thuesen$short.velocity))] <- "."
thuesen
## blood.glucose short.velocity
## 1 15.3 1.76
## 2 10.8 1.34
## 3 8.1 1.27
## 4 19.5 1.47
## 5 7.2 1.27
## 6 5.3 1.49
## 7 9.3 1.31
## 8 11.1 1.09
## 9 7.5 1.18
## 10 12.2 1.22
## 11 6.7 1.25
## 12 5.2 1.19
## 13 19.0 1.95
## 14 15.1 1.28
## 15 6.7 1.52
## 16 8.6 .
## 17 4.2 1.12
## 18 10.3 1.37
## 19 12.5 1.19
## 20 16.1 1.05
## 21 13.3 1.32
## 22 4.9 1.03
## 23 8.8 1.12
## 24 9.5 1.7
Exponentalpopulation<-function(r,t,N,K) {
Nt<-(K*N)/(N+(K-N)*exp(r*t))
return(Nt)
}
low_growth<-data.frame(r=0.5,t=c(0:20),Nt=NA)
medium_growth<-data.frame(r=0.8,t=c(0:20), Nt=NA)
high_growth<-data.frame(r=-0.1,t=c(0:20), Nt=NA)
all_growth<-rbind(low_growth,medium_growth,high_growth)
all_growth$Nt<-Exponentalpopulation(r=all_growth$r,
t=all_growth$t,
N=10,
K=1000)
library(ggplot2)
ggplot(data = all_growth, mapping=aes(x=t,y=Nt, color=as.factor(r), group=r))+geom_line()
3. Under highly favorable conditions, populations grow exponentially.
However resources will eventually limit population growth and
exponential growth cannot continue indefinitely. This phenomenon is
described by the logistic growth function: Nt = K N0/N 0+(K −N0)e−rt
where K is the carrying capacity. Write a logistic growth function in R
that also generates a plot with time on the x axis and Nt on the y axis.
Using that function create plots for 20 days assuming an initial
population size of 10 individuals and a carrying capacity of 1,000
individuals under three growth rate scenarios (0.5, 0.8, 0.4)
#Make the function
logisticpopulation<-function(N,K,r,t) {
Nt<-(K*N)/(N+(K-N)*exp(-r*t))
}
#Make the data frames
low_loggrowth<-data.frame(r=0.4,t=c(0:20),Nt=NA)
medium_loggrowth<-data.frame(r=0.5, t=c(0:20), Nt=NA)
high_loggrowth<-data.frame(r=0.8, t=c(0:20), Nt=NA)
#Make one big table with all data frames using rbind
all_loggrowth<-rbind(low_loggrowth,medium_loggrowth,high_loggrowth)
#compute the Nt values using the function I built
all_loggrowth$Nt<-logisticpopulation(r=all_loggrowth$r,t=all_loggrowth$t,N=10,K=1000)
ggplot(data=all_loggrowth, mapping=aes(x=t,y=Nt, color=as.factor(r), group=r))+
geom_line()
4. Write a function (sum_n) that for any given value, say n, computes
the sum of the integers from 1 to n (inclusive). Use the function to
determine the sum of integers from 1 to 5,000.
Additionfunction<-function(n){
total<-sum(1:n)
return(total)
}
Additionfunction(5000)
## [1] 12502500
5.Write a function (sqrt_round) that takes a number x as input, takes the square root, rounds it to the nearest whole number and then returns the result
sqrt_round<-function(x){
total<-round(sqrt(x))
return(total)
}
library(nycflights13)
data(package="nycflights13")
weather
## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_g…¹
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0 NA
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4 NA
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0 NA
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8 NA
## # … with 26,105 more rows, 4 more variables: precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>, and abbreviated variable name ¹wind_gust
head(weather)
## # A tibble: 6 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # … with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.1.8
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
weather1<-filter(weather,month==1)
weather1
## # A tibble: 2,226 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_g…¹
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0 NA
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4 NA
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0 NA
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8 NA
## # … with 2,216 more rows, 4 more variables: precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>, and abbreviated variable name ¹wind_gust
ggplot(weather1, aes(x=temp))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
d) Using ggplot2, make a beautiful line plot of ‘temp’ as a function of
‘time_hour’
ggplot(weather1, aes(x=time_hour, y=temp))+geom_line()
ggplot(weather1, aes(x=origin, y=temp))+geom_boxplot()
7.Write content in markdown syntax to replicate, as much as possible,
the format of the following sample text. ## Big Title
Some text using Markdown syntax
Some text in code format: computing with data
The main data objects in R are:
The R website is: http://www.r-project.org.
PRACTICE PROBLEMS-Fundamentals of statistics
undulation_rates<-c(0.9,1.4,1.2,1.2,1.3,2.0,1.4,1.6)
hist(undulation_rates)
b) Calculate the sample mean
mean(undulation_rates)
## [1] 1.375
range(undulation_rates)
## [1] 0.9 2.0
sd(undulation_rates)
## [1] 0.324037
coef_var<-function(standd,mean_undulation){
(standd/mean_undulation)*100
}
coef_var(.324037,1.375)
## [1] 23.56633
mean(c(112,128, 108, 129, 125, 153, 155, 132, 137))
## [1] 131
var(c(112,128, 108, 129, 125, 153, 155, 132, 137))
## [1] 254.5
sd(c(112,128, 108, 129, 125, 153, 155, 132, 137))
## [1] 15.95306
sum((15.95/131)*100)
## [1] 12.17557
DesertBirdAbundance <- read.csv("DesertBirdAbundance.csv")
ggplot(DesertBirdAbundance, aes(abundance))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
b) Calculate the median and the mean of the bird abundance data.
mean(DesertBirdAbundance$abundance)
## [1] 74.76744
median
median(DesertBirdAbundance$abundance)
## [1] 18
range(DesertBirdAbundance$abundance)
## [1] 1 625
standard deviation
sd(DesertBirdAbundance$abundance)
## [1] 121.9473
coefficient of variation
sum((121.9473/74.76744)*100)
## [1] 163.1021
4.Calculate the probability of each of the following events: a) A standard normally distributed variable is larger than 3
1-pnorm(3)
## [1] 0.001349898
#OR
pnorm(3, lower.tail = FALSE)
## [1] 0.001349898
pnorm(42, mean = 35, sd = 6, lower.tail = FALSE)
## [1] 0.1216725
dbinom(10,10,.8)
## [1] 0.1073742
d is for density. This is a discrete set of data so you use density- you are not integrating
pchisq(6.5, 2, lower.tail = FALSE)
## [1] 0.03877421
hist(rbinom(size=10, prob=0.9, n=5))
6. Imagine height is genetically determined by the combined (that is,
the sum) effect of several genes (polygenic trait). Assume that each
gene has an effect on height as a uniform distribution with min=1 and
max=3. Simulate an stochastic model of height for 1,000 random people
based on 1 gene, 2 genes, and 5 genes. As we increase the number of
genes, what is the resulting height distribution?
hist(runif(1000,1,3)) hist(runif(1000,1,3))+(runif(1000,1,3)) hist(runif(1000,1,3)+runif(1000,1,3)+runif(1000,1,3)) hist(runif(1000,1,3)+runif(1000,1,3)+runif(1000,1,3)+(runif(1000,1,3))+(runif(1000,1,3))
hist(runif(1000,1,3))
hist(runif(1000,1,3)*runif(1000,1,3))
hist(runif(1000,1,3)* runif(1000,1,3) * runif(1000,1,3))
hist(runif(1000,1,3) * runif(1000,1,3) * runif(1000,1,3) * runif(1000,1,3) * runif(1000,1,3))
PRACTICE PROBLEMS – Statistical tests 1. Normal human body temperature
is 98.6 F. Researchers obtained body-temperature measurements on
randomly chosen healthy people: 98.4, 98.6, 97.8, 98.8, 97.9, 99.0,
98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6,
97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4. a) Make a histogram of the
data
hist(c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4))
b) Make a normal quantile plot
qqnorm(c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4))
c) Perform a Shapiro-Wilk test to test for normality
shapiro.test(c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4))
##
## Shapiro-Wilk normality test
##
## data: c(98.4, 98.6, 97.8, 98.8, 97.9, 99, 98.2, 98.8, 98.8, 99, 98, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100, 98.4)
## W = 0.97216, p-value = 0.7001
Yes, p-value is 0.7001. There is also a high w value (0.97216) which tells me that sample quantiles fit the standard normal quantiles well.
t.test(c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4), mu=98.6)
##
## One Sample t-test
##
## data: c(98.4, 98.6, 97.8, 98.8, 97.9, 99, 98.2, 98.8, 98.8, 99, 98, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100, 98.4)
## t = -0.56065, df = 24, p-value = 0.5802
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
## 98.24422 98.80378
## sample estimates:
## mean of x
## 98.524
Yes, high p value (0.58) so the means are not significantly different.
binom.test(31,41, p=0.5)
##
## Exact binomial test
##
## data: 31 and 41
## number of successes = 31, number of trials = 41, p-value = 0.00145
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5969538 0.8763675
## sample estimates:
## probability of success
## 0.7560976
Exact binomial test data: 31 and 41 number of successes = 31, number of trials = 41, p-value = .00145 alternative hypothesis: true probability of success is not equal to 0.5, 95 percent confidence interval: 0.5969538 0.8763675. sample estimates: probability of success 0.7560976. Yes, there is a preference.
Placeboo <- c(37, 52, 68, 4, 29, 32, 19, 52, 19, 12)
Drug <- c(5, 23, 40, 3, 38, 19, 9, 24, 17, 14)
seizures <- data.frame(Placeboo, Drug)
boxplot(seizures)
b) Test the difference
t.test(seizures$Placebo, seizures$Drug, paired=T)
##
## Paired t-test
##
## data: seizures$Placebo and seizures$Drug
## t = 2.7661, df = 9, p-value = 0.02189
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 2.404666 23.995334
## sample estimates:
## mean difference
## 13.2
p-value = 0.02189, reject the null, the difference is significant 4. A bee biologist is analyzing whether there was an association between bee colony number and type of forest habitat. We expect that there is no habitat preference for bee colony number. Is this true based on this data? a) Make a barplot of the data
Bumblebee <-data.frame('habitat'= c('Oak','Hickory','Maple','Red cedar','Poplar'), 'bee.colonies'=c(33,30,29,4,4))
barplot(Bumblebee$bee.colonies, names.arg = Bumblebee$habitat)
b) Test the hypothesis of no-association
chisq.test(Bumblebee$bee.colonies)
##
## Chi-squared test for given probabilities
##
## data: Bumblebee$bee.colonies
## X-squared = 43.1, df = 4, p-value = 9.865e-09
Reject the null, p-value = 9.865e-09
t.test(rnorm(25), mu=0)$p.value
## [1] 0.5461515
t.test(rt(25, df=2), mu=0)$p.value
## [1] 0.6911987
experiment.pval.fdr<-p.adjust(replicate(1000, t.test(rnorm(25), mu=0)$p.value), method="fdr")
PRACTICE PROBLEMS – ANOVA 1. You are hired by the US Department of Agriculture to develop effective control practices for invasive plants (data_ANOVA.xlsx). In particular, you are asked to investigate pesticide controls for kudzu which is an invasive vine that grows in thick mats that smother underlying plants. Two of the most widely used pesticides for kudzu are glyphosate and triclopyr. To determine the effectiveness of a single application of pesticide, you conduct an experiment in 18 plots that each had 50% kudzu cover. In mid-summer, you applied equal amounts of 2% glyphosate to 6 plots, 2% triclopyr to 6 plots, and water without pesticides to 6 plots. Then you returned in autumn and measured the percent cover of kudzu in the plots. a) Transform the data from wide to long format (Google how to do it - Hint: function melt from reshape2 package)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
herbicide_wide <-read.csv("data_ANOVA.csv")
herbicide_experiment<-melt(herbicide_wide, id.vars=NULL)
library(ggplot2)
ggplot(herbicide_experiment, aes(x = variable, y = value)) +
geom_boxplot() +
ggtitle("Percent Cover of Kudzu After Treatment") +
xlab("Treatment") +
ylab("Percent Cover")
c) Perform an ANOVA
anova(lm(value~variable, data= herbicide_experiment))
## Analysis of Variance Table
##
## Response: value
## Df Sum Sq Mean Sq F value Pr(>F)
## variable 2 763 381.5 54.5 1.318e-07 ***
## Residuals 15 105 7.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(herbicide_experiment$value~herbicide_experiment$variable))
##
## Call:
## lm(formula = herbicide_experiment$value ~ herbicide_experiment$variable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.500 -1.875 0.000 1.875 4.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.500 1.080 18.053 1.38e-11 ***
## herbicide_experiment$variableTriclopyr -4.500 1.528 -2.946 0.01 *
## herbicide_experiment$variableControl 11.000 1.528 7.201 3.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.646 on 15 degrees of freedom
## Multiple R-squared: 0.879, Adjusted R-squared: 0.8629
## F-statistic: 54.5 on 2 and 15 DF, p-value: 1.318e-07
TukeyHSD(aov(value~variable, data=herbicide_experiment))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = value ~ variable, data = herbicide_experiment)
##
## $variable
## diff lwr upr p adj
## Triclopyr-Glyphosate -4.5 -8.467701 -0.5322987 0.0255594
## Control-Glyphosate 11.0 7.032299 14.9677013 0.0000087
## Control-Triclopyr 15.5 11.532299 19.4677013 0.0000001
animal_diet_study<-read.table("growth.txt", sep="\t", header=T)
boxplot(gain~diet, data=animal_diet_study)
boxplot(gain~supplement, data=animal_diet_study)
boxplot(gain~diet:supplement, data=animal_diet_study)
anova(lm(gain~diet*supplement, data=animal_diet_study))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 83.5201 2.999e-14 ***
## supplement 3 91.881 30.627 17.8150 2.952e-07 ***
## diet:supplement 6 3.406 0.568 0.3302 0.9166
## Residuals 36 61.890 1.719
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
My p-values for this test indicate that diet and supplement have an independent affect on weight gain in the animals, but the combined effect does not have a signficiant relationship with weight gain. (meaning, there is no evidence to suggest a combined effect of diet and supplement on weight gain)
interaction.plot(animal_diet_study$supplement, animal_diet_study$diet, animal_diet_study$gain)
It appears that mean weight gain of the animal is influenced by
supersupp and agrimore, as evidenced by the intersecting lines
anova(lm(gain~diet+supplement, data=animal_diet_study))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 92.358 4.199e-16 ***
## supplement 3 91.881 30.627 19.700 3.980e-08 ***
## Residuals 42 65.296 1.555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain~diet+supplement, data=animal_diet_study))
##
## Call:
## lm(formula = gain ~ diet + supplement, data = animal_diet_study)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.30792 -0.85929 -0.07713 0.92052 2.90615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.1230 0.4408 59.258 < 2e-16 ***
## dietoats -3.0928 0.4408 -7.016 1.38e-08 ***
## dietwheat -5.9903 0.4408 -13.589 < 2e-16 ***
## supplementcontrol -2.6967 0.5090 -5.298 4.03e-06 ***
## supplementsupergain -3.3815 0.5090 -6.643 4.72e-08 ***
## supplementsupersupp -0.7274 0.5090 -1.429 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8356
## F-statistic: 48.76 on 5 and 42 DF, p-value: < 2.2e-16
Multiple R-squared: 0.8531, Adjusted R-squared: 0.8356. So, it appears that approximateky 80% of the data can be explained by this new model.
TukeyHSD(aov(gain~diet+supplement, data=animal_diet_study))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gain ~ diet + supplement, data = animal_diet_study)
##
## $diet
## diff lwr upr p adj
## oats-barley -3.092817 -4.163817 -2.021817 0e+00
## wheat-barley -5.990298 -7.061298 -4.919297 0e+00
## wheat-oats -2.897481 -3.968481 -1.826481 2e-07
##
## $supplement
## diff lwr upr p adj
## control-agrimore -2.6967005 -4.0583332 -1.3350677 0.0000234
## supergain-agrimore -3.3814586 -4.7430914 -2.0198259 0.0000003
## supersupp-agrimore -0.7273521 -2.0889849 0.6342806 0.4888738
## supergain-control -0.6847581 -2.0463909 0.6768746 0.5400389
## supersupp-control 1.9693484 0.6077156 3.3309811 0.0020484
## supersupp-supergain 2.6541065 1.2924737 4.0157392 0.0000307
$diet diff lwr upr p adj oats-barley -3.092817 -4.163817 -2.021817 0e+00 wheat-barley -5.990298 -7.061298 -4.919297 0e+00 wheat-oats -2.897481 -3.968481 -1.826481 2e-07
$supplement diff lwr upr p adj control-agrimore -2.6967005 -4.0583332 -1.3350677 0.0000234 supergain-agrimore -3.3814586 -4.7430914 -2.0198259 0.0000003 supersupp-agrimore -0.7273521 -2.0889849 0.6342806 0.4888738 supergain-control -0.6847581 -2.0463909 0.6768746 0.5400389 supersupp-control 1.9693484 0.6077156 3.3309811 0.0020484 supersupp-supergain 2.6541065 1.2924737 4.0157392 0.0000307
PRACTICE PROBLEMS – Correlation 1. In a study of hyena laughter, a researcher investigated whether sound spectral properties of hyenas’ giggles are associated with age. a) Inspect the data using a scatterplot
hyena_laughter<- data.frame(age<-c(2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20),freq<-c(840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500))
plot(hyena_laughter)
cor.test(hyena_laughter$age, hyena_laughter$freq, method="pearson")
##
## Pearson's product-moment correlation
##
## data: hyena_laughter$age and hyena_laughter$freq
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8453293 -0.1511968
## sample estimates:
## cor
## -0.601798
cor(hyena_laughter$age, hyena_laughter$freq, method="spearman")
## [1] -0.5397807
cor(hyena_laughter, method = "spearman")
## age....c.2..2..2..6..9..10..13..10..14..14..12..7..11..11..14..
## age....c.2..2..2..6..9..10..13..10..14..14..12..7..11..11..14.. 1.0000000
## freq....c.840..670..580..470..540..660..510..520..500..480..400.. -0.5397807
## freq....c.840..670..580..470..540..660..510..520..500..480..400..
## age....c.2..2..2..6..9..10..13..10..14..14..12..7..11..11..14.. -0.5397807
## freq....c.840..670..580..470..540..660..510..520..500..480..400.. 1.0000000
PRACTICE PROBLEMS – Regression 1. Testosterone is known to predict aggressive behavior and is involved in face shape during puberty. Does face shape predict aggression? Researchers measured the face width-to-height ratio of 21 university hockey players with the average number of penalty minutes awarded per game for aggressive infractions. a) How steeply does the number of penalty minutes increase per unit increase in face ratio? Calculate the estimate of the intercept. Write the result in the form of an equation for the line.
hoeckey_players<-read.table("face.txt", sep="\t", header=T)
plot(hoeckey_players$Face, hoeckey_players$Penalty)
lm(Penalty ~ Face, data=hoeckey_players)
##
## Call:
## lm(formula = Penalty ~ Face, data = hoeckey_players)
##
## Coefficients:
## (Intercept) Face
## -4.505 3.189
summary(lm(Penalty ~ Face, data=hoeckey_players))
##
## Call:
## lm(formula = Penalty ~ Face, data = hoeckey_players)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9338 -0.3883 -0.1260 0.3381 1.0711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.505 2.089 -2.157 0.0440 *
## Face 3.189 1.150 2.774 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6129 on 19 degrees of freedom
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2508
## F-statistic: 7.694 on 1 and 19 DF, p-value: 0.0121
plot(hoeckey_players$Face, hoeckey_players$Penalty)
abline(lm(Penalty ~ Face, data=hoeckey_players), col="blue", lwd=3)
library(performance)
check_model(lm(Penalty ~ Face, data=hoeckey_players))
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
2. Respiratory rate (Y) is expected to depend on body mass (X) by the
power law, Y=aX, where is the scaling exponent. The data are
available in respiratoryrate_bodymass.txt. a) Make a scatterplot of the
raw data
respiratory_rates<-read.table("respiratoryrate_bodymass.txt", sep="\t", header=T)
plot(respiratory_rates)
plot(log10(respiratory_rates$BodyMass),log10(respiratory_rates$Respiration), xlab="Body mass", ylab="Respiration rate", main = "Respiration Rates")
respiratory_rate_model<-lm(log10(respiratory_rates$BodyMass) ~ log10(respiratory_rates$Respiration))
d)Carry out a formal test of the null hypothese of beta=0
check_model(respiratory_rate_model)
## Warning: Using `$` in model formulas can produce unexpected results. Specify your
## model using the `data` argument instead.
## Try: log10(BodyMass) ~
## log10(Respiration), data = respiratory_rates
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
e)What is the variation explained, rsquared?
summary(respiratory_rate_model)
##
## Call:
## lm(formula = log10(respiratory_rates$BodyMass) ~ log10(respiratory_rates$Respiration))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24453 -0.15388 0.01637 0.11046 0.36318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1627 0.5779 0.282 0.785401
## log10(respiratory_rates$Respiration) 0.9196 0.1684 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2001 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
f)Check the model assumptions
check_posterior_predictions(respiratory_rate_model)
PRACTICE PROBLEMS – Statistical Modeling 1. Use the ozone.txt data to model the ozone concentration as a linear function of wind speed, air temperature and the intensity of solar radiation. Assume that the requirements to perform a linear regression are met. a) Make a multiple panel bivariate scatterplot
ozone_data<-read.table("ozone.txt", sep="\t", header=T)
pairs( ~ ozone + rad + temp + wind, data = ozone_data, row1attop=TRUE)
ozone_regression<- lm(ozone ~ rad+temp+wind,ozone_data)
summary(ozone_regression)
##
## Call:
## lm(formula = ozone ~ rad + temp + wind, data = ozone_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.210 -3.556 10.124 95.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.23208 23.04204 -2.788 0.00628 **
## rad 0.05980 0.02318 2.580 0.01124 *
## temp 1.65121 0.25341 6.516 2.43e-09 ***
## wind -3.33760 0.65384 -5.105 1.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.17 on 107 degrees of freedom
## Multiple R-squared: 0.6062, Adjusted R-squared: 0.5952
## F-statistic: 54.91 on 3 and 107 DF, p-value: < 2.2e-16
What is the variation explained, R2? Approximtely 60% of the data can be explained by this model.
Assess the collinearity of the explanatory variables using the variance inflation factor
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(ozone_regression)
## rad temp wind
## 1.095241 1.431201 1.328979
check_model(ozone_regression)
diminish <- read.csv(file = "diminish.txt", sep = "\t", header = T)
diminish_linear_reg <- lm(yv ~ xv, data = diminish)
summary(diminish_linear_reg)
##
## Call:
## lm(formula = yv ~ xv, data = diminish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3584 -2.1206 -0.3218 1.8763 4.1782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.61438 1.17315 23.54 7.66e-14 ***
## xv 0.22683 0.02168 10.46 1.45e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 16 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.8646
## F-statistic: 109.5 on 1 and 16 DF, p-value: 1.455e-08
Residuals: Min 1Q Median 3Q Max -3.3584 -2.1206 -0.3218 1.8763 4.1782
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 27.61438 1.17315 23.54 7.66e-14 xv 0.22683
0.02168 10.46 1.45e-08 — Signif. codes: 0 ‘’
0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.386 on 16 degrees of freedom Multiple R-squared: 0.8725, Adjusted R-squared: 0.8646 F-statistic: 109.5 on 1 and 16 DF, p-value: 1.455e-08 b) Perform a polynomial (second-degree) regression
diminish_poly_reg<-lm(yv ~ xv + I(xv^2), data = diminish)
summary(diminish_poly_reg)
##
## Call:
## lm(formula = yv ~ xv + I(xv^2), data = diminish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0490 -1.2890 0.6018 1.1829 2.9610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.6323529 1.6916139 14.561 2.95e-10 ***
## xv 0.4057534 0.0819860 4.949 0.000175 ***
## I(xv^2) -0.0018834 0.0008386 -2.246 0.040203 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.131 on 15 degrees of freedom
## Multiple R-squared: 0.9046, Adjusted R-squared: 0.8919
## F-statistic: 71.12 on 2 and 15 DF, p-value: 2.222e-08
AIC(diminish_linear_reg)
## [1] 86.26193
AIC(diminish_poly_reg)
## [1] 83.04403
With a lower AIV value of 83, the better model is the polynomial linear regression d) Make a scatterplot of the data and include both regression lines
ggplot(data = diminish, aes(x = xv, y = yv)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "salmon") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "turquoise") +
labs(title = "Diminish Scatterplot",
x = "XV",
y = "YV")
3. The data in stork.txt display the stress-induced corticosterone
levels circulating in the blood of European white storks and their
survival over the subsequent five years of study. a) Make a scatterplot
of the data
stork_data <- read.csv(file = "stork.txt", sep = "\t", header = T)
plot(stork_data)
stork_glm_log_reg<-glm(Survival~Corticosterone, family=binomial, data=stork_data)
summary(stork_glm_log_reg)
##
## Call:
## glm(formula = Survival ~ Corticosterone, family = binomial, data = stork_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4316 -0.8724 -0.6703 1.2125 1.8211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.70304 1.74725 1.547 0.1219
## Corticosterone -0.07980 0.04368 -1.827 0.0677 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45.234 on 33 degrees of freedom
## Residual deviance: 41.396 on 32 degrees of freedom
## AIC: 45.396
##
## Number of Fisher Scoring iterations: 4
library(ggeffects)
predicted_cort <- ggpredict(stork_glm_log_reg, terms = "Corticosterone[all]")
plot(predicted_cort)
r2(stork_glm_log_reg)
## # R2 for Logistic Regression
## Tjur's R2: 0.102
check_model(stork_glm_log_reg)
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
4. The clusters.txt dataset contains the response variable Cancers (the
number of reported cancer cases per year per clinic) and the explanatory
variable Distance (the distance from a nuclear plant to the clinic in
kilometers). a) Make a scatterplot of the data
clusters <- read.csv(file = "clusters.txt", sep = "\t", header = T)
plot(clusters)
Which regression is the more appropriate for these data? (Don’t take overdispersion into account for now) This contains count data so I am inclined to use a poisson regression.
Given your choice, is the trend significant?
cancer_poisson <- glm(Cancers ~ Distance, data=clusters, family=poisson)
summary(cancer_poisson)
##
## Call:
## glm(formula = Cancers ~ Distance, family = poisson, data = clusters)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5504 -1.3491 -1.1553 0.3877 3.1304
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.186865 0.188728 0.990 0.3221
## Distance -0.006138 0.003667 -1.674 0.0941 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 149.48 on 93 degrees of freedom
## Residual deviance: 146.64 on 92 degrees of freedom
## AIC: 262.41
##
## Number of Fisher Scoring iterations: 5
The p-value is greater than 0.05, so I would say the trend is not significant.
Nagelkerke’s R2: 0.037
0.0941
ggplot(clusters, aes(x=Distance, y=Cancers)) +
geom_point() +
geom_smooth(method="glm", method.args=list(family="poisson"), se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
check_overdispersion(cancer_poisson)
## # Overdispersion test
##
## dispersion ratio = 1.555
## Pearson's Chi-Squared = 143.071
## p-value = 0.001
## Overdispersion detected.
Yes. dispersion ratio = 1.555 Pearson’s Chi-Squared = 143.071 p-value = 0.001
Overdispersion detected.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
cancer_negative_binomial<-glm.nb(Cancers ~ Distance, data=clusters)
summary(cancer_negative_binomial)
##
## Call:
## glm.nb(formula = Cancers ~ Distance, data = clusters, init.theta = 1.359466981,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3103 -1.1805 -1.0442 0.3065 1.9582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.182490 0.252434 0.723 0.470
## Distance -0.006041 0.004727 -1.278 0.201
##
## (Dispersion parameter for Negative Binomial(1.3595) family taken to be 1)
##
## Null deviance: 96.647 on 93 degrees of freedom
## Residual deviance: 94.973 on 92 degrees of freedom
## AIC: 253.19
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.359
## Std. Err.: 0.612
##
## 2 x log-likelihood: -247.191
check_model(cancer_negative_binomial)
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
5. Use the jaws.txt data to: a) Make a scatterplot of the data (age
explanatory, bone response)
jaws <- read.csv(file = "jaws.txt", sep = "\t", header = T)
ggplot(jaws, aes(x = age, y = bone)) +
geom_point() +
labs(x = "Age (years)", y = "Bone length (mm)",
title = "Scatterplot of Jaws")
b) Perform a non-linear regression assuming an asymptotic exponential
relationship: y=a(1−e−cx)
range(jaws$bone)
## [1] 0 142
jaws_lm <- lm(bone ~ age, data = jaws)
summary(jaws_lm)
##
## Call:
## lm(formula = bone ~ age, data = jaws)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.259 -10.457 2.353 18.048 33.589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.259 5.837 9.124 2.23e-12 ***
## age 1.642 0.201 8.166 6.97e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.3 on 52 degrees of freedom
## Multiple R-squared: 0.5619, Adjusted R-squared: 0.5534
## F-statistic: 66.68 on 1 and 52 DF, p-value: 6.968e-11
exp_model <- nls(bone ~ a*(1-exp(-c*age)), data = jaws, start = list(a = 142, c = 0.6088))
summary(exp_model)
##
## Formula: bone ~ a * (1 - exp(-c * age))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 115.58052 2.84364 40.645 < 2e-16 ***
## c 0.11882 0.01233 9.635 3.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 52 degrees of freedom
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 1.423e-06
mm_model <- nls(bone ~ a*age/(1+b*age), data = jaws, start = list(a = 142, b = 0.6088))
summary(mm_model)
##
## Formula: bone ~ a * age/(1 + b * age)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 18.72540 2.52587 7.413 1.09e-09 ***
## b 0.13596 0.02339 5.814 3.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 52 degrees of freedom
##
## Number of iterations to convergence: 8
## Achieved convergence tolerance: 2.686e-06
jaws_null <- lm(bone ~ 1, data = jaws)
1 - (sum(residuals(exp_model)^2)/sum(residuals(jaws_null)^2))
## [1] 0.8486791
1 - (sum(residuals(mm_model)^2)/sum(residuals(jaws_null)^2))
## [1] 0.8329987
AIC(exp_model)
## [1] 435.0823
AIC(mm_model)
## [1] 440.4066
The expontential model is better with a lower AIC score
ggplot(jaws, aes(x = age, y = bone)) +
geom_point() +
labs(x = "Age (years)", y = "Bone length (mm)",
title = "Scatterplot of Jaws") +
geom_smooth(method = "nls",
formula = y ~ a*(1-exp(-c*x)),
se = FALSE,
method.args = list(start = c(a = 140, c = 0.1)),
color = "blue") +
geom_smooth(method = "nls",
formula = y ~ a*x/(1+b*x),
se = FALSE,
method.args = list(start = c(a = 100, b = 0.01)),
color = "orange")
6. In a recent paper we read: “Linear mixed effects modelling fit by
restricted maximum likelihood was used to explain the variations in
growth. The linear mixed effects model was generated using the lmer
function in the R package lme4, with turbidity, temperature, tide, and
wave action set as fixed factors and site and date set as random
effects”. Write down the R code to recreate their model.
lmer(growth ~ turbidity + temperature + tide + wave_action + (1 | site) + (1 | date))
glmm(germination_rate~soil.type+sterilization+(1|greenhouse/pot.number)
A linear regression
What is depicting the solid black line? The mean of the 4 trends
What do the 4 thin dashed lines represent? The trend line for every subject
load("dragons.RData")
attach(dragons)
dragon_linearreg <- lm(testScore ~ bodyLength)
summary(dragon_linearreg)
##
## Call:
## lm(formula = testScore ~ bodyLength)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.962 -16.411 -0.783 15.193 55.200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.31783 12.06694 -5.081 5.38e-07 ***
## bodyLength 0.55487 0.05975 9.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.2 on 478 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1511
## F-statistic: 86.25 on 1 and 478 DF, p-value: < 2.2e-16
ggplot(dragon_linearreg, aes(x=bodyLength, y=testScore)) +
geom_point(shape=5) +
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(dragon_linearreg, aes(x=mountainRange, y=bodyLength)) +
geom_boxplot(color="pink")
ggplot(dragon_linearreg, aes(x=testScore, y=bodyLength)) +
geom_point(aes(color=mountainRange))
ggplot(data=dragon_linearreg, aes(x=bodyLength, y=testScore)) + geom_point() + facet_wrap(.~mountainRange)
detach(dragons)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
dragon_linear_mixed<-lmer(dragons$bodyLength~dragons$testScore+(1|dragons$mountainRange))
summary(dragon_linear_mixed)
## Linear mixed model fit by REML ['lmerMod']
## Formula: dragons$bodyLength ~ dragons$testScore + (1 | dragons$mountainRange)
##
## REML criterion at convergence: 3472.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.44503 -0.75468 -0.06233 0.72124 2.56768
##
## Random effects:
## Groups Name Variance Std.Dev.
## dragons$mountainRange (Intercept) 212.33 14.571
## Residual 74.73 8.645
## Number of obs: 480, groups: dragons$mountainRange, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 2.009e+02 5.337e+00 37.646
## dragons$testScore 8.006e-03 2.652e-02 0.302
##
## Correlation of Fixed Effects:
## (Intr)
## drgns$tstSc -0.250
dragon_linear_fixed <- lm(dragons$testScore ~ dragons$bodyLength+dragons$mountainRange)
summary(dragon_linear_fixed)
##
## Call:
## lm(formula = dragons$testScore ~ dragons$bodyLength + dragons$mountainRange)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.263 -9.926 0.361 9.994 44.488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.83051 14.47218 1.439 0.15072
## dragons$bodyLength 0.01267 0.07974 0.159 0.87379
## dragons$mountainRangeCentral 36.58277 3.59929 10.164 < 2e-16 ***
## dragons$mountainRangeEmmental 16.20923 3.69665 4.385 1.43e-05 ***
## dragons$mountainRangeJulian 45.11469 4.19012 10.767 < 2e-16 ***
## dragons$mountainRangeLigurian 17.74779 3.67363 4.831 1.84e-06 ***
## dragons$mountainRangeMaritime 49.88133 3.13924 15.890 < 2e-16 ***
## dragons$mountainRangeSarntal 41.97841 3.19717 13.130 < 2e-16 ***
## dragons$mountainRangeSouthern 8.51961 2.73128 3.119 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.96 on 471 degrees of freedom
## Multiple R-squared: 0.5843, Adjusted R-squared: 0.5773
## F-statistic: 82.76 on 8 and 471 DF, p-value: < 2.2e-16
r2(dragon_linear_fixed)
## # R2 for Linear Regression
## R2: 0.584
## adj. R2: 0.577
check_model(dragon_linear_fixed)
## Warning: Using `$` in model formulas can produce unexpected results. Specify your
## model using the `data` argument instead.
## Try: testScore ~ bodyLength +
## mountainRange, data = dragons
fitlm = lm(dragons\(bodyLength ~ dragons\)mountainRange) dragons$predlm = predict(fitlm) ggplot(dragons, aes(x=testScore, y=bodyLength)) + geom_point(aes(color=mountainRange))+geom_line(aes(y = predlm+x1))
library(lme4) library(lmerTest) Estuaries <- read.table(“Estuaries.csv”, header = TRUE, row.names = 1) Estuary_model<-lmer(Estuaries\(Total ~ Estuaries\)Modification + (1|Estuaries$Estuary), data= Estuaries)
summary(Estuary_model) r2(Estuary_model)
ggplot(Estuaries, aes(x = Modification, y = Total)) + geom_boxplot()+ geom_jitter(aes(color=Estuary))
Estuary_lmer_site<-lmer(Total ~ Modification + (1|Estuary/Site)) summary(estuary_lmer_site)
library(performance) r2(estuary_lmer_site)
check_model(estuary_lmer_site)
ggplot(Estuaries, aes(x = Modification, y = Total)) + geom_boxplot()+ geom_jitter(aes(color=Estuary))+ facet_grid(. ~ Site)
estuaries\(HydroidPres<-estuaries\)Hydroid > 0
estuaries\(HydroidPres<-estuaries\)Hydroid > 0 hydro_glmm<-glmer(HydroidPres ~ Modification + (1|Estuary/Site), family=binomial, data=estuaries) summary(hydro_glmm)
check_model(hydro_glmm)
PRACTICE PROBLEMS – Time series and spatial analysis 1. Write a function to calculate the 3-point moving average for an input vector. The formula is: yi’= (yi−1+ yi+ yi+1)/3
moving.average <- function (x) { y <- numeric(length(x)-2) for (i in 2:(length(x)-1)) { y[i]<-(x[i-1]+x[i]+x[i+1])/3 }
temp <- read.table("temp.txt", header = TRUE)
temp <- ts(temp, frequency = 12)
plot(temp)
install.packages("TTR", repos = "http://cran.us.r-project.org/")
##
## The downloaded binary packages are in
## /var/folders/mq/s2l1k_j90hqf8c3g5zymvzsh0000gn/T//RtmpTLg7mZ/downloaded_packages
library(TTR)
temp.m<-SMA(temp, n=5)
plot(temp, type="l", col="black")
lines(temp.m, col="red", lwd=2)
temp.decompseries <- decompose(temp)
plot(temp.decompseries)
acf(temp)
acf(temp.decompseries$random[!is.na(temp.decompseries$random)])
pacf(temp)
library(forecast) temp.new <- auto.arima(temp) summary(temp.new) checkresiduals(temp.new)
temp.fcast <- forecast(temp.new) plot(temp.fcast)
bac.dust <- read.table("bac_dust.txt", header = T)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(ggplot2)
world_map <- map_data("world")
ggplot(world_map)+
geom_polygon(aes(x = long, y = lat, group = group), fill = "lightgray", colour = "black", size = 0.1)+
geom_point(data = bac.dust, aes(x = Longitude, y = Latitude, color = Continent)) +
theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
ggplot(world_map)+
geom_polygon(aes(x = long, y = lat, group = group), fill = "lightgray", colour = "black", size = 0.1)+
geom_point(data = bac.dust, aes(x = Longitude, y = Latitude, color = Continent, size = Richness)) +
theme_bw()
library(spatial)
bac.dust.dists <- as.matrix(dist(cbind(bac.dust$Richness, bac.dust$Proteobacteria)))
bac.dust.dists.inv <- 1/bac.dust.dists
diag(bac.dust.dists.inv) <- 0
library(ape)
##
## Attaching package: 'ape'
## The following object is masked from 'package:dplyr':
##
## where
Moran.I(bac.dust$Richness, bac.dust.dists.inv)
## $observed
## [1] 0.6605168
##
## $expected
## [1] -0.003225806
##
## $sd
## [1] 0.03182156
##
## $p.value
## [1] 0
Moran.I(bac.dust$Proteobacteria, bac.dust.dists.inv)
## $observed
## [1] 0.04809948
##
## $expected
## [1] -0.003225806
##
## $sd
## [1] 0.03180571
##
## $p.value
## [1] 0.1065897
anova(lm(Richness ~ Proteobacteria, data = bac.dust))
## Analysis of Variance Table
##
## Response: Richness
## Df Sum Sq Mean Sq F value Pr(>F)
## Proteobacteria 1 38901 38901 0.5349 0.4651
## Residuals 309 22471204 72722
install.packages("spaMM", repos = "http://cran.us.r-project.org/" )
##
## The downloaded binary packages are in
## /var/folders/mq/s2l1k_j90hqf8c3g5zymvzsh0000gn/T//RtmpTLg7mZ/downloaded_packages
library(spaMM)
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
## spaMM (Rousset & Ferdy, 2014, version 4.2.1) is loaded.
## Type 'help(spaMM)' for a short introduction,
## 'news(package='spaMM')' for news,
## and 'citation('spaMM')' for proper citation.
## Further infos, slides, etc. at https://gitlab.mbb.univ-montp2.fr/francois/spamm-ref.
fitme(Richness ~ Proteobacteria + Matern(1|Latitude + Longitude), data = bac.dust)
## If the 'RSpectra' package were installed, an extreme eigenvalue computation could be faster.
## formula: Richness ~ Proteobacteria + Matern(1 | Latitude + Longitude)
## ML: Estimation of corrPars, lambda and phi by ML.
## Estimation of fixed effects by ML.
## Estimation of lambda and phi by 'outer' ML, maximizing logL.
## family: gaussian( link = identity )
## ------------ Fixed effects (beta) ------------
## Estimate Cond. SE t-value
## (Intercept) 411.58 48.47 8.4910
## Proteobacteria 37.66 112.90 0.3336
## --------------- Random effects ---------------
## Family: gaussian( link = identity )
## --- Correlation parameters:
## 1.nu 1.rho
## 0.2041721 0.3145339
## --- Variance parameters ('lambda'):
## lambda = var(u) for u ~ Gaussian;
## Latitude . : 19190
## # of obs: 311; # of groups: Latitude ., 311
## -------------- Residual variance ------------
## phi estimate was 54712.9
## ------------- Likelihood values -------------
## logLik
## logL (p_v(h)): -2170.262
nu rho 0.2041721 0.3145339
The spatial correlation is larger as the locations are closer
library(maps)
Spain_map <- map_data("world", region = "spain")
head(Spain_map)
## long lat group order region subregion
## 1 1.593945 38.67207 1 1 Spain Formentera
## 2 1.571191 38.65884 1 2 Spain Formentera
## 3 1.504980 38.67100 1 3 Spain Formentera
## 4 1.405762 38.67100 1 4 Spain Formentera
## 5 1.401953 38.71143 1 5 Spain Formentera
## 6 1.417188 38.73965 1 6 Spain Formentera
ggplot(Spain_map) +
geom_polygon(aes(x = long, y = lat, group = group), fill='firebrick')+
theme_bw()
cities <- get('world.cities')
head(cities)
## name country.etc pop lat long capital
## 1 'Abasan al-Jadidah Palestine 5629 31.31 34.34 0
## 2 'Abasan al-Kabirah Palestine 18999 31.32 34.35 0
## 3 'Abdul Hakim Pakistan 47788 30.55 72.11 0
## 4 'Abdullah-as-Salam Kuwait 21817 29.36 47.98 0
## 5 'Abud Palestine 2456 32.03 35.07 0
## 6 'Abwein Palestine 3434 32.03 35.20 0
cities_Spain <- cities[cities$country.etc == 'Spain',]
ggplot(Spain_map) +
geom_polygon(aes(x = long, y = lat, group = group), fill='firebrick') +
geom_point(data = cities_Spain, aes(x = long, y = lat, size = pop), alpha = 0.5) +
theme_bw()
c) Visit Spain and enjoyyy You buying the ticket? PRACTICE PROBLEMS –
Diversity analysis 1. Load the dune_bio.txt dataset. Species should be
in columns and sites in rows
dune_bio <- read.table("dune_bio.txt", header = T, sep = "\t", row.names = 1)
library(vegan)
## Loading required package: permute
##
## Attaching package: 'permute'
## The following object is masked from 'package:spaMM':
##
## how
## Loading required package: lattice
## This is vegan 2.6-4
sum(dune_bio)
## [1] 685
colSums(dune_bio)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 13 2 13 18 5 25 18 4 49 14 2
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 9 54 4 48 10 9 47 21 11 16 63
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 1 26 20 26 48 58 36 15
colMeans(dune_bio)
## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 0.65 0.10 0.65 0.90 0.25 1.25 0.90 0.20 2.45 0.70 0.10
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 0.45 2.70 0.20 2.40 0.50 0.45 2.35 1.05 0.55 0.80 3.15
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 0.05 1.30 1.00 1.30 2.40 2.90 1.80 0.75
rowSums(dune_bio)
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 42 33 45 33 48 18 40 43 15 23 43 32 42 27 40 31 24 31 35 40
rowMeans(dune_bio)
## 2 13 4 16 6 1 8 5
## 1.4000000 1.1000000 1.5000000 1.1000000 1.6000000 0.6000000 1.3333333 1.4333333
## 17 15 10 11 9 18 3 20
## 0.5000000 0.7666667 1.4333333 1.0666667 1.4000000 0.9000000 1.3333333 1.0333333
## 14 19 12 7
## 0.8000000 1.0333333 1.1666667 1.3333333
median_sps_sites <- function(x){
cat("The median number of individuals per sp is", apply(x, 2, median),
"and for sites is", apply(x, 1, median))
}
median_sps_sites(dune_bio)
## The median number of individuals per sp is 0 0 0 0 0 0 0 0 2 0 0 0 2 0 3 0 0 2 0 0 0 4 0 0 0 0 1.5 2 0 0 and for sites is 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
dune_bio.relabu <- decostand(dune_bio, method = "total")
rowSums(dune_bio.relabu)
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
dune_bio.range <- decostand(dune_bio, method = "range")
dune_bio.stand <- decostand(dune_bio, method = "standardize")
richness <- function(x) {
return(specnumber(decostand(x, method = "total")))
}
Shannon.div <- function(x) {
return(diversity(decostand(x, method = "total"), index = "shannon"))
}
library(vegan)
richness_and_Shannon.div <- function(a) {
x <- specnumber(a)
y <- diversity(a, index = "shannon")
df <- data.frame(x,y)
colnames(df) <- c("Richness", "Shannon")
return(df)
}
richness_and_Shannon.div(dune_bio)
## Richness Shannon
## 2 10 2.252516
## 13 10 2.099638
## 4 13 2.426779
## 16 8 1.959795
## 6 11 2.345946
## 1 5 1.440482
## 8 12 2.434898
## 5 14 2.544421
## 17 7 1.876274
## 15 8 1.979309
## 10 12 2.398613
## 11 9 2.106065
## 9 13 2.493568
## 18 9 2.079387
## 3 10 2.193749
## 20 8 2.048270
## 14 7 1.863680
## 19 9 2.134024
## 12 9 2.114495
## 7 13 2.471733
plot(rad.lognormal(dune_bio[2,]), lty=2, lwd=2)
rad.lognormal(dune_bio[2,])
##
## RAD model: Log-Normal
## Family: poisson
## No. of species: 10
## Total abundance: 33
##
## log.mu log.sigma Deviance AIC BIC
## 0.9982177 0.7106063 1.1324449 34.2076395 34.8128097
plot(radfit(dune_bio[2,]))
radfit(dune_bio[2,])
##
## RAD models, family poisson
## No. of species 10, total abundance 33
##
## par1 par2 par3 Deviance AIC BIC
## Null 3.91659 32.99179 32.99179
## Preemption 0.20971 2.02192 33.09712 33.39970
## Lognormal 0.99822 0.71061 1.13244 34.20764 34.81281
## Zipf 0.27866 -0.79375 0.79058 33.86578 34.47095
## Mandelbrot 0.40348 -0.95679 0.51608 0.75881 35.83401 36.74176
dune_env <- read.table("dune_env.txt", sep="\t", header=T, row.names=1)
dune_env.euc <- vegdist(decostand(dune_env[,c(1,2,5)], method = "standardize"), method = "euclidean")
dune_bio.bray <- vegdist(dune_bio, method="bray")
mantel(dune_env.euc, dune_bio.bray)
##
## Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel(xdis = dune_env.euc, ydis = dune_bio.bray)
##
## Mantel statistic r: 0.5496
## Significance: 0.001
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.142 0.197 0.219 0.275
## Permutation: free
## Number of permutations: 999
plot(dune_env.euc, dune_bio.bray)
dune_bio <- read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune_bio.bray <- vegdist(dune_bio, method="bray")
dune_bio.clust.s <- hclust(dune_bio.bray, method="single")
plot(dune_bio.clust.s)
dune_bio.clust.a <- hclust(dune_bio.bray, method="average")
plot(dune_bio.clust.a)
dune_bio.clust.c <- hclust(dune_bio.bray, method="complete")
plot(dune_bio.clust.c)
dune_bio.k <- cascadeKM(dune_bio, inf.gr=2, sup.gr=6)
plot(dune_bio.k, sortg = T)
library(vegan)
data("varechem")
varechem.pca <- rda(varechem, scale = T)
summary(varechem.pca)
##
## Call:
## rda(X = varechem, scale = T)
##
## Partitioning of correlations:
## Inertia Proportion
## Total 14 1
## Unconstrained 14 1
##
## Eigenvalues, and their contribution to the correlations
##
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Eigenvalue 5.1916 3.1928 1.6855 1.06899 0.81599 0.70581 0.43641
## Proportion Explained 0.3708 0.2281 0.1204 0.07636 0.05829 0.05042 0.03117
## Cumulative Proportion 0.3708 0.5989 0.7193 0.79564 0.85392 0.90434 0.93551
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Eigenvalue 0.36884 0.1707 0.14953 0.08526 0.06986 0.035057 0.023615
## Proportion Explained 0.02635 0.0122 0.01068 0.00609 0.00499 0.002504 0.001687
## Cumulative Proportion 0.96185 0.9740 0.98473 0.99082 0.99581 0.998313 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores: 4.236078
##
##
## Species scores
##
## PC1 PC2 PC3 PC4 PC5 PC6
## N 0.2441 0.2516 -0.23818 0.92846 -0.359554 0.30319
## P -0.9181 -0.4194 0.13356 0.09649 0.224909 0.07938
## K -0.9369 -0.3272 -0.06261 0.25051 0.180899 -0.28683
## Ca -0.9280 -0.1711 0.47982 -0.02138 -0.241479 -0.09536
## Mg -0.9087 -0.1978 0.12446 -0.04430 -0.497428 -0.10781
## S -0.8576 -0.6002 -0.31620 -0.01652 0.071629 -0.08779
## Al 0.2980 -0.9720 -0.39009 0.09092 0.005386 -0.19467
## Fe 0.5236 -0.7748 -0.23063 0.37187 -0.023377 -0.32292
## Mn -0.7418 0.3908 0.13114 0.44346 0.501064 0.06776
## Zn -0.8926 -0.3277 0.06142 -0.06121 -0.153677 0.51653
## Mo -0.1072 -0.4625 -0.85720 -0.27258 -0.030063 0.39721
## Baresoil -0.4218 0.6387 -0.40089 -0.07389 -0.416610 -0.31060
## Humdepth -0.6070 0.6825 -0.41674 0.02975 -0.047738 -0.15923
## pH 0.4286 -0.6517 0.66384 0.07599 -0.265374 0.05070
##
##
## Site scores (weighted sums of species scores)
##
## PC1 PC2 PC3 PC4 PC5 PC6
## 18 0.05126 0.84085 -0.190142 -0.40431 0.101518 -1.01997
## 15 0.20452 0.37397 0.002978 -1.06030 1.175734 -0.92257
## 24 -1.53647 -1.25830 -0.012170 -0.86690 -1.854372 1.74375
## 27 -1.25643 0.51345 0.743928 0.63835 1.097881 0.15631
## 23 -0.71203 0.86247 -0.203389 -0.05971 -0.786196 -0.80080
## 19 -0.76179 0.73785 -0.761645 -0.31728 -1.303297 -0.62264
## 22 -0.30340 0.86304 0.188484 0.54758 -0.078064 0.24567
## 16 0.62797 0.76436 -0.124301 -0.26047 0.009044 -0.02807
## 28 -1.13923 0.44909 0.229104 1.61582 0.842871 0.60852
## 13 -0.62483 -0.85407 -1.259755 1.38195 -0.157593 -1.04794
## 14 -0.04206 0.60916 -0.719743 -0.73109 -0.230147 0.25505
## 20 -0.93747 0.20753 -0.689713 0.62198 -0.282188 0.33232
## 25 -0.34451 0.90128 0.710714 0.64127 1.214897 0.48173
## 7 1.50502 -0.22114 -0.495604 0.87068 -0.769266 -0.25733
## 5 1.40889 0.60035 0.911463 0.71133 -0.488111 2.45195
## 6 1.30776 0.03604 -0.243884 -0.87792 0.543259 0.46876
## 3 1.64967 -0.82755 -0.343406 1.40028 -0.546374 -0.19580
## 4 -0.26711 -1.65198 -1.744251 -0.60763 0.947492 0.46203
## 2 0.59653 -1.21610 -0.200030 0.72815 0.799208 -0.76013
## 9 0.03271 -0.69445 -0.350028 -1.35247 0.717140 0.82965
## 12 0.47944 0.26377 0.184677 -1.27744 0.573929 0.07253
## 10 -0.32993 -0.95483 1.670469 -0.51343 0.819138 -0.48682
## 11 -0.09921 -1.52318 2.493913 -0.09044 -1.092296 -1.10654
## 21 0.49069 1.17838 0.202333 -0.73801 -1.254205 -0.85966
PC1 37%, PC2 23%
ev <- varechem.pca$CA$eig #extract eigenvalues
barplot(ev/sum(ev), main="Eigenvalues", col="bisque") #screeplot
plot(varechem.pca, display="sites", xlab="PC1 (37%)", ylab="PC2 (23%)")
biplot(varechem.pca, scaling="species", xlab="PC1 (37%)", ylab="PC2 (23%)")
dune_bio <- read.table("dune_bio.txt", header = T, sep = "\t", row.names = 1)
library(vegan)
dune_bio.pca <- rda(dune_bio, scale=T)
summary(dune_bio.pca)
##
## Call:
## rda(X = dune_bio, scale = T)
##
## Partitioning of correlations:
## Inertia Proportion
## Total 30 1
## Unconstrained 30 1
##
## Eigenvalues, and their contribution to the correlations
##
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Eigenvalue 7.0324 4.9973 3.5548 2.64405 2.1389 1.75781 1.47834
## Proportion Explained 0.2344 0.1666 0.1185 0.08813 0.0713 0.05859 0.04928
## Cumulative Proportion 0.2344 0.4010 0.5195 0.60762 0.6789 0.73751 0.78679
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Eigenvalue 1.31640 1.10787 0.80898 0.74527 0.69660 0.57488 0.35796
## Proportion Explained 0.04388 0.03693 0.02697 0.02484 0.02322 0.01916 0.01193
## Cumulative Proportion 0.83067 0.86760 0.89456 0.91941 0.94263 0.96179 0.97372
## PC15 PC16 PC17 PC18 PC19
## Eigenvalue 0.222534 0.219744 0.150712 0.131907 0.063498
## Proportion Explained 0.007418 0.007325 0.005024 0.004397 0.002117
## Cumulative Proportion 0.981138 0.988463 0.993487 0.997883 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores: 4.886172
##
##
## Species scores
##
## PC1 PC2 PC3 PC4 PC5 PC6
## Belper -0.47686 -0.254417 -0.065363 -0.467748 -0.09334 0.101603
## Empnig 0.01570 0.611852 -0.490145 0.060573 -0.11700 0.110506
## Junbuf 0.07304 -0.336593 -0.270540 0.572440 0.07798 -0.237750
## Junart 0.56496 -0.108411 0.175409 0.007844 0.15665 0.182690
## Airpra -0.00383 0.666918 -0.461324 0.081841 -0.27604 0.114044
## Elepal 0.69457 0.012911 0.382956 -0.137600 -0.06047 0.101890
## Rumace -0.41931 -0.008718 0.345174 0.590333 0.07281 0.291534
## Viclat -0.26587 0.211444 0.048313 -0.345497 0.50668 -0.416800
## Brarut 0.06077 0.283080 0.237426 0.131228 0.66972 0.189213
## Ranfla 0.72448 -0.015720 0.284143 -0.041979 -0.07417 -0.050091
## Cirarv -0.03939 -0.297900 -0.325356 -0.274435 0.07615 0.494720
## Hyprad -0.04145 0.703508 -0.484633 0.006194 -0.04594 -0.005173
## Leoaut -0.40433 0.462826 -0.228004 -0.156482 0.21035 -0.205059
## Potpal 0.38201 0.079690 0.314566 -0.153780 -0.25028 -0.166589
## Poapra -0.58572 -0.421592 -0.077315 -0.214655 0.24276 -0.040105
## Calcus 0.57940 0.075740 0.373983 -0.164042 -0.18122 0.021965
## Tripra -0.42518 0.091054 0.438682 0.477095 0.08450 0.275019
## Trirep -0.39602 -0.039044 0.190829 -0.032309 -0.10496 -0.381565
## Antodo -0.47855 0.504530 0.042095 0.216161 -0.33567 0.184658
## Salrep 0.27415 0.474527 -0.058351 -0.146213 0.21067 0.140351
## Achmil -0.65615 0.031721 0.288522 -0.071330 -0.38877 -0.021586
## Poatri -0.34924 -0.639286 -0.166348 0.283144 -0.18103 -0.033875
## Chealb 0.12921 -0.237490 -0.227366 0.337179 -0.15836 -0.419720
## Elyrep -0.25958 -0.479231 -0.224739 -0.135805 -0.10774 0.310247
## Sagpro 0.08780 -0.139706 -0.676217 0.120676 0.24558 0.137039
## Plalan -0.61659 0.231282 0.424325 0.220501 0.16319 0.052640
## Agrsto 0.63980 -0.449378 -0.126398 -0.033358 0.03338 0.220887
## Lolper -0.61693 -0.253253 0.089528 -0.251018 0.25974 0.022426
## Alogen 0.23508 -0.523916 -0.341214 0.266377 0.10359 -0.125309
## Brohor -0.53293 -0.236962 0.005037 -0.364163 -0.29092 0.117422
##
##
## Site scores (weighted sums of species scores)
##
## PC1 PC2 PC3 PC4 PC5 PC6
## 2 -1.34700 -0.87580 -0.23831 -1.58267 -1.42302 -6.496e-01
## 13 0.68977 -1.26785 -1.21380 1.80005 -0.84543 -2.241e+00
## 4 -0.21028 -1.59035 -1.73693 -1.46508 0.40655 2.641e+00
## 16 1.92513 -0.27484 0.95166 -0.02451 -0.08830 8.722e-01
## 6 -1.45857 0.54292 1.88699 2.06801 0.89204 1.037e+00
## 1 -0.22304 -0.54162 -0.04175 -0.53653 -0.39474 2.381e-01
## 8 0.78210 -0.60749 -0.09209 -0.10130 0.68298 -6.133e-02
## 5 -1.41354 0.02314 0.90200 0.83760 -0.93543 1.329e+00
## 17 -0.16164 1.36109 -0.40568 0.28323 -1.65437 1.857e-01
## 15 1.46036 0.32505 1.02725 -0.36354 -0.17529 -6.804e-05
## 10 -1.66958 0.08276 0.71009 -1.31231 -0.83698 -1.073e+00
## 11 -0.65855 0.83922 -0.19382 -1.00039 2.53899 -1.737e+00
## 9 0.16230 -1.14986 -0.70997 1.03446 0.50332 1.967e-01
## 18 -0.33407 0.87974 0.28097 -1.00217 2.08747 -6.589e-01
## 3 -0.27213 -1.24009 -0.61014 -0.49098 0.25415 -1.796e-03
## 20 1.97570 0.67211 1.01287 -0.56633 0.52511 9.759e-01
## 14 1.34682 0.26055 1.28433 -0.76651 -1.66391 -1.224e+00
## 19 0.08379 3.26639 -2.61666 0.32337 -0.62462 5.899e-01
## 12 0.52762 -0.66633 -1.04159 1.84314 0.80106 -5.501e-01
## 7 -1.20519 -0.03873 0.84458 1.02246 -0.04956 1.315e-01
PC1 23% and PC2 17%
ev <- dune_bio.pca$CA$eig
barplot(ev/sum(ev), main="Eigenvalues", col="bisque")
plot(dune_bio.pca, display="sites", xlab="PC1 (23%)", ylab="PC2 (17%)")
dune_bio.bray <- vegdist(dune_bio, method = "bray")
dune_bio.nmds <- metaMDS(dune_bio.bray, k = 2, try = 100)
## Run 0 stress 0.1192678
## Run 1 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02026985 max resid 0.06496025
## Run 2 stress 0.1183186
## ... Procrustes: rmse 2.456055e-06 max resid 7.036059e-06
## ... Similar to previous best
## Run 3 stress 0.1192678
## Run 4 stress 0.1192678
## Run 5 stress 0.1192678
## Run 6 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 1.075849e-06 max resid 2.256472e-06
## ... Similar to previous best
## Run 7 stress 0.1808911
## Run 8 stress 0.1183186
## ... Procrustes: rmse 1.513685e-05 max resid 5.038976e-05
## ... Similar to previous best
## Run 9 stress 0.1192678
## Run 10 stress 0.1192678
## Run 11 stress 0.1183186
## ... Procrustes: rmse 8.018497e-06 max resid 2.57491e-05
## ... Similar to previous best
## Run 12 stress 0.1192678
## Run 13 stress 0.1192678
## Run 14 stress 0.1192678
## Run 15 stress 0.1192679
## Run 16 stress 0.1192679
## Run 17 stress 0.1183186
## ... Procrustes: rmse 2.619235e-06 max resid 8.600287e-06
## ... Similar to previous best
## Run 18 stress 0.1183186
## ... Procrustes: rmse 1.958038e-05 max resid 6.401134e-05
## ... Similar to previous best
## Run 19 stress 0.1183186
## ... Procrustes: rmse 6.645989e-06 max resid 1.967145e-05
## ... Similar to previous best
## Run 20 stress 0.1183186
## ... Procrustes: rmse 4.818874e-06 max resid 1.664623e-05
## ... Similar to previous best
## *** Best solution repeated 7 times
dune_bio.nmds$stress
## [1] 0.1183186
stressplot(dune_bio.nmds)
plot(dune_bio.nmds, type = "t", display = "sites")
dune_env <- read.table("dune_env.txt", header = T, sep = "\t", row.names = 1)
row.names(dune_bio) == row.names(dune_env)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
dune_env$Axis01 = dune_bio.nmds$points[,1]
dune_env$Axis02 = dune_bio.nmds$points[,2]
dune_env$Richness <- specnumber(dune_bio)
ggplot(data = dune_env, aes(Axis01, Axis02)) +
geom_point(aes(color = Management, size = Richness)) +
labs(title="Dune Biology NMDS ordination (Stress = 0.1183)")
dune_bio <- read.table("dune_bio.txt", header = T, sep = "\t", row.names = 1)
dune_env <- read.table("dune_env.txt", header = T, sep = "\t", row.names = 1)
dune_bio.cca <- cca(dune_bio ~ A1 + Moisture + Manure, data = dune_env)
summary(dune_bio.cca)
##
## Call:
## cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 2.1153 1.0000
## Constrained 0.7692 0.3637
## Unconstrained 1.3460 0.6363
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CCA1 CCA2 CCA3 CA1 CA2 CA3 CA4
## Eigenvalue 0.4264 0.2347 0.10812 0.3124 0.2101 0.17518 0.13748
## Proportion Explained 0.2016 0.1110 0.05112 0.1477 0.0993 0.08282 0.06499
## Cumulative Proportion 0.2016 0.3125 0.36366 0.5114 0.6107 0.69348 0.75847
## CA5 CA6 CA7 CA8 CA9 CA10 CA11
## Eigenvalue 0.09442 0.09295 0.07380 0.06360 0.05541 0.04663 0.021117
## Proportion Explained 0.04464 0.04394 0.03489 0.03007 0.02620 0.02204 0.009983
## Cumulative Proportion 0.80311 0.84705 0.88194 0.91201 0.93820 0.96025 0.970230
## CA12 CA13 CA14 CA15 CA16
## Eigenvalue 0.019730 0.016183 0.012531 0.009960 0.004566
## Proportion Explained 0.009327 0.007651 0.005924 0.004709 0.002159
## Cumulative Proportion 0.979558 0.987208 0.993132 0.997841 1.000000
##
## Accumulated constrained eigenvalues
## Importance of components:
## CCA1 CCA2 CCA3
## Eigenvalue 0.4264 0.2347 0.1081
## Proportion Explained 0.5543 0.3051 0.1406
## Cumulative Proportion 0.5543 0.8594 1.0000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
##
##
## Species scores
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## Belper -0.72508 0.09015 0.28678 -0.14997 -0.08941 0.64602
## Empnig 0.92386 -1.49359 -1.29239 2.87128 1.31651 0.74936
## Junbuf 0.50547 0.17496 -0.35707 0.33578 -1.58298 -0.40514
## Junart 1.08445 -0.22093 -0.33048 -0.84568 0.15861 -0.23372
## Airpra 0.32923 -1.52131 -0.75777 2.80560 1.49278 0.23227
## Elepal 1.39255 0.01967 0.39847 -0.90296 0.89564 -0.18234
## Rumace -0.56966 0.07756 0.44317 0.23147 -0.33959 -1.25112
## Viclat -0.95631 -1.05874 0.13224 -0.42277 -0.16218 0.84521
## Brarut 0.05861 -0.29093 0.14295 -0.17325 0.10170 -0.08514
## Ranfla 1.30695 -0.17793 0.06749 -0.84091 0.45935 -0.18497
## Cirarv -0.40316 1.49565 -0.09657 0.17279 0.57950 1.51909
## Hyprad 0.14257 -1.37899 -0.68907 2.13762 1.11990 0.54595
## Leoaut -0.10805 -0.39424 0.03343 0.21126 0.05153 0.18568
## Potpal 1.82080 -0.59463 2.50587 -0.53778 0.46936 0.51885
## Poapra -0.47298 0.19818 -0.15806 -0.08959 -0.09919 0.21588
## Calcus 1.32589 -0.43845 0.22643 -1.29121 0.99188 -0.18948
## Tripra -0.94282 0.14003 0.52488 0.16097 0.18159 -1.70499
## Trirep -0.03691 -0.24820 0.22610 -0.03138 -0.23464 0.04178
## Antodo -0.42848 -0.66783 0.01590 1.13407 0.58011 -0.56583
## Salrep 0.38937 -1.51269 -0.78060 -0.47447 0.69625 0.26139
## Achmil -0.84528 -0.28199 0.08034 0.23663 0.15351 -0.33457
## Poatri -0.09591 0.48672 -0.08195 0.10052 -0.39496 -0.08346
## Chealb 1.33134 1.08879 -0.17910 0.92504 -1.53277 -0.26877
## Elyrep -0.45995 0.49110 -0.07018 -0.09270 -0.32202 0.53794
## Sagpro 0.36673 0.22891 -0.41200 0.63169 -0.40176 0.48274
## Plalan -0.89463 -0.37036 0.37142 0.16434 0.15598 -0.65298
## Agrsto 0.80663 0.42851 0.03513 -0.31737 0.09257 0.16535
## Lolper -0.68266 0.25866 -0.13366 -0.17559 0.06653 0.20307
## Alogen 0.52884 0.74865 -0.28721 0.11872 -0.57814 0.07754
## Brohor -0.77674 0.11771 0.03195 -0.07993 -0.06235 0.29678
##
##
## Site scores (weighted averages of species scores)
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## 2 -0.8544 0.57195 -0.04460 -0.44645 -0.473383 1.10419
## 13 0.7656 1.43234 -1.04660 0.92504 -1.532773 -0.26877
## 4 -0.1565 1.36916 -0.67602 0.17279 0.579503 1.51909
## 16 2.0459 0.46834 0.70504 -0.99503 1.669766 -0.75493
## 6 -1.0571 -0.29557 1.50936 0.06506 0.216688 -2.09457
## 1 -1.2439 1.24492 -0.99278 -0.48730 0.708935 1.13256
## 8 0.8113 0.66762 -0.54772 -0.28201 0.298269 -0.31177
## 5 -1.1247 -0.04619 1.17587 0.61920 0.008872 -0.95018
## 17 -0.7722 -2.94478 -1.24411 2.70708 1.757175 -0.54337
## 15 2.0065 -0.48090 2.87633 -0.26585 0.401900 0.57133
## 10 -1.0958 -0.42412 0.49762 -0.26374 -0.332785 0.08742
## 11 -0.7816 -0.90608 -0.28150 -0.26599 -0.008900 1.12676
## 9 0.1817 0.86595 -0.91242 -0.34851 -2.059004 -0.10377
## 18 -0.6053 -1.51952 0.07324 -0.89534 -0.298138 1.03991
## 3 -0.2093 1.35237 -0.66038 0.06196 0.171454 0.84659
## 20 1.8996 -1.40266 -0.55715 -2.22940 0.920734 -0.49851
## 14 1.9462 -0.67177 3.54931 -0.80971 0.536813 0.46637
## 19 0.2690 -3.39538 -3.20303 2.87128 1.316513 0.74936
## 12 0.6252 1.06204 -0.88731 0.77476 -2.069361 -0.26842
## 7 -1.0498 -0.01449 0.64118 -0.05748 0.266550 -1.48584
##
##
## Site constraints (linear combinations of constraining variables)
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## 2 -1.0722 -0.15065 0.02248 -0.44645 -0.473383 1.10419
## 13 1.3313 1.08879 -0.17910 0.92504 -1.532773 -0.26877
## 4 -0.4032 1.49565 -0.09657 0.17279 0.579503 1.51909
## 16 1.2912 1.04854 -0.34917 -0.99503 1.669766 -0.75493
## 6 -0.9651 -0.04331 0.47601 0.06506 0.216688 -2.09457
## 1 -1.0995 1.27129 -0.50141 -0.48730 0.708935 1.13256
## 8 1.0904 0.84728 -1.19953 -0.28201 0.298269 -0.31177
## 5 -0.6973 0.22504 1.60982 0.61920 0.008872 -0.95018
## 17 -0.5627 -1.56290 0.04417 2.70708 1.757175 -0.54337
## 15 1.9681 -0.44704 3.12947 -0.26585 0.401900 0.57133
## 10 -0.6232 -0.89889 -0.41620 -0.26374 -0.332785 0.08742
## 11 -1.1054 -0.90857 0.08601 -0.26599 -0.008900 1.12676
## 9 0.4481 -0.77218 -0.96709 -0.34851 -2.059004 -0.10377
## 18 -0.9913 -1.51891 0.77314 -0.89534 -0.298138 1.03991
## 3 -0.3898 1.50907 -0.03988 0.06196 0.171454 0.84659
## 20 0.8971 -1.52042 -1.40577 -2.22940 0.920734 -0.49851
## 14 1.6735 -0.74222 1.88228 -0.80971 0.536813 0.46637
## 19 0.9239 -1.49359 -1.29239 2.87128 1.316513 0.74936
## 12 0.7625 0.26751 0.15988 0.77476 -2.069361 -0.26842
## 7 -1.1327 0.51336 -0.43788 -0.05748 0.266550 -1.48584
##
##
## Biplot scores for constraining variables
##
## CCA1 CCA2 CCA3 CA1 CA2 CA3
## A1 0.6048 0.04018 0.7954 0 0 0
## Moisture 0.9746 -0.06076 -0.2158 0 0 0
## Manure -0.2059 0.96205 -0.1791 0 0 0
What is the variation explained by the two first constrained axes? CA1 20%, CA2 11%
What is the adjusted R2 of the model?
RsquareAdj(dune_bio.cca)$adj.r.squared
## [1] 0.2454982
anova(dune_bio.cca)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
## Df ChiSquare F Pr(>F)
## Model 3 0.76925 3.048 0.001 ***
## Residual 16 1.34602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(dune_bio.cca, by="margin")
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
## Df ChiSquare F Pr(>F)
## A1 1 0.13047 1.5508 0.100 .
## Moisture 1 0.30886 3.6714 0.001 ***
## Manure 1 0.23418 2.7837 0.008 **
## Residual 16 1.34602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(dune_bio.cca, scaling = "sites", xlab = "CCA1", ylab = "CCA2")
dune_bio.bray <- vegdist(dune_bio, method="bray")
dune_bio.nmds <- metaMDS(dune_bio.bray, k = 2, try = 100)
## Run 0 stress 0.1192678
## Run 1 stress 0.1192678
## ... Procrustes: rmse 5.327019e-05 max resid 0.0001636741
## ... Similar to previous best
## Run 2 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02026993 max resid 0.0649606
## Run 3 stress 0.1192678
## Run 4 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 8.567327e-06 max resid 2.250542e-05
## ... Similar to previous best
## Run 5 stress 0.2796059
## Run 6 stress 0.2192849
## Run 7 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 1.03919e-05 max resid 2.721875e-05
## ... Similar to previous best
## Run 8 stress 0.1183186
## ... Procrustes: rmse 7.87007e-06 max resid 2.240767e-05
## ... Similar to previous best
## Run 9 stress 0.2035424
## Run 10 stress 0.1183186
## ... Procrustes: rmse 2.517473e-05 max resid 7.579375e-05
## ... Similar to previous best
## Run 11 stress 0.1192678
## Run 12 stress 0.1192678
## Run 13 stress 0.1192678
## Run 14 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 4.176808e-06 max resid 1.291564e-05
## ... Similar to previous best
## Run 15 stress 0.1809578
## Run 16 stress 0.2045511
## Run 17 stress 0.1183186
## ... Procrustes: rmse 1.981443e-06 max resid 4.226217e-06
## ... Similar to previous best
## Run 18 stress 0.1192678
## Run 19 stress 0.1192678
## Run 20 stress 0.1192679
## *** Best solution repeated 2 times
rownames(dune_bio) = rownames(dune_env)
# perform permanova
adonis2(dune_bio.bray ~ A1 + Moisture + Manure, data = dune_env, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dune_bio.bray ~ A1 + Moisture + Manure, data = dune_env, permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## A1 1 0.7230 0.16817 5.8146 2e-04 ***
## Moisture 1 0.8671 0.20169 6.9736 1e-04 ***
## Manure 1 0.7197 0.16741 5.7885 2e-04 ***
## Residual 16 1.9893 0.46274
## Total 19 4.2990 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
A1, Moisture, Manure
A1, R2 = 0.17 Moisture, R2 = 0.20 Manure, R2 = 0.17
adonis2(dune_bio.bray ~ A1, strata = dune_env$Use, data = dune_env, permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dune_bio.bray ~ A1, data = dune_env, permutations = 9999, strata = dune_env$Use)
## Df SumOfSqs R2 F Pr(>F)
## A1 1 0.7230 0.16817 3.6389 0.0044 **
## Residual 18 3.5761 0.83183
## Total 19 4.2990 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dune_env$Axis01 = dune_bio.nmds$points[,1]
dune_env$Axis02 = dune_bio.nmds$points[,2]
ggplot(data = dune_env, aes(Axis01, Axis02)) +
geom_point(aes(color = Use, size = A1))