Disclaimer
This document is for educational purposes only, as a requirement for the completion of a university statistics course.
GitHub
Individual scripts and datasets can be found on GitHub: https://github.com/hleary/statsandviz
# Install and load the ISwR R Package.
install.packages("ISwR", repos = "http://cran.us.r-project.org")## Installing package into 'C:/Users/hlear/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'ISwR' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\hlear\AppData\Local\Temp\RtmpuSVT9V\downloaded_packages
library("ISwR")
# Write the built-in dataset thuesen to a tab-separated text file.
data(package = "ISwR") # Run to see the data sets within the ISwR package.
thuesen_df <- as.data.frame(thuesen) # Read the dataset thuesen in the package ISwR
head(thuesen_df) # View the first few rows of the dataset thuesen## blood.glucose short.velocity
## 1 15.3 1.76
## 2 10.8 1.34
## 3 8.1 1.27
## 4 19.5 1.47
## 5 7.2 1.27
## 6 5.3 1.49
write.table(thuesen_df, "C:\\Users\\hlear\\Desktop\\statsandviz\\R01_r_language\\data\\thuesen_tab.txt", row.names=FALSE) # Don't export row names.
# I viewed thuesen_tab in Notepad and changed "NA" to "." (period)
# Read the changed file back into R.
thuesen_tab <- read.table(file = "C:\\Users\\hlear\\Desktop\\statsandviz\\R01_r_language\\data\\thuesen_tab.txt", header = TRUE) # Has a header
head(thuesen_tab)## blood.glucose short.velocity
## 1 15.3 1.76
## 2 10.8 1.34
## 3 8.1 1.27
## 4 19.5 1.47
## 5 7.2 1.27
## 6 5.3 1.49
# Write an exponential growth function.
Ni <- 10
t <- c(1:20)
Nt <- function(r){
Nt <- Ni * exp(r*t)
plot(t, Nt)
}
# Test under three growth rate scenarios.
Nt(0.5)Nt(0.8)Nt(-0.1)
# Logistic growth function.
Ni <- 10
t <- c(1:20)
K <- 1000
log_growth <- function (r) {
Nt <- K*Ni/(Ni+(K-Ni) * exp(1)^(-r*t))
plot(t, Nt, xlab="Time", ylab="Population size")
}
# Test under three growth rate scenarios
log_growth(0.5)log_growth(0.8)log_growth(0.4)
# Write a function that computes the sum of integers from 1 to n (inclusive).
sum_n <- function(n){sum(1:n)}
# Quick test with n = 5.
1+2+3+4+5 # Run this to see what the sum of integers from 1 to 5 equals.## [1] 15
sum_n(5) # Now test the function with n = 5.## [1] 15
# Use the function to determine the sum of integers from 1 to 5,000.
sum_n(5000)## [1] 12502500
sqrt_round <- function(x) {
result <- round(sqrt(x))
return(result)
}
# Test
sqrt(17)## [1] 4.123106
sqrt_round(17)## [1] 4
# Install the R package ‘nycflights13’, and load the ‘weather’ data.
install.packages("nycflights13", repos = "http://cran.us.r-project.org")## Installing package into 'C:/Users/hlear/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'nycflights13' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\hlear\AppData\Local\Temp\RtmpuSVT9V\downloaded_packages
data(package = "nycflights13") # Run to see data sets within the nycflights13 package. Weather is in there.
library(nycflights13)
# a) Explore the column names and top part of the dataset to get a sense of the data.
colnames(weather) # View column names only.## [1] "origin" "year" "month" "day" "hour"
## [6] "temp" "dewp" "humid" "wind_dir" "wind_speed"
## [11] "wind_gust" "precip" "pressure" "visib" "time_hour"
head(weather) # View top part of the data.## # A tibble: 6 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # ℹ 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
# b) Make a subset of the data from just the first month (1) and then save that subset as ‘weather1’.
weather # View the "weather" data set.## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # ℹ 26,105 more rows
## # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
weather1 <- subset(weather, weather$month ==1)
# Test
# The subset was too big to see here, so I ended up viewing it outside of R to confirm.
# There is likely a more efficient method using code, but I don't know it.
# weather1
# write.table(weather1, "C:\\Users\\hlear\\Desktop\\statsandviz\\r_language\\data\\weather1.tab", row.names=FALSE)
# c) Using ggplot2, make a beautiful histogram of the variable ‘temp’
library(tidyverse)## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
ggplot(data = weather, aes(x = temp)) + geom_histogram() # Not beautiful...## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
# Make pretty
ggplot(data = weather, aes(x = temp)) + geom_histogram(col = "white",
fill = "red") +
labs(title = "Temperatures during NYC Flight Departures (2013)",
x = "Temperature (F)",
y = "Count")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
# d) Using ggplot2, make a beautiful line plot of ‘temp’ as a function of ‘time_hour’
ggplot(weather, aes(x = time_hour, y = temp)) + geom_line(color = "steelblue") +
labs(title = "NYC Departure Temperatures Over Time (2013)",
x = "Time (hrs)",
y = "Temp (F)")# e) Using ggplot2, make a beautiful boxplot of ‘temp’ as a function of ‘origin’
ggplot(weather1, aes(x = origin, y = temp)) +
geom_boxplot() +
labs(title = "Temperature by Origin (2013)",
x = "Origin",
y = "Temperature (F)")
# Load libraries
library(tidyverse)
library(ggplot2)
# Put rates into a vector
und_rates <- c(0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6)
# Draw histogram
hist(und_rates, xlab = "Hertz", ylab = "Frequency", main = "Undulation Rates of Tree Snakes")# Calculate sample mean
mean(und_rates) ## [1] 1.375
# Calculate range
range(und_rates) ## [1] 0.9 2.0
# Calculate standard deviation
sd(und_rates) ## [1] 0.324037
# Write a function to express the standard deviation
# as a percentage of the mean (that is, the coefficient of variation (CV)) and calculate it.
coef_var <- function(x) {
sd(x) / mean(x) * 100
}
coef_var(und_rates) ## [1] 23.56633
# Assign blood pressure measurements (bp) to a vector
bp <- c(112, 128, 108, 129, 125, 153, 155, 132, 137)
# a) How many individuals are in the sample?
length(bp)## [1] 9
# b) What is the mean of this sample?
mean(bp)## [1] 131
# c) What is the variance?
var(bp)## [1] 254.5
# d) What is the standard deviation?
sd(bp)## [1] 15.95306
# e) What is the coefficient of variation?
coef_var <- function(x) {
sd(x) / mean(x) * 100
}
coef_var(bp)## [1] 12.17791
# Read in the Desert Bird Abundance csv file.
birdset <- read.csv(file = "C:\\Users\\hlear\\Desktop\\statsandviz\\R02_fundamentals_statistics\\data\\DesertBirdAbundance.csv", header = TRUE)
# Explore
head(birdset)## species abundance
## 1 Black Vulture 64
## 2 Turkey Vulture 23
## 3 Harris's Hawk 3
## 4 Red-tailed Hawk 16
## 5 American Kestrel 7
## 6 Gambel's Quail 148
class(birdset)## [1] "data.frame"
# a) Histogram using ggplot2
ggplot(birdset, aes(x=abundance))+
geom_histogram(fill = "steelblue") +
labs(title = "Histogram of Desert Bird Abundance", x = "Abundance", y = "Frequency")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# b) Median and Mean
mean(birdset$abundance)## [1] 74.76744
median(birdset$abundance)## [1] 18
# c) Best measure of center in this case?
# The median
# d) Range, SD, variance, and CV
range(birdset$abundance)## [1] 1 625
sd(birdset$abundance)## [1] 121.9473
var(birdset$abundance)## [1] 14871.14
coef_var <- function(x) {
sd(x) / mean(x) * 100
}
coef_var(birdset$abundance)## [1] 163.1021
# a) A standard normally distributed variable is larger than 3
pnorm(3, lower.tail = FALSE)## [1] 0.001349898
# b) A normally distributed variable with mean 35 and standard deviation 6 is larger than 42
pnorm(42, mean = 35, sd = 6, lower.tail = FALSE)## [1] 0.1216725
# c) Getting 10 out of 10 successes in a binomial distribution with probability 0.8
dbinom(10, size = 10, prob = 0.8) # dbinom instead of pbinom because it's a discrete probability, we're not integrating anything.## [1] 0.1073742
# d) X > 6.5 in a Chi-squared distribution with 2 degrees of freedom
pchisq(6.5, df = 2, lower.tail = FALSE)## [1] 0.03877421
# Create a vector to store the mean of each sample
means <- numeric(1000)
# Loop to generate 1000 samples of size 5 from the Binomial distribution
for (i in 1:1000) {
sample <- rbinom(5, size= 10, prob= 0.9) # generate sample of size 5
means[i] <- mean(sample) # calculate mean of the sample
}
# Plot the histogram of the sample means
hist(means, main = "Histogram of Sample Means", xlab = "Sample Mean")
hist(runif(1000,1,3))hist((runif(1000,1,3)) + runif(1000,1,3))hist(runif(1000,1,3) + runif(1000,1,3) + runif(1000,1,3) + runif(1000,1,3) + runif(1000,1,3))# As we increase the number of genes, what is the resulting height distribution?
# As we increase the number of genes, the height distribution becomes more narrow and less variable.
# Increasing sample size reduces sampling error / spread.
hist(runif(1000,1,3))hist((runif(1000,1,3)) * runif(1000,1,3))hist(runif(1000,1,3) * runif(1000,1,3) * runif(1000,1,3) * runif(1000,1,3) * runif(1000,1,3))# As we increase the number of genes, what is the resulting height distribution?
# As we increase the number of genes, the resulting height distribution becomes more right-skewed.
# Put body temp values into a vector
temp <- c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4)
# a) Histogram
hist(temp)# b) Normal Quantile Plot
qqnorm(temp)
qqline(temp)# c) Shapiro-Wilk Test
shapiro.test(temp)##
## Shapiro-Wilk normality test
##
## data: temp
## W = 0.97216, p-value = 0.7001
# d) Are the data normally distributed?
# Yes. Points follow the QQ line closely, indicating the data follows a normal distribution.
# Also, the Shapiro-Wilk test p-value > 0.05, which means we fail to reject the null.
# e) Are these measurements consistent with a population mean of 98.6 F?
t.test(temp, mu=98.6)##
## One Sample t-test
##
## data: temp
## t = -0.56065, df = 24, p-value = 0.5802
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
## 98.24422 98.80378
## sample estimates:
## mean of x
## 98.524
# Yes, the measurements are consistent with a mean of 98.6 F.
# The p-value of 0.5802 is > 0.05, and 98.6 falls within the confidence intervals of the mean.
# We fail to reject the null.
# Define the number of spiders and the number choosing the dead cricket
n <- 41
dead <- 31
# Binomial test
# Analyzing proportions with only 2 outcomes
binom.test(dead, n, p=0.5)##
## Exact binomial test
##
## data: dead and n
## number of successes = 31, number of trials = 41, p-value = 0.00145
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5969538 0.8763675
## sample estimates:
## probability of success
## 0.7560976
# Print the p-value
binom.test(dead, n, p=0.5)$p.value## [1] 0.001450491
# Does this represent evidence for a diet preference?
# Yes. The p-value is < 0.05, which means there is enough evidence to reject the null.
# Put values into vectors
placebo = c(37, 52, 68, 4, 29, 32, 19, 52, 19, 12)
drug = c(5, 23, 40, 3, 38, 19, 9, 24, 17, 14)
# a) Create boxplot
boxplot(placebo, drug, names=c("Placebo", "Drug"), main="Boxplot of Number of Seizures")# b) Test the difference
# Paired t-test: Both treatments are applied to every sampled unit.
t.test(placebo, drug, paired=TRUE)##
## Paired t-test
##
## data: placebo and drug
## t = 2.7661, df = 9, p-value = 0.02189
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 2.404666 23.995334
## sample estimates:
## mean difference
## 13.2
# View p-value:
t.test(placebo, drug, paired=TRUE)$p.value## [1] 0.02189433
# The p-value is < 0.05, which indicates a significant difference.
# Put the data into a vector
data <- c("Oak"=33, "Hickory"=30, "Maple"=29, "Red Cedar"=4, "Poplar"=4)
# Barplot
barplot(data)# Test hypothesis of no-association
# Chi-squared goodness-of-fit test:
# Measures the discrepancy between an observed frequency distribution and the frequencies expected under a random model
chisq.test(data)##
## Chi-squared test for given probabilities
##
## data: data
## X-squared = 43.1, df = 4, p-value = 9.865e-09
# Show p-value
chisq.test(data)$p.value## [1] 9.865077e-09
# The p-value is > 0.05, suggesting there is evidence for habitat preference for bee colony number.
# Perform 10 one-sample t-tests for mu=0 on simulated standard normally distributed data of 25 observations each and get the P-value.
t.test(rnorm(25), mu=0)$p.value## [1] 0.9178738
replicate(10, t.test(rnorm(25), mu=0)$p.value) # replicates x10## [1] 0.9120613 0.5640290 0.7336884 0.1990770 0.9002373 0.4098705 0.4168711
## [8] 0.6149978 0.8976445 0.5915363
# Repeat the experiment, but instead simulate samples of 25 observations from a t-distribution with 2 degrees of freedom.
t.test(rt(25, df=2), mu=0)$p.value## [1] 0.7085971
replicate(10, t.test(rt(25, df=2), mu=0)$p.value) # replicates x10## [1] 0.4414909 0.4472713 0.7022276 0.4144562 0.3690365 0.2569152 0.4390642
## [8] 0.6624605 0.8156426 0.5470053
# Automate x1000 and apply FDR correction
experiment.pval.fdr<-p.adjust(replicate(1000, t.test(rnorm(25), mu=0)$p.value), method="fdr")
# Test
head(experiment.pval.fdr)## [1] 0.8823020 0.9255108 0.9423436 0.9423436 0.9558437 0.9423436
length(experiment.pval.fdr) # The number of elements in a vector## [1] 1000
# Load libraries
library(readxl) # read xlsx format
library(reshape2) # to use the melt function##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2) # for data viz$ensp;
# Read in wide format data
data_wide <- read_xlsx("C:\\Users\\hlear\\Desktop\\statsandviz\\R04_anova\\data\\data_ANOVA.xlsx")
# Explore data
head(data_wide)## # A tibble: 6 × 3
## Glyphosate Triclopyr Control
## <dbl> <dbl> <dbl>
## 1 20 17 34
## 2 17 14 31
## 3 24 12 29
## 4 18 16 32
## 5 16 18 27
## 6 22 13 30
colnames(data_wide)## [1] "Glyphosate" "Triclopyr" "Control"
class(data_wide)## [1] "tbl_df" "tbl" "data.frame"
# Melt the data into long format
data_long <- melt(data_wide, id.vars = NULL)
# Explore data
data_long## variable value
## 1 Glyphosate 20
## 2 Glyphosate 17
## 3 Glyphosate 24
## 4 Glyphosate 18
## 5 Glyphosate 16
## 6 Glyphosate 22
## 7 Triclopyr 17
## 8 Triclopyr 14
## 9 Triclopyr 12
## 10 Triclopyr 16
## 11 Triclopyr 18
## 12 Triclopyr 13
## 13 Control 34
## 14 Control 31
## 15 Control 29
## 16 Control 32
## 17 Control 27
## 18 Control 30
colnames(data_long)## [1] "variable" "value"
class(data_long)## [1] "data.frame"
# boxplot using ggplot2
ggplot(data_long, aes(x = variable, y = value)) +
geom_boxplot() +
ggtitle("Percent Cover of Kudzu After Treatment") +
xlab("Treatment") +
ylab("Percent Cover")
In this output, the p-value is very small (1.318e-07), which suggests that there is a significant difference between the means of the groups. The “***” sign next to the p-value indicates that the results are highly significant (p < 0.001).
# Fit a linear regression model
model <- lm(value ~ variable, data = data_long)
# Perform an ANOVA on the model
anova(model)## Analysis of Variance Table
##
## Response: value
## Df Sum Sq Mean Sq F value Pr(>F)
## variable 2 763 381.5 54.5 1.318e-07 ***
## Residuals 15 105 7.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The resulting R-squared value indicates that the regression model fits the data well.
summary(model)$r.squared## [1] 0.8790323
# Perform Tukey's test
TukeyHSD(aov(value~variable, data=data_long))## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = value ~ variable, data = data_long)
##
## $variable
## diff lwr upr p adj
## Triclopyr-Glyphosate -4.5 -8.467701 -0.5322987 0.0255594
## Control-Glyphosate 11.0 7.032299 14.9677013 0.0000087
## Control-Triclopyr 15.5 11.532299 19.4677013 0.0000001
# Read in data
data <- read.table("C:\\Users\\hlear\\Desktop\\statsandviz\\R04_anova\\data\\growth.txt", sep = "\t", header = TRUE)
# Explore data
class(data)## [1] "data.frame"
head(data)## supplement diet gain
## 1 supergain wheat 17.37125
## 2 supergain wheat 16.81489
## 3 supergain wheat 18.08184
## 4 supergain wheat 15.78175
## 5 control wheat 17.70656
## 6 control wheat 18.22717
colnames(data)## [1] "supplement" "diet" "gain"
# Create Boxplot
boxplot(gain~diet, data=data) # weight gain as a function of dietboxplot(gain~supplement, data=data) # weight gain as a function of supplementboxplot(gain~diet+supplement, data=data) # weight gain for each combination of diet/supplement&ensp
anova(lm(gain~diet+supplement, data=data)) # without interaction## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 92.358 4.199e-16 ***
## supplement 3 91.881 30.627 19.700 3.980e-08 ***
## Residuals 42 65.296 1.555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain~diet+supplement, data=data))$r.squared # for R-squared## [1] 0.8530521
anova(lm(gain~diet*supplement, data=data)) # includes interaction## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 83.5201 2.999e-14 ***
## supplement 3 91.881 30.627 17.8150 2.952e-07 ***
## diet:supplement 6 3.406 0.568 0.3302 0.9166
## Residuals 36 61.890 1.719
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain~diet*supplement, data=data))$r.squared # for R-squared## [1] 0.8607168
# install.packages("AICcmodavg")
# library(AICcmodavg)
# Fit the first linear regression model
two_way <- lm(gain ~ diet + supplement, data = data)
# Extract the AIC value for the first model
aic_two_way <- AIC(two_way)
# Fit the second linear regression model
interaction <- lm(gain ~ diet * supplement, data = data)
# Extract the AIC value for the second model
aic_interaction <- AIC(interaction)
# Compare the AIC values to determine which model is best
if (aic_two_way < aic_interaction) {
cat("The two_way model is better, with AIC =", aic_two_way)
} else {
cat("The interaction model is better, with AIC =", aic_interaction)
}## The two_way model is better, with AIC = 164.9892
interaction.plot(data$diet, data$supplement, data$gain) # supersupp and agrimore intersect at wheat interaction.plot(data$supplement, data$diet, data$gain) # parallel (little to no interaction)
The two-way (no interaction) model is better.
anova(lm(gain~diet+supplement, data=data)) # without interaction## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 92.358 4.199e-16 ***
## supplement 3 91.881 30.627 19.700 3.980e-08 ***
## Residuals 42 65.296 1.555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain~diet+supplement, data=data))$r.squared # for R-squared## [1] 0.8530521
TukeyHSD(aov(gain~diet+supplement, data=data))## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gain ~ diet + supplement, data = data)
##
## $diet
## diff lwr upr p adj
## oats-barley -3.092817 -4.163817 -2.021817 0e+00
## wheat-barley -5.990298 -7.061298 -4.919297 0e+00
## wheat-oats -2.897481 -3.968481 -1.826481 2e-07
##
## $supplement
## diff lwr upr p adj
## control-agrimore -2.6967005 -4.0583332 -1.3350677 0.0000234
## supergain-agrimore -3.3814586 -4.7430914 -2.0198259 0.0000003
## supersupp-agrimore -0.7273521 -2.0889849 0.6342806 0.4888738
## supergain-control -0.6847581 -2.0463909 0.6768746 0.5400389
## supersupp-control 1.9693484 0.6077156 3.3309811 0.0020484
## supersupp-supergain 2.6541065 1.2924737 4.0157392 0.0000307
Packages
library(corrgram) # graphical display of a correlation matrix (correlogram)
library(cowplot) # creae publication-quality figures with ggplot2##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
# Put the data into a data frame for manipulation
hyena <- data.frame(age = c(2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20),
Hz = c(840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500))
# Explore data
class(hyena)## [1] "data.frame"
head(hyena)## age Hz
## 1 2 840
## 2 2 670
## 3 2 580
## 4 6 470
## 5 9 540
## 6 10 660
colnames(hyena)## [1] "age" "Hz"
# Quick inspection of the data
# Plot using base R
plot(hyena)
# Hypothesis test on the correlation coefficient
cor.test(hyena$age, hyena$Hz, method = "pearson")##
## Pearson's product-moment correlation
##
## data: hyena$age and hyena$Hz
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8453293 -0.1511968
## sample estimates:
## cor
## -0.601798
# Plot using ggplot2
ggplot(data=hyena, aes(x=age, y=Hz)) +
geom_point(size=2.5, alpha=0.4) +
geom_smooth(method="lm", se=F, size=1.5, color="firebrick4") +
xlab("Age (years)") +
ylab("Giggle Frequency (Hz)") +
ggtitle("Relationship between hyena age and giggle frequency (Hz)") +
theme_cowplot(14)## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
# Non-parametric rank correlation methods (Spearman, Kendall)
cor(hyena, method="spearman")## age Hz
## age 1.0000000 -0.5397807
## Hz -0.5397807 1.0000000
cor(hyena, method="kendall")## age Hz
## age 1.0000000 -0.4211174
## Hz -0.4211174 1.0000000
# Libraries
library(performance) # assess model quality and goodness of fit
library(see) # plotting, extras for ggplot2
# Read in the data
face_rawdata <- read.table(file = "R06_regression/data/face.txt", sep = "\t", header = TRUE)
# Explore data
class(face_rawdata)## [1] "data.frame"
colnames(face_rawdata)## [1] "Face" "Penalty"
head(face_rawdata)## Face Penalty
## 1 1.59 0.44
## 2 1.67 1.43
## 3 1.65 1.57
## 4 1.72 0.14
## 5 1.79 0.27
## 6 1.77 0.35
nrow(face_rawdata)## [1] 21
# Quick scatterplot using base R
plot(face_rawdata)plot(face_rawdata$Face, face_rawdata$Penalty) # Alternative code to assign x and y variables
# Linear model
face_lm <- lm(Penalty ~ Face, data=face_rawdata)
summary(face_lm)##
## Call:
## lm(formula = Penalty ~ Face, data = face_rawdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9338 -0.3883 -0.1260 0.3381 1.0711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.505 2.089 -2.157 0.0440 *
## Face 3.189 1.150 2.774 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6129 on 19 degrees of freedom
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2508
## F-statistic: 7.694 on 1 and 19 DF, p-value: 0.0121
abline(face_lm, col = "red", lwd = 3) # summary(face_lm) tells us answers to a-e.
# But we can extract each individually and print the answers... if we're so inclined:
face_slope <- round(coef(face_lm)[2],3)
face_intercept <- round(coef(face_lm)[1],3)
face_df <- face_lm$df.residual
face_t_value <- round(summary(face_lm)$coefficients[2, "t value"], 3)
face_p_value <- round(summary(face_lm)$coefficients[2, "Pr(>|t|)"], 3)
face_r_squared <- round(summary(face_lm)$r.squared,3)
cat("Answer to questions:", "\n",
"a) The equation is: Penalty =", face_intercept, "+", face_slope, "* Face", "\n",
"b) Degrees of Freedom:", face_df, "\n",
"c) t-value for Face variable:", face_t_value, " (p-value:", face_p_value, ")", "\n",
"d) The slope of the relationship between Face and Penalty is positive and statistically significant.", "\n",
"e) R-squared:", face_r_squared, "\n")## Answer to questions:
## a) The equation is: Penalty = -4.505 + 3.189 * Face
## b) Degrees of Freedom: 19
## c) t-value for Face variable: 2.774 (p-value: 0.012 )
## d) The slope of the relationship between Face and Penalty is positive and statistically significant.
## e) R-squared: 0.288
# Check model assumptions
check_model(face_lm)## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
# Read in data
bodymass <- read.table(file = "R06_regression/data/respiratoryrate_bodymass.txt", sep = "\t", header = TRUE)
# Explore
class(bodymass)## [1] "data.frame"
colnames(bodymass)## [1] "BodyMass" "Respiration"
nrow(bodymass)## [1] 10
head(bodymass)## BodyMass Respiration
## 1 453 666
## 2 1283 643
## 3 695 1512
## 4 1640 2198
## 5 1207 2535
## 6 2096 4176
# ========================================
# Scatterplots ===========================
# ========================================
# Scatterplot using base R
plot(bodymass$BodyMass, bodymass$RespiratoryRate,
xlab = "Body Mass",
ylab = "Respiratory Rate")# Take the logarithm of both variables
logdata <- log(bodymass)
# Make a scatterplot of the linearized relationship
plot(logdata$BodyMass, logdata$RespiratoryRate,
xlab = "log(Body Mass)",
ylab = "log(Respiratory Rate)")# ==================================================
# Use linear regression to estimate beta ===========
# ==================================================
# Fit a linear regression model to the transformed data
logdata_lm <- lm(logdata$Respiration ~ logdata$BodyMass, data = logdata)
summary(logdata_lm)##
## Call:
## lm(formula = logdata$Respiration ~ logdata$BodyMass, data = logdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## logdata$BodyMass 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
# Extract the coefficient for the BodyMass predictor variable
logbeta <- coef(logdata_lm)[2]
# beta <- coef(logdata_lm)["logdata$BodyMass"] # alternative code
cat("Logged Beta:", logbeta, "\n")## Logged Beta: 0.8574374
# Exponentiate the logbeta coefficient to get the beta for the Power Law
beta <- exp(logbeta)
cat("Beta:", beta, "\n")## Beta: 2.357112
# Carry out a formal test of the null hypothesis of beta=0.
# Look at the t-value ???
summary(logdata_lm)##
## Call:
## lm(formula = logdata$Respiration ~ logdata$BodyMass, data = logdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## logdata$BodyMass 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
summary(logdata_lm)$coefficients## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401278 1.2011499 1.115704 0.296938084
## logdata$BodyMass 0.8574374 0.1570202 5.460680 0.000601046
summary(logdata_lm)$r.squared## [1] 0.7884663
# Check model assumptions
library(gvlma)
check_model(logdata_lm)## Warning: Using `$` in model formulas can produce unexpected results. Specify your
## model using the `data` argument instead.
## Try: Respiration ~ BodyMass,
## data = logdata
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
summary(gvlma(logdata_lm))##
## Call:
## lm(formula = logdata$Respiration ~ logdata$BodyMass, data = logdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## logdata$BodyMass 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = logdata_lm)
##
## Value p-value Decision
## Global Stat 7.3733 0.11743 Assumptions acceptable.
## Skewness 2.9044 0.08834 Assumptions acceptable.
## Kurtosis 0.9461 0.33071 Assumptions acceptable.
## Link Function 0.0425 0.83667 Assumptions acceptable.
## Heteroscedasticity 3.4803 0.06210 Assumptions acceptable.
# Libraries
library(car) # regression## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
# Read in the data
ozone <- read.csv(file = "R07_statistical_modeling/data/ozone.txt", sep="\t", header=T)
# Multiple panel bivariate scatterplot
pairs(ozone)# Multiple linear regression
model_lm <- lm(ozone ~ wind + temp + rad, data = ozone)
# R^2
summary(model_lm)##
## Call:
## lm(formula = ozone ~ wind + temp + rad, data = ozone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.210 -3.556 10.124 95.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.23208 23.04204 -2.788 0.00628 **
## wind -3.33760 0.65384 -5.105 1.45e-06 ***
## temp 1.65121 0.25341 6.516 2.43e-09 ***
## rad 0.05980 0.02318 2.580 0.01124 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.17 on 107 degrees of freedom
## Multiple R-squared: 0.6062, Adjusted R-squared: 0.5952
## F-statistic: 54.91 on 3 and 107 DF, p-value: < 2.2e-16
# Co-linearity using variation inflation factor
# If any VIF values are >5, then there may be co-linearity issues.
vif(model_lm)## wind temp rad
## 1.328979 1.431201 1.095241
# Model diagnostics
check_model(model_lm)hist(resid(model_lm), main = "Histogram of Residuals", xlab = "Residuals")
# Read in data
bodymass <- read.table(file = "R07_statistical_modeling/data/respiratoryrate_bodymass (1).txt", sep = "\t", header = TRUE)
# Explore
class(bodymass)## [1] "data.frame"
colnames(bodymass)## [1] "BodyMass" "Respiration"
nrow(bodymass)## [1] 10
head(bodymass)## BodyMass Respiration
## 1 453 666
## 2 1283 643
## 3 695 1512
## 4 1640 2198
## 5 1207 2535
## 6 2096 4176
# ========================================
# Scatterplots ===========================
# ========================================
# Scatterplot using base R
plot(bodymass$BodyMass, bodymass$RespiratoryRate,
xlab = "Body Mass",
ylab = "Respiratory Rate")# Take the logarithm of both variables
logdata <- log(bodymass)
# Make a scatterplot of the linearized relationship
plot(logdata$BodyMass, logdata$RespiratoryRate,
xlab = "log(Body Mass)",
ylab = "log(Respiratory Rate)")# ==================================================
# Use linear regression to estimate beta ===========
# ==================================================
# Fit a linear regression model to the transformed data
logdata_lm <- lm(logdata$Respiration ~ logdata$BodyMass, data = logdata)
summary(logdata_lm)##
## Call:
## lm(formula = logdata$Respiration ~ logdata$BodyMass, data = logdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## logdata$BodyMass 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
# Extract the coefficient for the BodyMass predictor variable
logbeta <- coef(logdata_lm)[2]
# beta <- coef(logdata_lm)["logdata$BodyMass"] # alternative code
cat("Logged Beta:", logbeta, "\n")## Logged Beta: 0.8574374
# Exponentiate the logbeta coefficient to get the beta for the Power Law
beta <- exp(logbeta)
cat("Beta:", beta, "\n")## Beta: 2.357112
# Carry out a formal test of the null hypothesis of beta=0.
# Look at the t-value ???
summary(logdata_lm)##
## Call:
## lm(formula = logdata$Respiration ~ logdata$BodyMass, data = logdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## logdata$BodyMass 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
summary(logdata_lm)$coefficients## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401278 1.2011499 1.115704 0.296938084
## logdata$BodyMass 0.8574374 0.1570202 5.460680 0.000601046
summary(logdata_lm)$r.squared## [1] 0.7884663
# Check model assumptions
check_model(logdata_lm)## Warning: Using `$` in model formulas can produce unexpected results. Specify your
## model using the `data` argument instead.
## Try: Respiration ~ BodyMass,
## data = logdata
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
summary(gvlma(logdata_lm))##
## Call:
## lm(formula = logdata$Respiration ~ logdata$BodyMass, data = logdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01062 -0.08150 0.00831 0.30669 0.43948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3401 1.2011 1.116 0.296938
## logdata$BodyMass 0.8574 0.1570 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = logdata_lm)
##
## Value p-value Decision
## Global Stat 7.3733 0.11743 Assumptions acceptable.
## Skewness 2.9044 0.08834 Assumptions acceptable.
## Kurtosis 0.9461 0.33071 Assumptions acceptable.
## Link Function 0.0425 0.83667 Assumptions acceptable.
## Heteroscedasticity 3.4803 0.06210 Assumptions acceptable.
# Libraries
library(ggeffects) # predicted probabilities##
## Attaching package: 'ggeffects'
## The following object is masked from 'package:cowplot':
##
## get_title
# Load in data
stork <- read.csv(file = "R07_statistical_modeling/data/stork.txt", sep = "\t", header = T)
# Scatterplot
plot(stork)ggplot(stork, aes(x = Corticosterone, y = Survival)) +
geom_point() +
labs(title = "Scatterplot of Corticosterone and Survival",
x = "Corticosterone (ng/ml)",
y = "Survival (0 = Died, 1 = Survived)")# Binary, so logistic regression is appropriate.
# Logistic regression
model_glm <- glm(Survival ~ Corticosterone, data = stork, family = binomial)
model_glm##
## Call: glm(formula = Survival ~ Corticosterone, family = binomial, data = stork)
##
## Coefficients:
## (Intercept) Corticosterone
## 2.7030 -0.0798
##
## Degrees of Freedom: 33 Total (i.e. Null); 32 Residual
## Null Deviance: 45.23
## Residual Deviance: 41.4 AIC: 45.4
# Pseudo-R2 of the model
r2(model_glm)## # R2 for Logistic Regression
## Tjur's R2: 0.102
# P-value of the model
summary(model_glm)##
## Call:
## glm(formula = Survival ~ Corticosterone, family = binomial, data = stork)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4316 -0.8724 -0.6703 1.2125 1.8211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.70304 1.74725 1.547 0.1219
## Corticosterone -0.07980 0.04368 -1.827 0.0677 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45.234 on 33 degrees of freedom
## Residual deviance: 41.396 on 32 degrees of freedom
## AIC: 45.396
##
## Number of Fisher Scoring iterations: 4
# Plot the predicted probabilities of survival
predicted <- ggpredict(model_glm, terms = "Corticosterone")## Data were 'prettified'. Consider using `terms="Corticosterone [all]"` to
## get smooth plots.
plot(predicted)# Plot with logistic regression curve
ggplot(stork, aes(x = Corticosterone, y = Survival)) +
geom_point() +
labs(title = "Scatterplot of Corticosterone and Survival",
x = "Corticosterone (ng/ml)",
y = "Survival (0 = Died, 1 = Survived)") +
stat_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE)## `geom_smooth()` using formula = 'y ~ x'
# Model diagnostics
check_model(model_glm)## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
hist(resid(model_glm), main = "Histogram of Residuals", xlab = "Residuals")
# Libraries
library(MASS) # perform negative binomial regression##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
clusters <- read.csv(file = "R07_statistical_modeling/data/clusters.txt", sep = "\t", header = T)
# Make a scatterplot of the data
ggplot(clusters, aes(x=Distance, y=Cancers)) + geom_point()# Which regression is the more appropriate for these data?(Don’t take overdispersion into account for now)
# Since response variable is a count, a Poisson regression model is appropriate.
# Given your choice, is the trend significant?
# Fit a Poisson regression model
model_poisson <- glm(Cancers ~ Distance, data=clusters, family=poisson)
summary(model_poisson)##
## Call:
## glm(formula = Cancers ~ Distance, family = poisson, data = clusters)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5504 -1.3491 -1.1553 0.3877 3.1304
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.186865 0.188728 0.990 0.3221
## Distance -0.006138 0.003667 -1.674 0.0941 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 149.48 on 93 degrees of freedom
## Residual deviance: 146.64 on 92 degrees of freedom
## AIC: 262.41
##
## Number of Fisher Scoring iterations: 5
# p-value 0.0941 > 0.05 ; not significant
# What is the pseudo-R2 of the model?
r2(model_poisson)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.037
# Include the predicted relationship from the model in the scatterplot
ggplot(clusters, aes(x=Distance, y=Cancers)) +
geom_point() +
geom_smooth(method="glm", method.args=list(family="poisson"), se=FALSE)## `geom_smooth()` using formula = 'y ~ x'
# Do you think there might be some evidence of overdispersion?
check_overdispersion(model_poisson)## # Overdispersion test
##
## dispersion ratio = 1.555
## Pearson's Chi-Squared = 143.071
## p-value = 0.001
## Overdispersion detected.
# Perform a new generalized linear model with a distribution that better accounts for overdispersion
# Negative binomial regression model
model_nb <- glm.nb(Cancers ~ Distance, data=clusters)
summary(model_nb)##
## Call:
## glm.nb(formula = Cancers ~ Distance, data = clusters, init.theta = 1.359466981,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3103 -1.1805 -1.0442 0.3065 1.9582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.182490 0.252434 0.723 0.470
## Distance -0.006041 0.004727 -1.278 0.201
##
## (Dispersion parameter for Negative Binomial(1.3595) family taken to be 1)
##
## Null deviance: 96.647 on 93 degrees of freedom
## Residual deviance: 94.973 on 92 degrees of freedom
## AIC: 253.19
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.359
## Std. Err.: 0.612
##
## 2 x log-likelihood: -247.191
# Check this last model assumptions
# Model diagnostics
check_model(model_nb)## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
hist(resid(model_nb), main = "Histogram of Residuals", xlab = "Residuals")
# Read in data
jaws <- read.csv(file = "R07_statistical_modeling/data/jaws.txt", sep = "\t", header = T)
# a) Make a scatterplot of the data (age explanatory, bone response)
ggplot(jaws, aes(x = age, y = bone)) +
geom_point() +
labs(x = "Age (years)", y = "Bone length (mm)",
title = "Scatterplot of age vs. bone length")# b) Perform a non-linear regression assuming an asymptotic exponential relationship:
# y=a(1-e^(-cx))
# "a" is the estimated maximum bone length
range(jaws$bone) # 142## [1] 0 142
# Regression to get the "c" starting value
jaws_lm <- lm(bone ~ age, data = jaws)
summary(jaws_lm) # slope = 1.642##
## Call:
## lm(formula = bone ~ age, data = jaws)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.259 -10.457 2.353 18.048 33.589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.259 5.837 9.124 2.23e-12 ***
## age 1.642 0.201 8.166 6.97e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.3 on 52 degrees of freedom
## Multiple R-squared: 0.5619, Adjusted R-squared: 0.5534
## F-statistic: 66.68 on 1 and 52 DF, p-value: 6.968e-11
# The starting value for parameter c is estimated by taking the reciprocal
# of the slope of the linear regression between age and bone length.
# c = 1/1.642 = 0.6088
exp_model <- nls(bone ~ a*(1-exp(-c*age)), data = jaws, start = list(a = 142, c = 0.6088))
summary(exp_model)##
## Formula: bone ~ a * (1 - exp(-c * age))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 115.58052 2.84364 40.645 < 2e-16 ***
## c 0.11882 0.01233 9.635 3.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 52 degrees of freedom
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 1.423e-06
# c) Perform a non-linear regression assuming a Michaelis-Menten model:
# y=ax/(1+bx)
mm_model <- nls(bone ~ a*age/(1+b*age), data = jaws, start = list(a = 142, b = 0.6088))
summary(mm_model)##
## Formula: bone ~ a * age/(1 + b * age)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 18.72540 2.52587 7.413 1.09e-09 ***
## b 0.13596 0.02339 5.814 3.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 52 degrees of freedom
##
## Number of iterations to convergence: 8
## Achieved convergence tolerance: 2.686e-06
# d) Estimate the percentage of variation explained by both models
# (comparing them with a null model with only a constant)
null_model <- lm(bone ~ 1, data = jaws)
1 - (sum(residuals(exp_model)^2)/sum(residuals(null_model)^2))## [1] 0.8486791
1 - (sum(residuals(mm_model)^2)/sum(residuals(null_model)^2))## [1] 0.8329987
# e) Compare both models with Akaike’s Information Criterion (AIC).
# Which model is better?
AIC(exp_model)## [1] 435.0823
AIC(mm_model) # higher AIC## [1] 440.4066
# f) Make a scatterplot of the data and include both regression lines
ggplot(jaws, aes(x = age, y = bone)) +
geom_point() +
labs(x = "Age (years)", y = "Bone length (mm)",
title = "Scatterplot of age vs. bone length") +
geom_smooth(method = "nls",
formula = y ~ a*(1-exp(-c*x)),
se = FALSE,
method.args = list(start = c(a = 140, c = 0.1)),
color = "red") +
geom_smooth(method = "nls",
formula = y ~ a*x/(1+b*x),
se = FALSE,
method.args = list(start = c(a = 100, b = 0.01)),
color = "blue") +
theme_bw()# Alternative code
# f) Make a scatterplot of the data and include both regression lines
# Create predicted values for both models
jaws$exp_pred <- predict(exp_model)
jaws$mm_pred <- predict(mm_model)
# Create scatterplot with both regression lines
ggplot(jaws, aes(x = age, y = bone)) +
geom_point() +
geom_line(aes(y = exp_pred), color = "blue") +
geom_line(aes(y = mm_pred), color = "red") +
labs(x = "Age (years)", y = "Bone length (mm)",
title = "Regression lines for age vs. bone length",
color = "Model") +
scale_color_manual(values = c("blue", "red"), labels = c("Exponential", "Michaelis-Menten"))
# # Load libraries
# library(lme4)
#
# # Load data into dataframe. Let's call it "data"...
# data <- read.table("data.txt", sep = "\t", header = T)
#
# # Generate the linear mixed effects model
# # with turbidity, temperature, tide, and wave action as fixed factors
# # and site and date as random effects
# model <- lmer(growth ~ turbidity + temperature + tide + wave_action + (1 | site) + (1 | date), data = data)
#
# # Output
# summary(model)
# a)
# 3 soil types and 2 levels of sterilization
# 3 x 2 = 6 fixed treatments
# b)
# 6 fixed treatments, Replicated in 5 pots, 4 different greenhouses
# 6 x 5 x 4 = 120 total number of pots
# c)
# 6 seeds planted in each pot, 120 pots in total
# 6 x 120 = 720 total number of plants.
# d)
# R code to analyze these data:
# Load libraries
library(dplyr)
library(lme4)## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
# Create a data frame with the experimental design
saguaros <- expand.grid(soil_type = c("remnant", "cultivated", "restored"),
sterilization = c("yes", "no"),
greenhouse = 1:4,
pot = 1:5)
# Simulate germination data for each treatment
set.seed(123)
germination <- data.frame(treatment = 1:nrow(saguaros),
germination = rbinom(nrow(saguaros), 6, 0.5))
# Combine the design and germination data frames
data <- cbind(saguaros, germination)
# Fit the mixed-effects logistic regression model
mixed_effects <- glmer(germination/6 ~ soil_type * sterilization + (1 | greenhouse/pot),
data = data, family = binomial())## Warning in eval(family$initialize, rho): non-integer #successes in a binomial
## glm!
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00296322 (tol = 0.002, component 1)
# Summarize the results
summary(mixed_effects)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: germination/6 ~ soil_type * sterilization + (1 | greenhouse/pot)
## Data: data
##
## AIC BIC logLik deviance df.resid
## 163.9 186.2 -73.9 147.9 112
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.81613 0.00002 0.33335 0.59960 2.38134
##
## Random effects:
## Groups Name Variance Std.Dev.
## pot:greenhouse (Intercept) 8.180e-08 0.0002860
## greenhouse (Intercept) 3.964e-08 0.0001991
## Number of obs: 120, groups: pot:greenhouse, 20; greenhouse, 4
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4057537 0.4564483 -0.889 0.374
## soil_typecultivated -0.0006088 0.6455354 -0.001 0.999
## soil_typerestored -0.6926621 0.6891910 -1.005 0.315
## sterilizationno 0.4057159 0.6390188 0.635 0.525
## soil_typecultivated:sterilizationno -0.4042833 0.9083118 -0.445 0.656
## soil_typerestored:sterilizationno -1.0426255 1.0331193 -1.009 0.313
##
## Correlation of Fixed Effects:
## (Intr) sl_typc sl_typr strlzt sl_typc:
## sl_typcltvt -0.707
## sl_typrstrd -0.662 0.468
## steriliztnn -0.714 0.505 0.473
## sl_typcltv: 0.503 -0.711 -0.333 -0.704
## sl_typrstr: 0.442 -0.312 -0.667 -0.619 0.435
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00296322 (tol = 0.002, component 1)
Overall relationship (positive) between previous performances and negative affect based on multilevel model
Overall relationship (negative) between previous performances and negative affect based on linear least squares regression model. Intercept is average of the four subjects.
Relationship between previous performances and negative affect for each subject (same slope but different intercepts).
# Load data
dragons_data <- load("R07_statistical_modeling/data/dragons.RData")
# Explore data
str(dragons) # structure## 'data.frame': 480 obs. of 5 variables:
## $ testScore : num 16.15 33.89 6.04 18.84 33.86 ...
## $ bodyLength : num 166 168 166 168 170 ...
## $ mountainRange: Factor w/ 8 levels "Bavarian","Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X : logi NA NA NA NA NA NA ...
## $ site : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
summary(dragons) # summary## testScore bodyLength mountainRange X site
## Min. : 0.00 Min. :162.3 Bavarian: 60 Mode:logical a:160
## 1st Qu.: 32.60 1st Qu.:191.8 Central : 60 NA's:480 b:160
## Median : 51.00 Median :202.9 Emmental: 60 c:160
## Mean : 50.39 Mean :201.3 Julian : 60
## 3rd Qu.: 67.51 3rd Qu.:213.1 Ligurian: 60
## Max. :100.00 Max. :236.4 Maritime: 60
## (Other) :120
# a) Perform a simple linear regression with testScore as response and bodyLength as explanatory
dragons_lm <- lm(testScore ~ bodyLength, data = dragons)
summary(dragons_lm)##
## Call:
## lm(formula = testScore ~ bodyLength, data = dragons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.962 -16.411 -0.783 15.193 55.200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.31783 12.06694 -5.081 5.38e-07 ***
## bodyLength 0.55487 0.05975 9.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.2 on 478 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1511
## F-statistic: 86.25 on 1 and 478 DF, p-value: < 2.2e-16
# b) Plot the data with ggplot2 and add a linear regression line with confidence intervals. Hint: geom_smooth()
library(ggplot2)
ggplot(dragons, aes(x = bodyLength, y = testScore)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)## `geom_smooth()` using formula = 'y ~ x'
# c) We collected multiple samples from 8 mountain ranges. Generate a boxplot using (or not) ggplot2 to explore this new explanatory variable
ggplot(dragons, aes(x = mountainRange, y = testScore)) +
geom_boxplot()# d) Now repeat the scatterplot in b) but coloring by mountain range and without linear regression line
ggplot(dragons, aes(x = bodyLength, y = testScore, color = mountainRange)) +
geom_point()# e) Instead of coloring, use facet_wrap() to separate by mountain range
ggplot(dragons, aes(x = bodyLength, y = testScore)) +
geom_point() +
facet_wrap(~ mountainRange)# f) Perform a new linear model adding mountain range and assuming that it is fixed effects
dragons_lm_fixed <- lm(testScore ~ bodyLength + mountainRange, data = dragons)
summary(dragons_lm_fixed)##
## Call:
## lm(formula = testScore ~ bodyLength + mountainRange, data = dragons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.263 -9.926 0.361 9.994 44.488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.83051 14.47218 1.439 0.15072
## bodyLength 0.01267 0.07974 0.159 0.87379
## mountainRangeCentral 36.58277 3.59929 10.164 < 2e-16 ***
## mountainRangeEmmental 16.20923 3.69665 4.385 1.43e-05 ***
## mountainRangeJulian 45.11469 4.19012 10.767 < 2e-16 ***
## mountainRangeLigurian 17.74779 3.67363 4.831 1.84e-06 ***
## mountainRangeMaritime 49.88133 3.13924 15.890 < 2e-16 ***
## mountainRangeSarntal 41.97841 3.19717 13.130 < 2e-16 ***
## mountainRangeSouthern 8.51961 2.73128 3.119 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.96 on 471 degrees of freedom
## Multiple R-squared: 0.5843, Adjusted R-squared: 0.5773
## F-statistic: 82.76 on 8 and 471 DF, p-value: < 2.2e-16
# g) Perform the same linear model as before but now assuming mountain range is random effects
library(lme4)
dragons_lm_random <- lmer(testScore ~ bodyLength + (1|mountainRange), data = dragons)
summary(dragons_lm_random)## Linear mixed model fit by REML ['lmerMod']
## Formula: testScore ~ bodyLength + (1 | mountainRange)
## Data: dragons
##
## REML criterion at convergence: 3991.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4815 -0.6513 0.0066 0.6685 2.9583
##
## Random effects:
## Groups Name Variance Std.Dev.
## mountainRange (Intercept) 339.7 18.43
## Residual 223.8 14.96
## Number of obs: 480, groups: mountainRange, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 43.70938 17.13489 2.551
## bodyLength 0.03316 0.07865 0.422
##
## Correlation of Fixed Effects:
## (Intr)
## bodyLength -0.924
# h) How much of the variation in test scores is explained by the random effect (mountain range)
VarCorr(dragons_lm_random)## Groups Name Std.Dev.
## mountainRange (Intercept) 18.43
## Residual 14.96
# i) Estimate the R2 (conditional and marginal) of the mixed-effects model
library(performance)
performance::r2(dragons_lm_random)## # R2 for Mixed Models
##
## Conditional R2: 0.603
## Marginal R2: 0.001
# j) Check this last model assumptions
library(DHARMa) #model diagnostics for glm## This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
simulateResiduals(dragons_lm_random, plot = T)## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.096 0.288 0.032 0.068 0.268 0.428 0.036 0.036 0.028 0.032 0.288 0.348 0.088 0.028 0.084 0.072 0.132 0.104 0.152 0.08 ...
# k) Include the fitted lines for each mountain range (plot from e). [Hint: predict function within geom_line layer]
dragons$predicted <- predict(dragons_lm_random, re.form = NA)
ggplot(dragons, aes(x = bodyLength, y = testScore)) +
geom_point() +
geom_line(aes(y = predicted, group = mountainRange), color = "blue") +
facet_wrap(~ mountainRange)
# Load data
estuaries <- read.csv(file = "R07_statistical_modeling/data/Estuaries.csv", header = T)
head(estuaries)## X Modification Estuary Site Hydroid Total Schizoporella.errata
## 1 1 Modified JAK 1 0 44 15
## 2 2 Modified JAK 1 0 42 8
## 3 3 Modified JAK 2 0 32 9
## 4 4 Modified JAK 2 0 44 14
## 5 5 Modified JAK 3 1 42 6
## 6 6 Modified JAK 3 1 48 12
# a) Fit a linear mixed model with Total as response and Modification as explanatory, controlling for Estuary
estuary.lme <- lmer(Total~Modification+(1|Estuary), data=estuaries)
# b) Estimate the R2 (conditional and marginal) of this model
r2(estuary.lme)## # R2 for Mixed Models
##
## Conditional R2: 0.553
## Marginal R2: 0.267
# c) Plot the data with ggplot2 in a way that helps you understand the different effects
# jitterplot with the effects of estuaries
ggplot(estuaries, aes(x = Modification, y = Total, color = Estuary)) +
geom_point() +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3) +
labs(title = "Total Invertebrates by Modification and Estuary")ggplot(estuaries, aes(x = Modification, y = Total)) +
geom_boxplot()+
geom_jitter(aes(color=Estuary))# d) Include the variable Site as a random effect.
# Do you think this corresponds to a crossed or a nested design?
# Fit the linear mixed model with Site as a random effect
model_site <- lmer(Total ~ Modification + (1 | Estuary) + (1 | Estuary:Site), data = estuaries)
# Nested design because each site is nested within a specific estuary.
# e) What are the R2 (conditional and marginal) of the model including Site
r2(model_site)## # R2 for Mixed Models
##
## Conditional R2: 0.774
## Marginal R2: 0.270
# f) Check the model assumptions
library(DHARMa)
simulateResiduals(model_site, plot = T)## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.584 0.552 0.208 0.576 0.528 0.732 0.604 0.292 0.588 0.976 0.08 0.628 0.916 0.868 0.812 0.904 0.74 0.74 0.464 0.62 ...
# g) Plot the data trying to include Site
ggplot(estuaries, aes(x=Site, y=Total, fill=Modification))+
geom_boxplot()+
geom_point()+
facet_grid(~Estuary)ggplot(estuaries, aes(x = Modification, y = Total, color = Estuary)) +
geom_point(aes(shape = factor(Site)), size = 3) +
labs(title = "Total Invertebrates by Modification, Estuary, and Site") +
scale_shape_discrete(name = "Site")# h) Transform the variable Hydroid to presence/absence data
estuaries$Hydroid <- ifelse(estuaries$Hydroid==0, 0 , 1) # Not recommended since it changes the raw data
# i) Fit a generalized linear mixed model (GLMM) with this transformed variable as Response and the same fixed and random effects as in d). [Hint: function glmer]
hydroid.GLM <- glmer(Hydroid~Modification + (1|Estuary/Site), data = estuaries, family = "binomial")
summary(hydroid.GLM)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Hydroid ~ Modification + (1 | Estuary/Site)
## Data: estuaries
##
## AIC BIC logLik deviance df.resid
## 57.1 65.0 -24.5 49.1 50
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.06063 -0.25028 -0.05443 0.25475 0.98719
##
## Random effects:
## Groups Name Variance Std.Dev.
## Site:Estuary (Intercept) 14.45 3.802
## Estuary (Intercept) 1.13 1.063
## Number of obs: 54, groups: Site:Estuary, 27; Estuary, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.710 2.419 -2.360 0.0183 *
## ModificationPristine 6.534 3.140 2.081 0.0375 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## MdfctnPrstn -0.888
# j) Check the model assumptions
simulateResiduals(hydroid.GLM, plot = T)## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help.
##
## Scaled residual values: 0.6382768 0.3853122 0.8086934 0.01626812 0.9424474 0.9426142 0.5324421 0.3949867 0.82084 0.4799594 0.8009645 0.03378411 0.3224195 0.3157432 0.602992 0.6115054 0.276876 0.8050706 0.7838545 0.5106198 ...
r2(hydroid.GLM)## # R2 for Mixed Models
##
## Conditional R2: 0.888
## Marginal R2: 0.358
moving_average <- function(x) {
n <- length(x)
y <- numeric(n) # Initialize an empty vector to store the moving average values
for (i in 2:(n - 1)) {
y[i] <- (x[i - 1] + x[i] + x[i + 1]) / 3
}
return(y)
}
# Example:
input_vector <- c(1, 2, 3, 4, 5, 6, 7)
result <- moving_average(input_vector)
print(result)## [1] 0 2 3 4 5 6 0
# Load packages
library(TTR)
library(forecast)## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# 2. Read the temp.txt file. The data correspond to monthly average temperatures.
temp<-read.table("R08_timeseries_spatialanalysis/temp.txt", header=T)
# a) Plot the time series data. [Hint: first you need to create a monthly time series object]
temp_ts<-ts(temp$temps, frequency=12)
plot(temp_ts)
# b) Calculate the 5-point moving average and plot it together with the time series
temp_ma<-SMA(temp_ts, n = 5)
lines(temp_ma, col="red", lwd=2)# c) Decompose the time series into seasonal, trend and residual error components
temp_decomp<-decompose(temp_ts)
plot(temp_decomp)# d) Generate a temporal correlogram to assess the autocorrelation of the time series
acf(temp_ts)# e) Generate a new correlogram but removing the trend and seasonal variation
acf(temp_decomp$random[!is.na(temp_decomp$random)])# f) Find the best ARIMA model using the forecast package
temp_fit<-auto.arima(temp_ts)
summary(temp_fit)## Series: temp_ts
## ARIMA(0,0,1)(2,1,0)[12]
##
## Coefficients:
## ma1 sar1 sar2
## 0.2456 -0.5212 -0.3233
## s.e. 0.0774 0.0823 0.0827
##
## sigma^2 = 3.772: log likelihood = -300.76
## AIC=609.52 AICc=609.81 BIC=621.4
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2384692 1.846384 1.498262 0.2682136 11.90101 0.8327783
## ACF1
## Training set 0.005413047
checkresiduals(temp_fit)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12]
## Q* = 38.688, df = 21, p-value = 0.01069
##
## Model df: 3. Total lags used: 24
# g) Estimate future values using the previous ARIMA model and plot the results
temp_fcast<-forecast(temp_fit)
plot(temp_fcast)# Load packages
library(maps)##
## Attaching package: 'maps'
## The following object is masked _by_ '.GlobalEnv':
##
## ozone
## The following object is masked from 'package:purrr':
##
## map
library(ggplot2)
library(viridis)## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
##
## unemp
# Download the map of Spain.
spain_map<-map_data("world", region="spain")
# a) Make a basic map of Spain
ggplot(spain_map) +
geom_polygon(aes(x = long, y = lat, group = group, fill = subregion)) +
theme_bw()# b) Get information on the population of Spanish cities
# and make a map of Spain including the locations and population of Spanish cities.
cities<-get('world.cities')
head(cities)## name country.etc pop lat long capital
## 1 'Abasan al-Jadidah Palestine 5629 31.31 34.34 0
## 2 'Abasan al-Kabirah Palestine 18999 31.32 34.35 0
## 3 'Abdul Hakim Pakistan 47788 30.55 72.11 0
## 4 'Abdullah-as-Salam Kuwait 21817 29.36 47.98 0
## 5 'Abud Palestine 2456 32.03 35.07 0
## 6 'Abwein Palestine 3434 32.03 35.20 0
cities_spain<-cities[cities$country.etc == 'Spain',]
ggplot(spain_map) +
geom_polygon(aes(x = long, y = lat, group = group, fill = subregion)) +
geom_point(data = cities_spain, aes(x = long, y = lat), size = 0.1) +
theme_bw()ggplot(spain_map) +
geom_polygon(aes(x = long, y = lat, group = group), fill = 'lightgray', color = "black", size = 0.1) +
geom_point(data = cities_spain, aes(x = long, y = lat, size = pop, color = pop), alpha = 0.8) +
scale_size_continuous(range = c(1, 12)) +
scale_color_viridis(trans = "log") +
theme_void()
# Load packages
library(rgbif)
# Download from GBIF
desman_gbif<-occ_search(scientificName = "Galemys pyrenaicus", hasCoordinate = T)
# Convert to data frame
desman<-as.data.frame(desman_gbif$data[,c("decimalLatitude", "decimalLongitude")])
# Download maps for geographical distribution
southEU_map<-map_data("world", region=c("Spain", "Portugal", "France", "Andorra"))
# Plot distribution
ggplot(southEU_map)+
geom_polygon(aes(x = long, y = lat, group=group), fill="lightgray")+
geom_point(data=desman, aes(x=decimalLongitude, y=decimalLatitude), color="red", alpha=0.4, size=2)+
theme_bw()
# Load the dune_bio.txt dataset.
# Species should be in columns and sites in rows.
dune <-read.table("R09_diversity_analysis/data/dune_bio.txt", header=T)
head(dune)## Sites Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla
## 1 2 3 0 0 0 0 0 0 0 0 0
## 2 13 0 0 3 0 0 0 0 0 0 2
## 3 4 2 0 0 0 0 0 0 0 2 0
## 4 16 0 0 0 3 0 8 0 0 4 2
## 5 6 0 0 0 0 0 0 6 0 6 0
## 6 1 0 0 0 0 0 0 0 0 0 0
## Cirarv Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil
## 1 0 0 5 0 4 0 0 5 0 0 3
## 2 0 0 2 0 2 0 0 2 0 0 0
## 3 2 0 2 0 4 0 0 1 0 0 0
## 4 0 0 0 0 0 3 0 0 0 0 0
## 5 0 0 3 0 3 0 5 5 3 0 2
## 6 0 0 0 0 4 0 0 0 0 0 1
## Poatri Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 1 7 0 4 0 0 0 5 2 4
## 2 9 1 0 2 0 5 0 5 0
## 3 5 0 4 5 0 8 5 2 3
## 4 2 0 0 0 0 7 0 4 0
## 5 4 0 0 0 5 0 6 0 0
## 6 2 0 4 0 0 0 7 0 0
# Load packages
library(vegan)## Loading required package: permute
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
## This is vegan 2.6-4
# a) Calculate the total number of individuals of all species
sum(dune)## [1] 895
# b) Calculate the total number of individuals for each species
colSums(dune)## Sites Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla
## 210 13 2 13 18 5 25 18 4 49 14
## Cirarv Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil
## 2 9 54 4 48 10 9 47 21 11 16
## Poatri Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 63 1 26 20 26 48 58 36 15
# c) Calculate the average number of individuals for each species
colMeans(dune)## Sites Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla
## 10.50 0.65 0.10 0.65 0.90 0.25 1.25 0.90 0.20 2.45 0.70
## Cirarv Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil
## 0.10 0.45 2.70 0.20 2.40 0.50 0.45 2.35 1.05 0.55 0.80
## Poatri Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 3.15 0.05 1.30 1.00 1.30 2.40 2.90 1.80 0.75
# d) Calculate the total number of individuals for each site
rowSums(dune)## [1] 44 46 49 49 54 19 48 48 32 38 53 43 51 45 43 51 38 50 47 47
# e) Calculate the average number of individuals for each site
rowMeans(dune)## [1] 1.4193548 1.4838710 1.5806452 1.5806452 1.7419355 0.6129032 1.5483871
## [8] 1.5483871 1.0322581 1.2258065 1.7096774 1.3870968 1.6451613 1.4516129
## [15] 1.3870968 1.6451613 1.2258065 1.6129032 1.5161290 1.5161290
# f) Function to report the median number of individuals for each species and each site
median_sps_sites<-function(x){
cat("The median number of inds per sp is", apply(x, 2, median), "and for sites is", apply(x, 1, median))
}
# Median number of individuals for each species
median_sps_sites(dune)## The median number of inds per sp is 10.5 0 0 0 0 0 0 0 0 2 0 0 0 2 0 3 0 0 2 0 0 0 4 0 0 0 0 1.5 2 0 0 and for sites is 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# g) Transform the dataset to relative abundances using decostand()
dune_relabun <-decostand(dune, method = "total")
rowSums(dune_relabun)## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# h) Standardize the dataset into the range 0 to 1 using decostand()
dune_range<-decostand(dune, method = "range")
# i) Standardize the dataset to mean=0 and variance=1 using decostand()
dune_stand<-decostand(dune, method = "standardize")
richness <- function(abundances) {
sum(abundances > 0)
}
# Test
abundances <- c(0, 2, 0, 1, 4, 0, 3) # abundances of diff species
richness(abundances) # four species with >0 abundances## [1] 4
shannon_diversity <- function(abundances) {
abundances <- abundances[abundances > 0] # Filter out zero abundance values
prop <- abundances / sum(abundances) # Calculate proportion of each species
log_prop <- log(prop) # Take the natural logarithm of each proportion
-sum(prop * log_prop) # Calculate the Shannon-Wiener index
}
abundances <- c(5, 2, 0, 1, 4, 5, 3)
shannon_diversity(abundances)## [1] 1.679648
diversity_summary <- function(community_matrix) {
# Calculate observed richness
richness <- apply(community_matrix, 1, function(row) sum(row > 0))
# Calculate Shannon-Wiener diversity index
shannon_wiener <- apply(community_matrix, 1, function(row) {
# Use the vegan built-in function to calculate Shannon-Wiener diversity index
diversity(row, index = "shannon")
})
# Create an output table with the results
output_table <- data.frame(Observed_Richness = richness,
Shannon_Wiener = shannon_wiener)
return(output_table)
}
# Test with dune.bio dataset
dune <- read.table("R09_diversity_analysis/data/dune_bio.txt", header = T)
head(dune)## Sites Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla
## 1 2 3 0 0 0 0 0 0 0 0 0
## 2 13 0 0 3 0 0 0 0 0 0 2
## 3 4 2 0 0 0 0 0 0 0 2 0
## 4 16 0 0 0 3 0 8 0 0 4 2
## 5 6 0 0 0 0 0 0 6 0 6 0
## 6 1 0 0 0 0 0 0 0 0 0 0
## Cirarv Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil
## 1 0 0 5 0 4 0 0 5 0 0 3
## 2 0 0 2 0 2 0 0 2 0 0 0
## 3 2 0 2 0 4 0 0 1 0 0 0
## 4 0 0 0 0 0 3 0 0 0 0 0
## 5 0 0 3 0 3 0 5 5 3 0 2
## 6 0 0 0 0 4 0 0 0 0 0 1
## Poatri Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 1 7 0 4 0 0 0 5 2 4
## 2 9 1 0 2 0 5 0 5 0
## 3 5 0 4 5 0 8 5 2 3
## 4 2 0 0 0 0 7 0 4 0
## 5 4 0 0 0 5 0 6 0 0
## 6 2 0 4 0 0 0 7 0 0
diversity_summary(dune)## Observed_Richness Shannon_Wiener
## 1 11 2.335037
## 2 11 2.101662
## 3 14 2.511413
## 4 9 1.951556
## 5 12 2.434118
## 6 6 1.570859
## 7 13 2.479643
## 8 15 2.613520
## 9 8 1.570696
## 10 9 1.868823
## 11 13 2.430346
## 12 10 2.135937
## 13 14 2.519526
## 14 10 1.920644
## 15 11 2.293734
## 16 9 1.914730
## 17 8 1.835171
## 18 10 1.987159
## 19 10 2.142728
## 20 14 2.524462
dune <- read.table("R09_diversity_analysis/data/dune_bio.txt", header = T)
head(dune)## Sites Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla
## 1 2 3 0 0 0 0 0 0 0 0 0
## 2 13 0 0 3 0 0 0 0 0 0 2
## 3 4 2 0 0 0 0 0 0 0 2 0
## 4 16 0 0 0 3 0 8 0 0 4 2
## 5 6 0 0 0 0 0 0 6 0 6 0
## 6 1 0 0 0 0 0 0 0 0 0 0
## Cirarv Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil
## 1 0 0 5 0 4 0 0 5 0 0 3
## 2 0 0 2 0 2 0 0 2 0 0 0
## 3 2 0 2 0 4 0 0 1 0 0 0
## 4 0 0 0 0 0 3 0 0 0 0 0
## 5 0 0 3 0 3 0 5 5 3 0 2
## 6 0 0 0 0 4 0 0 0 0 0 1
## Poatri Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 1 7 0 4 0 0 0 5 2 4
## 2 9 1 0 2 0 5 0 5 0
## 3 5 0 4 5 0 8 5 2 3
## 4 2 0 0 0 0 7 0 4 0
## 5 4 0 0 0 5 0 6 0 0
## 6 2 0 4 0 0 0 7 0 0
#Rank-abundance curves
plot(rad.lognormal(dune[2,]), lty=2, lwd=2)rad.lognormal(dune[2,])##
## RAD model: Log-Normal
## Family: poisson
## No. of species: 11
## Total abundance: 46
##
## log.mu log.sigma Deviance AIC BIC
## 1.1320236 0.8336663 1.6198617 39.1107008 39.9064914
plot(radfit(dune[2,]))radfit(dune[2,])##
## RAD models, family poisson
## No. of species 11, total abundance 46
##
## par1 par2 par3 Deviance AIC BIC
## Null 3.44463 36.93547 36.93547
## Preemption 0.22882 2.53015 38.02098 38.41888
## Lognormal 1.132 0.83367 1.61986 39.11070 39.90649
## Zipf 0.30895 -0.92977 1.21853 38.70937 39.50516
## Mandelbrot 1.0083 -1.4185 1.4101 0.88721 40.37805 41.57173
shannon_diversity <- function(abundances) {
abundances <- abundances[abundances > 0] # Filter out zero abundance values
prop <- abundances / sum(abundances) # Calculate proportion of each species
log_prop <- log(prop) # Take the natural logarithm of each proportion
-sum(prop * log_prop) # Calculate the Shannon-Wiener index
}
abundances <- c(5, 2, 0, 1, 4, 5, 3)
shannon_diversity(abundances)## [1] 1.679648
dune_bio <- read.table("R10_cluster_analysis/data/dune_bio.txt", sep="\t", header = T, row.names = 1)
head(dune_bio)## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 2 3 0 0 0 0 0 0 0 0 0 0
## 13 0 0 3 0 0 0 0 0 0 2 0
## 4 2 0 0 0 0 0 0 0 2 0 2
## 16 0 0 0 3 0 8 0 0 4 2 0
## 6 0 0 0 0 0 0 6 0 6 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 2 0 5 0 4 0 0 5 0 0 3 7
## 13 0 2 0 2 0 0 2 0 0 0 9
## 4 0 2 0 4 0 0 1 0 0 0 5
## 16 0 0 0 0 3 0 0 0 0 0 2
## 6 0 3 0 3 0 5 5 3 0 2 4
## 1 0 0 0 4 0 0 0 0 0 1 2
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 2 0 4 0 0 0 5 2 4
## 13 1 0 2 0 5 0 5 0
## 4 0 4 5 0 8 5 2 3
## 16 0 0 0 0 7 0 4 0
## 6 0 0 0 5 0 6 0 0
## 1 0 4 0 0 0 7 0 0
#Hierarchical clustering
dune_bray<-vegdist(dune_bio, method="bray")
clust_single<-hclust(dune_bray, method="single")
clust_avg<-hclust(dune_bray, method="average")
clust_complete<-hclust(dune_bray, method="complete")
plot(clust_single) #dendrogram (tree) visualizationplot(clust_avg)plot(clust_complete)
# Read in data
dune_bio <- read.table("R10_cluster_analysis/data/dune_bio.txt", sep="\t", header = T, row.names = 1)
head(dune_bio)## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv
## 2 3 0 0 0 0 0 0 0 0 0 0
## 13 0 0 3 0 0 0 0 0 0 2 0
## 4 2 0 0 0 0 0 0 0 2 0 2
## 16 0 0 0 3 0 8 0 0 4 2 0
## 6 0 0 0 0 0 0 6 0 6 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri
## 2 0 5 0 4 0 0 5 0 0 3 7
## 13 0 2 0 2 0 0 2 0 0 0 9
## 4 0 2 0 4 0 0 1 0 0 0 5
## 16 0 0 0 0 3 0 0 0 0 0 2
## 6 0 3 0 3 0 5 5 3 0 2 4
## 1 0 0 0 4 0 0 0 0 0 1 2
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor
## 2 0 4 0 0 0 5 2 4
## 13 1 0 2 0 5 0 5 0
## 4 0 4 5 0 8 5 2 3
## 16 0 0 0 0 7 0 4 0
## 6 0 0 0 5 0 6 0 0
## 1 0 4 0 0 0 7 0 0
# Load packages
library(vegan) # cluster analysis
library(tidyverse) # data manipulation
library(fpc) # calinski criterion
# Calculate the Bray-Curtis distance matrix
distance_matrix <- vegdist(dune_bio, method = "bray")
# Create an empty list to store k-means models
kmeans_models <- list()
calinski_scores <- c()
# Iterate through k = 2 to 6
for (k in 2:6) {
# Perform k-means clustering using the distance matrix
model <- kmeans(distance_matrix, centers = k)
# Store the model in the list
kmeans_models[[k - 1]] <- model
# Calculate dissimilarity sums of squares between and within clusters
cluster_diss <- cluster.stats(distance_matrix, model$cluster)
# Calculate the Calinski-Harabasz index
calinski <- cluster_diss$ch
calinski_scores <- c(calinski_scores, calinski)
}
# Find the optimal number of clusters with the highest Calinski-Harabasz index
optimal_k <- which.max(calinski_scores) + 1
# Select the best k-means model
best_kmeans_model <- kmeans_models[[optimal_k - 1]]
# Output the optimal number of clusters and the best k-means model
optimal_k## [1] 4
best_kmeans_model## K-means clustering with 4 clusters of sizes 6, 4, 2, 8
##
## Cluster means:
## 2 13 4 16 6 1 8
## 1 0.5041411 0.3456026 0.3499140 0.6016977 0.6308567 0.6658206 0.3106168
## 2 0.8835275 0.6631262 0.7405550 0.3111881 0.8270814 0.9803922 0.4771717
## 3 0.8163903 0.8437500 0.8447368 0.9531250 0.7020293 0.9393939 0.8186940
## 4 0.4071785 0.7268698 0.5310972 0.8862785 0.3666832 0.5257059 0.5822448
## 5 17 15 10 11 9 18
## 1 0.5818057 0.8952592 0.6398251 0.5906853 0.6146386 0.3003049 0.6905822
## 2 0.8789276 0.9263041 0.2537853 0.8490769 0.8133325 0.6927694 0.7798648
## 3 0.6616962 0.2826087 0.8362573 0.6616962 0.6288416 0.8377794 0.6568144
## 4 0.3868861 0.7205035 0.8392754 0.3389453 0.4196274 0.5682040 0.4710198
## 3 20 14 19 12 7
## 1 0.3002849 0.6908522 0.7018692 0.7762050 0.3514318 0.5440316
## 2 0.7275416 0.2736479 0.3400268 0.8201272 0.6502025 0.8735012
## 3 0.8609475 0.8274895 0.8759907 0.2826087 0.8084848 0.7096031
## 4 0.4925220 0.8704492 0.8468489 0.7236128 0.7159836 0.3400911
##
## Clustering vector:
## 2 13 4 16 6 1 8 5 17 15 10 11 9 18 3 20 14 19 12 7
## 4 1 1 2 4 4 1 4 3 2 4 4 1 4 1 2 2 3 1 4
##
## Within cluster sum of squares by cluster:
## [1] 1.5937748 0.6676346 0.4484247 2.9580899
## (between_SS / total_SS = 70.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(vegan)
# Load the varechem dataset
data(varechem)
# View
head(varechem)## N P K Ca Mg S Al Fe Mn Zn Mo Baresoil Humdepth
## 18 19.8 42.1 139.9 519.4 90.0 32.3 39.0 40.9 58.1 4.5 0.3 43.9 2.2
## 15 13.4 39.1 167.3 356.7 70.7 35.2 88.1 39.0 52.4 5.4 0.3 23.6 2.2
## 24 20.2 67.7 207.1 973.3 209.1 58.1 138.0 35.4 32.1 16.8 0.8 21.2 2.0
## 27 20.6 60.8 233.7 834.0 127.2 40.7 15.4 4.4 132.0 10.7 0.2 18.7 2.9
## 23 23.8 54.5 180.6 777.0 125.8 39.5 24.2 3.0 50.1 6.6 0.3 46.0 3.0
## 19 22.8 40.9 171.4 691.8 151.4 40.8 104.8 17.6 43.6 9.1 0.4 40.5 3.8
## pH
## 18 2.7
## 15 2.8
## 24 3.0
## 27 2.8
## 23 2.7
## 19 2.7
# Given the nature of this dataset perform a PCA or a CA ordination analysis.
varechem.pca <- rda(varechem)
# a) How much variation is explained by the two first axes?
# eigenvalues of the PCA
eigenvalues <- varechem.pca$CA$eig
# proportion of variation explained by each axis
summary(varechem.pca)##
## Call:
## rda(X = varechem)
##
## Partitioning of variance:
## Inertia Proportion
## Total 85655 1
## Unconstrained 85655 1
##
## Eigenvalues, and their contribution to the variance
##
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Eigenvalue 6.406e+04 1.696e+04 2.425e+03 952.75912 7.162e+02
## Proportion Explained 7.478e-01 1.980e-01 2.832e-02 0.01112 8.362e-03
## Cumulative Proportion 7.478e-01 9.459e-01 9.742e-01 0.98533 9.937e-01
## PC6 PC7 PC8 PC9 PC10 PC11
## Eigenvalue 248.4113 1.960e+02 6.122e+01 2.293e+01 9.8096415 1.55e+00
## Proportion Explained 0.0029 2.288e-03 7.147e-04 2.677e-04 0.0001145 1.81e-05
## Cumulative Proportion 0.9966 9.989e-01 9.996e-01 9.999e-01 0.9999799 1.00e+00
## PC12 PC13 PC14
## Eigenvalue 1.536e-01 1.365e-02 6.111e-03
## Proportion Explained 1.794e-06 1.594e-07 7.134e-08
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores: 37.46449
##
##
## Species scores
##
## PC1 PC2 PC3 PC4 PC5 PC6
## N 1.848e-01 0.06565 -0.064579 0.242489 0.049798 -0.058950
## P -1.406e+00 -0.49338 -0.586447 0.098794 -0.247471 -0.340161
## K -5.590e+00 -2.71621 -5.415076 0.133394 0.441849 0.851878
## Ca -3.107e+01 -2.16215 1.364667 0.031580 -0.392100 0.076706
## Mg -4.214e+00 -0.62786 -0.292713 0.086101 2.889419 -0.946727
## S -7.922e-01 -0.77736 -0.743928 -0.246051 0.306013 -0.229167
## Al 4.286e+00 -14.99249 0.103184 -1.278459 -0.459067 -0.410995
## Fe 3.055e+00 -6.09427 1.202549 3.396852 0.461394 0.414400
## Mn -2.123e+00 1.46971 -2.425290 1.461107 -1.483031 -1.381559
## Zn -2.610e-01 -0.04535 -0.073993 -0.082057 0.053418 -0.115676
## Mo 5.506e-03 -0.01362 -0.004219 -0.011709 0.003363 -0.012748
## Baresoil -4.538e-01 0.77137 -0.475825 -0.377334 0.505105 -0.087138
## Humdepth -2.491e-02 0.03660 -0.029715 0.001922 0.020091 -0.023395
## pH -8.582e-04 -0.01208 0.016641 0.001791 -0.006876 0.001844
##
##
## Site scores (weighted sums of species scores)
##
## PC1 PC2 PC3 PC4 PC5 PC6
## 18 1.134 6.506 0.22023 6.6422 4.8465 0.4275
## 15 6.097 4.849 -8.29769 1.7951 4.5293 4.9573
## 24 -12.745 -3.490 6.83734 -2.7002 19.9436 -9.8485
## 27 -9.202 5.394 -7.85159 9.7672 -4.0512 -10.7682
## 23 -7.030 5.575 2.20539 -0.1090 6.7307 4.2154
## 19 -4.158 1.572 1.63109 -4.2528 13.8684 -8.5953
## 22 -5.753 6.362 1.30446 2.8881 -4.0457 1.9607
## 16 5.163 6.195 -1.87558 -6.2570 -0.8808 14.2735
## 28 -6.686 5.598 -12.93031 6.9102 -2.8017 5.1512
## 13 0.829 -12.710 -20.68496 -0.1316 8.8986 5.5632
## 14 1.547 4.261 0.05426 -8.2069 1.5852 6.2083
## 20 -5.570 0.107 -3.92228 -0.1163 -9.1245 -7.3301
## 25 -2.990 8.022 -1.83433 7.3358 -14.9367 -12.3328
## 7 12.819 -6.135 4.23067 -1.3655 3.5539 -9.4099
## 5 10.191 10.296 6.17062 -3.8974 -4.1207 0.3553
## 6 10.185 2.024 7.11856 1.9300 -2.2812 -0.7472
## 3 12.217 -8.641 8.25696 16.1822 3.0774 -3.6812
## 4 2.654 -17.193 -7.52309 -13.6917 -10.8525 -4.5842
## 2 5.225 -11.817 1.62040 16.6311 -0.2547 8.4378
## 9 2.630 -3.420 3.06818 -12.4091 -4.8046 -11.7627
## 12 3.026 3.760 2.34446 -8.5600 -2.3325 5.4528
## 10 -4.452 -2.765 -2.98307 -5.1036 -1.3049 4.8802
## 11 -17.513 -12.376 17.78560 0.9704 -9.2785 12.4853
## 21 2.380 8.025 5.05469 -4.2513 4.0366 4.6918
(prop.var <- round(eigenvalues/sum(eigenvalues)*100, 1))## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14
## 74.8 19.8 2.8 1.1 0.8 0.3 0.2 0.1 0.0 0.0 0.0 0.0 0.0 0.0
# b) Make a screeplot of the results
plot(prop.var, type = "b", xlab = "Principal component", ylab = "Proportion of variance", main = "Screeplot of PCA")# c) Plot the ordination results of the sites
plot(varechem.pca, type = "n")
text(varechem.pca, display = "sites", cex = 0.7)# d) Plot both the sites scores and the soil characteristics scores focusing on the soil variables
biplot(varechem.pca, scaling = 2)
# Read in data
dune_bio <- read.table("R10_cluster_analysis/data/dune_bio.txt", sep="\t", header = T, row.names = 1)
# Load vegan library
library(vegan)
# Perform CA
dune.ca <- cca(dune_bio)
# a) How much variation is explained by the two first axes?
# Proportion of variation explained by each axis
summary(dune.ca)##
## Call:
## cca(X = dune_bio)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 2.115 1
## Unconstrained 2.115 1
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7
## Eigenvalue 0.5360 0.4001 0.2598 0.17598 0.14476 0.10791 0.09247
## Proportion Explained 0.2534 0.1892 0.1228 0.08319 0.06844 0.05102 0.04372
## Cumulative Proportion 0.2534 0.4426 0.5654 0.64858 0.71702 0.76804 0.81175
## CA8 CA9 CA10 CA11 CA12 CA13 CA14
## Eigenvalue 0.08091 0.07332 0.05630 0.04826 0.04125 0.03523 0.020529
## Proportion Explained 0.03825 0.03466 0.02661 0.02282 0.01950 0.01665 0.009705
## Cumulative Proportion 0.85000 0.88467 0.91128 0.93410 0.95360 0.97025 0.979955
## CA15 CA16 CA17 CA18 CA19
## Eigenvalue 0.014911 0.009074 0.007938 0.007002 0.003477
## Proportion Explained 0.007049 0.004290 0.003753 0.003310 0.001644
## Cumulative Proportion 0.987004 0.991293 0.995046 0.998356 1.000000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
##
##
## Species scores
##
## CA1 CA2 CA3 CA4 CA5 CA6
## Belper -0.50018 -0.35503 -0.15239 -0.704153 -0.058546 -0.07308
## Empnig -0.69027 3.26420 1.95716 -0.176936 -0.073518 0.16083
## Junbuf 0.08157 -0.68074 1.00545 1.078390 0.268360 -0.24168
## Junart 1.27580 0.09963 -0.09320 0.005536 0.289410 0.78247
## Airpra -1.00434 3.06749 1.33773 0.194305 -1.081813 0.53699
## Elepal 1.76383 0.34562 -0.57336 -0.002976 -0.332396 0.14688
## Rumace -0.65289 -0.25525 -0.59728 1.160164 0.255849 0.32730
## Viclat -0.61893 0.37140 -0.46057 -1.000375 1.162652 -1.44971
## Brarut 0.18222 0.26477 -0.16606 0.064009 0.576334 -0.07741
## Ranfla 1.55886 0.30700 -0.29765 0.046974 -0.008747 0.14744
## Cirarv -0.05647 -0.76398 0.91793 -1.175919 -0.384024 0.13985
## Hyprad -0.85408 2.52821 1.13951 -0.175115 -0.311874 -0.11177
## Leoaut -0.19566 0.38884 0.03975 -0.130392 0.141232 -0.23717
## Potpal 1.91690 0.52150 -1.18215 -0.021738 -1.359988 -1.31207
## Poapra -0.38919 -0.32999 -0.02015 -0.358371 0.079296 0.05165
## Calcus 1.95199 0.56743 -0.85948 -0.098969 -0.556737 -0.23282
## Tripra -0.88116 -0.09792 -1.18172 1.282429 0.325706 0.33388
## Trirep -0.07666 -0.02032 -0.20594 0.026462 -0.186748 -0.53957
## Antodo -0.96676 1.08361 -0.17188 0.459788 -0.607533 0.30425
## Salrep 0.61035 1.54868 0.04970 -0.607136 1.429729 0.55183
## Achmil -0.90859 0.08461 -0.58636 -0.008919 -0.660183 0.18877
## Poatri -0.18185 -0.53997 0.23388 0.178834 -0.155342 0.07584
## Chealb 0.42445 -0.84402 1.59029 1.248755 -0.207480 -0.87566
## Elyrep -0.37074 -0.74148 0.26238 -0.566308 -0.270122 0.72624
## Sagpro 0.00364 0.01719 1.11570 0.066981 0.186654 -0.32463
## Plalan -0.84058 0.24886 -0.78066 0.371149 0.271377 -0.11989
## Agrsto 0.93378 -0.20651 0.28165 0.024293 -0.139326 0.02256
## Lolper -0.50272 -0.35955 -0.21821 -0.474727 0.101494 0.01594
## Alogen 0.40088 -0.61839 0.85013 0.346740 0.016509 -0.10169
## Brohor -0.65762 -0.40634 -0.30685 -0.496751 -0.561358 -0.07004
##
##
## Site scores (weighted averages of species scores)
##
## CA1 CA2 CA3 CA4 CA5 CA6
## 2 -0.63268 -0.6958357 -0.09708 -1.18695 -0.97686 -0.06575
## 13 0.42445 -0.8440195 1.59029 1.24876 -0.20748 -0.87566
## 4 -0.05647 -0.7639784 0.91793 -1.17592 -0.38402 0.13985
## 16 2.00229 0.1090627 -0.33414 0.33760 -0.50097 0.76159
## 6 -0.85633 -0.0005408 -1.39735 1.59909 0.65494 0.19386
## 1 -0.81167 -1.0826714 -0.14479 -2.10665 -0.39287 1.83462
## 8 0.76268 -0.2968459 0.35648 -0.10772 0.17507 0.36444
## 5 -0.95293 -0.1846015 -0.95609 0.86853 -0.34552 0.98333
## 17 -1.47545 2.7724102 0.40859 0.75117 -2.59425 1.10122
## 15 1.91384 0.5079036 -0.96567 0.04227 -0.50681 -0.19370
## 10 -0.87885 -0.0353136 -0.82987 -0.68053 -0.75438 -0.81070
## 11 -0.64223 0.4440332 -0.17371 -1.09684 1.37462 -2.00626
## 9 0.09693 -0.7864314 0.86492 0.40090 0.28704 1.02783
## 18 -0.31241 0.6328355 -0.66501 -1.12728 2.65575 -0.97565
## 3 -0.10148 -0.9128732 0.68815 -0.68137 -0.08709 0.28678
## 20 1.94438 1.0688809 -0.66595 -0.55317 1.59606 1.70292
## 14 1.91996 0.5351062 -1.39863 -0.08575 -2.21317 -2.43044
## 19 -0.69027 3.2642026 1.95716 -0.17694 -0.07352 0.16083
## 12 0.28557 -0.6656161 1.64423 1.71496 0.65381 -1.17376
## 7 -0.87149 -0.2547040 -0.86830 0.90468 0.17385 0.03446
# b) Make a screeplot of the results
screeplot(dune.ca)# c) Plot the ordination results of the sites
plot(dune.ca, display = "sites")# Read in data
dune_bio <- read.table("R11_ordination/data/dune_bio.txt", sep="\t", header = T, row.names = 1)
# Calculate Bray-Curtis dissimilarity matrix
dune_dist <- vegdist(dune_bio, method = "bray")
# Perform NMDS
set.seed(123)
dune_nmds <- metaMDS(dune_dist, k = 2, trymax = 100)## Run 0 stress 0.1192678
## Run 1 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02027109 max resid 0.06496529
## Run 2 stress 0.1192679
## Run 3 stress 0.1886532
## Run 4 stress 0.1192678
## Run 5 stress 0.1812933
## Run 6 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 4.157097e-06 max resid 1.240169e-05
## ... Similar to previous best
## Run 7 stress 0.1183186
## ... Procrustes: rmse 2.144307e-05 max resid 6.794802e-05
## ... Similar to previous best
## Run 8 stress 0.1183186
## ... Procrustes: rmse 2.813403e-05 max resid 8.990945e-05
## ... Similar to previous best
## Run 9 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 3.255259e-06 max resid 1.061564e-05
## ... Similar to previous best
## Run 10 stress 0.1183186
## ... Procrustes: rmse 1.089752e-05 max resid 3.52784e-05
## ... Similar to previous best
## Run 11 stress 0.119268
## Run 12 stress 0.2341212
## Run 13 stress 0.1192678
## Run 14 stress 0.1192678
## Run 15 stress 0.1183186
## ... Procrustes: rmse 3.36777e-06 max resid 1.031387e-05
## ... Similar to previous best
## Run 16 stress 0.1192679
## Run 17 stress 0.1183186
## ... Procrustes: rmse 4.14244e-06 max resid 1.292607e-05
## ... Similar to previous best
## Run 18 stress 0.1886532
## Run 19 stress 0.1192678
## Run 20 stress 0.1192678
## *** Best solution repeated 4 times
# a) What is the stress?
dune_nmds$stress## [1] 0.1183186
# b) Make a Shepard plot of the NMDS results
stressplot(dune_nmds)# c) Plot the ordination results of the sites
plot(dune_nmds, display = "sites")# d) [Advanced – ggplot2] Plot the ordination results using ggplot2.
# Use the dune_env.txt dataset to make the size of the points
# proportional to the site richness (number of species) and
# the color to represent the variable Management.
# Load ggvegan package
library(ggvegan)
# Load data
dune_env <- read.table("R11_ordination/data/dune_env.txt", sep="\t", header = T, row.names = 1)
# Extract site scores from NMDS object and name columns
dune_scores <- data.frame(NMDS1 = dune_nmds$points[,1], NMDS2 = dune_nmds$points[,2])
# Add site richness and Management variables
dune_scores$Richness <- rowSums(dune_bio > 0)
dune_scores$Management <- dune_env$Management
# Plot using ggplot2
ggplot(dune_scores, aes(x = NMDS1, y = NMDS2)) +
geom_point(aes(size = Richness, color = Management)) +
scale_color_manual(values = c("black", "red", "blue", "green")) +
labs(x = "NMDS1", y = "NMDS2", size = "Richness", color = "Management") +
theme_bw()
# Load packages
library(vegan)
# Read in data
dune_bio <- read.table("R11_ordination/data/dune_bio.txt", sep="\t", header = T, row.names = 1)
# Perform CCA
dune_cca <- cca(dune_bio ~ A1 + Moisture + Manure, data = dune_env)
# a) Variation explained by first two constrained axes
dune_cca$CCA## $eig
## CCA1 CCA2 CCA3
## 0.4264268 0.2346962 0.1081232
##
## $poseig
## NULL
##
## $u
## CCA1 CCA2 CCA3
## 2 -1.0721645 -0.15064571 0.02248439
## 13 1.3313412 1.08879312 -0.17909863
## 4 -0.4031599 1.49565056 -0.09656978
## 16 1.2911794 1.04854117 -0.34917010
## 6 -0.9650664 -0.04330717 0.47600830
## 1 -1.0994893 1.27128858 -0.50140763
## 8 1.0903705 0.84728141 -1.19952742
## 5 -0.6973213 0.22503917 1.60981806
## 17 -0.5627064 -1.56289510 0.04416645
## 15 1.9680612 -0.44703770 3.12946612
## 10 -0.6232242 -0.89888856 -0.41619626
## 11 -1.1053575 -0.90857346 0.08601369
## 9 0.4481405 -0.77218022 -0.96709228
## 18 -0.9912907 -1.51891073 0.77313836
## 3 -0.3897726 1.50906787 -0.03987929
## 20 0.8970807 -1.52042308 -1.40577293
## 14 1.6735416 -0.74221868 1.88227538
## 19 0.9238553 -1.49358844 -1.29239196
## 12 0.7624658 0.26751119 0.15987867
## 7 -1.1326823 0.51336083 -0.43787833
##
## $v
## CCA1 CCA2 CCA3
## Belper -1.11035852 0.18609025 0.87213139
## Empnig 1.41475642 -3.08303103 -3.93038265
## Junbuf 0.77405428 0.36113967 -1.08590959
## Junart 1.66068409 -0.45604291 -1.00505278
## Airpra 0.50417108 -3.14025552 -2.30450253
## Elepal 2.13249526 0.04059818 1.21182929
## Rumace -0.87235194 0.16009955 1.34775799
## Viclat -1.46445244 -2.18541973 0.40217143
## Brarut 0.08975166 -0.60052127 0.43474540
## Ranfla 2.00141417 -0.36727220 0.20523525
## Cirarv -0.61738351 3.08728760 -0.29368503
## Hyprad 0.21832938 -2.84647252 -2.09556817
## Leoaut -0.16546412 -0.81378986 0.10166803
## Potpal 2.78830529 -1.22741789 7.62077701
## Poapra -0.72429786 0.40907813 -0.48069770
## Calcus 2.03042414 -0.90504267 0.68860356
## Tripra -1.44379134 0.28904537 1.59624923
## Trirep -0.05651707 -0.51233033 0.68760165
## Antodo -0.65616291 -1.37852937 0.04834784
## Salrep 0.59627008 -3.12246433 -2.37394373
## Achmil -1.29442470 -0.58207308 0.24433694
## Poatri -0.14687918 1.00467965 -0.24923437
## Chealb 2.03876466 2.24746180 -0.54466926
## Elyrep -0.70435488 1.01371106 -0.21343986
## Sagpro 0.56159769 0.47250808 -1.25294946
## Plalan -1.37000195 -0.76449532 1.12955346
## Agrsto 1.23524007 0.88453078 0.10685086
## Lolper -1.04540461 0.53392609 -0.40648055
## Alogen 0.80985066 1.54535597 -0.87343934
## Brohor -1.18946377 0.24296941 0.09718008
##
## $wa
## CCA1 CCA2 CCA3
## 2 -0.8543611 0.57195324 -0.04460370
## 13 0.7655849 1.43233992 -1.04659850
## 4 -0.1564792 1.36916386 -0.67602369
## 16 2.0458733 0.46834288 0.70504152
## 6 -1.0570772 -0.29556547 1.50936078
## 1 -1.2438592 1.24491960 -0.99278055
## 8 0.8113423 0.66761981 -0.54771974
## 5 -1.1246789 -0.04618834 1.17586596
## 17 -0.7721870 -2.94478418 -1.24410804
## 15 2.0065157 -0.48089852 2.87632931
## 10 -1.0957607 -0.42411945 0.49762350
## 11 -0.7815660 -0.90607598 -0.28150392
## 9 0.1816930 0.86594581 -0.91241803
## 18 -0.6052799 -1.51952249 0.07323869
## 3 -0.2093323 1.35236532 -0.66038354
## 20 1.8996429 -1.40266348 -0.55715093
## 14 1.9462359 -0.67176895 3.54931127
## 19 0.2690286 -3.39538093 -3.20303342
## 12 0.6251753 1.06203820 -0.88731293
## 7 -1.0497706 -0.01449462 0.64117751
##
## $alias
## NULL
##
## $biplot
## CCA1 CCA2 CCA3
## A1 0.6048068 0.04017953 0.7953580
## Moisture 0.9745548 -0.06076467 -0.2157560
## Manure -0.2058644 0.96205486 -0.1790818
##
## $rank
## [1] 3
##
## $qrank
## [1] 3
##
## $tot.chi
## [1] 0.7692462
##
## $QR
## $qr
## A1 Moisture Manure
## 2 1.861329532 0.719095938 -0.31114048
## 13 -0.155069441 -1.574794968 0.18822090
## 4 0.066780053 -0.087761040 -1.31352615
## 16 -0.119693364 0.229923505 0.19568149
## 6 0.054748424 -0.267822436 0.02412991
## 1 0.164161248 -0.080514036 0.22764492
## 8 0.062960838 0.377601525 0.16482461
## 5 -0.217394231 -0.425552647 0.09280834
## 17 0.054455846 -0.040506411 -0.22381227
## 15 -0.670908896 -0.172983324 -0.10673997
## 10 0.186424936 -0.008360368 -0.21268749
## 11 0.137597775 -0.159303274 -0.16898949
## 9 0.131031656 0.272201785 -0.21573538
## 18 0.009062452 -0.221318362 -0.27587875
## 3 0.049978245 -0.091039507 0.38051708
## 20 0.135430745 0.383551090 -0.36155931
## 14 -0.464100682 -0.035303280 -0.16653658
## 19 0.112572531 0.368941648 -0.35561834
## 12 -0.135411189 0.085489419 0.04143180
## 7 0.244717140 -0.120023238 0.15538345
## attr(,"assign")
## [1] 1 2 3
## attr(,"scaled:center")
## A1 Moisture Manure
## 4.684964 2.801460 1.902190
##
## $rank
## [1] 3
##
## $qraux
## [1] 1.157638 1.207313 1.400020
##
## $pivot
## [1] 1 2 3
##
## attr(,"class")
## [1] "qr"
##
## $envcentre
## A1 Moisture Manure
## 4.684964 2.801460 1.902190
# b) Adjusted R2 of the model
RsquareAdj(dune_cca)$adj.r.squared## [1] 0.2452277
# c) Permutation test for overall statistical significance
anova(dune_cca)## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
## Df ChiSquare F Pr(>F)
## Model 3 0.76925 3.048 0.001 ***
## Residual 16 1.34602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# d) Permutation test for marginal statistical significance of explanatory variables
anova(dune_cca, by="margin")## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
## Df ChiSquare F Pr(>F)
## A1 1 0.13047 1.5508 0.106
## Moisture 1 0.30886 3.6714 0.001 ***
## Manure 1 0.23418 2.7837 0.002 **
## Residual 16 1.34602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# e) Triplot of ordination results focusing on sites
plot(dune_cca, display = "sites")
# Load packages
library(vegan)
# Read in data
dune_bio <- read.table("R11_ordination/data/dune_bio.txt", sep = "\t", header = TRUE, row.names = 1)
dune_env <- read.table("R11_ordination/data/dune_env.txt", sep = "\t", header = TRUE)
# Convert Use variable to a factor
dune_env$Use <- factor(dune_env$Use)
# Calculate Bray-Curtis distances
dune_bc <- vegdist(dune_bio, method = "bray")
# Perform PERMANOVA with A1, Moisture, and Manure as explanatory variables
dune_perm <- adonis2(dune_bc ~ A1 + Moisture + Manure, data = dune_env, permutations = 9999)
# a) Which explanatory variables are significant?
print(summary(dune_perm))## Df SumOfSqs R2 F
## Min. : 1.0 Min. :0.7197 Min. :0.1674 Min. :5.788
## 1st Qu.: 1.0 1st Qu.:0.7230 1st Qu.:0.1682 1st Qu.:5.802
## Median : 1.0 Median :0.8670 Median :0.2017 Median :5.815
## Mean : 7.6 Mean :1.7196 Mean :0.4000 Mean :6.192
## 3rd Qu.:16.0 3rd Qu.:1.9893 3rd Qu.:0.4627 3rd Qu.:6.394
## Max. :19.0 Max. :4.2990 Max. :1.0000 Max. :6.974
## NA's :2
## Pr(>F)
## Min. :0.00010
## 1st Qu.:0.00015
## Median :0.00020
## Mean :0.00020
## 3rd Qu.:0.00025
## Max. :0.00030
## NA's :2
# b) What is the explanatory power (R2) of the significant variables?
print(dune_perm$R2)## [1] 0.1681666 0.2016854 0.1674089 0.4627391 1.0000000
# c) Perform a new PERMANOVA analysis with 9,999 permutations
# with A1 as the explanatory variable but constrain the permutations within the Use variable.
dune_perm2 <- adonis2(dune_bc ~ A1, data = dune_env, permutations = 9999, strata = dune_env$Use)
# Print results of PERMANOVA with constrained permutations
print(summary(dune_perm2))## Df SumOfSqs R2 F
## Min. : 1.00 Min. :0.723 Min. :0.1682 Min. :3.639
## 1st Qu.: 9.50 1st Qu.:2.150 1st Qu.:0.5000 1st Qu.:3.639
## Median :18.00 Median :3.576 Median :0.8318 Median :3.639
## Mean :12.67 Mean :2.866 Mean :0.6667 Mean :3.639
## 3rd Qu.:18.50 3rd Qu.:3.938 3rd Qu.:0.9159 3rd Qu.:3.639
## Max. :19.00 Max. :4.299 Max. :1.0000 Max. :3.639
## NA's :2
## Pr(>F)
## Min. :0.0062
## 1st Qu.:0.0062
## Median :0.0062
## Mean :0.0062
## 3rd Qu.:0.0062
## Max. :0.0062
## NA's :2
# d) Plot a NMDS ordination to visually confirm your results in c).
# ggplot2 extension to plot ordination:
# https://github.com/gavinsimpson/ggvegan
# install.packages("remotes")
# remotes::install_github("gavinsimpson/ggvegan")
# library(ggvegan)
# library(ggfortify)
# autoplot(dune_perm2)
# That didn't work, so...
# d) Plot a NMDS ordination to visually confirm your results in c).
# Perform NMDS ordination
dune_nmds <- metaMDS(dune_bio, distance = "bray")## Run 0 stress 0.1192678
## Run 1 stress 0.1808911
## Run 2 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 0.02026741 max resid 0.06494964
## Run 3 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 2.758271e-05 max resid 8.440603e-05
## ... Similar to previous best
## Run 4 stress 0.1192678
## Run 5 stress 0.1192679
## Run 6 stress 0.1192678
## Run 7 stress 0.1192678
## Run 8 stress 0.1192678
## Run 9 stress 0.1992852
## Run 10 stress 0.1183186
## ... New best solution
## ... Procrustes: rmse 1.053943e-05 max resid 3.46552e-05
## ... Similar to previous best
## Run 11 stress 0.1183186
## ... Procrustes: rmse 1.283445e-05 max resid 4.135828e-05
## ... Similar to previous best
## Run 12 stress 0.1183186
## ... Procrustes: rmse 1.859572e-05 max resid 5.651301e-05
## ... Similar to previous best
## Run 13 stress 0.1192678
## Run 14 stress 0.1183186
## ... Procrustes: rmse 5.023421e-06 max resid 1.483047e-05
## ... Similar to previous best
## Run 15 stress 0.1183186
## ... Procrustes: rmse 1.693054e-05 max resid 4.874075e-05
## ... Similar to previous best
## Run 16 stress 0.1192679
## Run 17 stress 0.1939202
## Run 18 stress 0.2953089
## Run 19 stress 0.1183186
## ... Procrustes: rmse 4.233804e-06 max resid 1.053956e-05
## ... Similar to previous best
## Run 20 stress 0.1808911
## *** Best solution repeated 6 times
# Add environmental variables to the ordination plot
ordiplot(dune_nmds, display = "sites", type = "n")
points(dune_nmds, display = "sites", col = as.numeric(dune_env$Use), pch = 16)
ordiellipse(dune_nmds, dune_env$A1, kind = "se", conf = 0.95, label = TRUE)## Warning in chol.default(cov, pivot = TRUE): the matrix is either rank-deficient
## or indefinite
## Warning in chol.default(cov, pivot = TRUE): the matrix is either rank-deficient
## or indefinite
## Warning in chol.default(cov, pivot = TRUE): the matrix is either rank-deficient
## or indefinite
## Warning in chol.default(cov, pivot = TRUE): the matrix is either rank-deficient
## or indefinite
# Add title and legend to the plot
title(main = "NMDS Ordination with A1 as Explanatory Variable")
legend("bottomright", legend = levels(dune_env$Use), col = 1:length(levels(dune_env$Use)), pch = 16, title = "Use")
sessionInfo()## R version 4.2.3 (2023-03-15 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggvegan_0.1-0 fpc_2.2-10 vegan_2.6-4 lattice_0.21-8
## [5] permute_0.9-7 rgbif_3.7.7 viridis_0.6.2 viridisLite_0.4.1
## [9] maps_3.4.1 forecast_8.21 TTR_0.24.3 DHARMa_0.4.6
## [13] lme4_1.1-32 Matrix_1.5-4 MASS_7.3-58.3 ggeffects_1.2.1
## [17] car_3.1-2 carData_3.0-5 gvlma_1.0.0.3 see_0.7.5
## [21] performance_0.10.3 cowplot_1.1.1 corrgram_1.14 reshape2_1.4.4
## [25] readxl_1.4.2 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
## [29] dplyr_1.1.1 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
## [33] tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0 nycflights13_1.0.2
## [37] ISwR_2.0-8
##
## loaded via a namespace (and not attached):
## [1] plyr_1.8.8 lazyeval_0.2.2 splines_4.2.3
## [4] gap.datasets_0.0.5 urltools_1.7.3 digest_0.6.31
## [7] foreach_1.5.2 htmltools_0.5.5 fansi_1.0.3
## [10] magrittr_2.0.3 cluster_2.1.4 doParallel_1.0.17
## [13] tzdb_0.3.0 xts_0.13.0 timechange_0.2.0
## [16] tseries_0.10-53 colorspace_2.0-3 ggrepel_0.9.3
## [19] haven_2.5.2 xfun_0.38 crayon_1.5.2
## [22] jsonlite_1.8.4 zoo_1.8-11 iterators_1.0.14
## [25] glue_1.6.2 gtable_0.3.3 kernlab_0.9-32
## [28] DEoptimR_1.0-12 prabclus_2.3-2 quantmod_0.4.22
## [31] abind_1.4-5 scales_1.2.1 oai_0.4.0
## [34] Rcpp_1.0.10 xtable_1.8-4 mclust_6.0.0
## [37] stats4_4.2.3 datawizard_0.7.1 httr_1.4.5
## [40] ellipsis_0.3.2 modeltools_0.2-23 pkgconfig_2.0.3
## [43] flexmix_2.3-19 farver_2.1.1 nnet_7.3-18
## [46] qgam_1.3.4 sass_0.4.5 utf8_1.2.2
## [49] crul_1.3 tidyselect_1.2.0 labeling_0.4.2
## [52] rlang_1.1.0 later_1.3.0 munsell_0.5.0
## [55] cellranger_1.1.0 tools_4.2.3 cachem_1.0.7
## [58] cli_3.6.0 generics_0.1.3 sjlabelled_1.2.0
## [61] evaluate_0.20 fastmap_1.1.1 yaml_2.3.7
## [64] knitr_1.42 robustbase_0.95-1 nlme_3.1-162
## [67] whisker_0.4.1 mime_0.12 xml2_1.3.3
## [70] gap_1.5-1 compiler_4.2.3 rstudioapi_0.14
## [73] curl_5.0.0 bslib_0.4.2 stringi_1.7.12
## [76] highr_0.10 nloptr_2.0.3 urca_1.3-3
## [79] vctrs_0.6.1 pillar_1.9.0 lifecycle_1.0.3
## [82] triebeard_0.4.1 lmtest_0.9-40 jquerylib_0.1.4
## [85] data.table_1.14.8 insight_0.19.1 httpuv_1.6.9
## [88] patchwork_1.1.2 R6_2.5.1 promises_1.2.0.1
## [91] gridExtra_2.3 codetools_0.2-19 boot_1.3-28.1
## [94] withr_2.5.0 httpcode_0.3.0 fracdiff_1.5-2
## [97] diptest_0.76-0 mgcv_1.8-42 bayestestR_0.13.1
## [100] parallel_4.2.3 hms_1.1.3 quadprog_1.5-8
## [103] grid_4.2.3 timeDate_4022.108 class_7.3-21
## [106] minqa_1.2.5 rmarkdown_2.21 shiny_1.7.4