ENVS567_Trahanhmwk

First, Set up working Directory.

#Get Working Directory (aka what directory are you currently working from)
# getwd()
#Set Working Directory
setwd("C:/Users/ual-laptop/Desktop/ENVS 567_R")

The following are the necessary Packages and libraries used for this homework (some will be included again in the problem’s code)

# install.packages("tidyverse")
# install.packages("ISwR")
# install.packages("readxl")
# install.packages("ggplot2")
# install.packages("reshape2")
# install.packages("performance")
# install.packages("lme4")
# install.packages("see")
# install.packages("lmerTest")
# install.packages("car")
# install.packages("ggeffects")
# install.packages("GGally")
# install.packages("MASS")
# install.packages("relaimpo")
# install.packages("sjPlot")
# install.packages("dplyr")
# install.packages("nycflights13")
# install.packages('plyr', repos = "http://cran.us.r-project.org")
# install.packages("gapminder")
# install.packages("Lahman")
# install.packages("ggord")
# install.packages("patchwork")
# install.packages("qqplotr")
# install.packages("ggplot2movies")
# installed.packages("Rtools")
# install.packages("rgbif")
# # Load libraries
# library(tidyverse)
# library(ISwR)
# library(readxl)
# library(ggplot2)
# library(reshape2)
# library(performance)
# library(lme4)
# library(see)
# library(lmerTest)
# library(car)
# library(ggeffects)
# library(GGally)
# library(MASS)
# library(relaimpo)
# library(sjPlot)
# library(dplyr)
# library(nycflights13)
# library(plyr)
# library(gapminder)
# library(Lahman)
# library(performance)
# library(see)
# library(qqplotr)
# library(ggrepel)
# library(ggplot2movies)
# library(rgbif)

#1 R Language

##1.1 - Install the ISwR R package. #Write the built-in dataset thuesen to a tab-separated text file. #View it with a text editor. #Change the NA to . (period), and #read the changed file back into R.

thuesen <- read.table(file = "thuesen.txt", header = FALSE, sep = "\t", na.strings = NA)
print(thuesen)

##               V1
## 1              x
## 2 thuesen_tabsep

thuesen1 <- thuesen
thuesen1[is.na(thuesen1)]<-"."
print(thuesen1)

##               V1
## 1              x
## 2 thuesen_tabsep

##1.2 - The exponential growth of a population is described by this mathematical function: Nt=No*e^(rt); where Nt is the population size at time t, No is the initial population size, and r is the rate of growth or reproductive rate. #Write an exponential growth function in R that also generates a plot with time on the x axis and Nt on the y axis. #Using that function create plots for 20 days assuming an initial population size of 10 individuals under three growth rate scenarios (0.5, 0.8, -0.1).

exp.growth <- function(No, r, t){
  No <- No
  r <- r
  t <- 0:t
  Nt <- No*exp(r*t)
  plot(x = t, y = Nt, xlab = "Time (days)", ylab = "Polulation Size", type = "l")}
exp.growth(No = 10, r = 0.5, t = 20)

exp.growth(No = 10, r=0.8, t = 20)

exp.growth(No = 10, r=-0.1, t = 20)

##1.3 - Under highly favorable conditions, populations grow exponentially. However resources will eventually limit population growth and exponential growth cannot continue indefinitely. This phenomenon is described by the logistic growth function: Nt=(KNo)/((No)+(K-No)exp(-rt)); where K is the carrying capacity. #Write a logistic growth function in R that also generates a plot with time on the x axis and Nt on the y axis. Using that function create plots for 20 days assuming an initial population size of 10 individuals and a carrying capacity of 1,000 individuals under three growth rate scenarios (0.5, 0.8, 0.4).

logis.growth <- function(No, r, t, K){
  No <- No
  r <- r
  t <- 0:t
  K <- K
  Nt.l <- (K*No)/(No+(K-No)*exp(-r*t))
  plot(x = t, y = Nt.l, xlab = "Time (days)", ylab = "Polulation Size", main = "Logistic Growth Graph", type = "l")}
logis.growth(No = 10, r = 0.5, t = 20, K=1000)

logis.growth(No = 10, r=0.8, t = 20, K=1000)

logis.growth(No = 10, r=0.4, t = 20, K=1000)

##1.4 - Write a function (sum_n) that for any given value, say n, computes the sum of the integers from 1 to n (inclusive). #Use the function to determine the sum of integers from 1 to 5,000.

sum_n <- function(n){
  n <- 1:n
  x <- n-(n-1)
  sum_n_eq <- sum(c(from = 1, to = n, by = 1))}
sum_n(n = 5000)
print(sum_n(n = 5000))

## [1] 12502502

##1.5 - Write a function (sqrt_round) #that takes a number x as input, #takes the square root, rounds it to the nearest whole number and then returns the result.

sqrt_round <- function(x){
  x <- x
  round(sqrt(x))}
print(sqrt_round(x=6.5))

## [1] 3

##1.6 - Install the R package ‘nycflights13’, and load the ‘weather’ data. #a) Explore the columns names and the top part of the dataset to get a sense of the data #b) Make a subset of the data from just the first month (1) and then save that subset as ‘weather1’ #c) Using ggplot2, make a beautiful histogram of the variable ‘temp’ #d) Using ggplot2, make a beautiful line plot of ‘temp’ as a function of ‘time_hour’ #e) Using ggplot2, make a beautiful boxplot of ‘temp’ as a function of ‘origin’

# install.packages("nycflights13")
library(nycflights13)
data(weather)
head(weather) #a

## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA
## 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA
## 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA
## 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA
## 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA
## 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>

weather1 <- subset(weather, month == 1) #b
library(ggplot2)
ggplot(data = weather1, aes(x = temp)) + geom_histogram(bins = 30)

ggplot() + geom_histogram(data = weather1, aes(x = temp, fill = ..x..), binwidth = 1) + scale_colour_gradient2() + xlab("Temperature") + ylab("Count")

ggplot(weather1, aes(x=time_hour, y=temp))+geom_line(colour = "red")

ggplot(weather1, aes(x=origin, y=temp))+geom_boxplot()

ggplot(weather1, aes(x=origin, y=temp))+geom_boxplot(color = "blue")

ggplot(weather1, aes(x=origin, y=temp, fill="Group")) + geom_boxplot(alpha=0.6)

##1.7 - Write content in markdown syntax to replicate, as much as possible, the format of the following sample text.

Big Title

some text using Markdown syntax

Some text in code format: computing with data

The main data objects in R are:

vectors
arrays
lists

The R website is: https://www.r-project.org

# 2 Fundamentals of statistics

##2.1 - A researcher videotaped the glides of 8 tree snakes leaping from a 10-m tower. Undulation rates of the snakes measured in hertz (cycles per second) were as follows: 0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6. #a) Draw a histogram of the undulation rate #b) Calculate the sample mean #c) Calculate the range #d) Calculate the standard deviation #e) Write a function to express the standard deviation as a percentage of the mean (that is, the coefficient of variation) and calculate it.

snake_rates <- c(0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6)
hist(snake_rates)

snake_mean <- mean(snake_rates)
range(snake_rates)

## [1] 0.9 2.0

snake_sd <- sd(snake_rates)
Coef_v <- function(snake_mean, snake_sd){
  snake_mean <- snake_mean
  snake_sd <- snake_sd
}
(snake_sd/snake_mean)*100 #Coefficient of variation (%) is the ratio of the sd to the mean

## [1] 23.56633

snake_mean #1.375 mean

## [1] 1.375

cv <- function(snake_rates){
  paste("mean = ", mean(snake_rates), ", cv = ",round(sd(snake_rates)/mean(snake_rates)*100), "%", sep="")}
cv(snake_rates)

## [1] "mean = 1.375, cv = 24%"

##2.2 - Blood pressure was measured (in units of mm Hg). Here are the measurements: 112, 128, 108, 129, 125, 153, 155, 132, 137. #a) How many individuals are in the sample? #b) What is the mean of this sample? #c) What is the variance? #d) What is the standard deviation? #e) What is the coefficient of variation?

blood_pressure <- c(112, 128, 108, 129, 125, 153, 155, 132, 137) #Blood pressure measurements (mm Hg)
length(blood_pressure) #amount of people in sample

## [1] 9

mean(blood_pressure) #mean of sample

## [1] 131

var(blood_pressure) #variance

## [1] 254.5

sd(blood_pressure) #standard deviation

## [1] 15.95306

paste(round(sd(blood_pressure)/mean(blood_pressure)*100), "%", sep="") #Coefficient of variation is the ratio of sd to the mean 131

## [1] "12%"

cv(blood_pressure)

## [1] "mean = 131, cv = 12%"

##2.3 - The data in the file DesertBirdAbundance.csv are from a survey of the breeding birds of Organ Pipe Cactus National Monument in southern Arizona. #a) Draw a histogram of the abundance data. #b) Calculate the median and the mean of the bird abundance data. #c) In this particular case, which do you think is the best measure of center, the mean or the median? #d) Calculate the range, standard deviation, variance and coefficient of variation of the bird abundance data.

dba_data <- readr::read_csv(file = "DesertBirdAbundance.csv")

## Rows: 43 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): species
## dbl (1): abundance
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

hist(dba_data$abundance, xlab = "Desert Bird Abundance", main = "Desert Bird Abundance Histogram")

mean(dba_data$abundance)

## [1] 74.76744

median(dba_data$abundance) #better measure of center

## [1] 18

range(dba_data$abundance)

## [1]   1 625

sd(dba_data$abundance)

## [1] 121.9473

var(dba_data$abundance)

## [1] 14871.14

paste("mean = ", mean(dba_data$abundance), ", cv = ",round(sd(dba_data$abundance)/mean(dba_data$abundance)*100), "%", sep="") #coefficient of variation of bird abundance data

## [1] "mean = 74.7674418604651, cv = 163%"

cv(dba_data$abundance)

## [1] "mean = 74.7674418604651, cv = 163%"

##2.4 - Calculate the probability of each of the following events: #a) A standard normally distributed variable is larger than 3 #b) A normally distributed variable with mean 35 and standard deviation 6 is larger than 42 #c) Getting 10 out of 10 successes in a binomial distribution with probability 0.8 #d) X > 6.5 in a Chi-squared distribution with 2 degrees of freedom

1-pnorm(3) #since 3 is > pnorm(q, mean = 0, sd = 1)

## [1] 0.001349898

1-pnorm(mean = 35, sd = 6, q = 42) #q= a normally distributed variable larger than "q"

## [1] 0.1216725

dbinom(x=10, size = 10, prob = 0.8) #x=possible outcomes; size=number of trials; prob=the probability of success

## [1] 0.1073742

1-pchisq(q=6.5, df=2) #Chi-sqrd distribution with 2 degrees of freedom x>6.5

## [1] 0.03877421

##2.5 - Demonstrate graphically the central limit theorem (by sampling and calculating the mean) using a Binomial distribution with 10 trials (size=10) and 0.9 probability of success (prob=0.9) and a sample size of 5.

c.l.thm_ten <- numeric(10) # sampling 10
for (i in 1:10) {
  c.l.thm_ten[i]<-mean(rbinom(n=5, size=10, prob=0.9)) # calculating mean using binom dist
}
hist(c.l.thm_ten, main = "Histogram of the Central Limit Theorem", xlab = "C.L.T. of sample size 10") # represent in graph

c.l.thm_hunnid<-numeric(100) # sampling 100
for (i in 1:100){
  c.l.thm_hunnid[i]<-mean(rbinom(n=5, size=10, prob=0.9))
}
hist(c.l.thm_hunnid)

c.l.thm_thous<-numeric(1000) # sampling 1,000
for (i in 1:1000){
  c.l.thm_thous[i]<-mean(rbinom(n=5, size=10, prob=0.9))
}
hist(c.l.thm_thous)

##2.6 - Imagine height is genetically determined by the combined (that is, the sum) effect of several genes (polygenic trait). Assume that each gene has an effect on height as a uniform distribution with min=1 and max=3. Simulate stochastic model of height for 1,000 random people based on 1 gene, 2 genes, and 5 genes. As we increase the number of genes, what is the resulting height distribution?

one_gene <- runif(1000, min = 1, max = 3)
hist(one_gene, main = "One Gene", xlab = "Height", breaks = 16) #Histogram not distributed

two_gene <- runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)
hist(two_gene, main = "Two Gene", xlab = "Height", breaks = 16)

five_gene <- runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)+runif(1000, min = 1, max = 3)
hist(five_gene, main = "Five Gene", xlab = "Height", breaks = 16) #Becoming more normally distributed and symmetric shaped

##2.7 - Do the same as in the previous problem but now assuming that the combined effect of the genes is multiplicative (not the sum). As we increase the number of genes, what is the resulting height distribution?

one_gene <- runif(1000, min = 1, max = 3)
hist(one_gene, main = "One Gene", xlab = "Height", breaks = 16)

two_gene <- runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)
hist(two_gene, main = "Two Gene", xlab = "Height", breaks = 16)

five_gene <- runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)*runif(1000, min = 1, max = 3)
hist(five_gene, main = "Five Gene", xlab = "Height", breaks = 16) #becoming more skewed right

#3 Statistical tests

##3.1 - Normal human body temperature is 98.6 F. Researchers obtained body-temperature measurements on randomly chosen healthy people: see code #a) Make a histogram of the data #b) Make a normal quantile plot #c) Perform a Shapiro-Wilk test to test for normality #d) Are the data normally distributed? #e) Are these measurements consistent with a population mean of 98.6 F?

rchp <- c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4)
hist(rchp)

qqnorm(rchp) #Normal quantile plot 
qqnorm(rchp)
qqline(rchp) #run in console to get plot with line

shapiro.test(rchp) #We fail to reject the null hypothesis

## 
##  Shapiro-Wilk normality test
## 
## data:  rchp
## W = 0.97216, p-value = 0.7001

t.test(rchp, mu = 98.6) #Measurements are not consistent with the population mean of 98.6

## 
##  One Sample t-test
## 
## data:  rchp
## t = -0.56065, df = 24, p-value = 0.5802
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
##  98.24422 98.80378
## sample estimates:
## mean of x 
##    98.524

##3.2 - The brown recluse spider often lives in houses throughout central North America. A diet-preference study gave each of 41 spiders a choice between two crickets, one live and one dead. 31 of the 41 spiders chose the dead cricket over the live one. Does this represent evidence for a diet preference?

#of the 41 brown recluse spiders, 31 spiders chose to eat dead crickets instead of live crickets
barplot(c(31,10), xlab="Brown Recluse Diet (dead vs live cricket)")

binom.test(x=31, n=41, p=0.5) #binom test is assessing the probability @ p=50% of a binom distribution

## 
##  Exact binomial test
## 
## data:  31 and 41
## number of successes = 31, number of trials = 41, p-value = 0.00145
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.5969538 0.8763675
## sample estimates:
## probability of success 
##              0.7560976

#There is evidence for a diet preference according to a significant binomial test with p-value=0.00145
sum(dbinom(x=31, size=41, p=0.5))*2 #*2 as a 2-tail test which is the p-value from the binom.test()

## [1] 0.001019634

##3.3 - Ten epileptic patients participated in a study of a new anticonvulsant drug. During the first 8-week period, half the patients received a placebo and half were given the drug, and the number of seizures were recorded. Following this, the same patients were given the opposite treatment and the number of seizures were recorded. Assuming that the distribution of the difference between the placebo and drug meets the assumption of normality, perform an appropriate test to determine whether there were differences in the number of epileptic seizures with and without the drug. #a) Make a boxplot of the data #b) Test the difference.

epil.study <- data.frame(c(1:10), c(37, 52, 68,4,29,32,19,52,19,12), c(5,23,40,3,38,19,9,24,17,14))
names(epil.study) <- c("Patient_No.", "Placebo (amount of seizures)", "Drug (amount of seizures)")
head(epil.study)

##   Patient_No. Placebo (amount of seizures) Drug (amount of seizures)
## 1           1                           37                         5
## 2           2                           52                        23
## 3           3                           68                        40
## 4           4                            4                         3
## 5           5                           29                        38
## 6           6                           32                        19

epil.study.long <- tidyr::pivot_longer(data = epil.study, cols = "Placebo (amount of seizures)":"Drug (amount of seizures)", names_to = "Treatment", values_to = "Seizures") #transforms the 2 columns of trials into 1 column
epil.study.long

## # A tibble: 20 x 3
##    Patient_No. Treatment                    Seizures
##          <int> <chr>                           <dbl>
##  1           1 Placebo (amount of seizures)       37
##  2           1 Drug (amount of seizures)           5
##  3           2 Placebo (amount of seizures)       52
##  4           2 Drug (amount of seizures)          23
##  5           3 Placebo (amount of seizures)       68
##  6           3 Drug (amount of seizures)          40
##  7           4 Placebo (amount of seizures)        4
##  8           4 Drug (amount of seizures)           3
##  9           5 Placebo (amount of seizures)       29
## 10           5 Drug (amount of seizures)          38
## 11           6 Placebo (amount of seizures)       32
## 12           6 Drug (amount of seizures)          19
## 13           7 Placebo (amount of seizures)       19
## 14           7 Drug (amount of seizures)           9
## 15           8 Placebo (amount of seizures)       52
## 16           8 Drug (amount of seizures)          24
## 17           9 Placebo (amount of seizures)       19
## 18           9 Drug (amount of seizures)          17
## 19          10 Placebo (amount of seizures)       12
## 20          10 Drug (amount of seizures)          14

boxplot(epil.study.long$Seizures ~ epil.study.long$Treatment + epil.study.long$Patient_No.)

boxplot(epil.study.long$Seizures ~ epil.study.long$Treatment)

hist(epil.study.long$Seizures) #maybe normal

shapiro.test(epil.study.long$Seizures) #normal

## 
##  Shapiro-Wilk normality test
## 
## data:  epil.study.long$Seizures
## W = 0.93723, p-value = 0.2124

t.test(epil.study$"Placebo (amount of seizures)", epil.study$"Drug (amount of seizures)", paired = TRUE)

## 
##  Paired t-test
## 
## data:  epil.study$"Placebo (amount of seizures)" and epil.study$"Drug (amount of seizures)"
## t = 2.7661, df = 9, p-value = 0.02189
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   2.404666 23.995334
## sample estimates:
## mean of the differences 
##                    13.2

#the number of seizures after patients received the drug versus after receiving the placebo is significant given the p-value of 0.02189

##3.4 - A bee biologist is analyzing whether there was an association between bee colony number and type of forest habitat. We expect that there is no habitat preference for bee colony number. Is this true based on this data? #a) Make a barplot of the data #b) Test the hypothesis of no-association

bee <- (c("Oak"=33, "Hickory"=30, "Maple"=29, "Red Cedar"=4, "Poplar"=4))
barplot(bee)

chisq.test(bee)

## 
##  Chi-squared test for given probabilities
## 
## data:  bee
## X-squared = 43.1, df = 4, p-value = 9.865e-09

#Accept hypothesis of no association between colony and habitat of the 5 tree types

##3.5 - Perform 10 one-sample t-tests for mu=0 on simulated standard normally distributed data of 25 observations each and get the P-value. Repeat the experiment, but instead simulate samples of 25 observations from a t-distribution with 2 degrees of freedom. Find a way to automate the first experiment to do it a 1,000 times and applying a false discovery rate correction to the P-values (check ?replicate).

replicate(10, t.test(rnorm(25), mu=0)$p.value) #repeating the t.test 10x

##  [1] 0.2834226 0.8126056 0.6469057 0.6990219 0.2237562 0.7454398 0.7293084
##  [8] 0.2133691 0.2897533 0.3734600

replicate(10, t.test(rt(25, df=2), mu=0)$p.value)

##  [1] 0.8300742 0.5970417 0.7391379 0.9025699 0.7132827 0.1968971 0.4098774
##  [8] 0.2221683 0.6354194 0.7154488

p.adjust(replicate(1000, t.test(rnorm(25), mu=0)$p.value), method="fdr")

##    [1] 0.99411174 0.98727289 0.99411174 0.92270050 0.92270050 0.99411174
##    [7] 0.86497521 0.86980590 0.92270050 0.99411174 0.90638528 0.86497521
##   [13] 0.99411174 0.99411174 0.99411174 0.86980590 0.98654661 0.99411174
##   [19] 0.99411174 0.90638528 0.92270050 0.99411174 0.92270050 0.99411174
##   [25] 0.99411174 0.99411174 0.92270050 0.98727289 0.92270050 0.99411174
##   [31] 0.92270050 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174
##   [37] 0.99411174 0.99411174 0.98041106 0.86497521 0.99411174 0.92270050
##   [43] 0.98727289 0.99411174 0.98654661 0.92270050 0.99411174 0.96269220
##   [49] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##   [55] 0.99411174 0.99411174 0.99411174 0.90638528 0.99411174 0.99411174
##   [61] 0.86980590 0.92270050 0.86497521 0.98105932 0.92270050 0.92270050
##   [67] 0.92270050 0.99411174 0.92270050 0.99411174 0.99411174 0.92270050
##   [73] 0.99411174 0.90638528 0.92270050 0.92151555 0.99411174 0.92270050
##   [79] 0.86980590 0.99411174 0.99411174 0.99411174 0.99411174 0.98654661
##   [85] 0.99411174 0.99411174 0.91651405 0.99411174 0.99411174 0.99411174
##   [91] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##   [97] 0.86980590 0.99411174 0.92270050 0.99411174 0.91651405 0.91651405
##  [103] 0.99411174 0.92270050 0.98727289 0.99411174 0.86497521 0.99411174
##  [109] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.92270050
##  [115] 0.92270050 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174
##  [121] 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174 0.99411174
##  [127] 0.99411174 0.87852503 0.99411174 0.99411174 0.86497521 0.86497521
##  [133] 0.92270050 0.92270050 0.99411174 0.92270050 0.86497521 0.99411174
##  [139] 0.91651405 0.99411174 0.99411174 0.92270050 0.99411174 0.98727289
##  [145] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [151] 0.86497521 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [157] 0.99411174 0.99411174 0.91651405 0.99411174 0.99411174 0.92270050
##  [163] 0.92151555 0.99411174 0.99411174 0.86497521 0.92270050 0.99411174
##  [169] 0.92270050 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [175] 0.86497521 0.92270050 0.99411174 0.99411174 0.91651405 0.92270050
##  [181] 0.99411174 0.99411174 0.99411174 0.86497521 0.99411174 0.99411174
##  [187] 0.99411174 0.86980590 0.99411174 0.99411174 0.92270050 0.99411174
##  [193] 0.98654661 0.99411174 0.92270050 0.99411174 0.90638528 0.99411174
##  [199] 0.92270050 0.99411174 0.92270050 0.92270050 0.99411174 0.91651405
##  [205] 0.86497521 0.99411174 0.99411174 0.92270050 0.86497521 0.99411174
##  [211] 0.92270050 0.98727289 0.99411174 0.99411174 0.86980590 0.99411174
##  [217] 0.99411174 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174
##  [223] 0.92270050 0.99411174 0.86980590 0.92270050 0.99411174 0.98727289
##  [229] 0.92270050 0.92151555 0.92270050 0.99411174 0.99411174 0.86497521
##  [235] 0.99411174 0.98727289 0.99411174 0.99411174 0.98654661 0.98654661
##  [241] 0.99411174 0.99411174 0.92270050 0.97226266 0.86497521 0.86497521
##  [247] 0.99411174 0.99411174 0.99411174 0.99411174 0.98654661 0.90638528
##  [253] 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174 0.99411174
##  [259] 0.99411174 0.99411174 0.90638528 0.99411174 0.99411174 0.99411174
##  [265] 0.86980590 0.99411174 0.91651405 0.90638528 0.99411174 0.99411174
##  [271] 0.92270050 0.95226833 0.99411174 0.99411174 0.99411174 0.92270050
##  [277] 0.99411174 0.99411174 0.99411174 0.92270050 0.98654661 0.99411174
##  [283] 0.99411174 0.92270050 0.94321539 0.99411174 0.92270050 0.86497521
##  [289] 0.92270050 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
##  [295] 0.99411174 0.99411174 0.86980590 0.99411174 0.91651405 0.94220154
##  [301] 0.99411174 0.92270050 0.92270050 0.95156308 0.99411174 0.99411174
##  [307] 0.86497521 0.92270050 0.99411174 0.99411174 0.91651405 0.92270050
##  [313] 0.99411174 0.99411174 0.92270050 0.86980590 0.92151555 0.99411174
##  [319] 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174 0.86497521
##  [325] 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174 0.92151555
##  [331] 0.98654661 0.92270050 0.98654661 0.98727289 0.91651405 0.99411174
##  [337] 0.99411174 0.99411174 0.99411174 0.90638528 0.99411174 0.86980590
##  [343] 0.99411174 0.99411174 0.03750134 0.92270050 0.92270050 0.92270050
##  [349] 0.99411174 0.86497521 0.92270050 0.99411174 0.99411174 0.99411174
##  [355] 0.92270050 0.98654661 0.99411174 0.99411174 0.99411174 0.98727289
##  [361] 0.86980590 0.25998618 0.99411174 0.87852503 0.99411174 0.99411174
##  [367] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [373] 0.99411174 0.90638528 0.91651405 0.98727289 0.99411174 0.92270050
##  [379] 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174 0.99411174
##  [385] 0.99411174 0.98727289 0.86980590 0.99411174 0.99411174 0.99411174
##  [391] 0.86497521 0.92270050 0.99411174 0.99411174 0.86497521 0.92699079
##  [397] 0.99411174 0.91651405 0.99411174 0.92270050 0.99411174 0.98654661
##  [403] 0.99411174 0.99411174 0.99411174 0.92270050 0.90638528 0.99411174
##  [409] 0.99411174 0.99411174 0.99411174 0.92270050 0.86497521 0.99411174
##  [415] 0.86980590 0.99411174 0.86497521 0.98654661 0.91651405 0.99411174
##  [421] 0.99411174 0.99411174 0.86980590 0.99411174 0.92270050 0.91651405
##  [427] 0.92151555 0.98654661 0.99411174 0.99411174 0.45880383 0.92270050
##  [433] 0.92270050 0.99411174 0.99411174 0.99411174 0.99411174 0.93551596
##  [439] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [445] 0.99411174 0.92270050 0.98654661 0.99411174 0.99411174 0.92270050
##  [451] 0.99411174 0.92270050 0.99411174 0.86980590 0.90638528 0.99411174
##  [457] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [463] 0.91651405 0.99411174 0.98971095 0.99411174 0.99411174 0.99411174
##  [469] 0.98727289 0.99411174 0.86980590 0.95996837 0.92270050 0.99411174
##  [475] 0.99411174 0.99411174 0.90638528 0.92270050 0.92270050 0.99411174
##  [481] 0.98654661 0.99411174 0.99411174 0.99411174 0.86497521 0.99411174
##  [487] 0.98654661 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [493] 0.92270050 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174
##  [499] 0.92270050 0.87852503 0.99411174 0.99411174 0.99411174 0.99411174
##  [505] 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174 0.92491868
##  [511] 0.99411174 0.99411174 0.99411174 0.99411174 0.98654661 0.99411174
##  [517] 0.99411174 0.90638528 0.86497521 0.86497521 0.99411174 0.99411174
##  [523] 0.86980590 0.99411174 0.92270050 0.86497521 0.92270050 0.99411174
##  [529] 0.92270050 0.99411174 0.99411174 0.98654661 0.99411174 0.92270050
##  [535] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.98727289
##  [541] 0.92270050 0.98654661 0.86980590 0.99411174 0.98654661 0.86497521
##  [547] 0.91651405 0.86980590 0.99411174 0.99411174 0.92270050 0.92270050
##  [553] 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174 0.92270050
##  [559] 0.92270050 0.92270050 0.99411174 0.92270050 0.92270050 0.86980590
##  [565] 0.92270050 0.92270050 0.86497521 0.99411174 0.99411174 0.99411174
##  [571] 0.99411174 0.99411174 0.86497521 0.99411174 0.99411174 0.98727289
##  [577] 0.86497521 0.99411174 0.99411174 0.99411174 0.99411174 0.91651405
##  [583] 0.99411174 0.98654661 0.99411174 0.93296569 0.99411174 0.99411174
##  [589] 0.92270050 0.99411174 0.99411174 0.99411174 0.86980590 0.86497521
##  [595] 0.99411174 0.99411174 0.90638528 0.99411174 0.99411174 0.92270050
##  [601] 0.99411174 0.99411174 0.99411174 0.92270050 0.91651405 0.98654661
##  [607] 0.99411174 0.92270050 0.99411174 0.99411174 0.92270050 0.86497521
##  [613] 0.92270050 0.99411174 0.99411174 0.91651405 0.98654661 0.92270050
##  [619] 0.92151555 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
##  [625] 0.86497521 0.92270050 0.99411174 0.99411174 0.92270050 0.99411174
##  [631] 0.99411174 0.99411174 0.86497521 0.92270050 0.95226833 0.91289956
##  [637] 0.99411174 0.99411174 0.92270050 0.86980590 0.99411174 0.99411174
##  [643] 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174 0.86980590
##  [649] 0.99411174 0.86980590 0.98654661 0.92270050 0.93972950 0.99411174
##  [655] 0.92270050 0.90638528 0.98654661 0.99411174 0.86980590 0.92270050
##  [661] 0.99411174 0.99411174 0.92270050 0.92270050 0.99411174 0.86980590
##  [667] 0.99411174 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174
##  [673] 0.86497521 0.98727289 0.92270050 0.99411174 0.99411174 0.98654661
##  [679] 0.99411174 0.90638528 0.99411174 0.99411174 0.92270050 0.99411174
##  [685] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [691] 0.99411174 0.86980590 0.86497521 0.99411174 0.92270050 0.99411174
##  [697] 0.86980590 0.99411174 0.98654661 0.86980590 0.92270050 0.99411174
##  [703] 0.97112313 0.99411174 0.99411174 0.86497521 0.86980590 0.99411174
##  [709] 0.99411174 0.86497521 0.90638528 0.99411174 0.91105273 0.86497521
##  [715] 0.99411174 0.86980590 0.92270050 0.99411174 0.99411174 0.99411174
##  [721] 0.86980590 0.86497521 0.92270050 0.99411174 0.98654661 0.92270050
##  [727] 0.99411174 0.92270050 0.86497521 0.99411174 0.92270050 0.99411174
##  [733] 0.99411174 0.92270050 0.99411174 0.92270050 0.86980590 0.99411174
##  [739] 0.86497521 0.91651405 0.92270050 0.86497521 0.99411174 0.99411174
##  [745] 0.99411174 0.98654661 0.90638528 0.99411174 0.99411174 0.99411174
##  [751] 0.92270050 0.99411174 0.92270050 0.99411174 0.99411174 0.90638528
##  [757] 0.99411174 0.99411174 0.86980590 0.90638528 0.99411174 0.98654661
##  [763] 0.99411174 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174
##  [769] 0.98727289 0.99411174 0.99411174 0.92270050 0.92270050 0.99411174
##  [775] 0.99411174 0.99411174 0.99411174 0.86980590 0.99411174 0.86980590
##  [781] 0.99411174 0.86980590 0.98727289 0.92270050 0.99411174 0.99411174
##  [787] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [793] 0.99411174 0.99411174 0.86980590 0.86497521 0.99411174 0.99411174
##  [799] 0.99411174 0.92270050 0.99411174 0.91651405 0.92270050 0.99411174
##  [805] 0.99411174 0.92270050 0.86980590 0.99411174 0.86497521 0.98654661
##  [811] 0.98654661 0.91651405 0.99411174 0.99411174 0.99411174 0.98727289
##  [817] 0.99411174 0.99411174 0.99411174 0.92270050 0.98727289 0.92270050
##  [823] 0.94752030 0.99411174 0.99411174 0.98727289 0.99411174 0.99411174
##  [829] 0.99411174 0.86980590 0.99411174 0.86497521 0.99411174 0.92270050
##  [835] 0.99411174 0.86980590 0.92270050 0.99411174 0.99411174 0.86980590
##  [841] 0.86497521 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174
##  [847] 0.99411174 0.99411174 0.99411174 0.92270050 0.86980590 0.92270050
##  [853] 0.99411174 0.99411174 0.86497521 0.99411174 0.98727289 0.92270050
##  [859] 0.99411174 0.99411174 0.93551596 0.99411174 0.99411174 0.92270050
##  [865] 0.99411174 0.86980590 0.99411174 0.99411174 0.92270050 0.99411174
##  [871] 0.99411174 0.99411174 0.99411174 0.92270050 0.92270050 0.99411174
##  [877] 0.99411174 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
##  [883] 0.99411174 0.92270050 0.91651405 0.99411174 0.99411174 0.99411174
##  [889] 0.86980590 0.90638528 0.99411174 0.99411174 0.99411174 0.99411174
##  [895] 0.99411174 0.92270050 0.99411174 0.99411174 0.86980590 0.99411174
##  [901] 0.99411174 0.92270050 0.99411174 0.90638528 0.98727289 0.92151555
##  [907] 0.86980590 0.99411174 0.98654661 0.86980590 0.99411174 0.99411174
##  [913] 0.98727289 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174
##  [919] 0.99411174 0.99411174 0.99411174 0.99411174 0.99411174 0.90638528
##  [925] 0.99411174 0.86497521 0.99411174 0.99411174 0.99411174 0.86497521
##  [931] 0.92270050 0.86980590 0.90638528 0.92270050 0.86980590 0.99411174
##  [937] 0.93551596 0.99411174 0.99411174 0.98654661 0.99411174 0.99411174
##  [943] 0.99411174 0.99411174 0.94907325 0.99411174 0.99411174 0.92270050
##  [949] 0.92270050 0.98654661 0.99411174 0.99411174 0.99411174 0.99411174
##  [955] 0.86497521 0.99411174 0.99411174 0.92270050 0.92151555 0.99411174
##  [961] 0.98727289 0.99411174 0.98654661 0.99411174 0.99411174 0.99411174
##  [967] 0.99411174 0.92270050 0.92270050 0.98654661 0.99411174 0.97249925
##  [973] 0.99411174 0.99411174 0.99411174 0.99411174 0.95996837 0.99411174
##  [979] 0.99411174 0.99411174 0.92270050 0.86497521 0.92270050 0.86497521
##  [985] 0.86497521 0.99411174 0.99411174 0.92270050 0.99411174 0.99411174
##  [991] 0.99411174 0.99411174 0.91651405 0.99411174 0.90638528 0.99411174
##  [997] 0.91651405 0.98727289 0.99411174 0.86980590

#4 ANOVA

##4.1 - You are hired by the US Department of Agriculture to develop effective control practices for invasive plants (data_ANOVA.xlsx). In particular, you are asked to investigate pesticide controls for kudzu which is an invasive vine that grows in thick mats that smother underlying plants. Two of the most widely used pesticides for kudzu are glyphosate and triclopyr. To determine the effectiveness of a single application of pesticide, you conduct an experiment in 18 plots that each had 50% kudzu cover. In mid-summer, you applied equal amounts of 2% glyphosate to 6 plots, 2% triclopyr to 6 plots, and water without pesticides to 6 plots. Then you returned in autumn and measured the percent cover of kudzu in the plots. #a) Transform the data from wide to long format (Google how to do it - Hint: function melt from reshape2 package) #b) Make a boxplot of the data #c) Perform an ANOVA #d) What is the variation explained (R2)? #e) Perform a post hoc test

USDA_inv <- readxl::read_xlsx(path = "data_ANOVA.xlsx", col_names = T)
USDA_inv

## # A tibble: 6 x 3
##   Glyphosate Triclopyr Control
##        <dbl>     <dbl>   <dbl>
## 1         20        17      34
## 2         17        14      31
## 3         24        12      29
## 4         18        16      32
## 5         16        18      27
## 6         22        13      30

Um <- reshape2::melt(USDA_inv)

## No id variables; using all as measure variables

Um

##      variable value
## 1  Glyphosate    20
## 2  Glyphosate    17
## 3  Glyphosate    24
## 4  Glyphosate    18
## 5  Glyphosate    16
## 6  Glyphosate    22
## 7   Triclopyr    17
## 8   Triclopyr    14
## 9   Triclopyr    12
## 10  Triclopyr    16
## 11  Triclopyr    18
## 12  Triclopyr    13
## 13    Control    34
## 14    Control    31
## 15    Control    29
## 16    Control    32
## 17    Control    27
## 18    Control    30

boxplot(value~variable, data=Um, main="Treatment Effect on Kudzu")

anova(lm(value~variable, data = Um))

## Analysis of Variance Table
## 
## Response: value
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## variable   2    763   381.5    54.5 1.318e-07 ***
## Residuals 15    105     7.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm(value~variable, data= Um)) #r^2 87.9%

## 
## Call:
## lm(formula = value ~ variable, data = Um)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.500 -1.875  0.000  1.875  4.500 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         19.500      1.080  18.053 1.38e-11 ***
## variableTriclopyr   -4.500      1.528  -2.946     0.01 *  
## variableControl     11.000      1.528   7.201 3.07e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.646 on 15 degrees of freedom
## Multiple R-squared:  0.879,  Adjusted R-squared:  0.8629 
## F-statistic:  54.5 on 2 and 15 DF,  p-value: 1.318e-07

TukeyHSD(aov(value~variable, data= Um)) #Amount of Kudzu from most to least: control, gly, tri

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = value ~ variable, data = Um)
## 
## $variable
##                      diff       lwr        upr     p adj
## Triclopyr-Glyphosate -4.5 -8.467701 -0.5322987 0.0255594
## Control-Glyphosate   11.0  7.032299 14.9677013 0.0000087
## Control-Triclopyr    15.5 11.532299 19.4677013 0.0000001

##4.2 - Data in growth.txt come from a farm-scale trial of animal diets. There are two factors: diet and supplement. Diet is a factor with three levels: barley, oats and wheat. Supplement is a factor with four levels: agrimore, control, supergain and supersupp. The response variable is weight gain after 6 weeks. #a) Inspect the data using boxplots #b) Perform a two-way ANOVA #c) Assess graphically if there exists an interaction between both factors #d) Given what you learnt about the interaction, what would be a better model? #e) What is the variation explained (R2) of this new model? #f) Perform a Tukey HSD test of this new model

farm_diets <- read.table("growth.txt", header = 1)
#a) inspect data w/ boxplots
par(mfrow = c(1,2))
boxplot(gain~diet, data = farm_diets); boxplot(gain~supplement, data = farm_diets)

par(mfrow = c(1,1))
#b) Perform a two-way ANOVA 
summary(lm(gain~diet+supplement, data = farm_diets))

## 
## Call:
## lm(formula = gain ~ diet + supplement, data = farm_diets)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.30792 -0.85929 -0.07713  0.92052  2.90615 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          26.1230     0.4408  59.258  < 2e-16 ***
## dietoats             -3.0928     0.4408  -7.016 1.38e-08 ***
## dietwheat            -5.9903     0.4408 -13.589  < 2e-16 ***
## supplementcontrol    -2.6967     0.5090  -5.298 4.03e-06 ***
## supplementsupergain  -3.3815     0.5090  -6.643 4.72e-08 ***
## supplementsupersupp  -0.7274     0.5090  -1.429     0.16    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared:  0.8531, Adjusted R-squared:  0.8356 
## F-statistic: 48.76 on 5 and 42 DF,  p-value: < 2.2e-16

#c) Assess graphically if there exists an interaction between both factors 
#d) Given what you learnt about the interaction, what would be a better model? 
interaction.plot(farm_diets$diet, farm_diets$supplement, farm_diets$gain)

interaction.plot(farm_diets$supplement, farm_diets$diet, farm_diets$gain)

#e) What is the variation explained (R2) of this new model? #no apparent interaction based on plots
anova(lm(gain~diet+supplement, data=farm_diets))

## Analysis of Variance Table
## 
## Response: gain
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## diet        2 287.171 143.586  92.358 4.199e-16 ***
## supplement  3  91.881  30.627  19.700 3.980e-08 ***
## Residuals  42  65.296   1.555                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm(gain~diet+supplement, data=farm_diets))

## 
## Call:
## lm(formula = gain ~ diet + supplement, data = farm_diets)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.30792 -0.85929 -0.07713  0.92052  2.90615 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          26.1230     0.4408  59.258  < 2e-16 ***
## dietoats             -3.0928     0.4408  -7.016 1.38e-08 ***
## dietwheat            -5.9903     0.4408 -13.589  < 2e-16 ***
## supplementcontrol    -2.6967     0.5090  -5.298 4.03e-06 ***
## supplementsupergain  -3.3815     0.5090  -6.643 4.72e-08 ***
## supplementsupersupp  -0.7274     0.5090  -1.429     0.16    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared:  0.8531, Adjusted R-squared:  0.8356 
## F-statistic: 48.76 on 5 and 42 DF,  p-value: < 2.2e-16

#f) Perform a Tukey HSD test of this new model
TukeyHSD(aov(gain~diet+supplement, data=farm_diets))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = gain ~ diet + supplement, data = farm_diets)
## 
## $diet
##                   diff       lwr       upr p adj
## oats-barley  -3.092817 -4.163817 -2.021817 0e+00
## wheat-barley -5.990298 -7.061298 -4.919297 0e+00
## wheat-oats   -2.897481 -3.968481 -1.826481 2e-07
## 
## $supplement
##                           diff        lwr        upr     p adj
## control-agrimore    -2.6967005 -4.0583332 -1.3350677 0.0000234
## supergain-agrimore  -3.3814586 -4.7430914 -2.0198259 0.0000003
## supersupp-agrimore  -0.7273521 -2.0889849  0.6342806 0.4888738
## supergain-control   -0.6847581 -2.0463909  0.6768746 0.5400389
## supersupp-control    1.9693484  0.6077156  3.3309811 0.0020484
## supersupp-supergain  2.6541065  1.2924737  4.0157392 0.0000307

#5 Correlation

##5.1. In a study of hyena laughter, a researcher investigated whether sound spectral properties of hyenas’ giggles are associated with age. #Age(years): 2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20 #Frequency (Hz): 840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500 #a) Inspect the data using a scatterplot #b) Test the linear association between both variables #c) Assume that the data are not normally distributed, test the linear association using a non-parametric correlation coefficient

age<-c(2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20)
freq<-c(840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500)
plot(age, freq)

cor.test(age, freq, method="pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  age and freq
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8453293 -0.1511968
## sample estimates:
##       cor 
## -0.601798

cor.test(freq, age, method="pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  freq and age
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8453293 -0.1511968
## sample estimates:
##       cor 
## -0.601798

#6 - Regression

##6.1. Testosterone is known to predict aggressive behavior # and is involved in face shape during puberty. Does face # shape predict aggression? Researchers measured the face # width-to-height ratio of 21 university hockey players with # the average number of penalty minutes awarded per game for # aggressive infractions. The data are available in face.txt. # #a) How steeply does the number of penalty minutes increase # per unit increase in face ratio? Calculate the estimate of # the intercept. Write the result in the form of an equation # for the line. #b) How many degrees of freedom does this # analysis have? #c) What is the t-statistic? #d) What is your # final conclusion about the slope? #e) What is the variation # explained, R2? #f) Make a scatterplot of the data and include # a linear regression line #g) Check the model assumptions

testosterone<-read.table("face.txt", sep="\t", header=T)
testo_model <- lm(Penalty~Face, data=testosterone)
summary(testo_model)

## 
## Call:
## lm(formula = Penalty ~ Face, data = testosterone)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9338 -0.3883 -0.1260  0.3381  1.0711 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -4.505      2.089  -2.157   0.0440 *
## Face           3.189      1.150   2.774   0.0121 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6129 on 19 degrees of freedom
## Multiple R-squared:  0.2882, Adjusted R-squared:  0.2508 
## F-statistic: 7.694 on 1 and 19 DF,  p-value: 0.0121

# a) Summary() says that the number of penalty minutes increase per unit increase in face shape is 3.189; the Y-intercept is -4.505; thus, the equation for the line is y = 3.189(x) - 4.505
# b) This analysis has 19 degrees of freedom
# c) The t-statistic is -2.157 on the y-intercept side and 2.774 on the other side of the graph
# d) the slope is a gradual positive slope
# e) What is the variation explained, R2? multiple R-squared = 0.2882 which is in penalty minuets and explained by face shape.
# f-g) 

plot(testosterone$Face, testosterone$Penalty) + abline(testo_model, col="red", lwd=3)

## integer(0)

performance::check_model(testo_model, k=2)

##6.2. Respiratory rate (Y) is expected to depend on body mass (X) #by the power law, Y=aX??, where ?? is the scaling exponent. #The data are available in respiratoryrate_bodymass.txt. #a) Make a scatterplot of the raw data #b) Make a scatterplot of the linearized relationship. Which transformation did you use? linear model and log transformed lm #c) Use linear regression to estimate B #d) Carry out a formal test of the null hypothesis of ??=0 #e) What is the variation explained, R2? #f) Check the model assumptions

resp_rate <- read.table("respiratoryrate_bodymass.txt", sep="\t", header=T)
plot(resp_rate$BodyMass, resp_rate$Respiration) #a

plot(resp_rate$BodyMass, resp_rate$Respiration, log = "xy") #b) linear model & log transformed linear model

lm(log(Respiration) ~ log(BodyMass), data = resp_rate) #b

## 
## Call:
## lm(formula = log(Respiration) ~ log(BodyMass), data = resp_rate)
## 
## Coefficients:
##   (Intercept)  log(BodyMass)  
##        1.3401         0.8574

plot(lm(log(Respiration) ~ log(BodyMass), data = resp_rate)) #b

summary(lm(log(Respiration) ~ log(BodyMass), data = resp_rate)) #c) linear regression to estimate B. B = 0.99

## 
## Call:
## lm(formula = log(Respiration) ~ log(BodyMass), data = resp_rate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01062 -0.08150  0.00831  0.30669  0.43948 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.3401     1.2011   1.116 0.296938    
## log(BodyMass)   0.8574     0.1570   5.461 0.000601 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4449 on 8 degrees of freedom
## Multiple R-squared:  0.7885, Adjusted R-squared:  0.762 
## F-statistic: 29.82 on 1 and 8 DF,  p-value: 0.000601

t.test(x=(log(resp_rate$BodyMass)), y= (log(resp_rate$Respiration)), data = resp_rate) #2d) formal test of null hypothesis of B=0

## 
##  Welch Two Sample t-test
## 
## data:  (log(resp_rate$BodyMass)) and (log(resp_rate$Respiration))
## t = -0.61922, df = 17.978, p-value = 0.5435
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.1293946  0.6152354
## sample estimates:
## mean of x mean of y 
##  7.596998  7.854078

# e) variation explained, R2? From summary(), multiple R-squared = 0.7885 or 78.85% of the data can be explained by the graph.
# f) Check the model assumptions
performance::check_model(lm(log(Respiration) ~ log(BodyMass), data = resp_rate))

*********************** #7 - Advanced Regression —

##7.1.Use the ozone.txt data to model the ozone concentration as a linear # function of wind speed, air temperature and the intensity of solar radiation. # Assume that the requirements to perform a linear regression are met. # a) Make a multiple panel bivariate scatterplot # b) Perform a multiple linear regression # c) What is the variation explained, R2? # d) Assess the collinearity of the explanatory variables using the variance # inflation factor # e) Check the model assumptions

ozone <- read.table("ozone.txt", sep="\t", header=T)

pairs(ozone) # a) Make a multiple panel bivariate scatterplot

ozoneconcentration <- lm (ozone ~ wind+temp+rad, data=ozone) # b) multiple linear regression

summary(ozoneconcentration) # c) variation explained, R2? From summary(), multiple R-squared = 0.6062 or 60.62%; Adjusted R-squared = 0.5952 or 59.52%

## 
## Call:
## lm(formula = ozone ~ wind + temp + rad, data = ozone)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.485 -14.210  -3.556  10.124  95.600 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -64.23208   23.04204  -2.788  0.00628 ** 
## wind         -3.33760    0.65384  -5.105 1.45e-06 ***
## temp          1.65121    0.25341   6.516 2.43e-09 ***
## rad           0.05980    0.02318   2.580  0.01124 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.17 on 107 degrees of freedom
## Multiple R-squared:  0.6062, Adjusted R-squared:  0.5952 
## F-statistic: 54.91 on 3 and 107 DF,  p-value: < 2.2e-16

# d) Assess the collinearity of the explanatory variables using the variance inflation factor
croc<-relaimpo::calc.relimp(ozoneconcentration)
plot(croc)

ozoneconcentration.two <- lm (ozone ~ wind+temp, data=ozone)
car::vif(ozoneconcentration.two)

##     wind     temp 
## 1.328293 1.328293

car::vif(ozoneconcentration)

##     wind     temp      rad 
## 1.328979 1.431201 1.095241

performance::check_model(ozoneconcentration.two)# e) Check the model assumptions

performance::check_model(ozoneconcentration) # e) Check the model assumptions

##7.2. Use the diminish.txt data (xv is explanatory, yv is response variable) to: # a) Perform a simple linear regression b) Perform a polynomial (second-degree) regression # c) Compare both models with Akaike’s Information Criterion (AIC). Which model is better? # d) Make a scatterplot of the data and include both regression lines

diminish <- read.table("diminish.txt", sep="\t", header=T)

diminish.lm <- lm(yv ~ xv, data=diminish) # a) simple linear regression
summary(diminish.lm)

## 
## Call:
## lm(formula = yv ~ xv, data = diminish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3584 -2.1206 -0.3218  1.8763  4.1782 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.61438    1.17315   23.54 7.66e-14 ***
## xv           0.22683    0.02168   10.46 1.45e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.386 on 16 degrees of freedom
## Multiple R-squared:  0.8725, Adjusted R-squared:  0.8646 
## F-statistic: 109.5 on 1 and 16 DF,  p-value: 1.455e-08

diminish.poly <- lm(yv ~ xv + I(xv^2), data=diminish) # b) polynomial (second-degree) regression
summary(diminish.poly)

## 
## Call:
## lm(formula = yv ~ xv + I(xv^2), data = diminish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0490 -1.2890  0.6018  1.1829  2.9610 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.6323529  1.6916139  14.561 2.95e-10 ***
## xv           0.4057534  0.0819860   4.949 0.000175 ***
## I(xv^2)     -0.0018834  0.0008386  -2.246 0.040203 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.131 on 15 degrees of freedom
## Multiple R-squared:  0.9046, Adjusted R-squared:  0.8919 
## F-statistic: 71.12 on 2 and 15 DF,  p-value: 2.222e-08

AIC(diminish.lm, diminish.poly) # c) Compare models AIC(); polynomial model is better because it is lower by ~ 3 points

##               df      AIC
## diminish.lm    3 86.26193
## diminish.poly  4 83.04403

plot(diminish$xv, diminish$yv, xlab="xv", ylab="yv") + 
  abline(diminish.lm, col="red", lwd=3) + 
  lines(diminish$xv, predict(diminish.poly), col="blue", lwd=3) # d) scatterplot of data w/ both regression lines

## integer(0)

##7.3. Stork data - logistic regression. The data in stork.txt display the # stress-induced corticosterone levels circulating in the blood of European # white storks and their survival over the subsequent five years of study. # #a) Make a scatterplot of the data #b) Which type of regression model is # suitable for these data? # c) Perform an appropriate regression to predict survival from corticosterone # d) What is the pseudo-R2 of the model? # e) What is the p-value of the model? # f) Include the predicted curve in the scatterplot # g) Check the model assumptions

my_stork <- read.table('stork.txt', header = 1)
plot(my_stork) # a) scatterplot # b) needs a logistic or binomal regression

m_s_glm <- glm(Survival ~ Corticosterone, data = my_stork, family = binomial(logit)) # c) predict survival from corticosterone
summary(m_s_glm)

## 
## Call:
## glm(formula = Survival ~ Corticosterone, family = binomial(logit), 
##     data = my_stork)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4316  -0.8724  -0.6703   1.2125   1.8211  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     2.70304    1.74725   1.547   0.1219  
## Corticosterone -0.07980    0.04368  -1.827   0.0677 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 45.234  on 33  degrees of freedom
## Residual deviance: 41.396  on 32  degrees of freedom
## AIC: 45.396
## 
## Number of Fisher Scoring iterations: 4

1-(m_s_glm$deviance/m_s_glm$null.deviance) # d) pseudo-R-squared = 0.084852

## [1] 0.084852

anova(m_s_glm, test='Chisq') # e) p-value of the model = 0.0501

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survival
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                              33     45.234           
## Corticosterone  1   3.8382        32     41.396   0.0501 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(aes(x=Corticosterone, y=Survival), data=m_s_glm)+
  geom_point()+
  geom_smooth(method=glm, method.args = list(family = "binomial"), se=T) # f) fancier way to predicted curve

## `geom_smooth()` using formula 'y ~ x'

performance::check_model(m_s_glm) # g) check model

##7.4. The clusters.txt dataset contains the response variable Cancers (the # number of reported cancer cases per year per clinic) and the explanatory # variable Distance (the distance from a nuclear plant to the clinic in km). # a) Make a scatterplot of the data b) Which regression is the more appropriate # for these data? (Don’t take overdispersion into account for now) # c) Given your choice, is the trend significant? # d) What is the pseudo-R2 of the model? # e) What is the p-value of the model? # f) Include the predicted relationship from the model in the scatterplot # g) Do you think there might be some evidence of overdispersion? # h) Perform a new generalized linear model with a distribution that better accounts # for overdispersion # i) Check this last model assumptions

clusters <- read.table('clusters.txt', sep="\t", header = T)

plot(clusters) # a) scatterplot

hist(clusters$Cancers) # b) poisson regression needed for count data

clusters.glm <- glm(Cancers ~ Distance, poisson, clusters)

summary(clusters.glm) # c) negative trend is not significant p=0.0941 is 0.1 sig

## 
## Call:
## glm(formula = Cancers ~ Distance, family = poisson, data = clusters)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5504  -1.3491  -1.1553   0.3877   3.1304  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.186865   0.188728   0.990   0.3221  
## Distance    -0.006138   0.003667  -1.674   0.0941 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 149.48  on 93  degrees of freedom
## Residual deviance: 146.64  on 92  degrees of freedom
## AIC: 262.41
## 
## Number of Fisher Scoring iterations: 5

1-(clusters.glm$deviance/clusters.glm$null.deviance) # d) PseudoR^2 = 0.019

## [1] 0.01900423

anova(clusters.glm, test='Chisq') # e) p-value of model is 0.0919

## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: Cancers
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                        93     149.48           
## Distance  1   2.8408        92     146.64   0.0919 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(aes(x=Distance, y=Cancers), data=clusters.glm)+
  geom_point()+
  geom_smooth(method=glm, method.args = list(family = "poisson"), se=T) # f)

## `geom_smooth()` using formula 'y ~ x'

performance::check_overdispersion(clusters.glm) # g) over-dispersion is detected & in the plot, the predicted relationship misses a lot of the data points above 2 and below 1 cancers

## # Overdispersion test
## 
##        dispersion ratio =   1.555
##   Pearson's Chi-Squared = 143.071
##                 p-value =   0.001

## Overdispersion detected.

clusters.glm.nb <- MASS::glm.nb(Cancers ~ Distance, data = clusters) # h) glm that includes a negative binomial estimation to account for over dispersion
summary(clusters.glm.nb) # i) model assumptions summary says that the dispersion parameter for the nb is 1.3595 family taken to be 1

## 
## Call:
## MASS::glm.nb(formula = Cancers ~ Distance, data = clusters, init.theta = 1.359466981, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3103  -1.1805  -1.0442   0.3065   1.9582  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.182490   0.252434   0.723    0.470
## Distance    -0.006041   0.004727  -1.278    0.201
## 
## (Dispersion parameter for Negative Binomial(1.3595) family taken to be 1)
## 
##     Null deviance: 96.647  on 93  degrees of freedom
## Residual deviance: 94.973  on 92  degrees of freedom
## AIC: 253.19
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.359 
##           Std. Err.:  0.612 
## 
##  2 x log-likelihood:  -247.191

performance::check_model(clusters.glm.nb)

##7.5. Use the jaws.txt data to: # a) Make a scatterplot of the data (age explanatory, bone response) # b) Perform a non-linear regression assuming an asymptotic exponential relationship eq. 1: y=a(1-e^(-cx)) # c) Perform a non-linear regression assuming a Michaelis-Menten model eq. 2: y=(ax)/(1+bx) # d) Estimate the percentage of variation explained by both models # (comparing them with a null model with only a constant) # e) Compare both models with Akaike’s Information Criterion (AIC). Which model is better? # f) Make a scatterplot of the data and include both regression lines

jaws <- read.table('jaws.txt', sep="\t", header = T)

GGally::ggpairs(jaws) # a) scatterplot (located in the bottom left) was made by ggpairs

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

jaws.nls <- nls(bone~a*(1 - exp(-c*age)), start=list(a=100, c = 0.1), jaws) # b) non linear regression with assumed eq 1 

jaws.nls.m <- nls(bone~(a*age)/(1+b*age), start=list(a=100, b = 0.1), jaws) # c) non linear regression with assumed eq 2

# d) to find the estimated % of variation explained, calculate by: (100*(tss-rss)/tss) where tss= total sum of squares and rss= residual sum of squares
summary(nls(bone~a*(1 - exp(-c*age)), start=list(a=100, c = 0.1), jaws)) # eq 1, residual standard error=13.1 on 52 degrees of freedom

## 
## Formula: bone ~ a * (1 - exp(-c * age))
## 
## Parameters:
##    Estimate Std. Error t value Pr(>|t|)    
## a 115.58059    2.84365  40.645  < 2e-16 ***
## c   0.11882    0.01233   9.635 3.69e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.1 on 52 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 4.025e-06

rss1 <- 13.1^2*52 # rss = rse^2 * degrees of freedom for eq 1
summary(nls(bone~(a*age)/(1+b*age), start=list(a=100, b = 0.1), jaws)) # eq 2, residual standard error=13.77 on 52 degrees of freedom

## 
## Formula: bone ~ (a * age)/(1 + b * age)
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a 18.72524    2.52585   7.413 1.09e-09 ***
## b  0.13596    0.02339   5.814 3.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.77 on 52 degrees of freedom
## 
## Number of iterations to convergence: 6 
## Achieved convergence tolerance: 7.785e-06

rss2 <- 13.77^2*52 # rss = rse^2 * degrees of freedom for eq 2
null<-lm(bone~1, data=jaws) # to find tss, fit a null model 
summary(null) # the residual standard error is 33.37 on 53 degrees of freedom

## 
## Call:
## lm(formula = bone ~ 1, data = jaws)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.979  -8.844   9.872  21.775  48.021 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   93.979      4.541    20.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.37 on 53 degrees of freedom

tss <- 33.37^2*53 # tss = null rse^2 * degrees of freedom
100*(tss-rss1)/tss # d) calculate the estimated % of variation explained for eq 1 = 84.8798

## [1] 84.8798

100*(tss-rss2)/tss # d) calculate the estimated % of variation explained for eq 2 = 83.2936

## [1] 83.2936

AIC(nls(bone~a*(1 - exp(-c*age)), start=list(a=100, c = 0.1), jaws)) # e) AIC for eq 1 = 435.0823 (eq 1 is better because the 435 value is smaller than 440 value)

## [1] 435.0823

AIC(nls(bone~(a*age)/(1+b*age), start=list(a=100, b = 0.1), jaws)) # e) AIC for eq 2 = 440.4066

## [1] 440.4066

plot(jaws$age, jaws$bone) # f) scatterplot of data

plot(jaws$age, jaws$bone) +
  lines(seq(0, 50, 0.1), predict(jaws.nls, list(age=seq(0, 50, 0.1)),
      type="response"), col="red", lwd=3) # f) scatterplot regression line 1

## integer(0)

plot(jaws$age, jaws$bone)+
  lines(seq(0, 50, 0.1), predict(jaws.nls.m, list(age=seq(0, 50, 0.1)),
    type="response"), col="tan", lwd=3) # f) scatterplot regression line 2

## integer(0)

plot(jaws$age, jaws$bone)+lines(seq(0, 50, 0.1), predict(jaws.nls, list(age=seq(0, 50, 0.1)), type="response"), col="red", lwd=3)+
  lines(seq(0, 50, 0.1), predict(jaws.nls.m, list(age=seq(0, 50, 0.1)),
        type="response"), col="tan", lwd=3)  # plot with both lines

## integer(0)

##7.6. In a recent paper we read: “Linear mixed effects modelling fit by restricted maximum likelihood was used to explain the variations in growth. The linear mixed effects model was generated using the lmer function in the R package lme4, with turbidity, temperature, tide, and wave action set as fixed factors and site and date set as random effects”. Write down the R code to recreate their model.

# using rats data
rats<-read.table("rats.txt", sep="\t", header=T)
rats$Treatment<-factor(rats$Treatment) #transform Treatment type into a factor
rats$Rat<-factor(rats$Rat) #transform which rat under the knife into a factor
rats$Liver<-factor(rats$Liver) #transform liver part into a factor
rats$Rat_wTreatment<-rats$Treatment:rats$Rat # the treatments varied, so the unique factor is created 
rats$Liver_wRat<-rats$Treatment:rats$Rat:rats$Liver # 2 rats had their liver cut in threes, so the random  effect is a unique factors of each liver piece 

rats.lme<-lme4::lmer(Glycogen~Treatment+(1|Rat_wTreatment)+(1|Liver_wRat), data=rats)
summary(rats.lme)

## Linear mixed model fit by REML ['lmerMod']
## Formula: Glycogen ~ Treatment + (1 | Rat_wTreatment) + (1 | Liver_wRat)
##    Data: rats
## 
## REML criterion at convergence: 219.6
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.48212 -0.47263  0.03062  0.42934  1.82935 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  Liver_wRat     (Intercept) 14.17    3.764   
##  Rat_wTreatment (Intercept) 36.06    6.005   
##  Residual                   21.17    4.601   
## Number of obs: 36, groups:  Liver_wRat, 18; Rat_wTreatment, 6
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  140.500      4.707  29.848
## Treatment2    10.500      6.657   1.577
## Treatment3    -5.333      6.657  -0.801
## 
## Correlation of Fixed Effects:
##            (Intr) Trtmn2
## Treatment2 -0.707       
## Treatment3 -0.707  0.500

anova(rats.lme)

## Analysis of Variance Table
##           npar Sum Sq Mean Sq F value
## Treatment    2 123.99  61.996   2.929

##7.7. Researchers at the University of Arizona want to assess the germination rate of saguaros using a factorial design, with 3 levels of soil type (remnant, cultivated and restored) and 2 levels of sterilization (yes or no). The same experimental design was deployed in 4 different greenhouses. Each of the unique treatments was replicated in 5 pots. 6 seeds were planted in each pot. a) How many fixed treatments (unique combinations) exist? b) What is the total number of pots? c) What is the total number of plants measured? d) Write the R code that you would use to analyze these data

3*2 # a) with 3 levels of soil type and 2 levels of sterilization = 6 unique combinations

## [1] 6

6*5*4 # b) each of the 6 treatment combos had 5 pots and 4 greenhouses = 120 pots

## [1] 120

6*120 # c) 6 unique combinations in 120 pots = 720 plants

## [1] 720

# glmer(germination ~ soil * sterilization + (1|Greenhouse) + (1|Greenhouse:Pot), family=binomial, data = problem) # d) R code to analyze data (i.e., AB notes: Notice that there are 2 fixed treatments (with interaction) and 2 nested random effects. The model should be a logistic GLMM because germination is a proportion (between 0 and 1).)

##7.8. Check these hypothetical data from 4 subjects relating number of previous performances to negative affect. [see graph]

What does the thick dashed black line represent?
- The thick black dashed line is a simple linear regression trend line increasing performances, increases negative affect
What is depicting the solid black line?
- the solid black line is the average of the four dashed trendlines representing the random variables
What do the 4 thin dashed lines represent?
- The four thin black dashed lines show the random-intercept levels for a mixed effects model

##7.9. Load dragons.RData # a) Perform a simple linear regression with testScore as response and bodyLength as explanatory # b) Plot the data with ggplot2 and add a linear regression line with confidence # intervals. Hint: geom_smooth() # c) We collected multiple samples from 8 mountain ranges. Generate a # boxplot using (or not) ggplot2 to explore this new explanatory variable # d) Now repeat the scatterplot in b) but coloring by mountain range and without # linear regression line # e) Instead of coloring, use facet_wrap() to separate by mountain range # f) Perform a new linear model adding mountain range and assuming that it is fixed effects # g) Perform the same linear model as before but now assuming mountain range is random effects # h) How much of the variation in test scores is explained by the random effect (mountain range) # i) Estimate the R2 (conditional and marginal) of the mixed-effects model # j) Check this last model assumptions # k) Include the fitted lines for each mountain range (plot from e). [Hint: predict function within geom_line layer]

load("dragons.RData")
drag <- lm(testScore ~ bodyLength, data = dragons)# a) simple linear regression; response=testScore, explanatory=bodyLength  
summary(drag)

## 
## Call:
## lm(formula = testScore ~ bodyLength, data = dragons)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.962 -16.411  -0.783  15.193  55.200 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -61.31783   12.06694  -5.081 5.38e-07 ***
## bodyLength    0.55487    0.05975   9.287  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.2 on 478 degrees of freedom
## Multiple R-squared:  0.1529, Adjusted R-squared:  0.1511 
## F-statistic: 86.25 on 1 and 478 DF,  p-value: < 2.2e-16

ggplot(aes(y=testScore, x=bodyLength), data=dragons)+geom_point()+  
  geom_smooth(method="lm", se=T)+  theme_bw() # b) plot with linear regression line & confidence interval

## `geom_smooth()` using formula 'y ~ x'

ggplot(data = dragons, mapping = aes(y = testScore, x = mountainRange, fill = mountainRange))+
  geom_boxplot()+labs(title = "Dragon Test Score and Mountain Range")+
  theme_bw()+theme(panel.grid = element_blank(),legend.position = "none") # c) boxplot to explore new explanatory variable

ggplot(aes(y=testScore, x=bodyLength, color = mountainRange), data=dragons)+
  geom_point()+theme_bw()+labs(title = "Dragon Test Score and Body Length by Mountain Range",
       color = "Mountain Range")+theme(panel.grid = element_blank(),
        legend.position = "bottom") # d) scatterplot in b with the samples from each mountain range colored

ggplot(aes(y=testScore, x=bodyLength, color = mountainRange), data=dragons)+
  geom_point()+theme_bw()+labs(title = "Dragon Test Score and Body Length by Mountain Range",
       color = "Mountain Range")+theme(panel.grid = element_blank(),
        legend.position = "none")+facet_wrap(.~mountainRange) # e) scatterplots by each mountainrange unsing facet_wrap()

mr <- lm(testScore ~ bodyLength + mountainRange, data = dragons)
summary(mr) # f) perform new linear model assuming moountain ranges as a fixed effect

## 
## Call:
## lm(formula = testScore ~ bodyLength + mountainRange, data = dragons)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.263  -9.926   0.361   9.994  44.488 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           20.83051   14.47218   1.439  0.15072    
## bodyLength             0.01267    0.07974   0.159  0.87379    
## mountainRangeCentral  36.58277    3.59929  10.164  < 2e-16 ***
## mountainRangeEmmental 16.20923    3.69665   4.385 1.43e-05 ***
## mountainRangeJulian   45.11469    4.19012  10.767  < 2e-16 ***
## mountainRangeLigurian 17.74779    3.67363   4.831 1.84e-06 ***
## mountainRangeMaritime 49.88133    3.13924  15.890  < 2e-16 ***
## mountainRangeSarntal  41.97841    3.19717  13.130  < 2e-16 ***
## mountainRangeSouthern  8.51961    2.73128   3.119  0.00192 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.96 on 471 degrees of freedom
## Multiple R-squared:  0.5843, Adjusted R-squared:  0.5773 
## F-statistic: 82.76 on 8 and 471 DF,  p-value: < 2.2e-16

mr2 <- lme4::lmer(testScore ~ bodyLength + (1|mountainRange), data = dragons)
summary(mr2) # g) perform new linear model assuming moountain ranges as a random effect

## Linear mixed model fit by REML ['lmerMod']
## Formula: testScore ~ bodyLength + (1 | mountainRange)
##    Data: dragons
## 
## REML criterion at convergence: 3991.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4815 -0.6513  0.0066  0.6685  2.9583 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  mountainRange (Intercept) 339.7    18.43   
##  Residual                  223.8    14.96   
## Number of obs: 480, groups:  mountainRange, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 43.70938   17.13489   2.551
## bodyLength   0.03316    0.07865   0.422
## 
## Correlation of Fixed Effects:
##            (Intr)
## bodyLength -0.924

# h) How much of the variation in test scores is explained by the random effect (mountain range) 
100*(339.7/(339.7+223.8)) # 100*((random effects mountain range intercept variance)/((random effects mountain range intercept variance)+(random effects residual intercept variance))) = 60.28% of variation is explained with mountainRange

## [1] 60.28394

performance::r2(mr2) # i) the mixed-effects model R2 (conditional)= 0.603, R2 (marginal)= 5.128e-4

## # R2 for Mixed Models
## 
##   Conditional R2: 0.603
##      Marginal R2: 0.001

performance::check_model(mr2) # j) Check this last model assumptions

predict <- data.frame(dragons, "predicted"=c(predict(mr2))) # set up the predicted function data frame

ggplot(aes(y=testScore, x=bodyLength, color = mountainRange), data=dragons)+
  geom_point()+
  geom_line(data = predict, aes(x=bodyLength, y=predicted), color = "black")+
  theme_bw()+
  labs(title = "Dragon Test Score and Body Length by Mountain Range",
       color = "Mountain Range")+
  theme(panel.grid = element_blank(),
        legend.position = "none")+
  facet_wrap(.~mountainRange) # k) fitted lines for each mountain range

##7.10. Data in Estuaries.csv correspond to counts of invertebrates at 3-4 sites in each of 7 (randomly chosen) estuaries. a) Fit a linear mixed model with Total as response and Modification as explanatory, controlling for Estuary b) Estimate the R2 (conditional and marginal) of this model c) Plot the data with ggplot2 in a way that helps you understand the different effects d) Include the variable Site as a random effect. Do you think this corresponds to a crossed or a nested design? # e) What are the R2 (conditional and marginal) of the model including Site # f) Check the model assumptions g) Plot the data trying to include Site # h) Transform the variable Hydroid to presence/absence data # i) Fit a generalized linear mixed model (GLMM) with this transformed variable as # Response and the same fixed and random effects as in d). [Hint: function glmer] # j) Check the model assumptions

estuaries <- read.csv("Estuaries.csv", header = T)

estu.lmer<-lme4::lmer(Total ~ Modification + (1|Estuary), data=estuaries) # a) linear mixed model, response=Total, explanatory=Modification, controlling for Estuary
summary(estu.lmer)

## Linear mixed model fit by REML ['lmerMod']
## Formula: Total ~ Modification + (1 | Estuary)
##    Data: estuaries
## 
## REML criterion at convergence: 394.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3859 -0.7142  0.2766  0.5240  2.0456 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Estuary  (Intercept) 55.12    7.424   
##  Residual             86.07    9.277   
## Number of obs: 54, groups:  Estuary, 7
## 
## Fixed effects:
##                      Estimate Std. Error t value
## (Intercept)            40.973      4.727   8.668
## ModificationPristine  -14.473      6.230  -2.323
## 
## Correlation of Fixed Effects:
##             (Intr)
## MdfctnPrstn -0.759

performance::r2(estu.lmer) # b) Conditional R2: 0.553, Marginal R2: 0.267

## # R2 for Mixed Models
## 
##   Conditional R2: 0.553
##      Marginal R2: 0.267

ggplot2::ggplot(estuaries, aes(x = Modification, y = Total))+geom_boxplot()+geom_jitter()+
  facet_wrap(~Estuary) # c) plot understanding the different effects

estu.lmer.site<-lme4::lmer(Total ~ Modification + (1|Estuary/Site), 
    data=estuaries) # d) variable site as random effect; corresponds as nested design
summary(estu.lmer.site)

## Linear mixed model fit by REML ['lmerMod']
## Formula: Total ~ Modification + (1 | Estuary/Site)
##    Data: estuaries
## 
## REML criterion at convergence: 386.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8686 -0.6687  0.1504  0.6505  1.9816 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  Site:Estuary (Intercept) 49.85    7.061   
##  Estuary      (Intercept) 47.59    6.899   
##  Residual                 43.65    6.607   
## Number of obs: 54, groups:  Site:Estuary, 27; Estuary, 7
## 
## Fixed effects:
##                      Estimate Std. Error t value
## (Intercept)            41.053      4.739   8.662
## ModificationPristine  -14.553      6.232  -2.335
## 
## Correlation of Fixed Effects:
##             (Intr)
## MdfctnPrstn -0.760

performance::r2(estu.lmer.site) # e) Conditional R2: 0.774, Marginal R2: 0.270 for mixed models

## # R2 for Mixed Models
## 
##   Conditional R2: 0.774
##      Marginal R2: 0.270

performance::check_model(estu.lmer.site) # f) check model assumptions

ggplot2::ggplot(estuaries, aes(x = Modification, y = Total))+geom_boxplot()+
  geom_jitter(aes(color=as.factor(Site)))+
  facet_wrap(~Estuary) # g) plot data including the site

estuaries$HydroidPres <- estuaries$Hydroid > 0 # h) transform Hydroid variable to presence/absence data
hydro.glmm.site<-lme4::glmer(HydroidPres ~ Modification + 
  (1|Estuary/Site), family=binomial, data=estuaries) # i) fit a glmm with transformed variable
summary(hydro.glmm.site)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: HydroidPres ~ Modification + (1 | Estuary/Site)
##    Data: estuaries
## 
##      AIC      BIC   logLik deviance df.resid 
##     57.1     65.0    -24.5     49.1       50 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.06063 -0.25028 -0.05443  0.25475  0.98719 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  Site:Estuary (Intercept) 14.45    3.802   
##  Estuary      (Intercept)  1.13    1.063   
## Number of obs: 54, groups:  Site:Estuary, 27; Estuary, 7
## 
## Fixed effects:
##                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)            -5.710      2.419  -2.361   0.0182 *
## ModificationPristine    6.534      3.140   2.081   0.0375 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## MdfctnPrstn -0.888

performance::r2(hydro.glmm.site)

## # R2 for Mixed Models
## 
##   Conditional R2: 0.888
##      Marginal R2: 0.358

performance::check_model(hydro.glmm.site) # j) check model assumptions

#8 - Time series and spatial analysis

##8.1. Write a function to calculate the 3-point moving average for an input vector. The formula is: y_i=((y_i-1)+(y_i)+(y_i+1))/3

pntavg<- function (x) {
y<-numeric(length(x)-2)
for (i in 2:(length(x)-1)) {
y[i]<-(x[i-1]+x[i]+x[i+1])/3
}
y
}

##8.2. Read the temp.txt file. The data correspond to monthly average temperatures. # a) Plot the time series data. [Hint: first you need to create a monthly time series object] # b) Calculate the 5-point moving average and plot it together with the time series # c) Decompose the time series into seasonal, trend and residual error components # d) Generate a temporal correlogram to assess the autocorrelation of the time series # e) Generate a new correlogram but removing the trend and seasonal variation # f) Generate a partial correlogram plot # g) Find the best ARIMA model using the forecast package # h) Estimate future values using the previous ARIMA model and plot the results

temp <- read.table("temp.txt", sep="\t", header=T) 
temp.ts <- ts(temp$temps, frequency = 12) # a) create monthly time series of temp
plot(temp.ts) # a) plot time series

library(TTR)
temp.ma <- smooth::sma(temp.ts, n = 5) # b) calc 5 pnt moving average
plot(temp.ma) # b) Calculate the 5-point moving average and plot it together with the time series

decom.temp <- decompose(temp.ts) # c) decompose ts into seasonal, trend, and residual (random & observed) error components
plot(decom.temp) # c) plotted decomposed temp data

acf(temp.ts) # d) using the "Auto-/Cross- Covariance & Correlation Function Estimation", generate a temporal correlogram to assess the autocorrelation of the time series

acf(decom.temp$random[!is.na(decom.temp$random)]) # e) Generate a new correlogram but removing the trend and seasonal variation

pacf(temp.ts) # f) pacf() creates partial autocorrelations

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

temp.fit<-auto.arima(temp.ts)
summary(temp.fit)

## Series: temp.ts 
## ARIMA(0,0,1)(2,1,0)[12] 
## 
## Coefficients:
##          ma1     sar1     sar2
##       0.2456  -0.5212  -0.3233
## s.e.  0.0774   0.0823   0.0827
## 
## sigma^2 = 3.772:  log likelihood = -300.76
## AIC=609.52   AICc=609.81   BIC=621.4
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.2384692 1.846384 1.498262 0.2682136 11.90101 0.8327783
##                     ACF1
## Training set 0.005413047

checkresiduals(temp.fit) # g) best ARIMA model

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(2,1,0)[12]
## Q* = 38.688, df = 21, p-value = 0.01069
## 
## Model df: 3.   Total lags used: 24

forecast<-forecast(temp.fit) # h) Estimate future values using the previous ARIMA model 
plot(forecast) # h) plot the results

##8.3. Read the bac_dust.txt dataset. The data corresponds to proteobacterial # abundance and bacterial species in dust samples collected around the globe. # a) Make a map of the sampling points using the R libraries maps and ggplot2. # Color the points based on the continent they were collected. # b) Make a map of the sampling points using the R libraries maps and ggplot2. # Size the points based on bacterial richness. # c) Calculate Moran’s I autocorrelation for the variable Richness and Proteobacteria # d) Make a simple linear regression of Richness as a function of Proteobacteria. # Are the residuals spatially autocorrelated? # e) Perform a generalized linear mixed model (GLMM) using the coordinates # as random effect and using the MatÃ©rn covariance function (R package spaMM). # f) Report the nu and the rho values for this spatial regression model. # g) What can you say about the estimated correlation of this particular # spatial regression model across increasing distances between pairs of locations.

bac<-read.table("bac_dust.txt", header = T, sep = "\t")
world_map <- map_data("world")
ggplot(world_map)+
  geom_polygon(aes(x = long, y = lat, group = group), fill = "lightgray", colour = "black", size = 0.1)+
  geom_point(data = bac, aes(x = Longitude, y = Latitude, color = Continent), size = 2)+
  scale_color_manual(values = c("SouthAmerica" = 'chartreuse4', "NorthAmerica" = 'chartreuse1', "Africa" = 'goldenrod1', "Oceania" = 'darkorchid3', "Europe" = 'firebrick4', "Asia" = 'cornflowerblue'))+
  theme_bw(base_size = 15)+
  theme(legend.title = element_blank(), axis.title.x = element_blank(), 
        axis.text.x = element_blank(), axis.ticks.x = element_blank(), 
        axis.title.y = element_blank(), axis.text.y = element_blank(), 
        axis.ticks.y = element_blank()) # a) map of sampling, points per continent

ggplot(world_map)+
  geom_polygon(aes(x = long, y = lat, group = group), fill = "lightgray", colour = "black", size = 0.1)+
  geom_point(data = bac, aes(x = Longitude, y = Latitude, color = Continent, size=Richness), alpha=0.6)+
  scale_color_manual(values = c("SouthAmerica" = 'chartreuse4', "NorthAmerica" = 'chartreuse1', "Africa" = 'goldenrod1', "Oceania" = 'darkorchid3', "Europe" = 'firebrick4', "Asia" = 'cornflowerblue'))+
  theme_bw(base_size = 15)+
  theme(legend.title = element_blank(), axis.title.x = element_blank(), 
        axis.text.x = element_blank(), axis.ticks.x = element_blank(), 
        axis.title.y = element_blank(), axis.text.y = element_blank(), 
        axis.ticks.y = element_blank()) # b) same as a) but Size the points based on bacterial richness

bac.dists <- as.matrix(dist(cbind(bac$Latitude, 
  bac$Longitude))) 
bac.dists.inv <- 1/bac.dists
diag(bac.dists.inv) <- 0
ape::Moran.I(bac$Richness, bac.dists.inv) # c) Calc Moran's I autocorr for variable Richness

## $observed
## [1] 0.1159867
## 
## $expected
## [1] -0.003225806
## 
## $sd
## [1] 0.03860909
## 
## $p.value
## [1] 0.002017263

ape::Moran.I(bac$Proteobacteria, bac.dists.inv) # c) Calc Moran's I autocorr for variable Proteobacteria

## $observed
## [1] 0.216501
## 
## $expected
## [1] -0.003225806
## 
## $sd
## [1] 0.03858986
## 
## $p.value
## [1] 1.241692e-08

anova(lm(Richness~Proteobacteria, data=bac)) # d) simple lg of Richness as a function of Proteobacteria.

## Analysis of Variance Table
## 
## Response: Richness
##                 Df   Sum Sq Mean Sq F value Pr(>F)
## Proteobacteria   1    38901   38901  0.5349 0.4651
## Residuals      309 22471204   72722

bac$resid <-resid(lm(Richness~Proteobacteria, data=bac))
ape::Moran.I(bac$resid, bac.dists.inv) # d) Are the residuals spatially autocorrelated? not significant

## $observed
## [1] 0.1098749
## 
## $expected
## [1] -0.003225806
## 
## $sd
## [1] 0.03861023
## 
## $p.value
## [1] 0.003397342

spaMM::fitme(Richness~Proteobacteria+Matern(1|Latitude+Longitude), 
  data = bac) # e) GLMM, random effect=coordinates, MatÃ©rn covariance function

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

## If the 'RSpectra' package were installed, an extreme eigenvalue computation could be faster.

## formula: Richness ~ Proteobacteria + Matern(1 | Latitude + Longitude)
## ML: Estimation of corrPars, lambda and phi by ML.
##     Estimation of fixed effects by ML.
## Estimation of lambda and phi by 'outer' ML, maximizing p_v.
## family: gaussian( link = identity ) 
##  ------------ Fixed effects (beta) ------------
##                Estimate Cond. SE t-value
## (Intercept)      411.58    48.47  8.4910
## Proteobacteria    37.66   112.90  0.3336
##  --------------- Random effects ---------------
## Family: gaussian( link = identity ) 
##                    --- Correlation parameters:
##      1.nu     1.rho 
## 0.2041722 0.3145336 
##            --- Variance parameters ('lambda'):
## lambda = var(u) for u ~ Gaussian; 
##    Latitude .  :  19190  
## # of obs: 311; # of groups: Latitude ., 311 
##  -------------- Residual variance  ------------
## phi estimate was 54713 
##  ------------- Likelihood values  -------------
##                         logLik
## p_v(h) (marginal L): -2170.262

# f) for this spatial regression model, nu = 0.2042, rho = 0.3145


dist<-dist(bac[,c("Longitude", "Latitude")])
matern<-spaMM::MaternCorr(dist, nu=0.2041723, rho=0.3145336)
plot(as.numeric(dist), as.numeric(matern), xlab="Distance between pairs of locations", ylab="Estimated correlation")

# g) the estimated correlation of this particular spatial regression model across increasing distances between pairs of locations shows that within the groups, there is high correlation but not when comparing the groups to eachother

##8.4. Download the map of Spain. # a) Make a basic map of Spain # b) Get information on the population of Spanish cities and make a map of Spain # including the locations and population of Spanish cities. c) Visit Spain and enjoy

library(maps)

## 
## Attaching package: 'maps'

## The following object is masked _by_ '.GlobalEnv':
## 
##     ozone

library(ggplot2)
library(viridis)

## Loading required package: viridisLite

## 
## Attaching package: 'viridis'

## The following object is masked from 'package:maps':
## 
##     unemp

spain<-map_data("world", region="spain")

ggplot(spain)+
  geom_polygon(aes(x = long, y = lat, group=group), fill='lightgray', color="black", size=0.1)+
  scale_size_continuous(range=c(1,12))# a) Make a basic map of Spain

cities<-get('world.cities')
head(cities)

##                 name country.etc   pop   lat  long capital
## 1 'Abasan al-Jadidah   Palestine  5629 31.31 34.34       0
## 2 'Abasan al-Kabirah   Palestine 18999 31.32 34.35       0
## 3       'Abdul Hakim    Pakistan 47788 30.55 72.11       0
## 4 'Abdullah-as-Salam      Kuwait 21817 29.36 47.98       0
## 5              'Abud   Palestine  2456 32.03 35.07       0
## 6            'Abwein   Palestine  3434 32.03 35.20       0

cities.spain<-cities[cities$country.etc == 'Spain',]
head(cities.spain)

##                     name country.etc    pop   lat  long capital
## 128             A Coruna       Spain 243088 43.33 -8.42       0
## 129            A Estrada       Spain  21997 42.70 -8.50       0
## 130            A Laracha       Spain  10856 43.25 -8.59       0
## 131 A Pobra do Caraminal       Spain   9955 42.61 -8.94       0
## 183      Abanto Zierbena       Spain   9505 43.32 -3.08       0
## 186               Abaran       Spain  12890 38.20 -1.40       0

ggplot(spain)+
  geom_polygon(aes(x = long, y = lat, group=group), fill='lightgray', color="black", size=0.1)+
  geom_point(data=cities.spain, aes(x=long, y=lat, size=pop, color="red"), alpha=0.5)+
  scale_size_continuous(range=c(1,12))+
  theme_void()

##8.5. Download GBIF georeferenced occurrence data of the Pyrenean desman # (Galemys pyrenaicus) and make a map of its geographical distribution # (Hint: it is only present in Spain, Andorra, Portugal and France).

rgbif::occ_count(georeferenced = T)

## [1] 2060618296

gp_gbif<-rgbif::occ_search(scientificName = "Galemys pyrenaicus", hasCoordinate = T)

gp<-as.data.frame(gp_gbif$data[,c("decimalLatitude", "decimalLongitude")])

world <- map_data("world")

eu<-map_data("world", region=c("Spain", "Portugal", "France", "Andorra"))
ggplot(eu, aes(x=long, y=lat))+
  geom_polygon(aes(group=group, fill=region))+
  geom_point(data=gp, aes(x=decimalLongitude, y=decimalLatitude), color="black", alpha=0.2, size=2)+
  theme_bw()

#9 - Diversity analysis

##9.1. Load the dune_bio.txt dataset. Species should be in columns and sites in rows. # a) Calculate the total number of individuals of all species # b) Calculate the total number of individuals for each species # c) Calculate the average number of individuals for each species # d) Calculate the total number of individuals for each site # e) Calculate the average number of individuals for each site # f) Write a function to report the median number of individuals for each species and the median number of individuals for each site # g) Use the vegan function decostand() to transform the dataset to relative abundances. How would you confirm the transformation worked? # h) Use the vegan function decostand() to standardize the dataset into the range 0 to 1 # i) Use the vegan function decostand() to standardize the dataset to mean=0 and variance=1

db<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
sum(db)

## [1] 685

colSums(db)

## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv 
##     13      2     13     18      5     25     18      4     49     14      2 
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri 
##      9     54      4     48     10      9     47     21     11     16     63 
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor 
##      1     26     20     26     48     58     36     15

colMeans(db)

## Belper Empnig Junbuf Junart Airpra Elepal Rumace Viclat Brarut Ranfla Cirarv 
##   0.65   0.10   0.65   0.90   0.25   1.25   0.90   0.20   2.45   0.70   0.10 
## Hyprad Leoaut Potpal Poapra Calcus Tripra Trirep Antodo Salrep Achmil Poatri 
##   0.45   2.70   0.20   2.40   0.50   0.45   2.35   1.05   0.55   0.80   3.15 
## Chealb Elyrep Sagpro Plalan Agrsto Lolper Alogen Brohor 
##   0.05   1.30   1.00   1.30   2.40   2.90   1.80   0.75

rowSums(db)

##  2 13  4 16  6  1  8  5 17 15 10 11  9 18  3 20 14 19 12  7 
## 42 33 45 33 48 18 40 43 15 23 43 32 42 27 40 31 24 31 35 40

rowMeans(db)

##         2        13         4        16         6         1         8         5 
## 1.4000000 1.1000000 1.5000000 1.1000000 1.6000000 0.6000000 1.3333333 1.4333333 
##        17        15        10        11         9        18         3        20 
## 0.5000000 0.7666667 1.4333333 1.0666667 1.4000000 0.9000000 1.3333333 1.0333333 
##        14        19        12         7 
## 0.8000000 1.0333333 1.1666667 1.3333333

median_sps_sites<-function(db){
  cat("The median number of individuals per sp is", apply(db, 2, median), "and for sites is", apply(db, 1, median))
}
dune.bio.relabu<-vegan::decostand(db, method = "total")
rowSums(dune.bio.relabu) # g) Use the vegan function decostand() to transform the dataset to relative abundances. How would you confirm the transformation worked? summary()

##  2 13  4 16  6  1  8  5 17 15 10 11  9 18  3 20 14 19 12  7 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1

dune.bio.range<-vegan::decostand(db, method = "range")
dune.bio.stand<-vegan::decostand(db, method = "standardize")
summary(dune.bio.relabu)

##      Belper            Empnig             Junbuf            Junart       
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.000000   Median :0.00000   Median :0.00000  
##  Mean   :0.01665   Mean   :0.003226   Mean   :0.01752   Mean   :0.02728  
##  3rd Qu.:0.04496   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.02273  
##  Max.   :0.07407   Max.   :0.064516   Max.   :0.11429   Max.   :0.13043  
##      Airpra            Elepal            Rumace            Viclat       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.01151   Mean   :0.04278   Mean   :0.02105   Mean   :0.00614  
##  3rd Qu.:0.00000   3rd Qu.:0.02500   3rd Qu.:0.01190   3rd Qu.:0.00000  
##  Max.   :0.13333   Max.   :0.24242   Max.   :0.12500   Max.   :0.06250  
##      Brarut            Ranfla            Cirarv             Hyprad       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.03333   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median :0.05000   Median :0.00000   Median :0.000000   Median :0.00000  
##  Mean   :0.07213   Mean   :0.02353   Mean   :0.002222   Mean   :0.01786  
##  3rd Qu.:0.12216   3rd Qu.:0.05265   3rd Qu.:0.000000   3rd Qu.:0.00000  
##  Max.   :0.22222   Max.   :0.12903   Max.   :0.044444   Max.   :0.16129  
##      Leoaut            Potpal             Poapra            Calcus       
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.05536   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.06977   Median :0.000000   Median :0.07778   Median :0.00000  
##  Mean   :0.08170   Mean   :0.008514   Mean   :0.06960   Mean   :0.01772  
##  3rd Qu.:0.09498   3rd Qu.:0.000000   3rd Qu.:0.10000   3rd Qu.:0.00000  
##  Max.   :0.19355   Max.   :0.086957   Max.   :0.22222   Max.   :0.16667  
##      Tripra            Trirep            Antodo            Salrep       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.03816   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.05530   Median :0.00000   Median :0.00000  
##  Mean   :0.01003   Mean   :0.06625   Mean   :0.03471   Mean   :0.01846  
##  3rd Qu.:0.00000   3rd Qu.:0.08772   3rd Qu.:0.05312   3rd Qu.:0.00000  
##  Max.   :0.10417   Max.   :0.25000   Max.   :0.26667   Max.   :0.16129  
##      Achmil            Poatri            Chealb             Elyrep       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.09651   Median :0.000000   Median :0.00000  
##  Mean   :0.02458   Mean   :0.08232   Mean   :0.001515   Mean   :0.03711  
##  3rd Qu.:0.04738   3rd Qu.:0.12054   3rd Qu.:0.000000   3rd Qu.:0.08992  
##  Max.   :0.13333   Max.   :0.27273   Max.   :0.030303   Max.   :0.22222  
##      Sagpro            Plalan            Agrsto            Lolper       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.03571   Median :0.06085  
##  Mean   :0.02714   Mean   :0.03767   Mean   :0.07145   Mean   :0.08353  
##  3rd Qu.:0.05265   3rd Qu.:0.09635   3rd Qu.:0.15396   3rd Qu.:0.12863  
##  Max.   :0.11429   Max.   :0.13333   Max.   :0.21212   Max.   :0.38889  
##      Alogen            Brohor       
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000  
##  Mean   :0.04824   Mean   :0.01757  
##  3rd Qu.:0.08387   3rd Qu.:0.01163  
##  Max.   :0.22857   Max.   :0.09524

rowSums(dune.bio.relabu) # i) Use the vegan function decostand() to standardize the dataset to mean=0 and variance=1

##  2 13  4 16  6  1  8  5 17 15 10 11  9 18  3 20 14 19 12  7 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1

##9.2. Write a function to calculate the observed richness of a vector representing the abundances of different species in a community.

richnessfunct <- function(abun){
  dat <- abun
  relab <- decostand(dat, method = "total")
  specnumber(relab)
}

##9.3. Write a function to calculate the Shannon-Wiener diversity index of a vector representing the abundances of different species in a community.

swdiversityfunct <- function(abun){
  dat <- abun
  relab <- decostand(dat, method = "total")
  diversity(relab, index="shannon")
}

##9.4. Write a function that calculates both the observed richness and the # Shannon-Wiener diversity index for each community in a matrix, and writes an # output table with the results. You can use the vegan built-in functions. # Test your function with the dune_bio.txt dataset.

richnessswfunct <- function(abun){
  dat <- abun
  relab <- decostand(dat, method = "total")
  rich <- specnumber(relab)
  swd <- diversity(relab, index="shannon")
}

##9.5. Make a rank-abundance curve using the second site (13) of the dune_bio.txt dataset. Fit a lognormal model to the data.

dunecurve <- read.table("dune_bio.txt", sep="\t", header=T)
plot(vegan::rad.lognormal(dunecurve), lty=2, lwd=2) # fit a log normal model to the data

vegan::rad.lognormal(dunecurve)

## 
## RAD model: Log-Normal 
## Family: poisson 
## No. of species:  217 
## Total abundance: 895 
## 
##      log.mu   log.sigma    Deviance         AIC         BIC 
##   1.2207285   0.6292969  20.2518355 702.2895203 709.0493150

plot(vegan::radfit(dunecurve)) # fit lognormal with radfit()

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

##9.6. Create a Euclidean distance matrix after standardization using the first # (A1), second (Moisture) and fifth (Manure) variables of dune_env.txt. # Then create a Bray-Curtis distance matrix using dune_bio.txt. # Perform a Mantel test on both distance matrices and plot the relationship.

duneenv <- read.table("dune_env.txt", sep="\t", header=T)
dunebio <- read.table("dune_bio.txt", sep="\t", header=T)

dune.sd <- vegan::decostand(duneenv[,c(2,3,6)], method="standardize") # standardize the 1st, 2nd,and 5th
dist.euc <- vegan::vegdist(dune.sd, method="euclidean")

dist.bray <- vegan::vegdist(dunebio[,-1], method="bray") # create a Bray-Curtis distance matrix using dune_bio.txt

vegan::mantel(dist.euc, dist.bray) # perform a mantel test on euc and bray dist matrices

## 
## Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## vegan::mantel(xdis = dist.euc, ydis = dist.bray) 
## 
## Mantel statistic r: 0.5496 
##       Significance: 0.001 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.146 0.191 0.224 0.254 
## Permutation: free
## Number of permutations: 999

plot(dist.euc, dist.bray) # plot the relationship

#10 - Cluster analysis

##10.1. Make dendrograms of the results of hierarchical clustering using # the single, average and complete methods. Use the dune_bio.txt dataset # after creating a Bray-Curtis distance matrix.

dune.bio<-read.table("dune_bio.txt", sep="\t", header=T, row.names=1)

dune.bio.bray<-vegan::vegdist(dune.bio, method = "bray") # Hierarchical clustering

dunebiosing<-stats::hclust(dune.bio.bray, method="single")
plot(dunebiosing) #dendrogram (tree) visualization, single

dunebioavg<-hclust(dune.bio.bray, method="average")
plot(dunebioavg)  #dendrogram (tree) visualization, average

dunebiocomp<-hclust(dune.bio.bray, method="complete")
plot(dunebiocomp) #dendrogram (tree) visualization, complete

##10.2. Perform k-means partitions from 2 to 6 on the dune_bio.txt dataset and select the best partition using the Calinski criterion. Visualize the results.

dunebio.k<-vegan::cascadeKM(dune.bio, inf.gr=2, sup.gr=6) # Non-hierarchical clustering (K-means) with inf.gr= minimum number of groups, sup.gr= maximum number of groups

dunebio.k$results # calinski maximum value is the best partitioning which is 5.793253 @ 3 groups

##             2 groups   3 groups   4 groups   5 groups   6 groups
## SSE      1218.500000 950.516667 777.333333 676.500000 593.750000
## calinski    5.611243   5.793253   5.633047   5.110033   4.737482

dunebio.k3grps<-vegan::cascadeKM(dune.bio, inf.gr=2, sup.gr=8)

plot(dunebio.k3grps, sortg=T) # visualize

#11 - Ordination

##11.1. Load the varechem dataset within the R package vegan. This data frame # collects soil characteristics. Given the nature of this dataset perform a PCA # or a CA ordination analysis. # a) How much variation is explained by the two first axes? # b) Make a screeplot of the results # c) Plot the ordination results of the sites # d) Plot both the sites scores and the soil characteristics scores focusing on the soil variables

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-2

data(varechem)
varechem_pca<-vegan::rda(varechem, scale=T) # vegan function
summary(varechem_pca) # a) variation explained by the two first axes (or proportion explained) 0.3708 + 0.2281 = 0.5989

## 
## Call:
## rda(X = varechem, scale = T) 
## 
## Partitioning of correlations:
##               Inertia Proportion
## Total              14          1
## Unconstrained      14          1
## 
## Eigenvalues, and their contribution to the correlations 
## 
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Eigenvalue            5.1916 3.1928 1.6855 1.06899 0.81599 0.70581 0.43641
## Proportion Explained  0.3708 0.2281 0.1204 0.07636 0.05829 0.05042 0.03117
## Cumulative Proportion 0.3708 0.5989 0.7193 0.79564 0.85392 0.90434 0.93551
##                           PC8    PC9    PC10    PC11    PC12     PC13     PC14
## Eigenvalue            0.36884 0.1707 0.14953 0.08526 0.06986 0.035057 0.023615
## Proportion Explained  0.02635 0.0122 0.01068 0.00609 0.00499 0.002504 0.001687
## Cumulative Proportion 0.96185 0.9740 0.98473 0.99082 0.99581 0.998313 1.000000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:  4.236078 
## 
## 
## Species scores
## 
##              PC1     PC2      PC3      PC4       PC5      PC6
## N         0.2441  0.2516 -0.23818  0.92846 -0.359554  0.30319
## P        -0.9181 -0.4194  0.13356  0.09649  0.224909  0.07938
## K        -0.9369 -0.3272 -0.06261  0.25051  0.180899 -0.28683
## Ca       -0.9280 -0.1711  0.47982 -0.02138 -0.241479 -0.09536
## Mg       -0.9087 -0.1978  0.12446 -0.04430 -0.497428 -0.10781
## S        -0.8576 -0.6002 -0.31620 -0.01652  0.071629 -0.08779
## Al        0.2980 -0.9720 -0.39009  0.09092  0.005386 -0.19467
## Fe        0.5236 -0.7748 -0.23063  0.37187 -0.023377 -0.32292
## Mn       -0.7418  0.3908  0.13114  0.44346  0.501064  0.06776
## Zn       -0.8926 -0.3277  0.06142 -0.06121 -0.153677  0.51653
## Mo       -0.1072 -0.4625 -0.85720 -0.27258 -0.030063  0.39721
## Baresoil -0.4218  0.6387 -0.40089 -0.07389 -0.416610 -0.31060
## Humdepth -0.6070  0.6825 -0.41674  0.02975 -0.047738 -0.15923
## pH        0.4286 -0.6517  0.66384  0.07599 -0.265374  0.05070
## 
## 
## Site scores (weighted sums of species scores)
## 
##         PC1      PC2       PC3      PC4       PC5      PC6
## 18  0.05126  0.84085 -0.190142 -0.40431  0.101518 -1.01997
## 15  0.20452  0.37397  0.002978 -1.06030  1.175734 -0.92257
## 24 -1.53647 -1.25830 -0.012170 -0.86690 -1.854372  1.74375
## 27 -1.25643  0.51345  0.743928  0.63835  1.097881  0.15631
## 23 -0.71203  0.86247 -0.203389 -0.05971 -0.786196 -0.80080
## 19 -0.76179  0.73785 -0.761645 -0.31728 -1.303297 -0.62264
## 22 -0.30340  0.86304  0.188484  0.54758 -0.078064  0.24567
## 16  0.62797  0.76436 -0.124301 -0.26047  0.009044 -0.02807
## 28 -1.13923  0.44909  0.229104  1.61582  0.842871  0.60852
## 13 -0.62483 -0.85407 -1.259755  1.38195 -0.157593 -1.04794
## 14 -0.04206  0.60916 -0.719743 -0.73109 -0.230147  0.25505
## 20 -0.93747  0.20753 -0.689713  0.62198 -0.282188  0.33232
## 25 -0.34451  0.90128  0.710714  0.64127  1.214897  0.48173
## 7   1.50502 -0.22114 -0.495604  0.87068 -0.769266 -0.25733
## 5   1.40889  0.60035  0.911463  0.71133 -0.488111  2.45195
## 6   1.30776  0.03604 -0.243884 -0.87792  0.543259  0.46876
## 3   1.64967 -0.82755 -0.343406  1.40028 -0.546374 -0.19580
## 4  -0.26711 -1.65198 -1.744251 -0.60763  0.947492  0.46203
## 2   0.59653 -1.21610 -0.200030  0.72815  0.799208 -0.76013
## 9   0.03271 -0.69445 -0.350028 -1.35247  0.717140  0.82965
## 12  0.47944  0.26377  0.184677 -1.27744  0.573929  0.07253
## 10 -0.32993 -0.95483  1.670469 -0.51343  0.819138 -0.48682
## 11 -0.09921 -1.52318  2.493913 -0.09044 -1.092296 -1.10654
## 21  0.49069  1.17838  0.202333 -0.73801 -1.254205 -0.85966

stats::screeplot(varechem_pca) # b) screeplot of the results

plot(varechem_pca, display="sites", xlab="PC1 (37%)", ylab="PC2 (23%)") # c) Plot the ordination results of the sites

stats::biplot(varechem_pca, scaling=1, xlab="PC1 (37%)", ylab="PC2 (23%)") # d) Plot both the sites scores and the soil characteristics scores focusing on the soil variables

##11.2. The dune_bio.txt dataset corresponds to species abundances. # Given the nature of this dataset perform a PCA or a CA ordination analysis. # a) How much variation is explained by the two first axes? # b) Make a screeplot of the results # c) Plot the ordination results of the sites

dunebio <- read.table("dune_bio.txt", sep="\t", header=T, row.names=1)

dunebio.ca<-vegan::cca(dunebio)
dunebio.ca

## Call: cca(X = dunebio)
## 
##               Inertia Rank
## Total           2.115     
## Unconstrained   2.115   19
## Inertia is scaled Chi-square 
## 
## Eigenvalues for unconstrained axes:
##    CA1    CA2    CA3    CA4    CA5    CA6    CA7    CA8 
## 0.5360 0.4001 0.2598 0.1760 0.1448 0.1079 0.0925 0.0809 
## (Showing 8 of 19 unconstrained eigenvalues)

summary(dunebio.ca) # a) variation explained by the two first axes (or proportion explained) 0.2534 + 0.1892 = 0.4426

## 
## Call:
## cca(X = dunebio) 
## 
## Partitioning of scaled Chi-square:
##               Inertia Proportion
## Total           2.115          1
## Unconstrained   2.115          1
## 
## Eigenvalues, and their contribution to the scaled Chi-square 
## 
## Importance of components:
##                          CA1    CA2    CA3     CA4     CA5     CA6     CA7
## Eigenvalue            0.5360 0.4001 0.2598 0.17598 0.14476 0.10791 0.09247
## Proportion Explained  0.2534 0.1892 0.1228 0.08319 0.06844 0.05102 0.04372
## Cumulative Proportion 0.2534 0.4426 0.5654 0.64858 0.71702 0.76804 0.81175
##                           CA8     CA9    CA10    CA11    CA12    CA13     CA14
## Eigenvalue            0.08091 0.07332 0.05630 0.04826 0.04125 0.03523 0.020529
## Proportion Explained  0.03825 0.03466 0.02661 0.02282 0.01950 0.01665 0.009705
## Cumulative Proportion 0.85000 0.88467 0.91128 0.93410 0.95360 0.97025 0.979955
##                           CA15     CA16     CA17     CA18     CA19
## Eigenvalue            0.014911 0.009074 0.007938 0.007002 0.003477
## Proportion Explained  0.007049 0.004290 0.003753 0.003310 0.001644
## Cumulative Proportion 0.987004 0.991293 0.995046 0.998356 1.000000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## 
## 
## Species scores
## 
##             CA1      CA2      CA3       CA4       CA5      CA6
## Belper -0.50018 -0.35503 -0.15239 -0.704153 -0.058546 -0.07308
## Empnig -0.69027  3.26420  1.95716 -0.176936 -0.073518  0.16083
## Junbuf  0.08157 -0.68074  1.00545  1.078390  0.268360 -0.24168
## Junart  1.27580  0.09963 -0.09320  0.005536  0.289410  0.78247
## Airpra -1.00434  3.06749  1.33773  0.194305 -1.081813  0.53699
## Elepal  1.76383  0.34562 -0.57336 -0.002976 -0.332396  0.14688
## Rumace -0.65289 -0.25525 -0.59728  1.160164  0.255849  0.32730
## Viclat -0.61893  0.37140 -0.46057 -1.000375  1.162652 -1.44971
## Brarut  0.18222  0.26477 -0.16606  0.064009  0.576334 -0.07741
## Ranfla  1.55886  0.30700 -0.29765  0.046974 -0.008747  0.14744
## Cirarv -0.05647 -0.76398  0.91793 -1.175919 -0.384024  0.13985
## Hyprad -0.85408  2.52821  1.13951 -0.175115 -0.311874 -0.11177
## Leoaut -0.19566  0.38884  0.03975 -0.130392  0.141232 -0.23717
## Potpal  1.91690  0.52150 -1.18215 -0.021738 -1.359988 -1.31207
## Poapra -0.38919 -0.32999 -0.02015 -0.358371  0.079296  0.05165
## Calcus  1.95199  0.56743 -0.85948 -0.098969 -0.556737 -0.23282
## Tripra -0.88116 -0.09792 -1.18172  1.282429  0.325706  0.33388
## Trirep -0.07666 -0.02032 -0.20594  0.026462 -0.186748 -0.53957
## Antodo -0.96676  1.08361 -0.17188  0.459788 -0.607533  0.30425
## Salrep  0.61035  1.54868  0.04970 -0.607136  1.429729  0.55183
## Achmil -0.90859  0.08461 -0.58636 -0.008919 -0.660183  0.18877
## Poatri -0.18185 -0.53997  0.23388  0.178834 -0.155342  0.07584
## Chealb  0.42445 -0.84402  1.59029  1.248755 -0.207480 -0.87566
## Elyrep -0.37074 -0.74148  0.26238 -0.566308 -0.270122  0.72624
## Sagpro  0.00364  0.01719  1.11570  0.066981  0.186654 -0.32463
## Plalan -0.84058  0.24886 -0.78066  0.371149  0.271377 -0.11989
## Agrsto  0.93378 -0.20651  0.28165  0.024293 -0.139326  0.02256
## Lolper -0.50272 -0.35955 -0.21821 -0.474727  0.101494  0.01594
## Alogen  0.40088 -0.61839  0.85013  0.346740  0.016509 -0.10169
## Brohor -0.65762 -0.40634 -0.30685 -0.496751 -0.561358 -0.07004
## 
## 
## Site scores (weighted averages of species scores)
## 
##         CA1        CA2      CA3      CA4      CA5      CA6
## 2  -0.63268 -0.6958357 -0.09708 -1.18695 -0.97686 -0.06575
## 13  0.42445 -0.8440195  1.59029  1.24876 -0.20748 -0.87566
## 4  -0.05647 -0.7639784  0.91793 -1.17592 -0.38402  0.13985
## 16  2.00229  0.1090627 -0.33414  0.33760 -0.50097  0.76159
## 6  -0.85633 -0.0005408 -1.39735  1.59909  0.65494  0.19386
## 1  -0.81167 -1.0826714 -0.14479 -2.10665 -0.39287  1.83462
## 8   0.76268 -0.2968459  0.35648 -0.10772  0.17507  0.36444
## 5  -0.95293 -0.1846015 -0.95609  0.86853 -0.34552  0.98333
## 17 -1.47545  2.7724102  0.40859  0.75117 -2.59425  1.10122
## 15  1.91384  0.5079036 -0.96567  0.04227 -0.50681 -0.19370
## 10 -0.87885 -0.0353136 -0.82987 -0.68053 -0.75438 -0.81070
## 11 -0.64223  0.4440332 -0.17371 -1.09684  1.37462 -2.00626
## 9   0.09693 -0.7864314  0.86492  0.40090  0.28704  1.02783
## 18 -0.31241  0.6328355 -0.66501 -1.12728  2.65575 -0.97565
## 3  -0.10148 -0.9128732  0.68815 -0.68137 -0.08709  0.28678
## 20  1.94438  1.0688809 -0.66595 -0.55317  1.59606  1.70292
## 14  1.91996  0.5351062 -1.39863 -0.08575 -2.21317 -2.43044
## 19 -0.69027  3.2642026  1.95716 -0.17694 -0.07352  0.16083
## 12  0.28557 -0.6656161  1.64423  1.71496  0.65381 -1.17376
## 7  -0.87149 -0.2547040 -0.86830  0.90468  0.17385  0.03446

sum(0.2534, 0.1892) # 44.3% variation

## [1] 0.4426

stats::screeplot(dunebio.ca) # b) screeplot

plot(dunebio.ca, display="sites", xlab="CA1 (25%)", ylab="CA2 (19%)") # c) ordination results plotted

##11.3. Perform a nonmetric multidimensional scaling (NMDS) on the dune_bio.txt # dataset after calculating Bray-Curtis distances among sites. # NMDS: ordination from any distance matrix, not an eigenvector-based method # Represent the set of objects along a predetermined number of axes while # preserving the ordering relationships among them # a) What is the stress? #Stress is the spread of points from the # monotone regression step line (stress < 0.1 is extremely good, # stress < 0.2 is very good) # b) Make a Shepard plot of the NMDS results # c) Plot the ordination results of the sites # d) [Advanced - ggplot2] Plot the ordination results using # ggplot2. Use the dune_env.txt dataset to make the size of # the points proportional to the site richness (number of # species) and the color to represent the variable Management.

#Install Packages & Load Libraries: 
# install.packages("vegan")
# library(vegan)
# install.packages("tidyverse")
# library(tidyverse)

dunebio <- read.table("dune_bio.txt", sep="\t", header=T, row.names=1)
dune_bio_nmds <- vegan::metaMDS(dunebio, distance="bray", k=2) #function calcs the Bray-Curtis and dimension amount

## Run 0 stress 0.1192678 
## Run 1 stress 0.1183186 
## ... New best solution
## ... Procrustes: rmse 0.02027176  max resid 0.06496899 
## Run 2 stress 0.1886532 
## Run 3 stress 0.2075713 
## Run 4 stress 0.1192678 
## Run 5 stress 0.1192679 
## Run 6 stress 0.1886532 
## Run 7 stress 0.1808912 
## Run 8 stress 0.1183186 
## ... Procrustes: rmse 3.869369e-05  max resid 0.0001221982 
## ... Similar to previous best
## Run 9 stress 0.1886532 
## Run 10 stress 0.1192678 
## Run 11 stress 0.1808911 
## Run 12 stress 0.1192678 
## Run 13 stress 0.1183186 
## ... Procrustes: rmse 9.605114e-06  max resid 3.020403e-05 
## ... Similar to previous best
## Run 14 stress 0.1183186 
## ... New best solution
## ... Procrustes: rmse 9.305398e-06  max resid 3.015801e-05 
## ... Similar to previous best
## Run 15 stress 0.1192679 
## Run 16 stress 0.1192678 
## Run 17 stress 0.2075713 
## Run 18 stress 0.1183186 
## ... New best solution
## ... Procrustes: rmse 5.810304e-06  max resid 1.67842e-05 
## ... Similar to previous best
## Run 19 stress 0.1192678 
## Run 20 stress 0.1808911 
## *** Solution reached

dune_bio_nmds$stress #a) stress=0.1183186

## [1] 0.1183186

vegan::stressplot(dune_bio_nmds) # b) Make a Shepard plot of the NMDS results, Sphepard plot

plot(dune_bio_nmds, type="t", display="sites") # c) Plot the ordination results of the sites

# d) [Advanced - ggplot2] Plot the ordination results using 
# ggplot2. Use the dune_env.txt dataset to make the size of 
# the points proportional to the site richness (number of 
# species) and the color to represent the variable Management.

#Enable the r-universe repo
# {options(repos = c(fawda123 = 'https://fawda123.r-universe.dev',
#   CRAN = 'https://cloud.r-project.org'))}
# install.packages('ggord')
dune_env <- read.table("dune_env.txt", sep="\t", header=T, row.names=1)
dune_env$Richness <- vegan::specnumber(x=dunebio) #Define as Species Richness 
point_data <- as.data.frame(dune_bio_nmds$points) #Define bio nmds points as a data frame
View(point_data)
dune_env$MDS1 <- point_data$MDS1 #Attaching MDS1 of dune_bio proportional to dune_bio
dune_env$MDS2 <- point_data$MDS2 #Make MDS2 of dune_bio proportional to dune_bio

ggplot(data = dune_env, aes(x=MDS1, y=MDS2))+
  geom_point(aes(fill=`Management`, size=dune_env$Richness), shape = 21) #d

## Warning: Use of `dune_env$Richness` is discouraged. Use `Richness` instead.

ggplot(data=dunebio, mapping=aes(x=MDS1, y=MDS2))+
  geom_point(data=dune_env, mapping=aes(fill=Management, size=dune_env$Richness), shape=24)

## Warning: Use of `dune_env$Richness` is discouraged. Use `Richness` instead.

##11.4. Perform a constrained correspondence analysis (CCA) on the dune_bio.txt # dataset using the first (A1), second (Moisture) and fifth (Manure) variables of # dune_env.txt as explanatory. # a) What is the variation explained by the two first constrained axes? # b) What is the adjusted R2 of the model? # c) Use a permutation test to assess the overall statistical significance of the model # d) Use a permutation test to assess the marginal statistical significance of # each explanatory variable of the model. Which variables are significant? # e) Make a triplot of the ordination results focusing on the sites

dune_bio<-read.table("dune_bio.txt", sep="\t", header=T)
dune_env<-read.table("dune_env.txt", sep="\t", header=T)
dune_cca<-vegan::cca(dune_bio ~ A1 + Moisture + Manure, data = dune_env)

summary(dune_cca) # a) variation explained by the two first constrained axes (or Proportion Explained)  0.2135 + 0.1236 = 0.3371 (33.71%)

## 
## Call:
## cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env) 
## 
## Partitioning of scaled Chi-square:
##               Inertia Proportion
## Total          1.6252     1.0000
## Constrained    0.6221     0.3828
## Unconstrained  1.0031     0.6172
## 
## Eigenvalues, and their contribution to the scaled Chi-square 
## 
## Importance of components:
##                         CCA1   CCA2    CCA3    CA1     CA2    CA3     CA4
## Eigenvalue            0.3469 0.2010 0.07428 0.1879 0.15906 0.1401 0.11071
## Proportion Explained  0.2135 0.1236 0.04571 0.1156 0.09787 0.0862 0.06812
## Cumulative Proportion 0.2135 0.3371 0.38281 0.4985 0.59632 0.6825 0.75065
##                           CA5     CA6     CA7     CA8     CA9    CA10    CA11
## Eigenvalue            0.07916 0.07293 0.05730 0.04718 0.04572 0.03865 0.01710
## Proportion Explained  0.04871 0.04487 0.03526 0.02903 0.02813 0.02378 0.01052
## Cumulative Proportion 0.79936 0.84423 0.87949 0.90852 0.93666 0.96044 0.97096
##                          CA12     CA13     CA14     CA15     CA16
## Eigenvalue            0.01500 0.013210 0.008019 0.007311 0.003656
## Proportion Explained  0.00923 0.008128 0.004934 0.004499 0.002249
## Cumulative Proportion 0.98019 0.988318 0.993252 0.997751 1.000000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                         CCA1   CCA2    CCA3
## Eigenvalue            0.3469 0.2010 0.07428
## Proportion Explained  0.5576 0.3230 0.11939
## Cumulative Proportion 0.5576 0.8806 1.00000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## 
## 
## Species scores
## 
##            CCA1      CCA2     CCA3       CA1      CA2      CA3
## Sites  -0.36612 -0.272851  0.03468 -0.026780 -0.05701 -0.17817
## Belper  0.83300 -0.024552  0.26178  0.249493  0.68038 -0.05697
## Empnig -1.19645 -1.015213 -1.29464 -2.381277  0.35802 -1.57136
## Junbuf -0.33355  0.392914 -0.33779 -0.612703  0.28445  1.45139
## Junart -0.98487  0.158124 -0.32922  0.777789 -0.26320  0.26276
## Airpra -0.62178 -1.218943 -0.79362 -2.003397  0.14075 -1.48745
## Elepal -1.18987  0.411338  0.36981  0.894116 -0.56011 -0.49119
## Rumace  0.68758 -0.009132  0.40907 -0.563922 -1.07097  0.83356
## Viclat  0.74779 -1.150745  0.06289  0.485533  0.45456 -0.40237
## Brarut -0.01433 -0.185135  0.11098  0.103773 -0.23756 -0.05516
## Ranfla -1.17130  0.226331  0.04852  0.751432 -0.38610 -0.11623
## Cirarv  0.88097  1.399633 -0.03436  0.096612  1.33775 -1.35317
## Hyprad -0.40538 -1.134432 -0.72287 -1.557435  0.27779 -1.35848
## Leoaut  0.11313 -0.315311  0.00254 -0.144195  0.13654 -0.09109
## Potpal -1.68519 -0.199852  2.32945  0.558279  0.23074 -0.25627
## Poapra  0.60710  0.166498 -0.15189  0.164077  0.25719  0.05963
## Calcus -1.25179 -0.024437  0.18648  1.213098 -0.60626 -0.42266
## Tripra  1.06049 -0.045435  0.48790 -0.536955 -1.77008  0.52948
## Trirep  0.09018 -0.173311  0.19104 -0.006099  0.10233  0.27931
## Antodo  0.34515 -0.648581 -0.02741 -0.934434 -0.54225 -0.27585
## Salrep -0.67734 -1.194890 -0.81458  0.470121 -0.14927 -0.49681
## Achmil  0.84311 -0.390114  0.04982 -0.120906 -0.18623  0.15482
## Poatri  0.32724  0.523661 -0.06588 -0.149603  0.14705  0.39989
## Chealb -0.87273  1.439561 -0.12589 -1.166287  0.34406  0.92844
## Elyrep  0.67402  0.439441 -0.05561  0.212329  0.81665  0.34163
## Sagpro -0.18977  0.413840 -0.38749 -0.580719  0.62725 -0.03569
## Plalan  0.87667 -0.503952  0.32002 -0.243956 -0.74970  0.07749
## Agrsto -0.53992  0.678450  0.04437  0.303519  0.14484 -0.11011
## Lolper  0.82254  0.171363 -0.12671  0.244396  0.14938 -0.08128
## Alogen -0.20392  0.933599 -0.24562 -0.149106  0.35350  0.39106
## Brohor  0.88060  0.005436  0.02269  0.126893  0.42629  0.07325
## 
## 
## Site scores (weighted averages of species scores)
## 
##           CCA1     CCA2     CCA3      CA1     CA2      CA3
## row1   1.30733  0.52050 -0.10807  0.67156  1.5229  0.54566
## row2  -0.53317  1.35011 -0.84353 -1.16629  0.3441  0.92844
## row3   0.59360  1.60088 -0.76573  0.09661  1.3378 -1.35317
## row4  -1.67234  0.87023  0.77543  1.01571 -1.3732 -1.20479
## row5   1.23344 -0.59583  1.71239 -0.52391 -2.2494  0.64215
## row6   1.82267  1.04962 -1.25008  1.04448  1.0312 -0.69674
## row7  -0.60118  1.09273 -0.57638  0.44127 -0.4551 -0.10654
## row8   1.35816 -0.36499  1.34351 -0.85365 -0.8100  0.51841
## row9  -0.23644 -2.20690 -0.82467 -1.43658 -0.1851 -1.36159
## row10 -1.66957 -0.04445  2.46397  0.29039  0.3173 -0.32879
## row11  1.04331 -0.76431  0.46182  0.28509  0.2433  0.35529
## row12  0.52779 -1.06784 -0.38141  0.38132  0.5402 -0.82315
## row13  0.02359  0.98908 -0.96098  0.15283  0.9026  2.39822
## row14  0.03511 -1.48173  0.02349  0.89441  0.4946 -0.31849
## row15  0.66962  1.59633 -0.76475  0.19191  0.8725 -0.39087
## row16 -1.71708 -0.61475 -0.47319  1.92639 -0.8400  0.04092
## row17 -1.66869 -0.15401  3.11596  0.82617  0.1442 -0.18375
## row18 -0.78671 -2.41592 -2.93782 -2.38128  0.3580 -1.57136
## row19 -0.43877  1.09271 -0.77217 -1.14296  0.5297  1.49304
## row20  1.16321 -0.33658  0.68024 -0.25287 -1.5318  0.25885
## 
## 
## Site constraints (linear combinations of constraining variables)
## 
##          CCA1     CCA2       CCA3      CA1     CA2      CA3
## row1   1.0905 -0.31742  0.0006521  0.67156  1.5229  0.54566
## row2  -0.8727  1.43956 -0.1258921 -1.16629  0.3441  0.92844
## row3   0.8810  1.39963 -0.0343648  0.09661  1.3378 -1.35317
## row4  -0.8508  1.40305 -0.2881650  1.01571 -1.3732 -1.20479
## row5   1.0322 -0.22006  0.4333799 -0.52391 -2.2494  0.64215
## row6   1.4693  1.04731 -0.4282122  1.04448  1.0312 -0.69674
## row7  -0.7414  1.22051 -1.0995296  0.44127 -0.4551 -0.10654
## row8   0.8863  0.02333  1.5151993 -0.85365 -0.8100  0.51841
## row9   0.2402 -1.52454 -0.0420918 -1.43658 -0.1851 -1.36159
## row10 -1.7654 -0.06599  2.9244526  0.29039  0.3173 -0.32879
## row11  0.4551 -0.88477 -0.4458423  0.28509  0.2433  0.35529
## row12  0.9267 -1.04237  0.0257658  0.38132  0.5402 -0.82315
## row13 -0.5464 -0.47220 -0.9563308  0.15283  0.9026  2.39822
## row14  0.6826 -1.63347  0.6458803  0.89441  0.4946 -0.31849
## row15  0.8737  1.41180  0.0197262  0.19191  0.8725 -0.39087
## row16 -1.1819 -1.03955 -1.4028253  1.92639 -0.8400  0.04092
## row17 -1.6050 -0.33372  1.7344512  0.82617  0.1442 -0.18375
## row18 -1.1965 -1.01521 -1.2946433 -2.38128  0.3580 -1.57136
## row19 -0.5358  0.50832  0.1544659 -1.14296  0.5297  1.49304
## row20  1.3054  0.32236 -0.4030985 -0.25287 -1.5318  0.25885
## 
## 
## Biplot scores for constraining variables
## 
##             CCA1   CCA2    CCA3 CA1 CA2 CA3
## A1       -0.5541 0.1536  0.8182   0   0   0
## Moisture -0.9613 0.2111 -0.1771   0   0   0
## Manure    0.4376 0.8881 -0.1408   0   0   0

0.2135 + 0.1236

## [1] 0.3371

vegan::RsquareAdj(dune_cca)$adj.r.squared # b) What is the adjusted R2 of the model? 0.267

## [1] 0.2680708

anova(dune_cca) # c) permutation test to assess the overall statistical significance of the model, 999 permutations

## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
##          Df ChiSquare     F Pr(>F)    
## Model     3   0.62215 3.308  0.001 ***
## Residual 16   1.00307                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(dune_cca, by="margin") # d) permutation test to assess the marginal statistical significance of each explanatory variable of the model. Which variables are significant? Moisture and manure

## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = dune_bio ~ A1 + Moisture + Manure, data = dune_env)
##          Df ChiSquare      F Pr(>F)    
## A1        1   0.08492 1.3546  0.168    
## Moisture  1   0.24521 3.9113  0.001 ***
## Manure    1   0.20790 3.3162  0.002 ** 
## Residual 16   1.00307                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(dune_cca, scaling=2, xlab="CCA1 (21%)", ylab="CCA2 (12%)") # e) Make a triplot of the ordination results focusing on the sites

##11.5. Perform a permutational multivariate analysis of variance (PERMANOVA) # with 9,999 permutations using the dune_bio.txt dataset after calculating Bray- # Curtis distances and the first (A1), second (Moisture) and fifth (Manure) # variables of dune_env.txt as explanatory. # a) Which explanatory variables are significant? # b) What is the explanatory power (R2) of the significant variables? # c) Perform a new PERMANOVA analysis with 9,999 permutations with A1 as the # explanatory variable but constrain the permutations within the Use variable.

dune_bio<-read.table("dune_bio.txt", sep="\t", header=T)
dune_env<-read.table("dune_env.txt", sep="\t", header=T)
vegan::adonis2(vegdist(dune_bio, method="bray") ~ A1 + Moisture + Manure, data=dune_env, permutations=9999) # a) A1, Moisture, and Manure are significant

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
## 
## vegan::adonis2(formula = vegdist(dune_bio, method = "bray") ~ A1 + Moisture + Manure, data = dune_env, permutations = 9999)
##          Df SumOfSqs      R2      F Pr(>F)    
## A1        1   0.5107 0.15417 5.8578  2e-04 ***
## Moisture  1   0.7196 0.21722 8.2537  1e-04 ***
## Manure    1   0.6874 0.20752 7.8849  3e-04 ***
## Residual 16   1.3949 0.42109                  
## Total    19   3.3126 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# b)  A1 R2=0.15417, Moisture R2=0.21722, Manure R2=0.20752

vegan::adonis2(vegdist(dune_bio, method="bray") ~ Use/A1, data=dune_env, permutations=9999, strata=dune_env$Use)

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 9999
## 
## vegan::adonis2(formula = vegdist(dune_bio, method = "bray") ~ Use/A1, data = dune_env, permutations = 9999, strata = dune_env$Use)
##          Df SumOfSqs      R2      F Pr(>F)    
## Use       2   0.4904 0.14805 1.9394  4e-04 ***
## Use:A1    3   1.0520 0.31757 2.7733  4e-04 ***
## Residual 14   1.7702 0.53438                  
## Total    19   3.3126 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1