Why simulate data?
Make sure to get data in a new code chunk, otherwise the pathname will not work.
# set seed to replicate simulation.
# alternatively, R will use the current time as the seed.
set.seed(10)
# get exemplar data
test1 <- read.csv("data/plant.growth.rate.csv",header=T)
test2 <- read.csv("data/Daphniagrowth.csv",header=T)
Simulate data for a simple linear regression. This is based on the example in Beckerman et al. (2017), which models plant growth rate as a function of soil moisture content.
# Set coefficients (see page 116)
alpha = 19.348
beta1 = 12.750
# set sample size
n = 50
# Set residual standard error (see page 116)
# this is the positive square root of the MSE
rse = 4.019
# get max and min values for soil.moisture
min=min(test1$soil.moisture.content)
max=max(test1$soil.moisture.content)
# randomize values of soil moisture across known range
data <- data.frame(soil.moisture=runif(n,min=min,max=max))
# generate response values
data1 <- data %>%
mutate(e = rnorm(n(),mean=0,sd=rse),
growth.rate =alpha+beta1*soil.moisture+e)
# used MSE as an estimate of error variance
# view data
head(data1)
## soil.moisture e growth.rate
## 1 1.1347617 -1.5017458 32.31447
## 2 0.7846470 -2.7632853 26.58896
## 3 0.9942158 -3.5052063 28.51904
## 4 1.4585608 -0.4089775 37.53567
## 5 0.3980350 -1.0199440 23.40300
## 6 0.6427731 -7.4501829 20.09317
# visualize
ggplot(data=data1,aes(x=soil.moisture,y=growth.rate))+
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
# Fit lm
model = lm(growth.rate ~ soil.moisture, data=data1)
autoplot(model)
anova(model)
## Analysis of Variance Table
##
## Response: growth.rate
## Df Sum Sq Mean Sq F value Pr(>F)
## soil.moisture 1 1777.10 1777.10 134.46 1.599e-15 ***
## Residuals 48 634.38 13.22
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = growth.rate ~ soil.moisture, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5512 -2.6107 -0.5761 2.7677 9.2327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.275 1.286 14.21 < 2e-16 ***
## soil.moisture 13.021 1.123 11.60 1.6e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.635 on 48 degrees of freedom
## Multiple R-squared: 0.7369, Adjusted R-squared: 0.7315
## F-statistic: 134.5 on 1 and 48 DF, p-value: 1.599e-15
# compare to summary table (page 116) and ANOVA table (page 115)
Simulate data for a one-way ANOVA. This is based on the example in Beckerman et al. (2017), which models Daphnia growth rate as a function of parasite treatment.
This is a bit trickier because we have to deal with dummy variable coding.
# Set coefficients (see page 125)
alpha = 1.21391
beta1 = -0.41275
beta2 = -0.13755
beta3 = -0.73171
# set sample size
n = 10
# Set residual standard error (see page 123)
# this is the positive square root of the MSE
rse = 0.1799
# make sure parasite is a factor
test2$parasite=as.factor(test2$parasite)
# get dummy variable coding for each row
mod1 <- lm(data=test2,growth.rate~parasite)
dummy <- data.frame(model.matrix(mod1)) # this is a shortcut to get the proper coding
colnames(dummy) <- c("control","Metsch","Panspo","Pasteu") # rename variables
# add parasite variable
dummy$parasite = test2$parasite
# generate data
data <- dummy %>%
rowwise() %>%
mutate(e = rnorm(1,mean=0,sd=rse),
growth.rate =alpha*control+beta1*Metsch+beta2*Panspo+beta3*Pasteu+e) %>%
ungroup() %>%
select(-e)
head(data)
## # A tibble: 6 × 6
## control Metsch Panspo Pasteu parasite growth.rate
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 1 0 0 0 control 1.42
## 2 1 0 0 0 control 0.948
## 3 1 0 0 0 control 1.14
## 4 1 0 0 0 control 1.02
## 5 1 0 0 0 control 1.49
## 6 1 0 0 0 control 1.32
# plot
ggplot(data=data, aes(x=parasite,y=growth.rate)) +
geom_boxplot()+
geom_point()+
xlab("Parasite")+
ylab("Growth rate")+
theme_classic()+
coord_flip()
# Fit an ANOVA and compare to real results
model = lm(growth.rate ~ parasite, data=data)
autoplot(model)
anova(model)
## Analysis of Variance Table
##
## Response: growth.rate
## Df Sum Sq Mean Sq F value Pr(>F)
## parasite 3 2.9242 0.97475 30.917 4.572e-10 ***
## Residuals 36 1.1350 0.03153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = growth.rate ~ parasite, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32914 -0.11328 0.04844 0.14007 0.34562
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.24941 0.05615 22.251 < 2e-16 ***
## parasiteMetschnikowia bicuspidata -0.39440 0.07941 -4.967 1.66e-05 ***
## parasitePansporella perplexa -0.21015 0.07941 -2.646 0.012 *
## parasitePasteuria ramosa -0.73635 0.07941 -9.273 4.48e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1776 on 36 degrees of freedom
## Multiple R-squared: 0.7204, Adjusted R-squared: 0.6971
## F-statistic: 30.92 on 3 and 36 DF, p-value: 4.572e-10