Why simulate data?

Get data

Make sure to get data in a new code chunk, otherwise the pathname will not work.

# set seed to replicate simulation.
# alternatively, R will use the current time as the seed.
set.seed(10)


# get exemplar data
test1 <- read.csv("data/plant.growth.rate.csv",header=T)
test2 <- read.csv("data/Daphniagrowth.csv",header=T)

Simple linear regression

Simulate data for a simple linear regression. This is based on the example in Beckerman et al. (2017), which models plant growth rate as a function of soil moisture content.

# Set coefficients (see page 116)
alpha = 19.348
beta1 = 12.750

# set sample size
n = 50

# Set residual standard error (see page 116)
# this is the positive square root of the MSE
rse = 4.019

# get max and min values for soil.moisture
min=min(test1$soil.moisture.content)
max=max(test1$soil.moisture.content)

# randomize values of soil moisture across known range
data <- data.frame(soil.moisture=runif(n,min=min,max=max))

# generate response values
data1 <- data %>%
  mutate(e = rnorm(n(),mean=0,sd=rse),
         growth.rate =alpha+beta1*soil.moisture+e) 
  # used MSE as an estimate of error variance

# view data
head(data1)
##   soil.moisture          e growth.rate
## 1     1.1347617 -1.5017458    32.31447
## 2     0.7846470 -2.7632853    26.58896
## 3     0.9942158 -3.5052063    28.51904
## 4     1.4585608 -0.4089775    37.53567
## 5     0.3980350 -1.0199440    23.40300
## 6     0.6427731 -7.4501829    20.09317
# visualize
ggplot(data=data1,aes(x=soil.moisture,y=growth.rate))+
  geom_point()+
  geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'

# Fit lm 
model = lm(growth.rate ~ soil.moisture, data=data1)
autoplot(model)

anova(model)
## Analysis of Variance Table
## 
## Response: growth.rate
##               Df  Sum Sq Mean Sq F value    Pr(>F)    
## soil.moisture  1 1777.10 1777.10  134.46 1.599e-15 ***
## Residuals     48  634.38   13.22                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = growth.rate ~ soil.moisture, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5512 -2.6107 -0.5761  2.7677  9.2327 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     18.275      1.286   14.21  < 2e-16 ***
## soil.moisture   13.021      1.123   11.60  1.6e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.635 on 48 degrees of freedom
## Multiple R-squared:  0.7369, Adjusted R-squared:  0.7315 
## F-statistic: 134.5 on 1 and 48 DF,  p-value: 1.599e-15
# compare to summary table (page 116) and ANOVA table (page 115)

One-way ANOVA

Simulate data for a one-way ANOVA. This is based on the example in Beckerman et al. (2017), which models Daphnia growth rate as a function of parasite treatment.

This is a bit trickier because we have to deal with dummy variable coding.

# Set coefficients (see page 125)
alpha = 1.21391
beta1 = -0.41275
beta2 = -0.13755
beta3 = -0.73171

# set sample size
n = 10

# Set residual standard error (see page 123)
# this is the positive square root of the MSE
rse = 0.1799

# make sure parasite is a factor
test2$parasite=as.factor(test2$parasite)

# get dummy variable coding for each row
mod1 <- lm(data=test2,growth.rate~parasite)
dummy <- data.frame(model.matrix(mod1))  # this is a shortcut to get the proper coding
colnames(dummy) <- c("control","Metsch","Panspo","Pasteu") # rename variables

# add parasite variable
dummy$parasite = test2$parasite

# generate data
data <- dummy %>%
  rowwise() %>%
  mutate(e = rnorm(1,mean=0,sd=rse),
         growth.rate =alpha*control+beta1*Metsch+beta2*Panspo+beta3*Pasteu+e) %>%
  ungroup() %>%
  select(-e)

head(data)
## # A tibble: 6 × 6
##   control Metsch Panspo Pasteu parasite growth.rate
##     <dbl>  <dbl>  <dbl>  <dbl> <fct>          <dbl>
## 1       1      0      0      0 control        1.42 
## 2       1      0      0      0 control        0.948
## 3       1      0      0      0 control        1.14 
## 4       1      0      0      0 control        1.02 
## 5       1      0      0      0 control        1.49 
## 6       1      0      0      0 control        1.32
# plot

ggplot(data=data, aes(x=parasite,y=growth.rate)) +
  geom_boxplot()+
  geom_point()+
  xlab("Parasite")+
  ylab("Growth rate")+
  theme_classic()+
  coord_flip()

# Fit an ANOVA and compare to real results
model = lm(growth.rate ~ parasite, data=data)
autoplot(model)

anova(model)
## Analysis of Variance Table
## 
## Response: growth.rate
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## parasite   3 2.9242 0.97475  30.917 4.572e-10 ***
## Residuals 36 1.1350 0.03153                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = growth.rate ~ parasite, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32914 -0.11328  0.04844  0.14007  0.34562 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.24941    0.05615  22.251  < 2e-16 ***
## parasiteMetschnikowia bicuspidata -0.39440    0.07941  -4.967 1.66e-05 ***
## parasitePansporella perplexa      -0.21015    0.07941  -2.646    0.012 *  
## parasitePasteuria ramosa          -0.73635    0.07941  -9.273 4.48e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1776 on 36 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.6971 
## F-statistic: 30.92 on 3 and 36 DF,  p-value: 4.572e-10