在动物急性毒性试验中,使受试动物半数死亡的毒物浓度,用LC50表示;
使受试动物半数死亡的毒物剂量,称为半数致死量,用LD50表示;
半最大效应浓度(concentration for 50% of maximal effect,EC50)是指能引起50%最大效应的浓度。

rm(list = ls())
if (!require(pacman)) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse)

数据读取与转换

# Acute and chronic copper toxicity data used in the multiple llinear regression (MLR) analyses
df_EC = read.csv(file = "./data/EC50OR20S.csv") # data of EC50s or EC20s
df_EC = df_EC %>% 
  mutate(l_DOC = log(DOC),l_Hardness = log(Hardness),l_EC = log(ObservedEC50sOREC20s))# 取以e为底的对数变换
df_EC$Species = as.factor(df_EC$Species)
df_EC_50s = df_EC %>% filter(AcuteORChronic == "Acute") # data of EC50s for Acute
df_EC_20s = df_EC %>% filter(AcuteORChronic == "Chronic") # data of EC20s for Chronic
# Acute copper toxicity data used for species sensitivity distribution (SSD) development
df_LC50 = read.csv(file = "./data/Standardizedlc50.csv") # 
unique(df_EC_50s$Species)
colnames(df_EC)

通过绘图探索数据

pacman::p_load(ggplot2)
fig_DOC = df_EC_50s %>%
  ggplot(aes(x = DOC,y = ObservedEC50sOREC20s))+
  geom_point(aes(colour = Species,shape = Species))+
  labs(x = "DOC (mg/L)",
       y = "Observed EC50 (µg/L)")+
  theme_bw()+
  theme(legend.position = c(0.6,1),
        legend.justification = c(0,1),
        legend.background = element_rect(fill = NA),
        legend.text = element_text(face = "italic"),
        panel.grid.minor = element_blank())
fig_pH = df_EC_50s %>%
  ggplot(aes(x = pH,y = ObservedEC50sOREC20s))+
  geom_point(aes(colour = Species,shape = Species))+
  labs(x = "pH",
       y = "Observed EC50 (µg/L)")+
  theme_bw()+
  theme(legend.position = c(0.71,1),
        legend.justification = c(0,1),
        legend.background = element_rect(fill = NA),
        legend.text = element_text(face = "italic"),
        panel.grid.minor = element_blank())

fig_Hard = df_EC_50s %>%
  ggplot(aes(x = Hardness,y = ObservedEC50sOREC20s))+
  geom_point(aes(colour = Species,shape = Species))+
  labs(x = "Hardness (mg/L)",
       y = "Observed EC50 (µg/L)")+
  theme_bw()+
  theme(legend.position = c(0.71,1),
        legend.justification = c(0,1),
        legend.background = element_rect(fill = NA),
        legend.text = element_text(face = "italic"),
        panel.grid.minor = element_blank())

# pacman::p_load(patchwork)
# 
# fig_DOC/fig_pH/fig_Hard
fig_DOC

fig_pH

fig_Hard

pacman::p_load(ggplot2)
fig_lDOC = df_EC_50s %>%
  ggplot(aes(x = l_DOC,y = l_EC))+
  geom_point(aes(colour = Species,shape = Species))+
  geom_smooth(method = "lm",formula = y~x)+
  labs(x = "ln(DOC (mg/L))",
       y = "ln(EC50 (µg/L))")+
  theme_bw()+
  theme(legend.position = c(0.72,0.4),
        legend.justification = c(0,1),
        legend.background = element_rect(fill = NA),
        legend.text = element_text(face = "italic"),
        panel.grid.minor = element_blank())

fig_lHard = df_EC_50s %>%
  ggplot(aes(x = l_Hardness,y = l_EC))+
  geom_point(aes(colour = Species,shape = Species))+
  geom_smooth(method = "lm",formula = y~x)+
  labs(x = "ln(Hardness (mg/L))",
       y = "ln(EC50 (µg/L))")+
  theme_bw()+
  theme(legend.position = c(0.72,0.4),
        legend.justification = c(0,1),
        legend.background = element_rect(fill = NA),
        legend.text = element_text(face = "italic"),
        panel.grid.minor = element_blank())

fig_lpH = df_EC_50s %>%
  ggplot(aes(x = pH,y = l_EC))+
  geom_point(aes(colour = Species,shape = Species))+
  geom_smooth(method = "lm",formula = y~x)+
  labs(x = "pH",
       y = "ln(EC50 (µg/L))")+
  theme_bw()+
  theme(legend.position = c(0.72,0.4),
        legend.justification = c(0,1),
        legend.background = element_rect(fill = NA),
        legend.text = element_text(face = "italic"),
        panel.grid.minor = element_blank())
fig_lDOC

fig_lpH

fig_lHard

经过ln变换后的数据呈现出线性关系

建立三个一元线性回归模型

model.doc = lm(l_EC~l_DOC,data = df_EC_50s)
summary(model.doc)
## 
## Call:
## lm(formula = l_EC ~ l_DOC, data = df_EC_50s)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9954 -0.9321 -0.0638  0.8246  3.9686 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.57297    0.05515   64.78   <2e-16 ***
## l_DOC        0.52999    0.03745   14.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.297 on 686 degrees of freedom
## Multiple R-squared:  0.226,  Adjusted R-squared:  0.2248 
## F-statistic: 200.3 on 1 and 686 DF,  p-value: < 2.2e-16

在模型l_EC~l_DOC,data = df_EC_50s中: * l_DOC对l_EC有显著的影响(p < 2.2e-16); * R2 = 0.226,l_DOC能解释22.6%的l_EC的变异;

同理对Hardness 和 pH建立一元线性回归模型

model.pH = lm(l_EC~pH,data = df_EC_50s)
summary(model.pH)

pH解释力度较小(7.97%)

model.hard = lm(l_EC~l_Hardness,data = df_EC_50s)
summary(model.hard)

硬度对l_EC的解释力度也较小为15.86%

plot(model.doc,which = 1)

plot(model.hard,which = 1)

plot(model.pH,which = 1)

残差图显示三个模型均有一定的非线性趋势,存在模型未解释的部分

残差正态性检验

plot(model.doc,which = 2)

plot(model.hard,which = 2)

plot(model.pH,which = 2)

大部分数据分布在对角线上,正态性良好,存在少数异常点

用W检验分别检验三个模型残差的正态性

shapiro.test(model.doc$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model.doc$residuals
## W = 0.99238, p-value = 0.001356
shapiro.test(model.hard$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model.hard$residuals
## W = 0.99522, p-value = 0.03118
shapiro.test(model.pH$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model.pH$residuals
## W = 0.99217, p-value = 0.001096

W检验表明残差并不是正态分布的

检验等方差性

plot(model.doc,which = 3)

plot(model.hard,which = 3)

plot(model.pH,which = 3)

残差随着预测值的增大有增大的趋势,存在一定程度的异方差性

上述的一元线性模型的解释力度不足,最大解释力度的模型为以DOC为自变量的一元线性回归,其也只解释了22.6%的EC的变异
并且上述模型在残差正态性,等方差性的基本假设条件不完全符合
因此不直接采用上述模型,寻找解释力度更高的模型,如多元的线性回归模型或者是加入高次项进行多项式回归

构建多元线性回归模型

model.multi_1<- lm(l_EC ~ l_DOC + l_Hardness + pH,data = df_EC_50s) # 不考虑交互效应
summary(model.multi_1)
## 
## Call:
## lm(formula = l_EC ~ l_DOC + l_Hardness + pH, data = df_EC_50s)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8755 -0.7434 -0.1010  0.6566  3.4913 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.57046    0.48569 -11.469   <2e-16 ***
## l_DOC        0.66653    0.03183  20.944   <2e-16 ***
## l_Hardness   0.48068    0.04815   9.983   <2e-16 ***
## pH           0.93358    0.06356  14.688   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.03 on 684 degrees of freedom
## Multiple R-squared:  0.5138, Adjusted R-squared:  0.5117 
## F-statistic:   241 on 3 and 684 DF,  p-value: < 2.2e-16
model.multi_2<- lm(l_EC ~ (l_DOC + l_Hardness + pH)^2,data = df_EC_50s) # 考虑最多两项之间的交互效应
summary(model.multi_2)
## 
## Call:
## lm(formula = l_EC ~ (l_DOC + l_Hardness + pH)^2, data = df_EC_50s)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9931 -0.6864 -0.1197  0.5994  2.9288 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -6.33560    2.12560  -2.981 0.002979 ** 
## l_DOC             2.53467    0.36170   7.008 5.84e-12 ***
## l_Hardness        0.50983    0.52844   0.965 0.334999    
## pH                0.95869    0.27923   3.433 0.000632 ***
## l_DOC:l_Hardness -0.27419    0.03968  -6.911 1.11e-11 ***
## l_DOC:pH         -0.09759    0.04824  -2.023 0.043462 *  
## l_Hardness:pH     0.01420    0.06854   0.207 0.835939    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9804 on 681 degrees of freedom
## Multiple R-squared:  0.5612, Adjusted R-squared:  0.5573 
## F-statistic: 145.1 on 6 and 681 DF,  p-value: < 2.2e-16
model.multi_3=lm(l_EC ~ Species + l_DOC + l_Hardness + pH,data = df_EC_50s) #加入物种差异的多元线性回归模型
summary(model.multi_3)
## 
## Call:
## lm(formula = l_EC ~ Species + l_DOC + l_Hardness + pH, data = df_EC_50s)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.22409 -0.38834  0.00511  0.38433  2.37967 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -7.31613    0.33237 -22.012  < 2e-16 ***
## SpeciesDaphnia magna        0.66090    0.08074   8.186 1.33e-15 ***
## SpeciesDaphnia obtusa       0.89482    0.11899   7.520 1.74e-13 ***
## SpeciesDaphnia pulex        0.38691    0.13534   2.859  0.00438 ** 
## SpeciesPimephales promelas  2.25685    0.08634  26.140  < 2e-16 ***
## l_DOC                       0.78406    0.02100  37.342  < 2e-16 ***
## l_Hardness                  0.57920    0.03150  18.388  < 2e-16 ***
## pH                          0.96103    0.04130  23.270  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6507 on 680 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.805 
## F-statistic: 406.1 on 7 and 680 DF,  p-value: < 2.2e-16
#考虑物种间差异并且考虑物种和不同变量之间的交互效应
options(contrasts=c("contr.sum", "contr.poly")) 
# 设定无序因子的默认对比方法为contr.sum,
# 有序因子的默认对比方法为contr.poly
# contr.sum:对照变量系数之和限制为0。也称作偏差找对,对各水平的均值与所有水平的均值进行比较
# 因子变量的最后一个水平的系数等于前面所有水平系数的和的相反数,因此求和等于0
# 在这样的设置下运行的模型的结果表示的是在每个Level上的效用的大小(这里的效用是比较与总体的均值的偏差,主要用于比较各个Level的影响程度
#contr.poly:基于正交多项式的对照,用于趋势分析(线性、二次、三次等)和等距水平的有序因子

model.multi_4=lm(l_EC ~ Species + l_DOC + l_Hardness + pH+Species:l_DOC + Species:l_Hardness + Species:pH,data = df_EC_50s) 
summary(model.multi_4)
## 
## Call:
## lm(formula = l_EC ~ Species + l_DOC + l_Hardness + pH + Species:l_DOC + 
##     Species:l_Hardness + Species:pH, data = df_EC_50s)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.93064 -0.28801  0.01755  0.33018  2.08860 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -4.92771    0.89884  -5.482 5.96e-08 ***
## Species1             1.06187    1.13716   0.934  0.35075    
## Species2            -2.11700    0.94436  -2.242  0.02531 *  
## Species3             2.38576    2.15936   1.105  0.26962    
## Species4            -0.19381    2.88673  -0.067  0.94649    
## l_DOC                0.79195    0.02959  26.764  < 2e-16 ***
## l_Hardness           0.43264    0.05655   7.651 6.99e-14 ***
## pH                   0.84414    0.13284   6.354 3.87e-10 ***
## Species1:l_DOC      -0.16645    0.05137  -3.240  0.00125 ** 
## Species2:l_DOC       0.14887    0.03713   4.010 6.77e-05 ***
## Species3:l_DOC       0.05240    0.06222   0.842  0.39998    
## Species4:l_DOC       0.08986    0.09015   0.997  0.31923    
## Species1:l_Hardness -0.42055    0.09393  -4.477 8.89e-06 ***
## Species2:l_Hardness  0.06766    0.06459   1.048  0.29522    
## Species3:l_Hardness -0.20382    0.12160  -1.676  0.09419 .  
## Species4:l_Hardness -0.04535    0.17524  -0.259  0.79587    
## Species1:pH          0.02104    0.16182   0.130  0.89657    
## Species2:pH          0.19250    0.13754   1.400  0.16209    
## Species3:pH         -0.21233    0.31515  -0.674  0.50070    
## Species4:pH         -0.01773    0.43277  -0.041  0.96733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5706 on 668 degrees of freedom
## Multiple R-squared:  0.8542, Adjusted R-squared:   0.85 
## F-statistic: 205.9 on 19 and 668 DF,  p-value: < 2.2e-16
# 在线性回归分析的结果中可以发现多出了species1-4共4个变量,这是因为在做线性模型遇到因子类型的变量的时候lm函数会自动创建因子的level数-1个变量进行编码

# 在上面model.multi_4的基础上添加三个环境因子之间的最多两个因子之间的相互作用项
model.multi_5=lm(l_EC ~ (Species + l_DOC + l_Hardness + pH)^2,data = df_EC_50s) 
summary(model.multi_5)
## 
## Call:
## lm(formula = l_EC ~ (Species + l_DOC + l_Hardness + pH)^2, data = df_EC_50s)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.93553 -0.29281  0.00119  0.30703  1.80985 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.379913   1.605845  -1.482 0.138805    
## Species1             1.382993   1.113091   1.242 0.214497    
## Species2            -2.167604   0.907085  -2.390 0.017143 *  
## Species3             1.963019   2.072637   0.947 0.343926    
## Species4             0.516868   2.764292   0.187 0.851733    
## l_DOC                2.139495   0.210681  10.155  < 2e-16 ***
## l_Hardness          -0.484661   0.342249  -1.416 0.157213    
## pH                   0.493240   0.217336   2.269 0.023559 *  
## Species1:l_DOC      -0.096271   0.050839  -1.894 0.058708 .  
## Species2:l_DOC       0.172115   0.036369   4.732 2.71e-06 ***
## Species3:l_DOC       0.034788   0.059861   0.581 0.561338    
## Species4:l_DOC       0.009429   0.087813   0.107 0.914521    
## Species1:l_Hardness -0.282216   0.091649  -3.079 0.002160 ** 
## Species2:l_Hardness  0.110996   0.062373   1.780 0.075609 .  
## Species3:l_Hardness -0.285853   0.117231  -2.438 0.015014 *  
## Species4:l_Hardness -0.116431   0.167947  -0.693 0.488388    
## Species1:pH         -0.092678   0.158496  -0.585 0.558926    
## Species2:pH          0.187390   0.132107   1.418 0.156522    
## Species3:pH         -0.120296   0.302752  -0.397 0.691243    
## Species4:pH         -0.080633   0.414236  -0.195 0.845723    
## l_DOC:l_Hardness    -0.087795   0.024536  -3.578 0.000371 ***
## l_DOC:pH            -0.131802   0.028155  -4.681 3.46e-06 ***
## l_Hardness:pH        0.121011   0.043611   2.775 0.005679 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5456 on 665 degrees of freedom
## Multiple R-squared:  0.8673, Adjusted R-squared:  0.8629 
## F-statistic: 197.5 on 22 and 665 DF,  p-value: < 2.2e-16

上面的模型model.multi_4的整体的F检验的p-value: < 2.2e-16 R2=0.8542,已经有85%的变异被解释 model.multi_5增加了3个变量只能多解释大约1%因此交互效应的加入可能是不需要的

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
AIC_model.multi_4 = stepAIC(model.multi_4, direction = c("both"), k = 2) #基于赤池信息准则
## Start:  AIC=-752.26
## l_EC ~ Species + l_DOC + l_Hardness + pH + Species:l_DOC + Species:l_Hardness + 
##     Species:pH
## 
##                      Df Sum of Sq    RSS     AIC
## - Species:pH          4     1.878 219.39 -754.35
## <none>                            217.51 -752.26
## - Species:l_DOC       4    16.938 234.45 -708.67
## - Species:l_Hardness  4    32.650 250.16 -664.04
## 
## Step:  AIC=-754.35
## l_EC ~ Species + l_DOC + l_Hardness + pH + Species:l_DOC + Species:l_Hardness
## 
##                      Df Sum of Sq    RSS     AIC
## <none>                            219.39 -754.35
## + Species:pH          4     1.878 217.51 -752.26
## - Species:l_DOC       4    15.796 235.18 -714.51
## - Species:l_Hardness  4    42.958 262.35 -639.32
## - pH                  1   222.758 442.15 -274.20
BIC_model.multi_4 = stepAIC(model.multi_4, direction = c("both"), k = log(length(df_EC_50s$Species)))# 基于贝叶斯信息准则进行逐步回归
## Start:  AIC=-661.59
## l_EC ~ Species + l_DOC + l_Hardness + pH + Species:l_DOC + Species:l_Hardness + 
##     Species:pH
## 
##                      Df Sum of Sq    RSS     AIC
## - Species:pH          4     1.878 219.39 -681.81
## <none>                            217.51 -661.59
## - Species:l_DOC       4    16.938 234.45 -636.13
## - Species:l_Hardness  4    32.650 250.16 -591.50
## 
## Step:  AIC=-681.81
## l_EC ~ Species + l_DOC + l_Hardness + pH + Species:l_DOC + Species:l_Hardness
## 
##                      Df Sum of Sq    RSS     AIC
## <none>                            219.39 -681.81
## + Species:pH          4     1.878 217.51 -661.59
## - Species:l_DOC       4    15.796 235.18 -660.11
## - Species:l_Hardness  4    42.958 262.35 -584.91
## - pH                  1   222.758 442.15 -206.19
df_EC_50s$fitbic<-BIC_model.multi_4$fitted.values #储存模型拟合的结果    
df_EC_50s$residbic<-BIC_model.multi_4$residuals  
library(car)   
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
Anova(BIC_model.multi_4, type="III") #对模型进行方差分析,显示显著的系数
## Anova Table (Type III tests)
## 
## Response: l_EC
##                     Sum Sq  Df F value    Pr(>F)    
## (Intercept)        131.252   1 402.035 < 2.2e-16 ***
## Species             13.946   4  10.679 2.142e-08 ***
## l_DOC              289.160   1 885.717 < 2.2e-16 ***
## l_Hardness          36.830   1 112.814 < 2.2e-16 ***
## pH                 222.758   1 682.324 < 2.2e-16 ***
## Species:l_DOC       15.796   4  12.096 1.685e-09 ***
## Species:l_Hardness  42.958   4  32.896 < 2.2e-16 ***
## Residuals          219.388 672                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ncvTest(BIC_model.multi_4) #car包的一个函数,对模型的等方差性进行检验
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.545188, Df = 1, p = 0.21385
# ncvTest():生成一个计分检验。若检验显著,则拒绝零假设。
# 零假设:误差方差不变
# 备择假设:误差方差随着拟合值水平变化而变化
vif(BIC_model.multi_4) # 计算方差膨胀系数
##                            GVIF Df GVIF^(1/(2*Df))
## Species            3.986506e+05  4        5.012727
## l_DOC              2.642600e+00  1        1.625608
## l_Hardness         1.998445e+00  1        1.413664
## pH                 1.326044e+00  1        1.151540
## Species:l_DOC      6.778907e+00  4        1.270267
## Species:l_Hardness 4.361559e+05  4        5.069384
# car包中的vif函数计算vif值,一般原则下,如果(vif)^(1/2) 
# >2就表明村庄多重共线问题(结果为TRUE)。
# 因为多重共线性的存在意味着在存在其他变量的情况下该变量提供的有关响应的信息是多余的 
shapiro.test(BIC_model.multi_4$residuals)# 残差正态性检验
## 
##  Shapiro-Wilk normality test
## 
## data:  BIC_model.multi_4$residuals
## W = 0.98409, p-value = 8.209e-07