# load the data for ENA indices
setwd("~/Dropbox/Xu ly so lieu")
load("/Users/LDA/Dropbox/Xu ly so lieu/ENA.rda")

shapiro.test(ENA$TE)

## 
##  Shapiro-Wilk normality test
## 
## data:  ENA$TE
## W = 0.9658, p-value = 0.000686

p1 <- ggplot(data=ENA, aes(x=Ecosystem.type, y=TE, colour=Ecosystem.type))+
  geom_boxplot()
p1

#system.time(data <- read.xlsx("data for analysis.xlsx",1))
load("data for analysis.rda")
df <- data
#str(df)

1. Model with all species in all ecosystems.

1.1 Correlation of log10(SPPR) vs predictors

df_cor <- df[,which(colnames(df) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_cor, lower.panel=panel.smooth,upper.panel = panel.cor)

log10(SPPR) is possitvely correlated with Trophic level(TL) and negatively correlated with growth efficiency (PQ), but at lower level. Therefore we can test the regression models with predictors are TL, PQ and Ecosystem.type

1.2. Regression model with TL is only predictor

model <- lm(data=df, SPPR~TL)
summary(model)

## 
## Call:
## lm(formula = SPPR ~ TL, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6890 -0.4339 -0.0769  0.3502  4.2363 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.34010    0.05403  -24.80   <2e-16 ***
## TL           1.15510    0.01644   70.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6813 on 3304 degrees of freedom
## Multiple R-squared:  0.5991, Adjusted R-squared:  0.599 
## F-statistic:  4938 on 1 and 3304 DF,  p-value: < 2.2e-16

Cross-validation

cross.val <- shrinkage(model, 1, nrow(df))

## Original R-square = 0.5991309 
## 3306 Fold Cross-Validated R-square = 0.5986087 
## Change = 0.0005221947

cross.val

## NULL

Model diagnostic

autoplot(model, data = df,colour = "TYPE", label.size = 3)

2. Consider for fish only

# test model for fish
df_fish <- subset(df, TYPE =="F")
idx <- which(df_fish$landing!=0 | df_fish$discard!=0)  # get only harvested species
df_catch <- df_fish[idx,]

2.1. Model for all fish species

2.1.1. Correlation of log10(SPPR) vs predictors

df_fish_cor <- df_fish[,which(colnames(df_fish) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_fish_cor, lower.panel=panel.smooth,upper.panel = panel.cor)

2.1.2 Regression with TL as only predictor

## model for all fish species
model1 <- lm(data=df_fish, SPPR~TL)
summary(model1)

## 
## Call:
## lm(formula = SPPR ~ TL, data = df_fish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6700 -0.4822 -0.0499  0.3540  4.1623 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.98295    0.09239  -10.64   <2e-16 ***
## TL           1.06508    0.02599   40.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6984 on 2259 degrees of freedom
## Multiple R-squared:  0.4263, Adjusted R-squared:  0.4261 
## F-statistic:  1679 on 1 and 2259 DF,  p-value: < 2.2e-16

Cross-validation

cross.val <- shrinkage(model, 1, nrow(df_fish))

## Original R-square = 0.5991309 
## 2261 Fold Cross-Validated R-square = 0.5985687 
## Change = 0.000562265

Model diagnostic

autoplot(model1, data = df_fish,
         colour = 'Ecosystem.type', label.size = 1)

df_s1 <- data.frame(fitted.values = model1$fitted.values, residuals=model1$residuals, observed.values=df_fish$SPPR, standardized.residuals=model1$residuals/sd(model1$residuals), Ecosystem.type=df_fish$Ecosystem.type)
p1 <- ggplot(data=df_s1, aes(y=residuals, x=fitted.values))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_hline(yintercept=0, colour="red")+
  scale_y_continuous(limits = c(-2, 2), breaks=c(-1, -0.5, -0.25, 0, 0.25, 0.5,1))

p2 <- ggplot(data=df_s1, aes(x=fitted.values, y=observed.values))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_abline(slope=1, intercept=0, colour="red")
  
p3 <- ggplot(data=df_s1, aes(x=fitted.values, y=standardized.residuals))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_hline(yintercept = 0, colour="red")

vplayout <- function(x, y)
  viewport(layout.pos.row = x, layout.pos.col = y)
pushViewport(viewport(layout=grid.layout(3,1)))
print(p1+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(1,1))

## Warning: Removed 34 rows containing missing values (geom_point).

print(p2+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(2,1))
print(p3+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(3,1))

2.2 Model for harvested fish species

2.1.1 Correlation of log10(SPPR) vs predictors

df_catch_cor <- df_catch[,which(colnames(df_catch) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_catch_cor, lower.panel=panel.smooth,upper.panel = panel.cor)

## model for harvested fish species
model2 <- lm(data=df_catch, SPPR~TL)
summary(model2)

## 
## Call:
## lm(formula = SPPR ~ TL, data = df_catch)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6686 -0.4578 -0.0394  0.3555  3.9703 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.97144    0.10301   -9.43   <2e-16 ***
## TL           1.06199    0.02859   37.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6662 on 1736 degrees of freedom
## Multiple R-squared:  0.4428, Adjusted R-squared:  0.4424 
## F-statistic:  1379 on 1 and 1736 DF,  p-value: < 2.2e-16

autoplot(model1, label.size = 3)

df_s2 <- data.frame(fitted.values = model2$fitted.values, residuals=model2$residuals, observed.values=df_catch$SPPR, standardized.residuals=model2$residuals/sd(model2$residuals), Ecosystem.type=df_catch$Ecosystem.type)

p4 <- ggplot(data=df_s2, aes(y=residuals, x=fitted.values))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_hline(yintercept=0, colour="red")+
  scale_y_continuous(limits = c(-2, 2), breaks=c(-1, -0.5, -0.25, 0, 0.25, 0.5,1))

p5 <- ggplot(data=df_s2, aes(x=fitted.values, y=observed.values))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_abline(slope=1, intercept=0, colour="red")

p6 <- ggplot(data=df_s2, aes(x=fitted.values, y=standardized.residuals))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_hline(yintercept = 0, colour="red")


vplayout <- function(x, y)
  viewport(layout.pos.row = x, layout.pos.col = y)
pushViewport(viewport(layout=grid.layout(3,1)))
print(p4+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(1,1))

## Warning: Removed 21 rows containing missing values (geom_point).

print(p5+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(2,1))
print(p6+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(3,1))

2.2.2. Model with only TL for different ecosystem type

df_id <- df_catch
df_fit = df_id %>% group_by(Ecosystem.type) %>%
  do(fit2 = lm(SPPR ~ TL , data = .))
df_fitCoef = tidy(df_fit, fit2)
datatable(df_fitCoef)

3. Model for harvested fish with TL and PQ as predictors

3.1. Model for all ecosystem with TL and PQ as predictors

model3 <- lm(data=df_catch, SPPR~TL+PQ)
summary(model3)

## 
## Call:
## lm(formula = SPPR ~ TL + PQ, data = df_catch)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8304 -0.4259 -0.0800  0.3262  3.7512 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.67718    0.09982  -6.784  1.6e-11 ***
## TL           1.08053    0.02713  39.834  < 2e-16 ***
## PQ          -2.08328    0.14783 -14.092  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6312 on 1735 degrees of freedom
## Multiple R-squared:    0.5,  Adjusted R-squared:  0.4994 
## F-statistic: 867.5 on 2 and 1735 DF,  p-value: < 2.2e-16

autoplot(model1, label.size = 3)

df_s3 <- data.frame(fitted.values = model3$fitted.values, residuals=model3$residuals, observed.values=df_catch$SPPR, standardized.residuals=model3$residuals/sd(model3$residuals), Ecosystem.type=df_catch$Ecosystem.type)

p7 <- ggplot(data=df_s3, aes(y=residuals, x=fitted.values))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_hline(yintercept=0, colour="red")+
  scale_y_continuous(limits = c(-2, 2), breaks=c(-1, -0.5, -0.25, 0, 0.25, 0.5,1))

p8 <- ggplot(data=df_s3, aes(x=fitted.values, y=observed.values))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_abline(slope=1, intercept=0, colour="red")

p9 <- ggplot(data=df_s3, aes(x=fitted.values, y=standardized.residuals))+
  geom_point(aes(colour=Ecosystem.type))+
  geom_hline(yintercept = 0, colour="red")


vplayout <- function(x, y)
  viewport(layout.pos.row = x, layout.pos.col = y)
pushViewport(viewport(layout=grid.layout(3,1)))
print(p7+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(1,1))

## Warning: Removed 20 rows containing missing values (geom_point).

print(p8+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(2,1))
print(p9+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(3,1))

3.2. Models with TL, PQ as predictors for different ecosystem type

2.2.2. Model with only TL for different ecosystem type

df_id <- df_catch
df_fit = df_id %>% group_by(Ecosystem.type) %>%
  do(fit2 = lm(SPPR ~ TL+PQ , data = .))
df_fitCoef = tidy(df_fit, fit2)
datatable(df_fitCoef)

4. Model with Ecosystem.type as a predictor

model4 <- lm(data=df_catch, SPPR~TL*Ecosystem.type)
k<-summary(model4)
k

## 
## Call:
## lm(formula = SPPR ~ TL * Ecosystem.type, data = df_catch)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3449 -0.4522 -0.0312  0.3386  3.6698 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         -0.1951     0.3745  -0.521   0.6025
## TL                                   0.7988     0.1083   7.374 2.55e-13
## Ecosystem.typeChanel/Strait         -0.9459     0.6078  -1.556   0.1198
## Ecosystem.typeCoastal lagoon        -0.1616     0.7600  -0.213   0.8316
## Ecosystem.typeContinental shelf     -0.7964     0.4008  -1.987   0.0471
## Ecosystem.typeCoral reef            -0.8602     0.9105  -0.945   0.3449
## Ecosystem.typeEstuary               -0.7731     1.0417  -0.742   0.4581
## Ecosystem.typeOpen ocean            -1.8056     0.4599  -3.926 8.98e-05
## Ecosystem.typeUpwelling             -0.3009     0.4610  -0.653   0.5140
## TL:Ecosystem.typeChanel/Strait       0.2597     0.1727   1.504   0.1328
## TL:Ecosystem.typeCoastal lagoon      0.2133     0.2389   0.893   0.3721
## TL:Ecosystem.typeContinental shelf   0.2655     0.1153   2.303   0.0214
## TL:Ecosystem.typeCoral reef          0.4639     0.2704   1.716   0.0864
## TL:Ecosystem.typeEstuary             0.2028     0.3021   0.671   0.5022
## TL:Ecosystem.typeOpen ocean          0.5550     0.1292   4.297 1.83e-05
## TL:Ecosystem.typeUpwelling           0.1377     0.1326   1.038   0.2995
##                                       
## (Intercept)                           
## TL                                 ***
## Ecosystem.typeChanel/Strait           
## Ecosystem.typeCoastal lagoon          
## Ecosystem.typeContinental shelf    *  
## Ecosystem.typeCoral reef              
## Ecosystem.typeEstuary                 
## Ecosystem.typeOpen ocean           ***
## Ecosystem.typeUpwelling               
## TL:Ecosystem.typeChanel/Strait        
## TL:Ecosystem.typeCoastal lagoon       
## TL:Ecosystem.typeContinental shelf *  
## TL:Ecosystem.typeCoral reef        .  
## TL:Ecosystem.typeEstuary              
## TL:Ecosystem.typeOpen ocean        ***
## TL:Ecosystem.typeUpwelling            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.653 on 1722 degrees of freedom
## Multiple R-squared:  0.4689, Adjusted R-squared:  0.4643 
## F-statistic: 101.3 on 15 and 1722 DF,  p-value: < 2.2e-16

m<-data.frame(k$coefficients)
datatable(round(m, digits=4))

Inferring new equation to calculate the SPPR of harvested fish species in marine ecosystem using multiple linear regression

LDA

Nov 24, 2016

1. Model with all species in all ecosystems.

1.1 Correlation of log10(SPPR) vs predictors

1.2. Regression model with TL is only predictor

2. Consider for fish only

2.1. Model for all fish species

2.1.1. Correlation of log10(SPPR) vs predictors

2.1.2 Regression with TL as only predictor

2.2 Model for harvested fish species

2.1.1 Correlation of log10(SPPR) vs predictors

2.2.2. Model with only TL for different ecosystem type

3. Model for harvested fish with TL and PQ as predictors

3.1. Model for all ecosystem with TL and PQ as predictors

3.2. Models with TL, PQ as predictors for different ecosystem type

2.2.2. Model with only TL for different ecosystem type

4. Model with Ecosystem.type as a predictor