# load the data for ENA indices
setwd("~/Dropbox/Xu ly so lieu")
load("/Users/LDA/Dropbox/Xu ly so lieu/ENA.rda")
shapiro.test(ENA$TE)
##
## Shapiro-Wilk normality test
##
## data: ENA$TE
## W = 0.9658, p-value = 0.000686
p1 <- ggplot(data=ENA, aes(x=Ecosystem.type, y=TE, colour=Ecosystem.type))+
geom_boxplot()
p1
#system.time(data <- read.xlsx("data for analysis.xlsx",1))
load("data for analysis.rda")
df <- data
#str(df)
df_cor <- df[,which(colnames(df) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_cor, lower.panel=panel.smooth,upper.panel = panel.cor)
log10(SPPR) is possitvely correlated with Trophic level(TL) and negatively correlated with growth efficiency (PQ), but at lower level. Therefore we can test the regression models with predictors are TL, PQ and Ecosystem.type
model <- lm(data=df, SPPR~TL)
summary(model)
##
## Call:
## lm(formula = SPPR ~ TL, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6890 -0.4339 -0.0769 0.3502 4.2363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.34010 0.05403 -24.80 <2e-16 ***
## TL 1.15510 0.01644 70.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6813 on 3304 degrees of freedom
## Multiple R-squared: 0.5991, Adjusted R-squared: 0.599
## F-statistic: 4938 on 1 and 3304 DF, p-value: < 2.2e-16
Cross-validation
cross.val <- shrinkage(model, 1, nrow(df))
## Original R-square = 0.5991309
## 3306 Fold Cross-Validated R-square = 0.5986087
## Change = 0.0005221947
cross.val
## NULL
Model diagnostic
autoplot(model, data = df,colour = "TYPE", label.size = 3)
# test model for fish
df_fish <- subset(df, TYPE =="F")
idx <- which(df_fish$landing!=0 | df_fish$discard!=0) # get only harvested species
df_catch <- df_fish[idx,]
df_fish_cor <- df_fish[,which(colnames(df_fish) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_fish_cor, lower.panel=panel.smooth,upper.panel = panel.cor)
log10(SPPR) is possitvely correlated with Trophic level(TL) and negatively correlated with growth efficiency (PQ), but at lower level. Therefore we can test the regression models with predictors are TL, PQ and Ecosystem.type
## model for all fish species
model1 <- lm(data=df_fish, SPPR~TL)
summary(model1)
##
## Call:
## lm(formula = SPPR ~ TL, data = df_fish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6700 -0.4822 -0.0499 0.3540 4.1623
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.98295 0.09239 -10.64 <2e-16 ***
## TL 1.06508 0.02599 40.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6984 on 2259 degrees of freedom
## Multiple R-squared: 0.4263, Adjusted R-squared: 0.4261
## F-statistic: 1679 on 1 and 2259 DF, p-value: < 2.2e-16
Cross-validation
cross.val <- shrinkage(model, 1, nrow(df_fish))
## Original R-square = 0.5991309
## 2261 Fold Cross-Validated R-square = 0.5985687
## Change = 0.000562265
Model diagnostic
autoplot(model1, data = df_fish,
colour = 'Ecosystem.type', label.size = 1)
df_s1 <- data.frame(fitted.values = model1$fitted.values, residuals=model1$residuals, observed.values=df_fish$SPPR, standardized.residuals=model1$residuals/sd(model1$residuals), Ecosystem.type=df_fish$Ecosystem.type)
p1 <- ggplot(data=df_s1, aes(y=residuals, x=fitted.values))+
geom_point(aes(colour=Ecosystem.type))+
geom_hline(yintercept=0, colour="red")+
scale_y_continuous(limits = c(-2, 2), breaks=c(-1, -0.5, -0.25, 0, 0.25, 0.5,1))
p2 <- ggplot(data=df_s1, aes(x=fitted.values, y=observed.values))+
geom_point(aes(colour=Ecosystem.type))+
geom_abline(slope=1, intercept=0, colour="red")
p3 <- ggplot(data=df_s1, aes(x=fitted.values, y=standardized.residuals))+
geom_point(aes(colour=Ecosystem.type))+
geom_hline(yintercept = 0, colour="red")
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
pushViewport(viewport(layout=grid.layout(3,1)))
print(p1+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(1,1))
## Warning: Removed 34 rows containing missing values (geom_point).
print(p2+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(2,1))
print(p3+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(3,1))
df_catch_cor <- df_catch[,which(colnames(df_catch) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_catch_cor, lower.panel=panel.smooth,upper.panel = panel.cor)
## model for harvested fish species
model2 <- lm(data=df_catch, SPPR~TL)
summary(model2)
##
## Call:
## lm(formula = SPPR ~ TL, data = df_catch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6686 -0.4578 -0.0394 0.3555 3.9703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.97144 0.10301 -9.43 <2e-16 ***
## TL 1.06199 0.02859 37.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6662 on 1736 degrees of freedom
## Multiple R-squared: 0.4428, Adjusted R-squared: 0.4424
## F-statistic: 1379 on 1 and 1736 DF, p-value: < 2.2e-16
autoplot(model1, label.size = 3)
df_s2 <- data.frame(fitted.values = model2$fitted.values, residuals=model2$residuals, observed.values=df_catch$SPPR, standardized.residuals=model2$residuals/sd(model2$residuals), Ecosystem.type=df_catch$Ecosystem.type)
p4 <- ggplot(data=df_s2, aes(y=residuals, x=fitted.values))+
geom_point(aes(colour=Ecosystem.type))+
geom_hline(yintercept=0, colour="red")+
scale_y_continuous(limits = c(-2, 2), breaks=c(-1, -0.5, -0.25, 0, 0.25, 0.5,1))
p5 <- ggplot(data=df_s2, aes(x=fitted.values, y=observed.values))+
geom_point(aes(colour=Ecosystem.type))+
geom_abline(slope=1, intercept=0, colour="red")
p6 <- ggplot(data=df_s2, aes(x=fitted.values, y=standardized.residuals))+
geom_point(aes(colour=Ecosystem.type))+
geom_hline(yintercept = 0, colour="red")
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
pushViewport(viewport(layout=grid.layout(3,1)))
print(p4+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(1,1))
## Warning: Removed 21 rows containing missing values (geom_point).
print(p5+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(2,1))
print(p6+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(3,1))
df_id <- df_catch
df_fit = df_id %>% group_by(Ecosystem.type) %>%
do(fit2 = lm(SPPR ~ TL , data = .))
df_fitCoef = tidy(df_fit, fit2)
datatable(df_fitCoef)
model3 <- lm(data=df_catch, SPPR~TL+PQ)
summary(model3)
##
## Call:
## lm(formula = SPPR ~ TL + PQ, data = df_catch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8304 -0.4259 -0.0800 0.3262 3.7512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.67718 0.09982 -6.784 1.6e-11 ***
## TL 1.08053 0.02713 39.834 < 2e-16 ***
## PQ -2.08328 0.14783 -14.092 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6312 on 1735 degrees of freedom
## Multiple R-squared: 0.5, Adjusted R-squared: 0.4994
## F-statistic: 867.5 on 2 and 1735 DF, p-value: < 2.2e-16
autoplot(model1, label.size = 3)
df_s3 <- data.frame(fitted.values = model3$fitted.values, residuals=model3$residuals, observed.values=df_catch$SPPR, standardized.residuals=model3$residuals/sd(model3$residuals), Ecosystem.type=df_catch$Ecosystem.type)
p7 <- ggplot(data=df_s3, aes(y=residuals, x=fitted.values))+
geom_point(aes(colour=Ecosystem.type))+
geom_hline(yintercept=0, colour="red")+
scale_y_continuous(limits = c(-2, 2), breaks=c(-1, -0.5, -0.25, 0, 0.25, 0.5,1))
p8 <- ggplot(data=df_s3, aes(x=fitted.values, y=observed.values))+
geom_point(aes(colour=Ecosystem.type))+
geom_abline(slope=1, intercept=0, colour="red")
p9 <- ggplot(data=df_s3, aes(x=fitted.values, y=standardized.residuals))+
geom_point(aes(colour=Ecosystem.type))+
geom_hline(yintercept = 0, colour="red")
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
pushViewport(viewport(layout=grid.layout(3,1)))
print(p7+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(1,1))
## Warning: Removed 20 rows containing missing values (geom_point).
print(p8+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(2,1))
print(p9+theme(plot.margin=unit(c(1,1,1,1), "mm")), vp=vplayout(3,1))
df_id <- df_catch
df_fit = df_id %>% group_by(Ecosystem.type) %>%
do(fit2 = lm(SPPR ~ TL+PQ , data = .))
df_fitCoef = tidy(df_fit, fit2)
datatable(df_fitCoef)
model4 <- lm(data=df_catch, SPPR~TL*Ecosystem.type)
k<-summary(model4)
k
##
## Call:
## lm(formula = SPPR ~ TL * Ecosystem.type, data = df_catch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3449 -0.4522 -0.0312 0.3386 3.6698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1951 0.3745 -0.521 0.6025
## TL 0.7988 0.1083 7.374 2.55e-13
## Ecosystem.typeChanel/Strait -0.9459 0.6078 -1.556 0.1198
## Ecosystem.typeCoastal lagoon -0.1616 0.7600 -0.213 0.8316
## Ecosystem.typeContinental shelf -0.7964 0.4008 -1.987 0.0471
## Ecosystem.typeCoral reef -0.8602 0.9105 -0.945 0.3449
## Ecosystem.typeEstuary -0.7731 1.0417 -0.742 0.4581
## Ecosystem.typeOpen ocean -1.8056 0.4599 -3.926 8.98e-05
## Ecosystem.typeUpwelling -0.3009 0.4610 -0.653 0.5140
## TL:Ecosystem.typeChanel/Strait 0.2597 0.1727 1.504 0.1328
## TL:Ecosystem.typeCoastal lagoon 0.2133 0.2389 0.893 0.3721
## TL:Ecosystem.typeContinental shelf 0.2655 0.1153 2.303 0.0214
## TL:Ecosystem.typeCoral reef 0.4639 0.2704 1.716 0.0864
## TL:Ecosystem.typeEstuary 0.2028 0.3021 0.671 0.5022
## TL:Ecosystem.typeOpen ocean 0.5550 0.1292 4.297 1.83e-05
## TL:Ecosystem.typeUpwelling 0.1377 0.1326 1.038 0.2995
##
## (Intercept)
## TL ***
## Ecosystem.typeChanel/Strait
## Ecosystem.typeCoastal lagoon
## Ecosystem.typeContinental shelf *
## Ecosystem.typeCoral reef
## Ecosystem.typeEstuary
## Ecosystem.typeOpen ocean ***
## Ecosystem.typeUpwelling
## TL:Ecosystem.typeChanel/Strait
## TL:Ecosystem.typeCoastal lagoon
## TL:Ecosystem.typeContinental shelf *
## TL:Ecosystem.typeCoral reef .
## TL:Ecosystem.typeEstuary
## TL:Ecosystem.typeOpen ocean ***
## TL:Ecosystem.typeUpwelling
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.653 on 1722 degrees of freedom
## Multiple R-squared: 0.4689, Adjusted R-squared: 0.4643
## F-statistic: 101.3 on 15 and 1722 DF, p-value: < 2.2e-16
m<-data.frame(k$coefficients)
datatable(round(m, digits=4))