set working directory, load data and librarires

setwd("/Users/israel/Desktop/Bjork")
tree<- read.csv ("bjork_trees.csv")
library (tidyverse)
library (MASS)
library (plotly)
library (corrgram)
str (tree)
## 'data.frame':    632 obs. of  7 variables:
##  $ Plot        : Factor w/ 26 levels "24","25","26",..: 1 3 3 4 4 4 4 6 6 6 ...
##  $ Tag         : int  552 593 596 651 665 624 664 778 724 748 ...
##  $ Common.Name : Factor w/ 19 levels "American Beech",..: 14 8 8 14 14 8 8 8 8 8 ...
##  $ DBH..cm.    : num  9.4 29.2 17 58.2 23 19.2 18.7 34.9 23.6 22.5 ...
##  $ Biomass..kg.: num  49 379 106 2530 253 ...
##  $ Abundance   : int  66 13 13 40 40 40 40 65 65 65 ...
##  $ Tree_Type   : Factor w/ 2 levels "Deciduous","Evergreen": 1 1 1 1 1 1 1 1 1 1 ...
head (tree)
##   Plot Tag Common.Name DBH..cm. Biomass..kg. Abundance Tree_Type
## 1   24 552 Sugar Maple      9.4     48.98005        66 Deciduous
## 2   26 593 Paper Birch     29.2    379.20797        13 Deciduous
## 3   26 596 Paper Birch     17.0    105.78726        13 Deciduous
## 4   27 651 Sugar Maple     58.2   2530.40060        40 Deciduous
## 5   27 665 Sugar Maple     23.0    253.13136        40 Deciduous
## 6   27 624 Paper Birch     19.2    140.98238        40 Deciduous

working with plotly to make sexy graphs

Pet-Butt Method:

Plot it

* Plot the raw data and see if a general trend exists. 
#
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg.)) + geom_point() + 
  stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
  xlab("Diameter at Breast Height (DBH cm)"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# This shows a likely non linear fit to the data maybe a quadratic fit? Why is this? 

Explore it:

  • Can we add additional information to this model to improve the association?
# Sorting the relatioships by tree type seems to help a lot! 
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Tree_Type)) + geom_point() + 
  stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
  xlab("Diameter at Breast Height (DBH cm)")) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Sorting the relatioships by common name seems to help but the graph may be messier. 
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Common.Name)) + geom_point() + 
  stat_smooth(method="lm", formula= y~x+ I(x^2)) + theme_bw())
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)),
## se.fit = se, : prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)),
## se.fit = se, : prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)),
## se.fit = se, : prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced

Transform it:

  • Should we attempt to linarize this model? IF the relationship apprears to be quadratic then a sqrt transformation should be a good way to start the transformation.
# All the data transformed
tree$Biomass.sqrt = sqrt (tree$Biomass..kg.)
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass.sqrt)) + geom_point() + 
  stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
  xlab("Diameter at Breast Height (DBH cm)")) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# transformed data sorted by tree type 
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass.sqrt, color=Tree_Type)) + geom_point() + 
  stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
  xlab("Diameter at Breast Height (DBH cm)")) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#looks like a solid transformation! We can formally check it by using a shapiro test
shapiro.test(tree$Biomass..kg.) # Untransfomed data look
## 
##  Shapiro-Wilk normality test
## 
## data:  tree$Biomass..kg.
## W = 0.54462, p-value < 2.2e-16
shapiro.test(tree$Biomass.sqrt) # transfomed data look 
## 
##  Shapiro-Wilk normality test
## 
## data:  tree$Biomass.sqrt
## W = 0.85839, p-value < 2.2e-16
#neither variable seems to be imporving the normality of the data 

Build the model

#Quadratic model
lm1<- lm (Biomass..kg.~ DBH..cm. + I(DBH..cm.^2), data=tree);summary (lm1)
## 
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm. + I(DBH..cm.^2), data = tree)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1010.48   -59.25   -23.80    64.21  1433.80 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   128.22025   34.80261   3.684 0.000249 ***
## DBH..cm.      -12.73591    2.64403  -4.817 1.83e-06 ***
## I(DBH..cm.^2)   0.62513    0.04464  14.003  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 180.5 on 629 degrees of freedom
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.655 
## F-statistic:   600 on 2 and 629 DF,  p-value: < 2.2e-16
#linear model 
lm2<- lm (Biomass.sqrt~ DBH..cm., data=tree); summary (lm2)
## 
## Call:
## lm(formula = Biomass.sqrt ~ DBH..cm., data = tree)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5276  -3.0005  -0.6382   2.8216  19.2368 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.74590    0.38297  -1.948   0.0519 .  
## DBH..cm.     0.60912    0.01575  38.673   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.936 on 630 degrees of freedom
## Multiple R-squared:  0.7036, Adjusted R-squared:  0.7031 
## F-statistic:  1496 on 1 and 630 DF,  p-value: < 2.2e-16
#the linear fit appears to explain more of the variation in y than the quadratic fit on the entire dataset. but both models are significant. 

#lets consider the residuals of the two models 
ggplotly(ggplot(lm1) + geom_point(aes(x=.fitted, y=.resid)))
ggplotly(ggplot(lm2) + geom_point(aes(x=.fitted, y=.resid)))
#these residuals show that the points are likely following a non linear association as they are not consisently scatterd around zero. For this reason the quadratic model may be more appropriate. 

Learn More about residuals here

Understand the model

  • For the untransfomrmed data 65.5% of the variation in Biomass can be eplained by tree DBH- lm1
  • For the transfomrmed data 70.3% of the variation in Biomass can be eplained by tree DBH- lm2
  • Both models are significant so statistically either solution is appropriate. The trick is all in how you tell the story!

Translate it to Biological Significance

  • we support the HA that a non zero relationship exsists between DBH and Biomass.
  • the quadratic model may be more reflective of how a tree gains biomass in nature.
  • Think about how a tree grows! The trend may be linear-ish in the early years but as older trees get bigger, they require more surface area to be filled with carbon. THis reflects a natural exponential growth function.
  • remember that the models likely improve if we take additional considerations into account.For example, if the tree is evergreen or deciduous.

Further exploration of the dataset

Subsetting the analysis by tree type yeilds more explanatory power!

  • Check out those sexy R^2 values! Much improved models.
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Tree_Type)) + geom_point() + 
  stat_smooth(method="lm", formula= y~x+ I(x^2)) + theme_bw() + ylab("Biomass (Dry Kg.)") +
  xlab("Diameter at Breast Height (DBH cm)")) 
#Now we build two models one for Deciduous trees and one for evergreen trees
dec.tree<- filter (tree, Tree_Type=="Deciduous")
dec.lm<- lm (Biomass..kg.~ DBH..cm. + I(DBH..cm.^2), data=dec.tree); summary (dec.lm)
## 
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm. + I(DBH..cm.^2), data = dec.tree)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -402.64  -27.35    4.24   20.97  422.98 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   117.3580    22.6753   5.176 4.58e-07 ***
## DBH..cm.      -12.6886     1.6105  -7.879 9.38e-14 ***
## I(DBH..cm.^2)   0.8604     0.0253  34.005  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.22 on 257 degrees of freedom
## Multiple R-squared:  0.9736, Adjusted R-squared:  0.9734 
## F-statistic:  4739 on 2 and 257 DF,  p-value: < 2.2e-16
evg.tree<- filter (tree, Tree_Type=="Evergreen")
evg.lm<- lm (Biomass..kg.~ DBH..cm. + I(DBH..cm.^2), data=evg.tree); summary (evg.lm) 
## 
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm. + I(DBH..cm.^2), data = evg.tree)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -123.474   -8.774   -7.634    7.646  244.398 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.380805   6.044033   2.048   0.0412 *  
## DBH..cm.      -1.082821   0.485214  -2.232   0.0262 *  
## I(DBH..cm.^2)  0.220016   0.008611  25.551   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.78 on 369 degrees of freedom
## Multiple R-squared:  0.9558, Adjusted R-squared:  0.9556 
## F-statistic:  3991 on 2 and 369 DF,  p-value: < 2.2e-16

Creating dummy variables for muliple linear regression

library (fastDummies)
?dummy_cols

#lets make this as simple as possible lets select only the variables we care about 
tree.sub<- dplyr::select(tree, c("DBH..cm.", "Biomass..kg.", "Tree_Type")); summary (tree.sub) 
##     DBH..cm.      Biomass..kg.          Tree_Type  
##  Min.   : 5.00   Min.   :   4.791   Deciduous:260  
##  1st Qu.:14.50   1st Qu.:  63.339   Evergreen:372  
##  Median :20.10   Median : 124.536                  
##  Mean   :22.19   Mean   : 215.193                  
##  3rd Qu.:27.52   3rd Qu.: 252.222                  
##  Max.   :66.40   Max.   :3430.598
tree.dummy<- dummy_cols (tree.sub); head (tree.dummy)
##   DBH..cm. Biomass..kg. Tree_Type Tree_Type_Deciduous Tree_Type_Evergreen
## 1      9.4     48.98005 Deciduous                   1                   0
## 2     29.2    379.20797 Deciduous                   1                   0
## 3     17.0    105.78726 Deciduous                   1                   0
## 4     58.2   2530.40060 Deciduous                   1                   0
## 5     23.0    253.13136 Deciduous                   1                   0
## 6     19.2    140.98238 Deciduous                   1                   0
# Now that tree type is a dummy numeric variable we can build it into a multiple linear regression! 

Multiple linear regression example

corrgram (tree.dummy) # Notice that by creating the dummy dataset we violate the multi-colinarity assumption. Lets ignore this for now but know that the alternative is to build a glm instead of a lm. 

# We can build in an interaction here because we can assume that biomass capture by evergreens and decidous trees might be related. 
multi.lm<- lm (Biomass..kg. ~ DBH..cm.: Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen, data=tree.dummy)
summary (multi.lm) # This is our global model
## 
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm.:Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen, 
##     data = tree.dummy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -208.00  -70.09  -18.53   61.35 1752.70 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -283.1092    13.3968  -21.13   <2e-16 ***
## DBH..cm.:Tree_Type_Deciduous   29.8026     0.6034   49.39   <2e-16 ***
## DBH..cm.:Tree_Type_Evergreen   17.0220     0.5890   28.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 137.6 on 629 degrees of freedom
## Multiple R-squared:    0.8,  Adjusted R-squared:  0.7993 
## F-statistic:  1258 on 2 and 629 DF,  p-value: < 2.2e-16
stepAIC (multi.lm)
## Start:  AIC=6227.73
## Biomass..kg. ~ DBH..cm.:Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen
## 
##                                Df Sum of Sq      RSS    AIC
## <none>                                      11916038 6227.7
## - DBH..cm.:Tree_Type_Evergreen  1  15824840 27740878 6759.8
## - DBH..cm.:Tree_Type_Deciduous  1  46218989 58135027 7227.4
## 
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm.:Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen, 
##     data = tree.dummy)
## 
## Coefficients:
##                  (Intercept)  DBH..cm.:Tree_Type_Deciduous  
##                      -283.11                         29.80  
## DBH..cm.:Tree_Type_Evergreen  
##                        17.02
  • also notice that our adjusted R2 here including the dummy vaibles is only .79 not as good as treating the two models separatley.

  • We can check if a simpler model exists. In this case this would mean that either the evergreens or deciduous trees might not be contirbuting to the y-variance explination.

  • Because removing either interaction would increase our AIC then our global model is also our simplest and best fit model

Combining curvi-linear relationships with multiple gregressions

ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Tree_Type)) + geom_point() + 
  stat_smooth(method="lm", formula= y~x+ I(x^2)) + theme_bw() + ylab("Biomass (Dry Kg.)") +
  xlab("Diameter at Breast Height (DBH cm)")) 
multi.lm.quad<- lm (Biomass..kg. ~  I(DBH..cm.^2): Tree_Type_Deciduous +  I(DBH..cm.^2):Tree_Type_Evergreen, data=tree.dummy)
stepAIC (multi.lm.quad)
## Start:  AIC=5080.46
## Biomass..kg. ~ I(DBH..cm.^2):Tree_Type_Deciduous + I(DBH..cm.^2):Tree_Type_Evergreen
## 
##                                     Df Sum of Sq      RSS    AIC
## <none>                                            1939817 5080.5
## - I(DBH..cm.^2):Tree_Type_Evergreen  1   7428059  9367876 6073.7
## - I(DBH..cm.^2):Tree_Type_Deciduous  1  57574978 59514795 7242.2
## 
## Call:
## lm(formula = Biomass..kg. ~ I(DBH..cm.^2):Tree_Type_Deciduous + 
##     I(DBH..cm.^2):Tree_Type_Evergreen, data = tree.dummy)
## 
## Coefficients:
##                       (Intercept)  I(DBH..cm.^2):Tree_Type_Deciduous  
##                          -22.8166                             0.6425  
## I(DBH..cm.^2):Tree_Type_Evergreen  
##                            0.2211
summary (multi.lm.quad)
## 
## Call:
## lm(formula = Biomass..kg. ~ I(DBH..cm.^2):Tree_Type_Deciduous + 
##     I(DBH..cm.^2):Tree_Type_Evergreen, data = tree.dummy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -192.09  -12.68    4.57   16.40  671.82 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       -22.816600   3.132792  -7.283 9.79e-13
## I(DBH..cm.^2):Tree_Type_Deciduous   0.642455   0.004702 136.635  < 2e-16
## I(DBH..cm.^2):Tree_Type_Evergreen   0.221079   0.004505  49.078  < 2e-16
##                                      
## (Intercept)                       ***
## I(DBH..cm.^2):Tree_Type_Deciduous ***
## I(DBH..cm.^2):Tree_Type_Evergreen ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.53 on 629 degrees of freedom
## Multiple R-squared:  0.9674, Adjusted R-squared:  0.9673 
## F-statistic:  9343 on 2 and 629 DF,  p-value: < 2.2e-16