setwd("/Users/israel/Desktop/Bjork")
tree<- read.csv ("bjork_trees.csv")
library (tidyverse)
library (MASS)
library (plotly)
library (corrgram)
str (tree)
## 'data.frame': 632 obs. of 7 variables:
## $ Plot : Factor w/ 26 levels "24","25","26",..: 1 3 3 4 4 4 4 6 6 6 ...
## $ Tag : int 552 593 596 651 665 624 664 778 724 748 ...
## $ Common.Name : Factor w/ 19 levels "American Beech",..: 14 8 8 14 14 8 8 8 8 8 ...
## $ DBH..cm. : num 9.4 29.2 17 58.2 23 19.2 18.7 34.9 23.6 22.5 ...
## $ Biomass..kg.: num 49 379 106 2530 253 ...
## $ Abundance : int 66 13 13 40 40 40 40 65 65 65 ...
## $ Tree_Type : Factor w/ 2 levels "Deciduous","Evergreen": 1 1 1 1 1 1 1 1 1 1 ...
head (tree)
## Plot Tag Common.Name DBH..cm. Biomass..kg. Abundance Tree_Type
## 1 24 552 Sugar Maple 9.4 48.98005 66 Deciduous
## 2 26 593 Paper Birch 29.2 379.20797 13 Deciduous
## 3 26 596 Paper Birch 17.0 105.78726 13 Deciduous
## 4 27 651 Sugar Maple 58.2 2530.40060 40 Deciduous
## 5 27 665 Sugar Maple 23.0 253.13136 40 Deciduous
## 6 27 624 Paper Birch 19.2 140.98238 40 Deciduous
* Plot the raw data and see if a general trend exists.
#
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg.)) + geom_point() +
stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
xlab("Diameter at Breast Height (DBH cm)"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# This shows a likely non linear fit to the data maybe a quadratic fit? Why is this?
# Sorting the relatioships by tree type seems to help a lot!
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Tree_Type)) + geom_point() +
stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
xlab("Diameter at Breast Height (DBH cm)"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Sorting the relatioships by common name seems to help but the graph may be messier.
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Common.Name)) + geom_point() +
stat_smooth(method="lm", formula= y~x+ I(x^2)) + theme_bw())
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)),
## se.fit = se, : prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)),
## se.fit = se, : prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)),
## se.fit = se, : prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced
# All the data transformed
tree$Biomass.sqrt = sqrt (tree$Biomass..kg.)
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass.sqrt)) + geom_point() +
stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
xlab("Diameter at Breast Height (DBH cm)"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# transformed data sorted by tree type
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass.sqrt, color=Tree_Type)) + geom_point() +
stat_smooth() + theme_bw() + ylab("Biomass (Dry Kg.)") +
xlab("Diameter at Breast Height (DBH cm)"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#looks like a solid transformation! We can formally check it by using a shapiro test
shapiro.test(tree$Biomass..kg.) # Untransfomed data look
##
## Shapiro-Wilk normality test
##
## data: tree$Biomass..kg.
## W = 0.54462, p-value < 2.2e-16
shapiro.test(tree$Biomass.sqrt) # transfomed data look
##
## Shapiro-Wilk normality test
##
## data: tree$Biomass.sqrt
## W = 0.85839, p-value < 2.2e-16
#neither variable seems to be imporving the normality of the data
#Quadratic model
lm1<- lm (Biomass..kg.~ DBH..cm. + I(DBH..cm.^2), data=tree);summary (lm1)
##
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm. + I(DBH..cm.^2), data = tree)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1010.48 -59.25 -23.80 64.21 1433.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.22025 34.80261 3.684 0.000249 ***
## DBH..cm. -12.73591 2.64403 -4.817 1.83e-06 ***
## I(DBH..cm.^2) 0.62513 0.04464 14.003 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 180.5 on 629 degrees of freedom
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.655
## F-statistic: 600 on 2 and 629 DF, p-value: < 2.2e-16
#linear model
lm2<- lm (Biomass.sqrt~ DBH..cm., data=tree); summary (lm2)
##
## Call:
## lm(formula = Biomass.sqrt ~ DBH..cm., data = tree)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5276 -3.0005 -0.6382 2.8216 19.2368
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.74590 0.38297 -1.948 0.0519 .
## DBH..cm. 0.60912 0.01575 38.673 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.936 on 630 degrees of freedom
## Multiple R-squared: 0.7036, Adjusted R-squared: 0.7031
## F-statistic: 1496 on 1 and 630 DF, p-value: < 2.2e-16
#the linear fit appears to explain more of the variation in y than the quadratic fit on the entire dataset. but both models are significant.
#lets consider the residuals of the two models
ggplotly(ggplot(lm1) + geom_point(aes(x=.fitted, y=.resid)))
ggplotly(ggplot(lm2) + geom_point(aes(x=.fitted, y=.resid)))
#these residuals show that the points are likely following a non linear association as they are not consisently scatterd around zero. For this reason the quadratic model may be more appropriate.
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Tree_Type)) + geom_point() +
stat_smooth(method="lm", formula= y~x+ I(x^2)) + theme_bw() + ylab("Biomass (Dry Kg.)") +
xlab("Diameter at Breast Height (DBH cm)"))
#Now we build two models one for Deciduous trees and one for evergreen trees
dec.tree<- filter (tree, Tree_Type=="Deciduous")
dec.lm<- lm (Biomass..kg.~ DBH..cm. + I(DBH..cm.^2), data=dec.tree); summary (dec.lm)
##
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm. + I(DBH..cm.^2), data = dec.tree)
##
## Residuals:
## Min 1Q Median 3Q Max
## -402.64 -27.35 4.24 20.97 422.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.3580 22.6753 5.176 4.58e-07 ***
## DBH..cm. -12.6886 1.6105 -7.879 9.38e-14 ***
## I(DBH..cm.^2) 0.8604 0.0253 34.005 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.22 on 257 degrees of freedom
## Multiple R-squared: 0.9736, Adjusted R-squared: 0.9734
## F-statistic: 4739 on 2 and 257 DF, p-value: < 2.2e-16
evg.tree<- filter (tree, Tree_Type=="Evergreen")
evg.lm<- lm (Biomass..kg.~ DBH..cm. + I(DBH..cm.^2), data=evg.tree); summary (evg.lm)
##
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm. + I(DBH..cm.^2), data = evg.tree)
##
## Residuals:
## Min 1Q Median 3Q Max
## -123.474 -8.774 -7.634 7.646 244.398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.380805 6.044033 2.048 0.0412 *
## DBH..cm. -1.082821 0.485214 -2.232 0.0262 *
## I(DBH..cm.^2) 0.220016 0.008611 25.551 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.78 on 369 degrees of freedom
## Multiple R-squared: 0.9558, Adjusted R-squared: 0.9556
## F-statistic: 3991 on 2 and 369 DF, p-value: < 2.2e-16
dummy_cols
function in the fastDummies
package Dummy Variableslibrary (fastDummies)
?dummy_cols
#lets make this as simple as possible lets select only the variables we care about
tree.sub<- dplyr::select(tree, c("DBH..cm.", "Biomass..kg.", "Tree_Type")); summary (tree.sub)
## DBH..cm. Biomass..kg. Tree_Type
## Min. : 5.00 Min. : 4.791 Deciduous:260
## 1st Qu.:14.50 1st Qu.: 63.339 Evergreen:372
## Median :20.10 Median : 124.536
## Mean :22.19 Mean : 215.193
## 3rd Qu.:27.52 3rd Qu.: 252.222
## Max. :66.40 Max. :3430.598
tree.dummy<- dummy_cols (tree.sub); head (tree.dummy)
## DBH..cm. Biomass..kg. Tree_Type Tree_Type_Deciduous Tree_Type_Evergreen
## 1 9.4 48.98005 Deciduous 1 0
## 2 29.2 379.20797 Deciduous 1 0
## 3 17.0 105.78726 Deciduous 1 0
## 4 58.2 2530.40060 Deciduous 1 0
## 5 23.0 253.13136 Deciduous 1 0
## 6 19.2 140.98238 Deciduous 1 0
# Now that tree type is a dummy numeric variable we can build it into a multiple linear regression!
corrgram (tree.dummy) # Notice that by creating the dummy dataset we violate the multi-colinarity assumption. Lets ignore this for now but know that the alternative is to build a glm instead of a lm.
# We can build in an interaction here because we can assume that biomass capture by evergreens and decidous trees might be related.
multi.lm<- lm (Biomass..kg. ~ DBH..cm.: Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen, data=tree.dummy)
summary (multi.lm) # This is our global model
##
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm.:Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen,
## data = tree.dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -208.00 -70.09 -18.53 61.35 1752.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -283.1092 13.3968 -21.13 <2e-16 ***
## DBH..cm.:Tree_Type_Deciduous 29.8026 0.6034 49.39 <2e-16 ***
## DBH..cm.:Tree_Type_Evergreen 17.0220 0.5890 28.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 137.6 on 629 degrees of freedom
## Multiple R-squared: 0.8, Adjusted R-squared: 0.7993
## F-statistic: 1258 on 2 and 629 DF, p-value: < 2.2e-16
stepAIC (multi.lm)
## Start: AIC=6227.73
## Biomass..kg. ~ DBH..cm.:Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen
##
## Df Sum of Sq RSS AIC
## <none> 11916038 6227.7
## - DBH..cm.:Tree_Type_Evergreen 1 15824840 27740878 6759.8
## - DBH..cm.:Tree_Type_Deciduous 1 46218989 58135027 7227.4
##
## Call:
## lm(formula = Biomass..kg. ~ DBH..cm.:Tree_Type_Deciduous + DBH..cm.:Tree_Type_Evergreen,
## data = tree.dummy)
##
## Coefficients:
## (Intercept) DBH..cm.:Tree_Type_Deciduous
## -283.11 29.80
## DBH..cm.:Tree_Type_Evergreen
## 17.02
also notice that our adjusted R2 here including the dummy vaibles is only .79 not as good as treating the two models separatley.
We can check if a simpler model exists. In this case this would mean that either the evergreens or deciduous trees might not be contirbuting to the y-variance explination.
Because removing either interaction would increase our AIC then our global model is also our simplest and best fit model
ggplotly(ggplot (tree, aes(x=DBH..cm., y=Biomass..kg., color=Tree_Type)) + geom_point() +
stat_smooth(method="lm", formula= y~x+ I(x^2)) + theme_bw() + ylab("Biomass (Dry Kg.)") +
xlab("Diameter at Breast Height (DBH cm)"))
multi.lm.quad<- lm (Biomass..kg. ~ I(DBH..cm.^2): Tree_Type_Deciduous + I(DBH..cm.^2):Tree_Type_Evergreen, data=tree.dummy)
stepAIC (multi.lm.quad)
## Start: AIC=5080.46
## Biomass..kg. ~ I(DBH..cm.^2):Tree_Type_Deciduous + I(DBH..cm.^2):Tree_Type_Evergreen
##
## Df Sum of Sq RSS AIC
## <none> 1939817 5080.5
## - I(DBH..cm.^2):Tree_Type_Evergreen 1 7428059 9367876 6073.7
## - I(DBH..cm.^2):Tree_Type_Deciduous 1 57574978 59514795 7242.2
##
## Call:
## lm(formula = Biomass..kg. ~ I(DBH..cm.^2):Tree_Type_Deciduous +
## I(DBH..cm.^2):Tree_Type_Evergreen, data = tree.dummy)
##
## Coefficients:
## (Intercept) I(DBH..cm.^2):Tree_Type_Deciduous
## -22.8166 0.6425
## I(DBH..cm.^2):Tree_Type_Evergreen
## 0.2211
summary (multi.lm.quad)
##
## Call:
## lm(formula = Biomass..kg. ~ I(DBH..cm.^2):Tree_Type_Deciduous +
## I(DBH..cm.^2):Tree_Type_Evergreen, data = tree.dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -192.09 -12.68 4.57 16.40 671.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.816600 3.132792 -7.283 9.79e-13
## I(DBH..cm.^2):Tree_Type_Deciduous 0.642455 0.004702 136.635 < 2e-16
## I(DBH..cm.^2):Tree_Type_Evergreen 0.221079 0.004505 49.078 < 2e-16
##
## (Intercept) ***
## I(DBH..cm.^2):Tree_Type_Deciduous ***
## I(DBH..cm.^2):Tree_Type_Evergreen ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.53 on 629 degrees of freedom
## Multiple R-squared: 0.9674, Adjusted R-squared: 0.9673
## F-statistic: 9343 on 2 and 629 DF, p-value: < 2.2e-16
the previous model only accounted for the linear relationship between the variables, here we have built in the quadratic function to explain more of the variation in y! Boo-yah!
We can check if a simpler model exists. In this case this would mean that either the evergreens or deciduous trees might not be contirbuting to the y-variance explination.
Because removing either interaction would increase our AIC then our global model is also our simplest and best fit model