datasets::trees
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
df <- trees
plot(df$Height, df$Volume)
cor(df$Height, df$Girth)
## [1] 0.5192801
cor(df$Volume, df$Girth)
## [1] 0.9671194
#Correlation is measured from -1 to 1 and shows how two variables are linearly related.
#The correlation between volume and height is moderate, not strong, and positive
#Regression Analysis
tree_model = lm(Volume ~ Height, data = df)
summary(tree_model)
##
## Call:
## lm(formula = Volume ~ Height, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.274 -9.894 -2.894 12.068 29.852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.1236 29.2731 -2.976 0.005835 **
## Height 1.5433 0.3839 4.021 0.000378 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.4 on 29 degrees of freedom
## Multiple R-squared: 0.3579, Adjusted R-squared: 0.3358
## F-statistic: 16.16 on 1 and 29 DF, p-value: 0.0003784
#Volume= -87.1236 + 1.5433 * height - 1 unit increase in height is associated with 1.5433 units of increase in Volume. - When height is zero, volume is -87.1236 - The model explained 33.38% of variance in volume.
#Linear Model
library(ggplot2)

ggplot(data = df, aes(x = Height, y = Volume)) +
geom_point() +
stat_smooth(method = "lm", col = "dodgerblue3") +
theme(panel.background = element_rect(fill = "white"),
axis.line.x=element_line(),
axis.line.y=element_line())
## `geom_smooth()` using formula = 'y ~ x'

plot(tree_model)




#The residuals should be evenly distributed above and below the line. It looks like there may be a few more below the line, but it is evenly distributed enough.
#Normal Q-Q - If this is a normal distribution, 98% of the data will lie between 2 standard deviations of the mean. Most of the data is within this range, with only 2 outliers outside of the range.
#Scale-Location - Shows if the residuals are spread equally among our predictions in order to check homoscedasticity and equal variance of residuals. Relatively straight line with residuals causing there to be elevations.
#Residuals vs Leverage - Shows massive data points that have a big effect on the linear model.
df1 <- df
tree_model1 = lm(Volume ~ Height, data = df1)
plot(tree_model1)




#The data does show a linear trend, by looking at the “Linear Model Fitted to Data” graph
#The residuals in the Normal Q-Q graph seem to be following a normal distribution. There are some outliers, but they are follow a straight line
#The residuals around the least squares regression line do seem to be constant
#Perhaps, Gauss Markov does not hold up, using the log function may have been of more clarity