datasets::trees
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
df <- trees
plot(df$Height, df$Volume)

cor(df$Height, df$Girth)
## [1] 0.5192801
cor(df$Volume, df$Girth)
## [1] 0.9671194
#Correlation is measured from -1 to 1 and shows how two variables are linearly related.
#The correlation between volume and height is moderate, not strong, and positive

#Regression Analysis

tree_model = lm(Volume ~ Height, data = df)
summary(tree_model)
## 
## Call:
## lm(formula = Volume ~ Height, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.274  -9.894  -2.894  12.068  29.852 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.1236    29.2731  -2.976 0.005835 ** 
## Height        1.5433     0.3839   4.021 0.000378 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 29 degrees of freedom
## Multiple R-squared:  0.3579, Adjusted R-squared:  0.3358 
## F-statistic: 16.16 on 1 and 29 DF,  p-value: 0.0003784
#Volume= -87.1236 + 1.5433 * height - 1 unit increase in height is associated with 1.5433 units of increase in Volume. - When height is zero, volume is -87.1236 - The model explained 33.38% of variance in volume.

#Linear Model

library(ggplot2)

ggplot(data = df, aes(x = Height, y = Volume)) +
  geom_point() +
  stat_smooth(method = "lm", col = "dodgerblue3") +
  theme(panel.background = element_rect(fill = "white"),
        axis.line.x=element_line(),
        axis.line.y=element_line())
## `geom_smooth()` using formula = 'y ~ x'

plot(tree_model)

#The residuals should be evenly distributed above and below the line. It looks like there may be a few more below the line, but it is evenly distributed enough.
#Normal Q-Q - If this is a normal distribution, 98% of the data will lie between 2 standard deviations of the mean. Most of the data is within this range, with only 2 outliers outside of the range.
#Scale-Location - Shows if the residuals are spread equally among our predictions in order to check homoscedasticity and equal variance of residuals. Relatively straight line with residuals causing there to be elevations.
#Residuals vs Leverage - Shows massive data points that have a big effect on the linear model. 

df1 <- df
tree_model1 = lm(Volume ~ Height, data = df1)
plot(tree_model1)

#The data does show a linear trend, by looking at the “Linear Model Fitted to Data” graph
#The residuals in the Normal Q-Q graph seem to be following a normal distribution. There are some outliers, but they are follow a straight line
#The residuals around the least squares regression line do seem to be constant

#Perhaps, Gauss Markov does not hold up, using the log function may have been of more clarity