rm(list = ls()) # Clear all files from your environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526513 28.2 1169612 62.5 NA 669420 35.8
## Vcells 970825 7.5 8388608 64.0 16384 1851931 14.2
cat("\f") # Clear the console
graphics.off() # Clear all graphs
head(Orange)
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 4 1 1004 115
## 5 1 1231 120
## 6 1 1372 142
summary(Orange)
## Tree age circumference
## 3:7 Min. : 118.0 Min. : 30.0
## 1:7 1st Qu.: 484.0 1st Qu.: 65.5
## 5:7 Median :1004.0 Median :115.0
## 2:7 Mean : 922.1 Mean :115.9
## 4:7 3rd Qu.:1372.0 3rd Qu.:161.5
## Max. :1582.0 Max. :214.0
Dependent Variable (Y):
Independent Variable (X):
\(Y_i = \beta_0 + \beta_1 * X_i + \epsilon_i\)
\(Y_i\) = circumference of the \(i\)th orange tree
\(X_i\) = Age of the \(i\)th orange tree
\(\beta_0\) = Interecept (constant) term
\(\beta_i\) = Slope coefficient representing the change in circumference (mm) per unit change in age
\(\epsilon_i\) = Error term representing enexplainted variation
library(ggplot2)
# assign data set
data <- Orange
# Correlation
cor(data$circumference,
data$age,
use = "complete.obs")
## [1] 0.9135189
# Graph
ggplot(data, aes(x = age,
y = circumference)) +
geom_point(size = 2,
shape = 18,
col = "red") +
stat_smooth(method = lm,
linetype = "dashed") +
xlab("Age (days)") +
ylab("Circumference (mm)")
## `geom_smooth()` using formula = 'y ~ x'
lm_model <- lm(circumference ~ age,
data = data)
# Summary of the linear regression model
summary(lm_model)
##
## Call:
## lm(formula = circumference ~ age, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.310 -14.946 -0.076 19.697 45.111
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.399650 8.622660 2.018 0.0518 .
## age 0.106770 0.008277 12.900 1.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.74 on 33 degrees of freedom
## Multiple R-squared: 0.8345, Adjusted R-squared: 0.8295
## F-statistic: 166.4 on 1 and 33 DF, p-value: 1.931e-14
Intercept \(\beta_0\) = Estimated circumference of the orange tree trunk at 0 days, this might not be applicable in this scenario though as the orange tree wouldn’t have a circumference at 0 days old.
Slope \(\beta_i\) = The change in circumference for each day in age, in this case the coefficient is 0.107, meaning that each day of age the circumference might increase an estimated 0.10mm
\(\beta_i\) = \(\frac{cov(X,Y)}{Var(X)}\)
\(cov(X,Y)\) = Covariance between X (age) and Y (circumference)
\(Var(X)\) = Variance X (age)
Covariance <- cov(data$age,data$circumference)
print(Covariance)
## [1] 25831.02
Variance <- var(data$age)
print(Variance)
## [1] 241930.7
Bi <- Covariance/Variance
print(Bi)
## [1] 0.1067703
\(\beta_0\) = \(Y - \beta_i * X\)
\(Y\) = Mean circumference
\(X\) = Mean age
Y <- mean(data$circumference)
print(Y)
## [1] 115.8571
X <- mean(data$age)
print(X)
## [1] 922.1429
B0 <- Y - Bi * X
print(B0)
## [1] 17.39965