Regression Line with Error Distances
Nguyen Chi Dung
#==========================
# Example 1
#==========================
library(tidyverse)
ols <- trees %>% lm(Volume ~ Height, data = .)
trees %>%
mutate(yhat = ols$fitted.values) %>%
ggplot(aes(Height, Volume)) +
geom_segment(aes(x = Height, xend = Height, y = Volume, yend = yhat), color = "yellow") +
geom_point(color = "red") +
geom_smooth(method = "lm", size = 1.3, fill = "blue", alpha = 0.15) +
labs(title = "Regression Line with Error Distances") +
theme_minimal()

#==================================================================
# Example 2: An Introduction to Statistical Learning (chapter 3)
#==================================================================
# Import Data:
advertising <- read_csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv") %>%
select(-X1)
# Glimpse data:
advertising %>% head()
## # A tibble: 6 x 4
## TV radio newspaper sales
## <dbl> <dbl> <dbl> <dbl>
## 1 230 37.8 69.2 22.1
## 2 44.5 39.3 45.1 10.4
## 3 17.2 45.9 69.3 9.30
## 4 152 41.3 58.5 18.5
## 5 181 10.8 58.4 12.9
## 6 8.70 48.9 75.0 7.20
# Split Data:
set.seed(123)
train <- advertising %>%
group_by(sales) %>%
sample_frac(0.6) %>%
ungroup()
test <- dplyr::setdiff(advertising, train)
# Perform OLS Model1:
model1 <- train %>% lm(sales ~ TV, data = .)
model1 %>% summary()
##
## Call:
## lm(formula = sales ~ TV, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9799 -1.8605 -0.0552 2.1877 6.8657
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.747276 0.562254 12.00 <2e-16 ***
## TV 0.050714 0.003263 15.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.355 on 138 degrees of freedom
## Multiple R-squared: 0.6364, Adjusted R-squared: 0.6337
## F-statistic: 241.5 on 1 and 138 DF, p-value: < 2.2e-16
# Regression Line:
train %>%
mutate(yhat = model1$fitted.values) %>%
ggplot(aes(TV, sales)) +
geom_segment(aes(x = TV, xend = TV, y = sales, yend = yhat), color = "yellow") +
geom_point(color = "red") +
geom_smooth(method = "lm", size = 1.3, fill = "blue", alpha = 0.15) +
labs(title = "Regression Line with Error Distances",
subtitle = "Note: Train Data",
caption = "Source: Advertising Data from James et al. (2014)") +
theme_minimal()

train %>%
mutate(yhat = model1$fitted.values) %>%
ggplot(aes(TV, sales)) +
geom_segment(aes(x = TV, xend = TV, y = sales, yend = yhat), color = "yellow") +
geom_point(color = "red") +
geom_smooth(method = "lm", size = 1.3, fill = "blue", alpha = 0.15) +
geom_smooth(color = "purple", se = FALSE) +
labs(title = "Regression Line with Error Distances",
subtitle = "Note: Train Data",
caption = "Source: Advertising Data from James et al. (2014)") +
theme_minimal()

# Write some functions for asscessing Model:
my_mse <- function(actual, predicted) {
((actual - predicted)^2) %>%
mean() %>%
return()
}
my_r2 <- function(actual, predicted) {
y <- cor(actual, predicted)
return(y^2)
}
#-----------------------
# Assess Model
#-----------------------
# For train data:
my_r2(train$sales, model1$fitted.values)
## [1] 0.6363643
my_mse(train$sales, model1$fitted.values)
## [1] 11.09739
# For train data:
my_r2(test$sales, predict(model1, test))
## [1] 0.5573597
my_mse(test$sales, predict(model1, test))
## [1] 9.505445