Regression Line with Error Distances

Nguyen Chi Dung

#==========================
#        Example 1
#==========================

library(tidyverse)

ols <- trees %>% lm(Volume ~ Height, data = .)

trees %>% 
  mutate(yhat = ols$fitted.values) %>% 
  ggplot(aes(Height, Volume)) + 
  geom_segment(aes(x = Height, xend = Height, y = Volume, yend = yhat), color = "yellow") + 
  geom_point(color = "red") + 
  geom_smooth(method = "lm", size = 1.3, fill = "blue", alpha = 0.15) + 
  labs(title = "Regression Line with Error Distances") + 
  theme_minimal()

#==================================================================
#  Example 2: An Introduction to Statistical Learning (chapter 3)   
#==================================================================

# Import Data: 
advertising <- read_csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv") %>%
  select(-X1)

# Glimpse data: 
advertising %>% head()
## # A tibble: 6 x 4
##       TV radio newspaper sales
##    <dbl> <dbl>     <dbl> <dbl>
## 1 230     37.8      69.2 22.1 
## 2  44.5   39.3      45.1 10.4 
## 3  17.2   45.9      69.3  9.30
## 4 152     41.3      58.5 18.5 
## 5 181     10.8      58.4 12.9 
## 6   8.70  48.9      75.0  7.20
#  Split Data: 
set.seed(123)

train <-  advertising %>% 
  group_by(sales) %>% 
  sample_frac(0.6) %>% 
  ungroup()

test <- dplyr::setdiff(advertising, train)

# Perform OLS Model1: 
model1 <- train %>% lm(sales ~ TV, data = .)
model1 %>% summary()
## 
## Call:
## lm(formula = sales ~ TV, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9799 -1.8605 -0.0552  2.1877  6.8657 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.747276   0.562254   12.00   <2e-16 ***
## TV          0.050714   0.003263   15.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.355 on 138 degrees of freedom
## Multiple R-squared:  0.6364, Adjusted R-squared:  0.6337 
## F-statistic: 241.5 on 1 and 138 DF,  p-value: < 2.2e-16
# Regression Line: 
train %>% 
  mutate(yhat = model1$fitted.values) %>% 
  ggplot(aes(TV, sales)) + 
  geom_segment(aes(x = TV, xend = TV, y = sales, yend = yhat), color = "yellow") + 
  geom_point(color = "red") + 
  geom_smooth(method = "lm", size = 1.3, fill = "blue", alpha = 0.15) + 
  labs(title = "Regression Line with Error Distances", 
       subtitle = "Note: Train Data",
       caption = "Source: Advertising Data from James et al. (2014)") + 
  theme_minimal()

train %>% 
  mutate(yhat = model1$fitted.values) %>% 
  ggplot(aes(TV, sales)) + 
  geom_segment(aes(x = TV, xend = TV, y = sales, yend = yhat), color = "yellow") + 
  geom_point(color = "red") + 
  geom_smooth(method = "lm", size = 1.3, fill = "blue", alpha = 0.15) + 
  geom_smooth(color = "purple", se = FALSE) + 
  labs(title = "Regression Line with Error Distances", 
       subtitle = "Note: Train Data",
       caption = "Source: Advertising Data from James et al. (2014)") + 
  theme_minimal()

# Write some functions for asscessing Model:   


my_mse <- function(actual, predicted) {
  ((actual - predicted)^2) %>% 
    mean() %>% 
    return()
}

my_r2 <- function(actual, predicted) {
  y <- cor(actual, predicted)
  return(y^2)
}

#-----------------------
#    Assess Model
#-----------------------

# For train data: 
my_r2(train$sales, model1$fitted.values)
## [1] 0.6363643
my_mse(train$sales, model1$fitted.values)
## [1] 11.09739
# For train data: 
my_r2(test$sales, predict(model1, test))
## [1] 0.5573597
my_mse(test$sales, predict(model1, test))
## [1] 9.505445