Correlation & Regression Analysis

Lets Import the useful libraries & the dataset

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ broom        0.7.6      ✓ recipes      0.1.16
## ✓ dials        0.0.9      ✓ rsample      0.1.0 
## ✓ dplyr        1.0.6      ✓ tibble       3.1.2 
## ✓ ggplot2      3.3.3      ✓ tidyr        1.1.3 
## ✓ infer        0.5.4      ✓ tune         0.1.5 
## ✓ modeldata    0.1.0      ✓ workflows    0.2.2 
## ✓ parsnip      0.1.5      ✓ workflowsets 0.0.2 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ✓ stringr 1.4.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()

data("Orange")

orange <- as_tibble(Orange)
glimpse(orange)

## Rows: 35
## Columns: 3
## $ Tree          <ord> 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,…
## $ age           <dbl> 118, 484, 664, 1004, 1231, 1372, 1582, 118, 484, 664, 10…
## $ circumference <dbl> 30, 58, 87, 115, 120, 142, 145, 33, 69, 111, 156, 172, 2…

cor(orange$age,orange$circumference)

## [1] 0.9135189

ggplot(orange, aes(age,circumference, color=Tree)) +
  geom_line() +theme_bw()

Test for correlations individually within each tree

orange %>% group_by(Tree) %>% summarize(correlation = cor(age,circumference))

instead of simply estimating a correlation, we want to perform a hypothesis test

ct <- cor.test(orange$age,orange$circumference)
ct

## 
##  Pearson's product-moment correlation
## 
## data:  orange$age and orange$circumference
## t = 12.9, df = 33, p-value = 1.931e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8342364 0.9557955
## sample estimates:
##       cor 
## 0.9135189

tidy(ct)

nested <- orange %>% nest(data=c(age,circumference))
nested

nested %>% mutate(test = map(data, ~ cor.test(.x$age, .x$circumference)),tidied = map(test,tidy))

orange %>% 
  nest(data = c(age, circumference)) %>% 
  mutate(
    test = map(data, ~ cor.test(.x$age, .x$circumference)), # S3 list-col
    tidied = map(test, tidy)
  ) %>% 
  unnest(cols = tidied) %>% 
  select(-data, -test)

Regression Model

lm_fit <- lm(age ~ circumference, data = orange)
summary(lm_fit)

## 
## Call:
## lm(formula = age ~ circumference, data = orange)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -317.88 -140.90  -17.20   96.54  471.16 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    16.6036    78.1406   0.212    0.833    
## circumference   7.8160     0.6059  12.900 1.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 203.1 on 33 degrees of freedom
## Multiple R-squared:  0.8345, Adjusted R-squared:  0.8295 
## F-statistic: 166.4 on 1 and 33 DF,  p-value: 1.931e-14

tidy(lm_fit)

orange %>%
  nest(data = c(-Tree)) %>% 
  mutate(
    fit = map(data, ~ lm(age ~ circumference, data = .x)),
    tidied = map(fit, tidy)
  ) %>% 
  unnest(tidied) %>% 
  select(-data, -fit)

mtcars <- as_tibble(mtcars)
mtcars

mtcars %>%
  nest(data = c(-am)) %>% 
  mutate(
    fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),  # S3 list-col
    tidied = map(fit, tidy)
  ) %>% 
  unnest(tidied) %>% 
  select(-data, -fit)

 regressions <-  mtcars %>%
  nest(data = c(-am)) %>% 
  mutate(
    fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),
    tidied = map(fit, tidy),
    glanced = map(fit, glance),
    augmented = map(fit, augment)
  )

regressions %>% 
  select(tidied) %>% 
  unnest(tidied)

regressions %>% 
  select(glanced) %>% 
  unnest(glanced)

regressions %>% 
  select(augmented) %>% 
  unnest(augmented)