Modelling with modelr

Memuat Package yang digunakan

library(tidyverse) 
library(modelr) 
options(na.action = na.warn)
library(nycflights13) 
library(lubridate)
library(splines)
library(MASS)
library(gapminder)

Model Building

Dataset yang digunakan

gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows

Masalah

Bagaimana harapan hidup setiap negara berubah seiring waktu ?

gapminder %>% 
  ggplot(aes(year,
             lifeExp,
             group = country)
         ) +
  geom_line(alpha = 1/4)

Pada grafik diatas terlihat seiring waktu, angka harapan hidup kebanyakan negara meningkat. Namun terdapat juga beberapa negara yang tidak mengikuti pola tersebut.


Pada satu negara

Visalisasi data.

nz <- filter(gapminder, 
             country == "Indonesia") 

nz %>% 
  ggplot(aes(year, lifeExp)) +
  geom_line() +
  ggtitle("Full data = ")

Terlihat pola data linear.

Model dan visualisasi data berdasarkan model.

nz_mod <- lm(lifeExp ~ year,
             data = nz) 

nz %>%
  add_predictions(nz_mod) %>% 
  ggplot(aes(year, pred)) + 
  geom_line() +
  ggtitle("Linear trend + ")

Residual.

nz %>%  
  add_residuals(nz_mod) %>% 
  ggplot(aes(year, resid)) +
  geom_hline(yintercept = 0,
             color = "white",
             size = 3) + 
  geom_line() + 
  ggtitle("Remaining pattern")


Pada Setiap Negara

Untuk setiap negara dapat menggunakan code sebelumnya di copy-paste pada setiap negara. Tapi banyak jadi gak mungkin menggunakan copy-paste.

Berikut cara yang lebih baik.

  1. Buat nested data frame untuk setiap negara.
by_country <- gapminder %>% 
  group_by(country, continent) %>%
  nest()
by_country 
## # A tibble: 142 x 3
## # Groups:   country, continent [142]
##    country     continent data             
##    <fct>       <fct>     <list>           
##  1 Afghanistan Asia      <tibble [12 x 4]>
##  2 Albania     Europe    <tibble [12 x 4]>
##  3 Algeria     Africa    <tibble [12 x 4]>
##  4 Angola      Africa    <tibble [12 x 4]>
##  5 Argentina   Americas  <tibble [12 x 4]>
##  6 Australia   Oceania   <tibble [12 x 4]>
##  7 Austria     Europe    <tibble [12 x 4]>
##  8 Bahrain     Asia      <tibble [12 x 4]>
##  9 Bangladesh  Asia      <tibble [12 x 4]>
## 10 Belgium     Europe    <tibble [12 x 4]>
## # ... with 132 more rows
by_country$data[[1]] # Contoh untuk negara afghanistan (baris 1)
## # A tibble: 12 x 4
##     year lifeExp      pop gdpPercap
##    <int>   <dbl>    <int>     <dbl>
##  1  1952    28.8  8425333      779.
##  2  1957    30.3  9240934      821.
##  3  1962    32.0 10267083      853.
##  4  1967    34.0 11537966      836.
##  5  1972    36.1 13079460      740.
##  6  1977    38.4 14880372      786.
##  7  1982    39.9 12881816      978.
##  8  1987    40.8 13867957      852.
##  9  1992    41.7 16317921      649.
## 10  1997    41.8 22227415      635.
## 11  2002    42.1 25268405      727.
## 12  2007    43.8 31889923      975.
  1. Model menggunakan perintah map pada purr
country_model <- function(df) {
  lm(lifeExp ~ year, 
     data = df) 
}

country_model(by_country$data[[1]] ) # Negara Afganistan
## 
## Call:
## lm(formula = lifeExp ~ year, data = df)
## 
## Coefficients:
## (Intercept)         year  
##   -507.5343       0.2753
models <- map(by_country$data,
              country_model)  # Model dari seluruh negara
  1. Tambahkan model pada nested dataframe pada bagian (1).
by_country <- by_country %>% 
  mutate(model = map(data, 
                     country_model)
         )
by_country 
## # A tibble: 142 x 4
## # Groups:   country, continent [142]
##    country     continent data              model 
##    <fct>       <fct>     <list>            <list>
##  1 Afghanistan Asia      <tibble [12 x 4]> <lm>  
##  2 Albania     Europe    <tibble [12 x 4]> <lm>  
##  3 Algeria     Africa    <tibble [12 x 4]> <lm>  
##  4 Angola      Africa    <tibble [12 x 4]> <lm>  
##  5 Argentina   Americas  <tibble [12 x 4]> <lm>  
##  6 Australia   Oceania   <tibble [12 x 4]> <lm>  
##  7 Austria     Europe    <tibble [12 x 4]> <lm>  
##  8 Bahrain     Asia      <tibble [12 x 4]> <lm>  
##  9 Bangladesh  Asia      <tibble [12 x 4]> <lm>  
## 10 Belgium     Europe    <tibble [12 x 4]> <lm>  
## # ... with 132 more rows
  1. Menambahkan residual pada masing-masing model.
by_country <- by_country %>%  
  mutate( resids = map2(data,
                        model,
                        add_residuals)
          ) 
by_country 
## # A tibble: 142 x 5
## # Groups:   country, continent [142]
##    country     continent data              model  resids           
##    <fct>       <fct>     <list>            <list> <list>           
##  1 Afghanistan Asia      <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  2 Albania     Europe    <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  3 Algeria     Africa    <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  4 Angola      Africa    <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  5 Argentina   Americas  <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  6 Australia   Oceania   <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  7 Austria     Europe    <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  8 Bahrain     Asia      <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
##  9 Bangladesh  Asia      <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
## 10 Belgium     Europe    <tibble [12 x 4]> <lm>   <tibble [12 x 5]>
## # ... with 132 more rows
  1. Kembalikkan nested dataframe sebelumnya menjadi dataframe biasa.
resids <- unnest(by_country, 
                 resids) 
resids
## # A tibble: 1,704 x 9
## # Groups:   country, continent [142]
##    country   continent data        model   year lifeExp    pop gdpPercap   resid
##    <fct>     <fct>     <list>      <list> <int>   <dbl>  <int>     <dbl>   <dbl>
##  1 Afghanis~ Asia      <tibble [1~ <lm>    1952    28.8 8.43e6      779. -1.11  
##  2 Afghanis~ Asia      <tibble [1~ <lm>    1957    30.3 9.24e6      821. -0.952 
##  3 Afghanis~ Asia      <tibble [1~ <lm>    1962    32.0 1.03e7      853. -0.664 
##  4 Afghanis~ Asia      <tibble [1~ <lm>    1967    34.0 1.15e7      836. -0.0172
##  5 Afghanis~ Asia      <tibble [1~ <lm>    1972    36.1 1.31e7      740.  0.674 
##  6 Afghanis~ Asia      <tibble [1~ <lm>    1977    38.4 1.49e7      786.  1.65  
##  7 Afghanis~ Asia      <tibble [1~ <lm>    1982    39.9 1.29e7      978.  1.69  
##  8 Afghanis~ Asia      <tibble [1~ <lm>    1987    40.8 1.39e7      852.  1.28  
##  9 Afghanis~ Asia      <tibble [1~ <lm>    1992    41.7 1.63e7      649.  0.754 
## 10 Afghanis~ Asia      <tibble [1~ <lm>    1997    41.8 2.22e7      635. -0.534 
## # ... with 1,694 more rows
  1. Plot residual
resids %>%
  ggplot(aes(year, 
             resid))+
  geom_line(aes(group = country),
            alpha = 1 / 3) + 
  geom_smooth(se = FALSE) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

resids %>% 
  ggplot(aes(year, 
             resid, 
             group = country)) +
  geom_line(alpha = 1 / 3) + 
  facet_wrap(~continent)

Pada grafik diatas terlihat sesuatu yang mearik pada benua afrika. Terlihat model masih kurang baik/pas. Terlihat masih banyak residual yang besar.

Tapi asia juga kaya sama, kaya terdapat residual yang besar.


MODEL QUALITY

Menambahkan model quality pada dataframe.

glance <- by_country %>% 
  mutate(glance = map(model,
                      broom::glance)) %>%
  unnest(glance, .drop = TRUE) 
## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
## All list-columns are now preserved.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
glance
## # A tibble: 142 x 17
## # Groups:   country, continent [142]
##    country continent data  model resids r.squared adj.r.squared sigma statistic
##    <fct>   <fct>     <lis> <lis> <list>     <dbl>         <dbl> <dbl>     <dbl>
##  1 Afghan~ Asia      <tib~ <lm>  <tibb~     0.948         0.942 1.22      181. 
##  2 Albania Europe    <tib~ <lm>  <tibb~     0.911         0.902 1.98      102. 
##  3 Algeria Africa    <tib~ <lm>  <tibb~     0.985         0.984 1.32      662. 
##  4 Angola  Africa    <tib~ <lm>  <tibb~     0.888         0.877 1.41       79.1
##  5 Argent~ Americas  <tib~ <lm>  <tibb~     0.996         0.995 0.292    2246. 
##  6 Austra~ Oceania   <tib~ <lm>  <tibb~     0.980         0.978 0.621     481. 
##  7 Austria Europe    <tib~ <lm>  <tibb~     0.992         0.991 0.407    1261. 
##  8 Bahrain Asia      <tib~ <lm>  <tibb~     0.967         0.963 1.64      291. 
##  9 Bangla~ Asia      <tib~ <lm>  <tibb~     0.989         0.988 0.977     930. 
## 10 Belgium Europe    <tib~ <lm>  <tibb~     0.995         0.994 0.293    1822. 
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## #   logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## #   nobs <int>

Pada dataframe diatas kita dapat melihat data yang tidak pas dengan model (R squarednya kecil).

glance %>%
  arrange(r.squared)
## # A tibble: 142 x 17
## # Groups:   country, continent [142]
##    country continent data  model resids r.squared adj.r.squared sigma statistic
##    <fct>   <fct>     <lis> <lis> <list>     <dbl>         <dbl> <dbl>     <dbl>
##  1 Rwanda  Africa    <tib~ <lm>  <tibb~    0.0172      -0.0811   6.56     0.175
##  2 Botswa~ Africa    <tib~ <lm>  <tibb~    0.0340      -0.0626   6.11     0.352
##  3 Zimbab~ Africa    <tib~ <lm>  <tibb~    0.0562      -0.0381   7.21     0.596
##  4 Zambia  Africa    <tib~ <lm>  <tibb~    0.0598      -0.0342   4.53     0.636
##  5 Swazil~ Africa    <tib~ <lm>  <tibb~    0.0682      -0.0250   6.64     0.732
##  6 Lesotho Africa    <tib~ <lm>  <tibb~    0.0849      -0.00666  5.93     0.927
##  7 Cote d~ Africa    <tib~ <lm>  <tibb~    0.283        0.212    3.93     3.95 
##  8 South ~ Africa    <tib~ <lm>  <tibb~    0.312        0.244    4.74     4.54 
##  9 Uganda  Africa    <tib~ <lm>  <tibb~    0.342        0.276    3.19     5.20 
## 10 Congo,~ Africa    <tib~ <lm>  <tibb~    0.348        0.283    2.43     5.34 
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## #   logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## #   nobs <int>

Visualisasi dari R squared.

glance %>%  
  ggplot(aes(continent,
             r.squared)) +  
  geom_jitter(width = 0.5)

Berikut negara (negara afrika) dengan r squared kurang dari 0.25

bad_fit <- filter(glance,
                  r.squared < 0.25)
gapminder %>%  
  semi_join(bad_fit, 
            by = "country") %>%  
  ggplot(aes(year,
             lifeExp,
             color = country)) +
  geom_line()

Menurut buku, angka harapan hidup diatas mengalami penurunan karena terjadi epidemic HIV/AIDS dan genosida Rwanda.

Tambahan negara dengan P-value kurang dari 0.05

bad_fit <- filter(glance,
                  p.value > 0.05)
gapminder %>%  
  semi_join(bad_fit, 
            by = "country") %>%  
  ggplot(aes(year,
             lifeExp,
             color = country)) +
  geom_line()


TAMBAHAN

LIST - COLUMNS

data.frame(x = list(1:3, 3:5)) 
##   x.1.3 x.3.5
## 1     1     3
## 2     2     4
## 3     3     5
data.frame( 
  x = I(list(1:3, 3:5)), 
  y = c("1, 2", "3, 4, 5") ) 
##         x       y
## 1 1, 2, 3    1, 2
## 2 3, 4, 5 3, 4, 5
tibble( 
  x = list(1:3, 3:5),  
  y = c("1, 2", "3, 4, 5") )
## # A tibble: 2 x 2
##   x         y      
##   <list>    <chr>  
## 1 <int [3]> 1, 2   
## 2 <int [3]> 3, 4, 5
tribble(  
  ~x, ~y, 
  1:3, 
  "1, 2", 
  3:5,
  "3, 4, 5" )
## # A tibble: 2 x 2
##   x         y      
##   <list>    <chr>  
## 1 <int [3]> 1, 2   
## 2 <int [3]> 3, 4, 5

CREATING LIST - COLUMN

  1. With Nesting
gapminder %>%  
  group_by(country,
           continent) %>%
  nest()
## # A tibble: 142 x 3
## # Groups:   country, continent [142]
##    country     continent data             
##    <fct>       <fct>     <list>           
##  1 Afghanistan Asia      <tibble [12 x 4]>
##  2 Albania     Europe    <tibble [12 x 4]>
##  3 Algeria     Africa    <tibble [12 x 4]>
##  4 Angola      Africa    <tibble [12 x 4]>
##  5 Argentina   Americas  <tibble [12 x 4]>
##  6 Australia   Oceania   <tibble [12 x 4]>
##  7 Austria     Europe    <tibble [12 x 4]>
##  8 Bahrain     Asia      <tibble [12 x 4]>
##  9 Bangladesh  Asia      <tibble [12 x 4]>
## 10 Belgium     Europe    <tibble [12 x 4]>
## # ... with 132 more rows
gapminder %>%
  nest(year:gdpPercap)
## # A tibble: 142 x 3
##    country     continent data             
##    <fct>       <fct>     <list>           
##  1 Afghanistan Asia      <tibble [12 x 4]>
##  2 Albania     Europe    <tibble [12 x 4]>
##  3 Algeria     Africa    <tibble [12 x 4]>
##  4 Angola      Africa    <tibble [12 x 4]>
##  5 Argentina   Americas  <tibble [12 x 4]>
##  6 Australia   Oceania   <tibble [12 x 4]>
##  7 Austria     Europe    <tibble [12 x 4]>
##  8 Bahrain     Asia      <tibble [12 x 4]>
##  9 Bangladesh  Asia      <tibble [12 x 4]>
## 10 Belgium     Europe    <tibble [12 x 4]>
## # ... with 132 more rows
  1. From Vectorized Functions
df <- tribble(  ~x1, 
                "a,b,c", 
                "d,e,f,g" )
df
## # A tibble: 2 x 1
##   x1     
##   <chr>  
## 1 a,b,c  
## 2 d,e,f,g
df %>% 
  mutate(x2 = stringr::str_split(x1, ",")) 
## # A tibble: 2 x 2
##   x1      x2       
##   <chr>   <list>   
## 1 a,b,c   <chr [3]>
## 2 d,e,f,g <chr [4]>
sim <- tribble(  ~f, ~params, 
                 "runif", list(min = -1,max = -1),
                 "rnorm", list(sd = 5), 
                 "rpois", list(lambda = 10) )
sim
## # A tibble: 3 x 2
##   f     params          
##   <chr> <list>          
## 1 runif <named list [2]>
## 2 rnorm <named list [1]>
## 3 rpois <named list [1]>
sim %>% 
  mutate(sims = invoke_map(f, 
                           params,
                           n = 10)) 
## # A tibble: 3 x 3
##   f     params           sims      
##   <chr> <list>           <list>    
## 1 runif <named list [2]> <dbl [10]>
## 2 rnorm <named list [1]> <dbl [10]>
## 3 rpois <named list [1]> <int [10]>
  1. From Mutivalued Summaries
mtcars %>% 
  group_by(cyl) %>%
  summarize(q = quantile(mpg)) 
## # A tibble: 15 x 2
## # Groups:   cyl [3]
##      cyl     q
##    <dbl> <dbl>
##  1     4  21.4
##  2     4  22.8
##  3     4  26  
##  4     4  30.4
##  5     4  33.9
##  6     6  17.8
##  7     6  18.6
##  8     6  19.7
##  9     6  21  
## 10     6  21.4
## 11     8  10.4
## 12     8  14.4
## 13     8  15.2
## 14     8  16.2
## 15     8  19.2
mtcars %>%  
  group_by(cyl) %>% 
  summarize(q = list(quantile(mpg))) 
## # A tibble: 3 x 2
##     cyl q        
##   <dbl> <list>   
## 1     4 <dbl [5]>
## 2     6 <dbl [5]>
## 3     8 <dbl [5]>
  1. From a Named List
x <- list(  a = 1:5, 
            b = 3:4, 
            c = 5:6 )
x
## $a
## [1] 1 2 3 4 5
## 
## $b
## [1] 3 4
## 
## $c
## [1] 5 6
df <- enframe(x)
df 
## # A tibble: 3 x 2
##   name  value    
##   <chr> <list>   
## 1 a     <int [5]>
## 2 b     <int [2]>
## 3 c     <int [2]>

SIMPLIFYING LIST - COLUMNS

  1. List To Vector
df <- tribble(  ~x,  
                letters[1:5],
                1:3,  
                runif(5) 
                )
df
## # A tibble: 3 x 1
##   x        
##   <list>   
## 1 <chr [5]>
## 2 <int [3]>
## 3 <dbl [5]>
df %>% 
  mutate(  type = map_chr(x,
                          typeof),
           length = map_int(x,
                            length)
           )
## # A tibble: 3 x 3
##   x         type      length
##   <list>    <chr>      <int>
## 1 <chr [5]> character      5
## 2 <int [3]> integer        3
## 3 <dbl [5]> double         5
df <- tribble(  ~x, 
                list(a = 1, b = 2),
                list(a = 2, c = 4) ) 
df
## # A tibble: 2 x 1
##   x               
##   <list>          
## 1 <named list [2]>
## 2 <named list [2]>
df %>% 
  mutate(  a = map_dbl(x,
                       "a"),
           b = map_dbl(x, 
                       "b", 
                       .null = NA_real_),
           c = map_dbl(x, 
                        "c",
                        .null=NA_real_)
           )
## # A tibble: 2 x 4
##   x                    a     b     c
##   <list>           <dbl> <dbl> <dbl>
## 1 <named list [2]>     1     2    NA
## 2 <named list [2]>     2    NA     4
  1. Unnesting
tibble(x = 1:2,
       y = list(1:4, 1)) %>% 
  unnest(y) 
## # A tibble: 5 x 2
##       x     y
##   <int> <dbl>
## 1     1     1
## 2     1     2
## 3     1     3
## 4     1     4
## 5     2     1
df1 <- tribble(  ~x, ~y, ~z,  
                 1, c("a", "b"), 1:2, 
                 2, "c", 3 )
df1 
## # A tibble: 2 x 3
##       x y         z        
##   <dbl> <list>    <list>   
## 1     1 <chr [2]> <int [2]>
## 2     2 <chr [1]> <dbl [1]>
df1 %>% 
  unnest(y, z) 
## # A tibble: 3 x 3
##       x y         z
##   <dbl> <chr> <dbl>
## 1     1 a         1
## 2     1 b         2
## 3     2 c         3
df2 <- tribble(  ~x, ~y, ~z,
                 1, "a", 1:2,
                 2, c("b", "c"), 3 )
df2 
## # A tibble: 2 x 3
##       x y         z        
##   <dbl> <list>    <list>   
## 1     1 <chr [1]> <int [2]>
## 2     2 <chr [2]> <dbl [1]>
df2 %>% 
  unnest(y, z) 
## # A tibble: 4 x 3
##       x y         z
##   <dbl> <chr> <dbl>
## 1     1 a         1
## 2     1 a         2
## 3     2 b         3
## 4     2 c         3

Tambahan

  1. broom::glance(model) returns a row for each model. Each column gives a model summary: either a measure of model quality, or complexity, or a combination of the two.

  2. broom::tidy(model) returns a row for each coefficient in the model. Each column gives information about the estimate or its variability.

  3. broom::augment(model, data) returns a row for each row in data, adding extra values like residuals, and influence statistics.


TAMAT