Memuat Package yang digunakan
library(tidyverse)
library(modelr)
options(na.action = na.warn)
library(nycflights13)
library(lubridate)
library(splines)
library(MASS)
library(gapminder)
Dataset yang digunakan
gapminder
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # ... with 1,694 more rows
Masalah
Bagaimana harapan hidup setiap negara berubah seiring waktu ?
gapminder %>%
ggplot(aes(year,
lifeExp,
group = country)
) +
geom_line(alpha = 1/4)
Pada grafik diatas terlihat seiring waktu, angka harapan hidup kebanyakan negara meningkat. Namun terdapat juga beberapa negara yang tidak mengikuti pola tersebut.
Pada satu negara
Visalisasi data.
nz <- filter(gapminder,
country == "Indonesia")
nz %>%
ggplot(aes(year, lifeExp)) +
geom_line() +
ggtitle("Full data = ")
Terlihat pola data linear.
Model dan visualisasi data berdasarkan model.
nz_mod <- lm(lifeExp ~ year,
data = nz)
nz %>%
add_predictions(nz_mod) %>%
ggplot(aes(year, pred)) +
geom_line() +
ggtitle("Linear trend + ")
Residual.
nz %>%
add_residuals(nz_mod) %>%
ggplot(aes(year, resid)) +
geom_hline(yintercept = 0,
color = "white",
size = 3) +
geom_line() +
ggtitle("Remaining pattern")
Pada Setiap Negara
Untuk setiap negara dapat menggunakan code sebelumnya di copy-paste pada setiap negara. Tapi banyak jadi gak mungkin menggunakan copy-paste.
Berikut cara yang lebih baik.
by_country <- gapminder %>%
group_by(country, continent) %>%
nest()
by_country
## # A tibble: 142 x 3
## # Groups: country, continent [142]
## country continent data
## <fct> <fct> <list>
## 1 Afghanistan Asia <tibble [12 x 4]>
## 2 Albania Europe <tibble [12 x 4]>
## 3 Algeria Africa <tibble [12 x 4]>
## 4 Angola Africa <tibble [12 x 4]>
## 5 Argentina Americas <tibble [12 x 4]>
## 6 Australia Oceania <tibble [12 x 4]>
## 7 Austria Europe <tibble [12 x 4]>
## 8 Bahrain Asia <tibble [12 x 4]>
## 9 Bangladesh Asia <tibble [12 x 4]>
## 10 Belgium Europe <tibble [12 x 4]>
## # ... with 132 more rows
by_country$data[[1]] # Contoh untuk negara afghanistan (baris 1)
## # A tibble: 12 x 4
## year lifeExp pop gdpPercap
## <int> <dbl> <int> <dbl>
## 1 1952 28.8 8425333 779.
## 2 1957 30.3 9240934 821.
## 3 1962 32.0 10267083 853.
## 4 1967 34.0 11537966 836.
## 5 1972 36.1 13079460 740.
## 6 1977 38.4 14880372 786.
## 7 1982 39.9 12881816 978.
## 8 1987 40.8 13867957 852.
## 9 1992 41.7 16317921 649.
## 10 1997 41.8 22227415 635.
## 11 2002 42.1 25268405 727.
## 12 2007 43.8 31889923 975.
country_model <- function(df) {
lm(lifeExp ~ year,
data = df)
}
country_model(by_country$data[[1]] ) # Negara Afganistan
##
## Call:
## lm(formula = lifeExp ~ year, data = df)
##
## Coefficients:
## (Intercept) year
## -507.5343 0.2753
models <- map(by_country$data,
country_model) # Model dari seluruh negara
by_country <- by_country %>%
mutate(model = map(data,
country_model)
)
by_country
## # A tibble: 142 x 4
## # Groups: country, continent [142]
## country continent data model
## <fct> <fct> <list> <list>
## 1 Afghanistan Asia <tibble [12 x 4]> <lm>
## 2 Albania Europe <tibble [12 x 4]> <lm>
## 3 Algeria Africa <tibble [12 x 4]> <lm>
## 4 Angola Africa <tibble [12 x 4]> <lm>
## 5 Argentina Americas <tibble [12 x 4]> <lm>
## 6 Australia Oceania <tibble [12 x 4]> <lm>
## 7 Austria Europe <tibble [12 x 4]> <lm>
## 8 Bahrain Asia <tibble [12 x 4]> <lm>
## 9 Bangladesh Asia <tibble [12 x 4]> <lm>
## 10 Belgium Europe <tibble [12 x 4]> <lm>
## # ... with 132 more rows
by_country <- by_country %>%
mutate( resids = map2(data,
model,
add_residuals)
)
by_country
## # A tibble: 142 x 5
## # Groups: country, continent [142]
## country continent data model resids
## <fct> <fct> <list> <list> <list>
## 1 Afghanistan Asia <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 2 Albania Europe <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 3 Algeria Africa <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 4 Angola Africa <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 5 Argentina Americas <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 6 Australia Oceania <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 7 Austria Europe <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 8 Bahrain Asia <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 9 Bangladesh Asia <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## 10 Belgium Europe <tibble [12 x 4]> <lm> <tibble [12 x 5]>
## # ... with 132 more rows
resids <- unnest(by_country,
resids)
resids
## # A tibble: 1,704 x 9
## # Groups: country, continent [142]
## country continent data model year lifeExp pop gdpPercap resid
## <fct> <fct> <list> <list> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanis~ Asia <tibble [1~ <lm> 1952 28.8 8.43e6 779. -1.11
## 2 Afghanis~ Asia <tibble [1~ <lm> 1957 30.3 9.24e6 821. -0.952
## 3 Afghanis~ Asia <tibble [1~ <lm> 1962 32.0 1.03e7 853. -0.664
## 4 Afghanis~ Asia <tibble [1~ <lm> 1967 34.0 1.15e7 836. -0.0172
## 5 Afghanis~ Asia <tibble [1~ <lm> 1972 36.1 1.31e7 740. 0.674
## 6 Afghanis~ Asia <tibble [1~ <lm> 1977 38.4 1.49e7 786. 1.65
## 7 Afghanis~ Asia <tibble [1~ <lm> 1982 39.9 1.29e7 978. 1.69
## 8 Afghanis~ Asia <tibble [1~ <lm> 1987 40.8 1.39e7 852. 1.28
## 9 Afghanis~ Asia <tibble [1~ <lm> 1992 41.7 1.63e7 649. 0.754
## 10 Afghanis~ Asia <tibble [1~ <lm> 1997 41.8 2.22e7 635. -0.534
## # ... with 1,694 more rows
resids %>%
ggplot(aes(year,
resid))+
geom_line(aes(group = country),
alpha = 1 / 3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
resids %>%
ggplot(aes(year,
resid,
group = country)) +
geom_line(alpha = 1 / 3) +
facet_wrap(~continent)
Pada grafik diatas terlihat sesuatu yang mearik pada benua afrika. Terlihat model masih kurang baik/pas. Terlihat masih banyak residual yang besar.
Tapi asia juga kaya sama, kaya terdapat residual yang besar.
MODEL QUALITY
Menambahkan model quality pada dataframe.
glance <- by_country %>%
mutate(glance = map(model,
broom::glance)) %>%
unnest(glance, .drop = TRUE)
## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
## All list-columns are now preserved.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
glance
## # A tibble: 142 x 17
## # Groups: country, continent [142]
## country continent data model resids r.squared adj.r.squared sigma statistic
## <fct> <fct> <lis> <lis> <list> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan~ Asia <tib~ <lm> <tibb~ 0.948 0.942 1.22 181.
## 2 Albania Europe <tib~ <lm> <tibb~ 0.911 0.902 1.98 102.
## 3 Algeria Africa <tib~ <lm> <tibb~ 0.985 0.984 1.32 662.
## 4 Angola Africa <tib~ <lm> <tibb~ 0.888 0.877 1.41 79.1
## 5 Argent~ Americas <tib~ <lm> <tibb~ 0.996 0.995 0.292 2246.
## 6 Austra~ Oceania <tib~ <lm> <tibb~ 0.980 0.978 0.621 481.
## 7 Austria Europe <tib~ <lm> <tibb~ 0.992 0.991 0.407 1261.
## 8 Bahrain Asia <tib~ <lm> <tibb~ 0.967 0.963 1.64 291.
## 9 Bangla~ Asia <tib~ <lm> <tibb~ 0.989 0.988 0.977 930.
## 10 Belgium Europe <tib~ <lm> <tibb~ 0.995 0.994 0.293 1822.
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>
Pada dataframe diatas kita dapat melihat data yang tidak pas dengan model (R squarednya kecil).
glance %>%
arrange(r.squared)
## # A tibble: 142 x 17
## # Groups: country, continent [142]
## country continent data model resids r.squared adj.r.squared sigma statistic
## <fct> <fct> <lis> <lis> <list> <dbl> <dbl> <dbl> <dbl>
## 1 Rwanda Africa <tib~ <lm> <tibb~ 0.0172 -0.0811 6.56 0.175
## 2 Botswa~ Africa <tib~ <lm> <tibb~ 0.0340 -0.0626 6.11 0.352
## 3 Zimbab~ Africa <tib~ <lm> <tibb~ 0.0562 -0.0381 7.21 0.596
## 4 Zambia Africa <tib~ <lm> <tibb~ 0.0598 -0.0342 4.53 0.636
## 5 Swazil~ Africa <tib~ <lm> <tibb~ 0.0682 -0.0250 6.64 0.732
## 6 Lesotho Africa <tib~ <lm> <tibb~ 0.0849 -0.00666 5.93 0.927
## 7 Cote d~ Africa <tib~ <lm> <tibb~ 0.283 0.212 3.93 3.95
## 8 South ~ Africa <tib~ <lm> <tibb~ 0.312 0.244 4.74 4.54
## 9 Uganda Africa <tib~ <lm> <tibb~ 0.342 0.276 3.19 5.20
## 10 Congo,~ Africa <tib~ <lm> <tibb~ 0.348 0.283 2.43 5.34
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>
Visualisasi dari R squared.
glance %>%
ggplot(aes(continent,
r.squared)) +
geom_jitter(width = 0.5)
Berikut negara (negara afrika) dengan r squared kurang dari 0.25
bad_fit <- filter(glance,
r.squared < 0.25)
gapminder %>%
semi_join(bad_fit,
by = "country") %>%
ggplot(aes(year,
lifeExp,
color = country)) +
geom_line()
Menurut buku, angka harapan hidup diatas mengalami penurunan karena terjadi epidemic HIV/AIDS dan genosida Rwanda.
Tambahan negara dengan P-value kurang dari 0.05
bad_fit <- filter(glance,
p.value > 0.05)
gapminder %>%
semi_join(bad_fit,
by = "country") %>%
ggplot(aes(year,
lifeExp,
color = country)) +
geom_line()
TAMBAHAN
LIST - COLUMNS
data.frame(x = list(1:3, 3:5))
## x.1.3 x.3.5
## 1 1 3
## 2 2 4
## 3 3 5
data.frame(
x = I(list(1:3, 3:5)),
y = c("1, 2", "3, 4, 5") )
## x y
## 1 1, 2, 3 1, 2
## 2 3, 4, 5 3, 4, 5
tibble(
x = list(1:3, 3:5),
y = c("1, 2", "3, 4, 5") )
## # A tibble: 2 x 2
## x y
## <list> <chr>
## 1 <int [3]> 1, 2
## 2 <int [3]> 3, 4, 5
tribble(
~x, ~y,
1:3,
"1, 2",
3:5,
"3, 4, 5" )
## # A tibble: 2 x 2
## x y
## <list> <chr>
## 1 <int [3]> 1, 2
## 2 <int [3]> 3, 4, 5
CREATING LIST - COLUMN
gapminder %>%
group_by(country,
continent) %>%
nest()
## # A tibble: 142 x 3
## # Groups: country, continent [142]
## country continent data
## <fct> <fct> <list>
## 1 Afghanistan Asia <tibble [12 x 4]>
## 2 Albania Europe <tibble [12 x 4]>
## 3 Algeria Africa <tibble [12 x 4]>
## 4 Angola Africa <tibble [12 x 4]>
## 5 Argentina Americas <tibble [12 x 4]>
## 6 Australia Oceania <tibble [12 x 4]>
## 7 Austria Europe <tibble [12 x 4]>
## 8 Bahrain Asia <tibble [12 x 4]>
## 9 Bangladesh Asia <tibble [12 x 4]>
## 10 Belgium Europe <tibble [12 x 4]>
## # ... with 132 more rows
gapminder %>%
nest(year:gdpPercap)
## # A tibble: 142 x 3
## country continent data
## <fct> <fct> <list>
## 1 Afghanistan Asia <tibble [12 x 4]>
## 2 Albania Europe <tibble [12 x 4]>
## 3 Algeria Africa <tibble [12 x 4]>
## 4 Angola Africa <tibble [12 x 4]>
## 5 Argentina Americas <tibble [12 x 4]>
## 6 Australia Oceania <tibble [12 x 4]>
## 7 Austria Europe <tibble [12 x 4]>
## 8 Bahrain Asia <tibble [12 x 4]>
## 9 Bangladesh Asia <tibble [12 x 4]>
## 10 Belgium Europe <tibble [12 x 4]>
## # ... with 132 more rows
df <- tribble( ~x1,
"a,b,c",
"d,e,f,g" )
df
## # A tibble: 2 x 1
## x1
## <chr>
## 1 a,b,c
## 2 d,e,f,g
df %>%
mutate(x2 = stringr::str_split(x1, ","))
## # A tibble: 2 x 2
## x1 x2
## <chr> <list>
## 1 a,b,c <chr [3]>
## 2 d,e,f,g <chr [4]>
sim <- tribble( ~f, ~params,
"runif", list(min = -1,max = -1),
"rnorm", list(sd = 5),
"rpois", list(lambda = 10) )
sim
## # A tibble: 3 x 2
## f params
## <chr> <list>
## 1 runif <named list [2]>
## 2 rnorm <named list [1]>
## 3 rpois <named list [1]>
sim %>%
mutate(sims = invoke_map(f,
params,
n = 10))
## # A tibble: 3 x 3
## f params sims
## <chr> <list> <list>
## 1 runif <named list [2]> <dbl [10]>
## 2 rnorm <named list [1]> <dbl [10]>
## 3 rpois <named list [1]> <int [10]>
mtcars %>%
group_by(cyl) %>%
summarize(q = quantile(mpg))
## # A tibble: 15 x 2
## # Groups: cyl [3]
## cyl q
## <dbl> <dbl>
## 1 4 21.4
## 2 4 22.8
## 3 4 26
## 4 4 30.4
## 5 4 33.9
## 6 6 17.8
## 7 6 18.6
## 8 6 19.7
## 9 6 21
## 10 6 21.4
## 11 8 10.4
## 12 8 14.4
## 13 8 15.2
## 14 8 16.2
## 15 8 19.2
mtcars %>%
group_by(cyl) %>%
summarize(q = list(quantile(mpg)))
## # A tibble: 3 x 2
## cyl q
## <dbl> <list>
## 1 4 <dbl [5]>
## 2 6 <dbl [5]>
## 3 8 <dbl [5]>
x <- list( a = 1:5,
b = 3:4,
c = 5:6 )
x
## $a
## [1] 1 2 3 4 5
##
## $b
## [1] 3 4
##
## $c
## [1] 5 6
df <- enframe(x)
df
## # A tibble: 3 x 2
## name value
## <chr> <list>
## 1 a <int [5]>
## 2 b <int [2]>
## 3 c <int [2]>
SIMPLIFYING LIST - COLUMNS
df <- tribble( ~x,
letters[1:5],
1:3,
runif(5)
)
df
## # A tibble: 3 x 1
## x
## <list>
## 1 <chr [5]>
## 2 <int [3]>
## 3 <dbl [5]>
df %>%
mutate( type = map_chr(x,
typeof),
length = map_int(x,
length)
)
## # A tibble: 3 x 3
## x type length
## <list> <chr> <int>
## 1 <chr [5]> character 5
## 2 <int [3]> integer 3
## 3 <dbl [5]> double 5
df <- tribble( ~x,
list(a = 1, b = 2),
list(a = 2, c = 4) )
df
## # A tibble: 2 x 1
## x
## <list>
## 1 <named list [2]>
## 2 <named list [2]>
df %>%
mutate( a = map_dbl(x,
"a"),
b = map_dbl(x,
"b",
.null = NA_real_),
c = map_dbl(x,
"c",
.null=NA_real_)
)
## # A tibble: 2 x 4
## x a b c
## <list> <dbl> <dbl> <dbl>
## 1 <named list [2]> 1 2 NA
## 2 <named list [2]> 2 NA 4
tibble(x = 1:2,
y = list(1:4, 1)) %>%
unnest(y)
## # A tibble: 5 x 2
## x y
## <int> <dbl>
## 1 1 1
## 2 1 2
## 3 1 3
## 4 1 4
## 5 2 1
df1 <- tribble( ~x, ~y, ~z,
1, c("a", "b"), 1:2,
2, "c", 3 )
df1
## # A tibble: 2 x 3
## x y z
## <dbl> <list> <list>
## 1 1 <chr [2]> <int [2]>
## 2 2 <chr [1]> <dbl [1]>
df1 %>%
unnest(y, z)
## # A tibble: 3 x 3
## x y z
## <dbl> <chr> <dbl>
## 1 1 a 1
## 2 1 b 2
## 3 2 c 3
df2 <- tribble( ~x, ~y, ~z,
1, "a", 1:2,
2, c("b", "c"), 3 )
df2
## # A tibble: 2 x 3
## x y z
## <dbl> <list> <list>
## 1 1 <chr [1]> <int [2]>
## 2 2 <chr [2]> <dbl [1]>
df2 %>%
unnest(y, z)
## # A tibble: 4 x 3
## x y z
## <dbl> <chr> <dbl>
## 1 1 a 1
## 2 1 a 2
## 3 2 b 3
## 4 2 c 3
Tambahan
broom::glance(model) returns a row for each model. Each column gives a model summary: either a measure of model quality, or complexity, or a combination of the two.
broom::tidy(model) returns a row for each coefficient in the model. Each column gives information about the estimate or its variability.
broom::augment(model, data) returns a row for each row in data, adding extra values like residuals, and influence statistics.