library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ---------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.0 √ purrr 0.2.5
## √ tibble 1.4.2 √ dplyr 0.7.8
## √ tidyr 0.8.1 √ stringr 1.3.1
## √ readr 1.1.1 √ forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stats)
# reproducible results
set.seed(47)
# shuffle / randomise rows
mtcars2 <- mtcars %>% sample_frac(1)
# split train / test
mtcars_train <- mtcars2[1:20, ] %>%
rownames_to_column() %>%
as.tibble()
mtcars_test <- mtcars2[21:32, ] %>%
rownames_to_column() %>%
as.tibble()
mtcars_test
## # A tibble: 12 x 12
## rowname mpg cyl disp hp drat wt qsec vs am gear
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Datsun~ 22.8 4 108 93 3.85 2.32 18.6 1 1 4
## 2 Merc 4~ 17.3 8 276. 180 3.07 3.73 17.6 0 0 3
## 3 Valiant 18.1 6 225 105 2.76 3.46 20.2 1 0 3
## 4 Mazda ~ 21 6 160 110 3.9 2.88 17.0 0 1 4
## 5 Fiat 1~ 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4
## 6 Porsch~ 26 4 120. 91 4.43 2.14 16.7 0 1 5
## 7 Ford P~ 15.8 8 351 264 4.22 3.17 14.5 0 1 5
## 8 Merc 2~ 24.4 4 147. 62 3.69 3.19 20 1 0 4
## 9 Fiat X~ 27.3 4 79 66 4.08 1.94 18.9 1 1 4
## 10 Toyota~ 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4
## 11 Merc 2~ 22.8 4 141. 95 3.92 3.15 22.9 1 0 4
## 12 Hornet~ 18.7 8 360 175 3.15 3.44 17.0 0 0 3
## # ... with 1 more variable: carb <dbl>
mtcars_model_spline <- function(df) {
with(df, smooth.spline(x = mpg, y = disp, df = 3))
}
# for each cyl group create subsets and fit the models of interest using map
dt_models <- as.tibble(mtcars_train) %>%
mutate(cyl = if_else(cyl < 7, 4, cyl)) %>%
nest(-cyl) %>% # print() %>%
rename(data_train = data) %>%
mutate(
# model1 = map(data_train, ~approxfun(x = .$disp, y = .$mpg)),
model2 = map(data_train, ~lm(disp ~ mpg, data = .)),
model3 = map(data_train, mtcars_model_spline),
model31 = map(data_train, ~smooth.spline(x = .$mpg, y = .$disp, df = 3))
)
## Warning: package 'bindrcpp' was built under R version 3.4.4
# rename(data_train = data) %>%
# print()
mtcars_test
## # A tibble: 12 x 12
## rowname mpg cyl disp hp drat wt qsec vs am gear
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Datsun~ 22.8 4 108 93 3.85 2.32 18.6 1 1 4
## 2 Merc 4~ 17.3 8 276. 180 3.07 3.73 17.6 0 0 3
## 3 Valiant 18.1 6 225 105 2.76 3.46 20.2 1 0 3
## 4 Mazda ~ 21 6 160 110 3.9 2.88 17.0 0 1 4
## 5 Fiat 1~ 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4
## 6 Porsch~ 26 4 120. 91 4.43 2.14 16.7 0 1 5
## 7 Ford P~ 15.8 8 351 264 4.22 3.17 14.5 0 1 5
## 8 Merc 2~ 24.4 4 147. 62 3.69 3.19 20 1 0 4
## 9 Fiat X~ 27.3 4 79 66 4.08 1.94 18.9 1 1 4
## 10 Toyota~ 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4
## 11 Merc 2~ 22.8 4 141. 95 3.92 3.15 22.9 1 0 4
## 12 Hornet~ 18.7 8 360 175 3.15 3.44 17.0 0 0 3
## # ... with 1 more variable: carb <dbl>
# join test data to be able to predict them
dt_test_data <- mtcars_test %>%
nest(-cyl) %>%
inner_join(dt_models, by = "cyl") %>%
rename(data_test = data) %>%
mutate(
pred.rt1 = map2(data_train, data_test, function(train, test) {
approx(x = train$mpg, y = train$hp, xout = test$mpg)$y
}),
pred.rt2 = map2(model2, data_test, function(px, py) {
predict(px, py)
}),
pred.rt21 = map2(model2, data_test, ~predict(.x, .y)),
pred.rt3 = map2(model3, data_test, function(px, py) {
predict(px, py$mpg)$y
}),
pred.rt31 = map2(model31, data_test,~predict(.x, .y$mpg)$y)
) %>%
#select(data_train) %>%
#unnest() %>%
print()
## # A tibble: 2 x 11
## cyl data_test data_train model2 model3 model31 pred.rt1 pred.rt2
## <dbl> <list> <list> <list> <list> <list> <list> <list>
## 1 4 <tibble ~ <tibble [~ <S3: ~ <S3: ~ <S3: s~ <dbl [7~ <dbl [7~
## 2 8 <tibble ~ <tibble [~ <S3: ~ <S3: ~ <S3: s~ <dbl [3~ <dbl [3~
## # ... with 3 more variables: pred.rt21 <list>, pred.rt3 <list>,
## # pred.rt31 <list>
dt_test_data$pred.rt2
## [[1]]
## 1 2 3 4 5 6 7
## 143.67034 75.40274 120.91448 132.29241 111.66991 64.73593 143.67034
##
## [[2]]
## 1 2 3
## 316.2009 339.5750 294.3851
dt_test_data$pred.rt21
## [[1]]
## 1 2 3 4 5 6 7
## 143.67034 75.40274 120.91448 132.29241 111.66991 64.73593 143.67034
##
## [[2]]
## 1 2 3
## 316.2009 339.5750 294.3851
dt_test_data$pred.rt3
## [[1]]
## [1] 156.03740 63.42927 132.80349 146.11966 120.10918 46.44018 156.03740
##
## [[2]]
## [1] 327.0194 321.6170 351.1309
dt_test_data$pred.rt31
## [[1]]
## [1] 156.03740 63.42927 132.80349 146.11966 120.10918 46.44018 156.03740
##
## [[2]]
## [1] 327.0194 321.6170 351.1309
# predict test data using map2
pred.rt1 <- map2(dt_test_data$data_train, dt_test_data$data_test, function(train, test) {
approx(x = train$mpg, y = train$hp, xout = test$mpg)$y
}) %>% print()
## [[1]]
## [1] 94.88202 NA 89.66854 92.27528 87.55056 NA 94.88202
##
## [[2]]
## [1] 178.3929 160.0000 175.8929
pred.rt2 <- map2(dt_test_data$model2, dt_test_data$data_test, function(x, y) {
predict(x, y)
}) %>% print()
## [[1]]
## 1 2 3 4 5 6 7
## 143.67034 75.40274 120.91448 132.29241 111.66991 64.73593 143.67034
##
## [[2]]
## 1 2 3
## 316.2009 339.5750 294.3851
pred.rt3 <- map2( dt_test_data$model3, dt_test_data$data_test, function(x, y) {
predict(x, y$mpg)$y
}
) %>% print()
## [[1]]
## [1] 156.03740 63.42927 132.80349 146.11966 120.10918 46.44018 156.03740
##
## [[2]]
## [1] 327.0194 321.6170 351.1309