HW2. Random Forest in text

library(ggplot2)
library(textrecipes)

## Загрузка требуемого пакета: recipes

## Загрузка требуемого пакета: dplyr

## 
## Присоединяю пакет: 'dplyr'

## Следующие объекты скрыты от 'package:stats':
## 
##     filter, lag

## Следующие объекты скрыты от 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Присоединяю пакет: 'recipes'

## Следующий объект скрыт от 'package:stats':
## 
##     step

library(LiblineaR)
library(ranger)
library(spacyr)
library(rlang)
library(text2vec)
library(dplyr)
library(generics)

## 
## Присоединяю пакет: 'generics'

## Следующий объект скрыт от 'package:text2vec':
## 
##     fit

## Следующий объект скрыт от 'package:dplyr':
## 
##     explain

## Следующие объекты скрыты от 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union

library(rsample)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(parsnip)

## 
## Присоединяю пакет: 'parsnip'

## Следующий объект скрыт от 'package:text2vec':
## 
##     fit

## Следующий объект скрыт от 'package:spacyr':
## 
##     get_dependency

library(workflows)
library(readr)
library(tune)
library(tidytext)
library(stringr)

## 
## Присоединяю пакет: 'stringr'

## Следующий объект скрыт от 'package:recipes':
## 
##     fixed

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──

## ✔ broom        1.0.4     ✔ tibble       3.2.0
## ✔ dials        1.1.0     ✔ tidyr        1.3.0
## ✔ infer        1.0.4     ✔ workflowsets 1.0.0
## ✔ modeldata    1.1.0     ✔ yardstick    1.1.0
## ✔ purrr        1.0.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::%@%()                 masks rlang::%@%()
## ✖ yardstick::accuracy()        masks forecast::accuracy(), generics::accuracy()
## ✖ infer::calculate()           masks generics::calculate()
## ✖ purrr::discard()             masks scales::discard()
## ✖ dplyr::filter()              masks stats::filter()
## ✖ infer::fit()                 masks parsnip::fit(), generics::fit(), text2vec::fit()
## ✖ stringr::fixed()             masks recipes::fixed()
## ✖ purrr::flatten()             masks rlang::flatten()
## ✖ purrr::flatten_chr()         masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl()         masks rlang::flatten_dbl()
## ✖ purrr::flatten_int()         masks rlang::flatten_int()
## ✖ purrr::flatten_lgl()         masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw()         masks rlang::flatten_raw()
## ✖ infer::generate()            masks generics::generate()
## ✖ parsnip::get_dependency()    masks spacyr::get_dependency()
## ✖ infer::hypothesize()         masks generics::hypothesize()
## ✖ purrr::invoke()              masks rlang::invoke()
## ✖ dplyr::lag()                 masks stats::lag()
## ✖ dials::prune()               masks generics::prune()
## ✖ workflowsets::rank_results() masks generics::rank_results()
## ✖ yardstick::spec()            masks readr::spec()
## ✖ infer::specify()             masks generics::specify()
## ✖ purrr::splice()              masks rlang::splice()
## ✖ recipes::step()              masks stats::step()
## ✖ infer::visualize()           masks generics::visualize()
## • Use tidymodels_prefer() to resolve common conflicts.

library(tidyr)
library(scales)
library(gt)
theme_set(theme_classic())

Введение - скачивание данных

Задание заключалось в прогнозе баллов за дегустацию посредством использования текста. Так как в Kaggle данные не валидные (некоторые строчки разнесены по столбцам, некоторые нет), было принято решение скачать аналогичные данные с github. Для начала была проведена первичная аналитика: удалены NA, стоп-слова, а также выделены наиболее встречаемые слова в отзывах.

winemag <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv")

## New names:
## Rows: 129971 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (11): country, description, designation, province, region_1, region_2, t... dbl
## (3): ...1, points, price
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

data1 <- as.data.frame(winemag)
str(data1)

## 'data.frame':    129971 obs. of  14 variables:
##  $ ...1                 : num  0 1 2 3 4 5 6 7 8 9 ...
##  $ country              : chr  "Italy" "Portugal" "US" "US" ...
##  $ description          : chr  "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering un"| __truncated__ "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy r"| __truncated__ "Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp ac"| __truncated__ "Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with note"| __truncated__ ...
##  $ designation          : chr  "Vulkà Bianco" "Avidagos" NA "Reserve Late Harvest" ...
##  $ points               : num  87 87 87 87 87 87 87 87 87 87 ...
##  $ price                : num  NA 15 14 13 65 15 16 24 12 27 ...
##  $ province             : chr  "Sicily & Sardinia" "Douro" "Oregon" "Michigan" ...
##  $ region_1             : chr  "Etna" NA "Willamette Valley" "Lake Michigan Shore" ...
##  $ region_2             : chr  NA NA "Willamette Valley" NA ...
##  $ taster_name          : chr  "Kerin O’Keefe" "Roger Voss" "Paul Gregutt" "Alexander Peartree" ...
##  $ taster_twitter_handle: chr  "@kerinokeefe" "@vossroger" "@paulgwine " NA ...
##  $ title                : chr  "Nicosia 2013 Vulkà Bianco  (Etna)" "Quinta dos Avidagos 2011 Avidagos Red (Douro)" "Rainstorm 2013 Pinot Gris (Willamette Valley)" "St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore)" ...
##  $ variety              : chr  "White Blend" "Portuguese Red" "Pinot Gris" "Riesling" ...
##  $ winery               : chr  "Nicosia" "Quinta dos Avidagos" "Rainstorm" "St. Julian" ...

#удаляем ненужные столбцы - taster_twitter_handle, taster_name, region_2 
names(data1)

##  [1] "...1"                  "country"               "description"          
##  [4] "designation"           "points"                "price"                
##  [7] "province"              "region_1"              "region_2"             
## [10] "taster_name"           "taster_twitter_handle" "title"                
## [13] "variety"               "winery"

data2 <- data1[,-c(10,11,9)]

#почистим NA
data3 <- na.omit(data2)

data3_desc<- as.data.frame(data3[,3])
colnames(data3_desc) <- c("Description")
data4 <- unnest_tokens(data3_desc, output = word, input = Description)
#удалим стоп-слова
data(stop_words)
head(stop_words)

## # A tibble: 6 × 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART

dim(stop_words)

## [1] 1149    2

data5 <- data4 %>% anti_join(stop_words)

## Joining with `by = join_by(word)`

#размерность уменьшилась в два раза: удалили много стоп-слов
dim(data4); dim(data5)

## [1] 2965384       1

## [1] 1657794       1

#наиболее часто встречающиеся слова
head(data5 %>% count(word, sort = TRUE))

##      word     n
## 1    wine 39673
## 2 flavors 34219
## 3   fruit 26902
## 4  aromas 22920
## 5  palate 21588
## 6  finish 18966

Визуализация часто встречаемых слов

В ходе визуализации были оставлены слова, которые встречаются более 15 тыс. раз. Наиболее часто встречаемое слово - wine (39473 раз).

#визуализация частот
#оставим только те, которые больше 15 000
data6 <- data5 %>%
  count(word, sort = TRUE) %>% filter(n > 15000) %>%
  mutate(word = reorder(word, n))

ggplot(data6, aes(word, n)) +
  geom_col(fill = "slateblue") +
  xlab(NULL) +
  coord_flip() + ylab("Количество") + geom_label(aes(label = n))

Построение случайного леса

Для построения случайного леса я брала только переменную с описанием (description) и первые 10 000 наблюдений (иначе R не компилируется). Стоит отметить, что тестовая и обучающая выборка получаются случайно, поэтому даже если я беру первые 10 000 наблюдений, рандомизация сохраняется. Для построения случайного леса я брала 100 деревьев. В идеале после данного построения было применить 10-кратную кроссвалидацию, но её не предоставилось возможным сделать в силу ограничений мощности персонального компьютера. Загрузка одного раза составляла около 30 минут. Запуск единого текущего кода - 1.5 часа.

#разделение на выборку обучения и тестовую, выбрав столбцы баллы за дегустацию (points), description, price, province, winery
set.seed(1234)
names(data3)

##  [1] "...1"        "country"     "description" "designation" "points"     
##  [6] "price"       "province"    "region_1"    "title"       "variety"    
## [11] "winery"

wine_split <- data3 %>% select(description, points, price, province, winery) %>%
  mutate(price = as.numeric(price), province = as.character(province), winery = as.character(province), 
         points = as.numeric(points),
         text = str_remove_all(description, "'")) %>%
  initial_split()

wine_train <- training(wine_split)
wine_test <- testing(wine_split)

#Мой R не работает с 50 000 наблюдениями, поэтому я решила оставить около 10 000 наблюдений (выбранных случайно)
wine_train1 <- wine_train[c(1:10000),]
wine_test1 <- wine_test[c(1:10000),]

#разделим на токены, выберем 100 наиболее часто встречающихся
#вычислим tf-idf и нормализуем эту величину
wine_rec <- recipe(points ~ description, data = wine_train1) %>%
  step_tokenize(description) %>%
  step_tokenfilter(description, max_tokens = 100) %>%
  step_tfidf(description) %>%
  step_normalize(all_predictors())
wine_rec

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 1

##

## ── Operations

## • Tokenization for: description

## • Text filtering for: description

## • Term frequency-inverse document frequency with: description

## • Centering and scaling for: all_predictors()

#пустая модель
wine_wf <- workflow() %>%
  add_recipe(wine_rec)
wine_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: None
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tfidf()
## • step_normalize()

#случайный лес
rf_spec <- rand_forest(trees = 100) %>%
  set_engine("randomForest") %>%
  set_mode("regression")

rf_fit <- wine_wf %>%
  add_model(rf_spec) %>%
  generics::fit(data = wine_train1)

rf_fit$fit$fit$fit$rsq

##   [1] -0.29604146 -0.25964344 -0.18526457 -0.14367786 -0.07820201 -0.01738885
##   [7]  0.02943796  0.06497824  0.09602445  0.12654053  0.15025743  0.17044283
##  [13]  0.18267420  0.19905716  0.20882486  0.22210910  0.22927340  0.23746651
##  [19]  0.24634328  0.25272965  0.25882760  0.26502692  0.26800888  0.27064988
##  [25]  0.27390334  0.27701029  0.28063258  0.28363188  0.28480842  0.28781709
##  [31]  0.28899115  0.29044829  0.29226030  0.29238963  0.29373998  0.29448692
##  [37]  0.29547649  0.29641531  0.29788693  0.29915796  0.30060302  0.30291823
##  [43]  0.30437736  0.30511434  0.30547573  0.30727923  0.30843238  0.30905378
##  [49]  0.31003598  0.31015392  0.31093955  0.31176442  0.31188862  0.31256661
##  [55]  0.31259488  0.31291729  0.31344260  0.31372023  0.31453325  0.31552403
##  [61]  0.31546026  0.31698645  0.31777748  0.31812931  0.31822726  0.31876766
##  [67]  0.31997585  0.32108710  0.32144842  0.32148856  0.32188090  0.32263943
##  [73]  0.32344263  0.32356423  0.32290780  0.32267760  0.32272001  0.32324138
##  [79]  0.32290342  0.32311213  0.32306972  0.32315412  0.32387173  0.32389752
##  [85]  0.32396758  0.32426002  0.32438140  0.32467243  0.32513987  0.32563160
##  [91]  0.32608463  0.32694935  0.32718324  0.32761109  0.32783591  0.32825034
##  [97]  0.32866394  0.32881547  0.32916483  0.32899904

data_check <- data.frame(truth = wine_train1$points, predictions = rf_fit$fit$fit$fit$predicted)
data_check$criteria <- ifelse(abs(data_check$truth - data_check$predictions)<1, "close prediction", "some differences")
data_check %>%
  ggplot(aes(x = truth, y = predictions, color = criteria)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.3) +
  labs(
    color = NULL,
    title = "Predicted and true points for wines"
  ) + scale_color_manual(values = c("darkblue", "lightblue"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

По итогу с ограничением 100 токенов получилось, что качество модели с точки зрения коэффициента детерминациии составляет 46.4%. При этом если посмотреть на график, то видно, что маленькое количество наблюдений предсказано корректно (мера разброса между правдой и предсказанным < 1 по модулю), они выделены синим (close prediction). Чтобы повысить качество модели я решила поудалять стоп-слова и посмотреть на качество модели без использования стоп-слов.

Построение случайного леса с удалением стоп-слов

#с удалением стоп-слов
stopword_rec <- function(stopword_name) {
  recipe(points ~ description, data = wine_train1) %>%
    step_tokenize(description) %>%
    step_stopwords(description, stopword_source = stopword_name) %>%
    step_tokenfilter(description, max_tokens = 1e3) %>%
    step_tfidf(description) %>%
    step_normalize(all_predictors())
}

#пустая модель
wine_wf1 <- workflow() %>%
  add_recipe(stopword_rec("smart"))
wine_wf1

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: None
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
## 
## • step_tokenize()
## • step_stopwords()
## • step_tokenfilter()
## • step_tfidf()
## • step_normalize()

rf_fit1 <- wine_wf1 %>%
  add_model(rf_spec) %>%
  generics::fit(data = wine_train1)

rf_fit1$fit$fit$fit$rsq

##   [1] 0.03052403 0.06769498 0.12468879 0.16645922 0.21064181 0.24077840
##   [7] 0.26701662 0.28778097 0.30637843 0.32708543 0.34526777 0.36333486
##  [13] 0.37447255 0.38901486 0.39792468 0.40645647 0.41410244 0.42261091
##  [19] 0.42665875 0.43272733 0.43510859 0.43836759 0.44119165 0.44438112
##  [25] 0.44604291 0.44835478 0.44995056 0.45222625 0.45483778 0.45743592
##  [31] 0.45903797 0.46016915 0.46112128 0.46245705 0.46367070 0.46540836
##  [37] 0.46623680 0.46743235 0.46837619 0.46904578 0.47035709 0.47194995
##  [43] 0.47245217 0.47413584 0.47516294 0.47595126 0.47704202 0.47779976
##  [49] 0.47861292 0.47878024 0.47919327 0.48025133 0.48076750 0.48138025
##  [55] 0.48180352 0.48260802 0.48313451 0.48374706 0.48384174 0.48482043
##  [61] 0.48503908 0.48538557 0.48557165 0.48613416 0.48622234 0.48605950
##  [67] 0.48595158 0.48592168 0.48633306 0.48672626 0.48688806 0.48741032
##  [73] 0.48800393 0.48819080 0.48863083 0.48912869 0.48888636 0.48932762
##  [79] 0.48940345 0.48964681 0.48955607 0.48984874 0.49009022 0.49034102
##  [85] 0.49020346 0.49024404 0.49039835 0.49075142 0.49043709 0.49023985
##  [91] 0.49044288 0.49082222 0.49069799 0.49089034 0.49055839 0.49083970
##  [97] 0.49102577 0.49091715 0.49076109 0.49066939

data_check1 <- data.frame(truth = wine_train1$points, predictions = rf_fit1$fit$fit$fit$predicted)
data_check1$criteria <- ifelse(abs(data_check1$truth - data_check1$predictions)<1, "close prediction", "some differences")
data_check1 %>%
  ggplot(aes(x = truth, y = predictions, color = criteria)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.3) +
  labs(
    color = NULL,
    title = "Predicted and true points for wines"
  ) + scale_color_manual(values = c("darkblue", "lightblue"))

rmse(data = data_check1, truth = data_check1$truth, estimate = data_check1$predictions)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        2.19

В конечном итоге выяснилось, что лучшей моделью является модель с удалением стоп-слов. Коэффициент детерминации вырос до 51%. RMSE - 2.13.

Просмотр качества модели на тестовой выборке

#посмотрим качество на выборке тестовой 
a1 <- predict(rf_fit, new_data = wine_test1[,c(1,2)])
data_check2 <- data.frame(truth = wine_train1$points, predictions = a1$.pred)
data_check2$criteria <- ifelse(abs(data_check2$truth - data_check2$predictions)<1, "close prediction", "some differences")
data_check2 %>%
  ggplot(aes(x = truth, y = predictions, color = criteria)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.3) +
  labs(
    color = NULL,
    title = "Predicted and true points for wines"
  ) + scale_color_manual(values = c("darkblue", "lightblue"))

#качество на тестовой выборке не очень

rmse(data = data_check2, truth = data_check2$truth, estimate = data_check2$predictions)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        3.44