library(tidyverse)
library(tidymodels)
library(textrecipes)
library(stopwords)
Hackaton
<- read_csv("./tripadvisor_hotel_reviews.csv")
hotels set.seed(1)
<- sample_frac(hotels, 0.3) hotels
set.seed(1)
<- hotels |>
split initial_split()
<- training(split)
train <- testing(split) test
<- svm_linear() |>
svm_spec set_engine("LiblineaR") |>
set_mode("regression")
#добавим модель к воркфлоу
<- workflow() |>
svm_wflow add_model(svm_spec)
svm_wflow#напишем рецепт препросессинга данных
<- recipe(Rating ~ Review, data = train) |>
rec step_tokenize(Review) |>
step_tokenfilter(Review, max_tokens = 1000) |>
step_tfidf(Review)
#Добавляем препроцессор в воркфлоу.
<- svm_wflow |>
svm_wflow add_recipe(rec)
svm_wflow
#подгонка модели на обучающих данных
<- svm_wflow |>
svm_fit fit(data = train)
#Оценим модель на тренировочных данных.
<- tibble(truth = train$Rating,
pred_data estimate = predict(svm_fit, train)$.pred)
pred_data<- metric_set(rmse, rsq, mae)
metrics
<- metrics(pred_data, truth = truth, estimate = estimate) metrics_svm
1 rmse standard 1.1387260 2 rsq standard 0.4952173 3 mae standard 0.9003342
#заменим выборку
set.seed(1)
<- vfold_cv(train, v = 10)
folds
#нулевая модель
<- null_model() |>
null_reg set_engine("parsnip") |>
set_mode("regression")
<- workflow() |>
null_wflow add_model(null_reg) |>
add_recipe(rec)
<- fit_resamples(
null_rs
null_wflow,
folds,control = control_resamples(save_pred = TRUE)
)collect_metrics(null_rs)
1 rmse standard 1.237592 10 0.0111641 Preprocessor1_Model1 2 rsq standard NaN 0 NA Preprocessor1_Model1
install.packages("spacyr")
::spacy_install()
spacyr
<- recipe(Rating ~ Review, data = train) |>
rec_lemma step_tokenize(Review, engine = "spacyr") |>
step_lemma(Review) |>
step_tokenfilter(Review, max_tokens = 1000) |>
step_tfidf(Review)
<- workflow() |>
svm_wflow_lemma add_model(svm_spec)|>
add_recipe(rec_lemma)
<- svm_wflow_lemma |>
svm_fit_lemma fit(data = train)
<- tibble(truth = train$Rating,
pred_data_lemma estimate = predict(svm_fit_lemma, train)$.pred)
pred_data_lemma<- metric_set(rmse, rsq, mae)
metrics_lemma
<- metrics(pred_data_lemma, truth = truth, estimate = estimate) metrics_svm_lemma
1 rmse standard 1.1569035 2 rsq standard 0.4700319 3 mae standard 0.9135093
<- function(stopwords_name) {
stopwords_rec recipe(Rating ~ Review, data = train) |>
step_tokenize(Review) |>
step_stopwords(Review, stopword_source = stopwords_name) |>
step_tokenfilter(Review, max_tokens = 1000) |>
step_tfidf(Review)
}set.seed(1)
<- workflow() |>
svm_wflow_stop add_model(svm_spec)
<- fit_resamples(
snowball_rs |>
svm_wflow_stop add_recipe(stopwords_rec("snowball")),
folds
)<- collect_metrics(snowball_rs) metrics_snowball
1 rmse standard 1.1471342 10 0.009814347 Preprocessor1_Model1 2 rsq standard 0.4621366 10 0.008786747 Preprocessor1_Model1
set.seed(1)
<- fit_resamples(
smart_rs |> add_recipe(stopwords_rec("smart")),
svm_wflow_stop
folds
)<- collect_metrics(smart_rs) metrics_smart_rs
1 rmse standard 1.1404006 10 0.009630983 Preprocessor1_Model1 2 rsq standard 0.4668183 10 0.009518076 Preprocessor1_Model1
set.seed(1)
<- fit_resamples(
stopwords_iso_rs |> add_recipe(stopwords_rec("stopwords-iso")),
svm_wflow_stop
folds
)<- collect_metrics(stopwords_iso_rs) metrics_stopwords_iso_rs
1 rmse standard 1.1324068 10 0.009622164 Preprocessor1_Model1 2 rsq standard 0.4546836 10 0.009070150 Preprocessor1_Model1
<- boost_tree(mtry = 50, trees = 500) |>
xgb_spec set_engine("xgboost") |>
set_mode("regression")
<- workflow() |>
xgb_wflow add_model(xgb_spec) |>
add_recipe(rec)
xgb_wflow
<- fit_resamples(
xgb_rs
xgb_wflow,
folds,control = control_resamples(save_pred = TRUE)
)
1 rmse standard 0.8439222 10 0.010151510 Preprocessor1_Model1 2 rsq standard 0.5370892 10 0.009386127 Preprocessor1_Model1
# ОБУЧЕНИЕ ФИНАЛЬНОЙ МОДЕЛИ НА ВСЕХ ТРЕНИРОВОЧНЫХ ДАННЫХ
<- last_fit(xgb_wflow, split)
final_fit
# Метрики на тестовых данных
<- collect_metrics(final_fit) test_metrics
1 rmse standard 0.8570846 Preprocessor1_Model1 2 rsq standard 0.5139780 Preprocessor1_Model1