library(tidyverse)
library(tidymodels)
library(textrecipes)
library(stopwords)Hackaton
hotels <- read_csv("./tripadvisor_hotel_reviews.csv")
set.seed(1)
hotels <- sample_frac(hotels, 0.3)set.seed(1)
split <- hotels |>
initial_split()
train <- training(split)
test <- testing(split)svm_spec <- svm_linear() |>
set_engine("LiblineaR") |>
set_mode("regression")
#добавим модель к воркфлоу
svm_wflow <- workflow() |>
add_model(svm_spec)
svm_wflow
#напишем рецепт препросессинга данных
rec <- recipe(Rating ~ Review, data = train) |>
step_tokenize(Review) |>
step_tokenfilter(Review, max_tokens = 1000) |>
step_tfidf(Review)
#Добавляем препроцессор в воркфлоу.
svm_wflow <- svm_wflow |>
add_recipe(rec)
svm_wflow
#подгонка модели на обучающих данных
svm_fit <- svm_wflow |>
fit(data = train)
#Оценим модель на тренировочных данных.
pred_data <- tibble(truth = train$Rating,
estimate = predict(svm_fit, train)$.pred)
pred_data
metrics <- metric_set(rmse, rsq, mae)
metrics_svm <- metrics(pred_data, truth = truth, estimate = estimate) 1 rmse standard 1.1387260 2 rsq standard 0.4952173 3 mae standard 0.9003342
#заменим выборку
set.seed(1)
folds <- vfold_cv(train, v = 10)
#нулевая модель
null_reg <- null_model() |>
set_engine("parsnip") |>
set_mode("regression")
null_wflow <- workflow() |>
add_model(null_reg) |>
add_recipe(rec)
null_rs <- fit_resamples(
null_wflow,
folds,
control = control_resamples(save_pred = TRUE)
)
collect_metrics(null_rs)1 rmse standard 1.237592 10 0.0111641 Preprocessor1_Model1 2 rsq standard NaN 0 NA Preprocessor1_Model1
install.packages("spacyr")
spacyr::spacy_install()
rec_lemma <- recipe(Rating ~ Review, data = train) |>
step_tokenize(Review, engine = "spacyr") |>
step_lemma(Review) |>
step_tokenfilter(Review, max_tokens = 1000) |>
step_tfidf(Review)
svm_wflow_lemma <- workflow() |>
add_model(svm_spec)|>
add_recipe(rec_lemma)
svm_fit_lemma <- svm_wflow_lemma |>
fit(data = train)
pred_data_lemma <- tibble(truth = train$Rating,
estimate = predict(svm_fit_lemma, train)$.pred)
pred_data_lemma
metrics_lemma <- metric_set(rmse, rsq, mae)
metrics_svm_lemma <- metrics(pred_data_lemma, truth = truth, estimate = estimate) 1 rmse standard 1.1569035 2 rsq standard 0.4700319 3 mae standard 0.9135093
stopwords_rec <- function(stopwords_name) {
recipe(Rating ~ Review, data = train) |>
step_tokenize(Review) |>
step_stopwords(Review, stopword_source = stopwords_name) |>
step_tokenfilter(Review, max_tokens = 1000) |>
step_tfidf(Review)
}
set.seed(1)
svm_wflow_stop <- workflow() |>
add_model(svm_spec)
snowball_rs <- fit_resamples(
svm_wflow_stop |>
add_recipe(stopwords_rec("snowball")),
folds
)
metrics_snowball <- collect_metrics(snowball_rs)1 rmse standard 1.1471342 10 0.009814347 Preprocessor1_Model1 2 rsq standard 0.4621366 10 0.008786747 Preprocessor1_Model1
set.seed(1)
smart_rs <- fit_resamples(
svm_wflow_stop |> add_recipe(stopwords_rec("smart")),
folds
)
metrics_smart_rs <- collect_metrics(smart_rs) 1 rmse standard 1.1404006 10 0.009630983 Preprocessor1_Model1 2 rsq standard 0.4668183 10 0.009518076 Preprocessor1_Model1
set.seed(1)
stopwords_iso_rs <- fit_resamples(
svm_wflow_stop |> add_recipe(stopwords_rec("stopwords-iso")),
folds
)
metrics_stopwords_iso_rs <- collect_metrics(stopwords_iso_rs) 1 rmse standard 1.1324068 10 0.009622164 Preprocessor1_Model1 2 rsq standard 0.4546836 10 0.009070150 Preprocessor1_Model1
xgb_spec <- boost_tree(mtry = 50, trees = 500) |>
set_engine("xgboost") |>
set_mode("regression")
xgb_wflow <- workflow() |>
add_model(xgb_spec) |>
add_recipe(rec)
xgb_wflow
xgb_rs <- fit_resamples(
xgb_wflow,
folds,
control = control_resamples(save_pred = TRUE)
)1 rmse standard 0.8439222 10 0.010151510 Preprocessor1_Model1 2 rsq standard 0.5370892 10 0.009386127 Preprocessor1_Model1
# ОБУЧЕНИЕ ФИНАЛЬНОЙ МОДЕЛИ НА ВСЕХ ТРЕНИРОВОЧНЫХ ДАННЫХ
final_fit <- last_fit(xgb_wflow, split)
# Метрики на тестовых данных
test_metrics <- collect_metrics(final_fit)1 rmse standard 0.8570846 Preprocessor1_Model1 2 rsq standard 0.5139780 Preprocessor1_Model1