1 Введение

На этой странице Вы найдете объяснение линейной регрессии на примере данных о такси Нью-Йорка.

2 Библиотеки

Загружаем библиотеки.

library(data.table) # библиотека для быстрого чтения данных с csv файла
library(lubridate) # библиотека для работы с данными форматаDate
library(dplyr) # библиотека для data engineering
library(ggplot2) # библиотека для построений графиков
library(caret) # библиотека для создания выборок для обучения и теста
library(xgboost) # библиотека для XGBoost

3 Загрузка данных

Читаем данные.

df <- fread("../L9Data/train.csv") # fast read
test_final <- fread("../L9Data/test.csv")
subm <- fread("../L9Data/sample_submission.csv")
# Обзор наименование переменных в искомом дата фрейме
colnames(df) 
 [1] "id"                 "vendor_id"          "pickup_datetime"   
 [4] "dropoff_datetime"   "passenger_count"    "pickup_longitude"  
 [7] "pickup_latitude"    "dropoff_longitude"  "dropoff_latitude"  
[10] "store_and_fwd_flag" "trip_duration"     
# Обзор наименование переменных в тест файле
colnames(test_final) 
[1] "id"                 "vendor_id"          "pickup_datetime"   
[4] "passenger_count"    "pickup_longitude"   "pickup_latitude"   
[7] "dropoff_longitude"  "dropoff_latitude"   "store_and_fwd_flag"

4 Работа с датами

Обзор формата данных. Здесь показывается как можно конвертировать дату, записанной в формате “character” в дату формата “Date”.

str("2014-01-01") 
 chr "2014-01-01"
class("2014-01-01")
[1] "character"
str(as.Date("01/01/2011"))
 Date[1:1], format: "1-01-20"
str(as.Date("2014-01-01 17:01:01"))
 Date[1:1], format: "2014-01-01"

Для работы с данными следует воспользоваться библотекой lubridate. Рассматриваем сдуеющую расшифровку ymd - year month day mdy - month day year dmy - day month year ymd_hms(df$dropoff_datetime) # возвращает дату в формате год-месц-день_часы-минуты-секунды

Посмотрим на формат данных

str(ymd_hms(df$dropoff_datetime))
##  POSIXct[1:1458644], format: "2016-03-14 17:32:30" "2016-06-12 00:54:38" ...

table((df\(dropoff_datetime-df\)pickup_datetime) == df$trip_duration)

5 Задача

Условие задачи: Определить время поездки на такси. Зависимая переменная (ЗП) - trip_duration

str(df) # Смотрим на тип данных
Classes 'data.table' and 'data.frame':  1458644 obs. of  11 variables:
 $ id                : chr  "id2875421" "id2377394" "id3858529" "id3504673" ...
 $ vendor_id         : int  2 1 2 2 2 2 1 2 1 2 ...
 $ pickup_datetime   : chr  "2016-03-14 17:24:55" "2016-06-12 00:43:35" "2016-01-19 11:35:24" "2016-04-06 19:32:31" ...
 $ dropoff_datetime  : chr  "2016-03-14 17:32:30" "2016-06-12 00:54:38" "2016-01-19 12:10:48" "2016-04-06 19:39:40" ...
 $ passenger_count   : int  1 1 1 1 1 6 4 1 1 1 ...
 $ pickup_longitude  : num  -74 -74 -74 -74 -74 ...
 $ pickup_latitude   : num  40.8 40.7 40.8 40.7 40.8 ...
 $ dropoff_longitude : num  -74 -74 -74 -74 -74 ...
 $ dropoff_latitude  : num  40.8 40.7 40.7 40.7 40.8 ...
 $ store_and_fwd_flag: chr  "N" "N" "N" "N" ...
 $ trip_duration     : int  455 663 2124 429 435 443 341 1551 255 1225 ...
 - attr(*, ".internal.selfref")=<externalptr> 

6 Очистка данных

ymd_hms() # преобразует данные
POSIXct of length 0
df$pickup_datetime <- ymd_hms(df$pickup_datetime)
df$dropoff_datetime <- ymd_hms(df$dropoff_datetime)
str(df) # Проверяем формат данных
Classes 'data.table' and 'data.frame':  1458644 obs. of  11 variables:
 $ id                : chr  "id2875421" "id2377394" "id3858529" "id3504673" ...
 $ vendor_id         : int  2 1 2 2 2 2 1 2 1 2 ...
 $ pickup_datetime   : POSIXct, format: "2016-03-14 17:24:55" "2016-06-12 00:43:35" ...
 $ dropoff_datetime  : POSIXct, format: "2016-03-14 17:32:30" "2016-06-12 00:54:38" ...
 $ passenger_count   : int  1 1 1 1 1 6 4 1 1 1 ...
 $ pickup_longitude  : num  -74 -74 -74 -74 -74 ...
 $ pickup_latitude   : num  40.8 40.7 40.8 40.7 40.8 ...
 $ dropoff_longitude : num  -74 -74 -74 -74 -74 ...
 $ dropoff_latitude  : num  40.8 40.7 40.7 40.7 40.8 ...
 $ store_and_fwd_flag: chr  "N" "N" "N" "N" ...
 $ trip_duration     : int  455 663 2124 429 435 443 341 1551 255 1225 ...
 - attr(*, ".internal.selfref")=<externalptr> 
df$id <- NULL # Удаляем переменную не значимую для анализа
df$store_and_fwd_flag <- as.factor(df$store_and_fwd_flag) # Преобразуем данные в факторные данные с двумя уровнями: "N" и "Y"
str(df$store_and_fwd_flag)
 Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
sum(is.na(df)) # Подсчитываем колличество NA значений
[1] 0
boxplot(df$trip_duration) # Строим ящик с усами для того, чтобы увидеть какое распределение зависимой переменной
outliers <- boxplot(df$trip_duration)$out # записываем все выбросы для дальнейшей работы

x = (df[-(which(df$trip_duration %in% outliers)),]) # Удаляем данные из дата фрейма, чьи значения в переменной trip_duration относящийеся к выбросам 
nrow(x)/nrow(df) #проверяем соотносимость оставшихся данных от всех изначальных
[1] 0.9491171
boxplot(x$trip_duration) # Строим ящик с усами для того, чтобы увидеть какое распределение данных зависимой переменной в выборке без выбросов

df <- x # Обновляем нашу выборку

6.1 Пункт крутых идеи

Можно долготу и широту сразу преобразовать в расстояние.

Расчленяем данные дат и создаем соответвующие им новые переменные.

df$week <- (lubridate::wday(df$pickup_datetime))
df$hour <- (lubridate::hour(df$pickup_datetime))
df$month <- (lubridate::month(df$pickup_datetime))
df$year <- (lubridate::year(df$pickup_datetime))
df$day <- (lubridate::day(df$pickup_datetime))
# Строим график длительности поездки по дням недели
ggplot(df,aes(y=trip_duration,group=week,fill=as.factor(week)))+
  geom_boxplot()

# Убираем ненужные пременные 
df$pickup_datetime <- NULL
df$dropoff_datetime <- NULL

# Высчитываем координаты посадки x, y, z относительно ядра Земли по формулам
df$x <- cos(df$pickup_latitude)*cos(df$pickup_longitude)
df$y <- cos(df$pickup_latitude)*sin(df$pickup_longitude)
df$z <- sin(df$pickup_latitude)

# Высчитываем координаты конечного пункта поездки x_, y_, z_ относительно ядра Земли по формулам
df$x_ <- cos(df$dropoff_latitude)*cos(df$dropoff_longitude)
df$y_ <- cos(df$dropoff_latitude)*sin(df$dropoff_longitude)
df$z_ <- sin(df$dropoff_latitude)

attach(df) # Сохраняем название переменных дата фрейма в памяти для упрощенного вызова
# Избавляемся от ненужных перемен
df <- select(df,-c(pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude))

7 Машинное обучение

Разделяем данные для обучения (train) и тестирования (test) в соотношении 80% для обучения и 20% для теста.

index <- createDataPartition(df$trip_duration,p=0.8,list = F)
tr <- df[index,]
ts <- df[-index,]

7.1 Модель №1

# строим можель линейной зависимости между ЗП и всеми остальными переменными
fit <- lm(trip_duration~.,tr)
summary(fit) # анализируем полученную модель ( 0.06457)

Call:
lm(formula = trip_duration ~ ., data = tr)

Residuals:
     Min       1Q   Median       3Q      Max 
-12419.1   -326.6    -84.6    256.3   2800.8 

Coefficients: (1 not defined because of singularities)
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          2.199e+03  8.268e+01  26.602   <2e-16 ***
vendor_id            1.479e+00  8.657e-01   1.708   0.0877 .  
passenger_count      3.900e+00  3.281e-01  11.884   <2e-16 ***
store_and_fwd_flagY  5.794e+01  5.789e+00  10.009   <2e-16 ***
week                 7.069e+00  2.061e-01  34.299   <2e-16 ***
hour                 2.765e+00  6.401e-02  43.197   <2e-16 ***
month                1.081e+01  2.454e-01  44.065   <2e-16 ***
year                        NA         NA      NA       NA    
day                  5.082e-01  4.731e-02  10.741   <2e-16 ***
x                    9.865e+02  1.903e+01  51.826   <2e-16 ***
y                   -5.219e+03  1.391e+02 -37.517   <2e-16 ***
z                    1.991e+03  2.665e+01  74.731   <2e-16 ***
x_                   2.699e+03  1.939e+01 139.232   <2e-16 ***
y_                   6.548e+03  1.293e+02  50.643   <2e-16 ***
z_                   9.677e+02  2.283e+01  42.385   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 433.7 on 1107527 degrees of freedom
Multiple R-squared:  0.06384,   Adjusted R-squared:  0.06383 
F-statistic:  5810 on 13 and 1107527 DF,  p-value: < 2.2e-16
plot(fit)

head(tr) # Посмотрим как выглядит выборка
   vendor_id passenger_count store_and_fwd_flag trip_duration week hour
1:         1               1                  N           663    1    0
2:         2               1                  N           429    4   19
3:         2               1                  N          1551    7    7
4:         1               1                  N           255    6   23
5:         2               1                  N          1225    5   21
6:         2               1                  N          1274    3   22
   month year day          x          y          z          x_         y_
1:     6 2016  12 -0.1515976 -0.9831692 0.10196345 -0.17017986 -0.9793288
2:     4 2016   6 -0.1802777 -0.9762141 0.12044069 -0.18214270 -0.9741555
3:     5 2016  21 -0.1412436 -0.9890444 0.04291223 -0.09459509 -0.9922914
4:     5 2016  27 -0.1703111 -0.9800838 0.10212663 -0.15678110 -0.9817457
5:     3 2016  10 -0.1523078 -0.9836385 0.09621643 -0.14487211 -0.9881509
6:     5 2016  10 -0.1541443 -0.9850599 0.07678911 -0.17290369 -0.9790540
           z_
1: 0.10933391
2: 0.13358552
3: 0.08005964
4: 0.10768052
5: 0.05069329
6: 0.10750606
prognoz <- predict(fit,ts) # Прогнозируем значения ЗП по построенной модели
Warning in predict.lm(fit, ts): prediction from a rank-deficient fit may be
misleading

7.1.1 Анализ эффективности модели

RMSE(prognoz,ts$trip_duration)
[1] 433.842
# Анализируем распределение ошибки модели
boxplot(prognoz-ts$trip_duration)

qplot(prognoz-ts$trip_duration)

summary(ts$pred)
Length  Class   Mode 
     0   NULL   NULL 
#Сравниваем распределение значений ЗП
summary(ts$trip_duration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0   384.0   632.0   731.3   991.0  2092.0 

7.2 Модель №2

Cтроим альтернативную модель для того, чтобы выбрать наиболее эффективную модель. Альтернативная модель строится на основе зависимости ЗП взятой в логарифм от всех остальных переменных.

# Logged dataset
tr_log <- tr
tr_log$trip_duration <- log(tr_log$trip_duration)
fit_log <- lm(trip_duration~.,tr_log) 
summary(fit_log)

Call:
lm(formula = trip_duration ~ ., data = tr_log)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.9034  -0.4126   0.0763   0.5065   6.5977 

Coefficients: (1 not defined because of singularities)
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          5.875e+00  1.350e-01  43.526  < 2e-16 ***
vendor_id            9.234e-03  1.413e-03   6.534 6.42e-11 ***
passenger_count      7.909e-03  5.357e-04  14.764  < 2e-16 ***
store_and_fwd_flagY  4.391e-02  9.450e-03   4.646 3.38e-06 ***
week                 1.035e-02  3.365e-04  30.776  < 2e-16 ***
hour                 5.064e-03  1.045e-04  48.467  < 2e-16 ***
month                1.399e-02  4.006e-04  34.924  < 2e-16 ***
year                        NA         NA      NA       NA    
day                  7.473e-04  7.723e-05   9.676  < 2e-16 ***
x                    2.225e-01  3.107e-02   7.159 8.11e-13 ***
y                   -7.273e+00  2.271e-01 -32.026  < 2e-16 ***
z                    2.764e+00  4.350e-02  63.529  < 2e-16 ***
x_                   2.881e+00  3.165e-02  91.041  < 2e-16 ***
y_                   6.852e+00  2.111e-01  32.461  < 2e-16 ***
z_                   1.183e+00  3.727e-02  31.742  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.708 on 1107527 degrees of freedom
Multiple R-squared:  0.02936,   Adjusted R-squared:  0.02935 
F-statistic:  2577 on 13 and 1107527 DF,  p-value: < 2.2e-16
# Высчитываем прогнозируемые данные беря экспоненту прогноза
prognoz <- exp(predict(fit_log,ts))
Warning in predict.lm(fit_log, ts): prediction from a rank-deficient fit
may be misleading
ts$pred <- prognoz
# Смотрим распределение данных
summary(ts$pred)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       7      554      582      701      615 29024200 
# избавляемся от выбросов (в реальном тесте так сделать не получиться)
ts <- ts[ts$pred<3.368e+09,]

# сравниваем распределение ЗП и прогноза
summary(ts$pred)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       7      554      582      701      615 29024200 
summary(ts$trip_duration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0   384.0   632.0   731.3   991.0  2092.0 

7.2.1 Анализ эффективности модели

Анализиируем модель по результатом рапределения ошибки модели.

RMSE(ts$pred,ts$trip_duration)
[1] 55157.72
boxplot(ts$pred-ts$trip_duration)

qplot(ts$pred-ts$trip_duration)

summary(ts$pred-ts$trip_duration)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   -1688     -394      -46      -31      196 29022795 

7.3 Модель 3: XGBoost

Создаем sparse matrix, где будут значения наших предикторов, и отдельный вектор для зависимой прееменной. Далее сохраняем матрицу и вектор в объекте dtrain и ctest для тренировочной и тестовой выборки соответвенно.

ts$pred <-  NULL
tr_matrix <- data.matrix(select(tr,-c(trip_duration)))
ts_matrix <- data.matrix(select(ts,-c(trip_duration)))

train_target <- tr$trip_duration
test_target <- ts$trip_duration

dtrain <- xgb.DMatrix(data=tr_matrix,label=train_target)
ctest <- xgb.DMatrix(data=ts_matrix,label=test_target)

Создаем watchlist для просмотра промежуточных результатов.

watchlist = list(train=dtrain,test=ctest)

Cтроим модель со следующими параметрами:

bst <- xgb.train(data=dtrain,
                 booster="gbtree",
                 watchlist = watchlist,
                 nrounds = 30000,
                 objective = "reg:linear",
                 eval_metric="rmse",
                 maximize = F,
                 early_stopping_rounds = 10,
                 max_depth=10,
                 subsample=0.7,
                 colsample_bytree=0.7,
                 lambda=0.01,
                 alpha=0.0001
)
[1] train-rmse:648.846191   test-rmse:648.991577 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

[2] train-rmse:510.795135   test-rmse:511.477814 
[3] train-rmse:419.720764   test-rmse:421.043243 
[4] train-rmse:368.974274   test-rmse:370.904144 
[5] train-rmse:335.429901   test-rmse:337.635223 
[6] train-rmse:315.083679   test-rmse:317.761658 
[7] train-rmse:299.923187   test-rmse:303.028046 
[8] train-rmse:290.015869   test-rmse:293.650391 
[9] train-rmse:278.564789   test-rmse:282.470581 
[10]    train-rmse:273.517792   test-rmse:277.904388 
[11]    train-rmse:268.637421   test-rmse:273.369995 
[12]    train-rmse:262.466827   test-rmse:267.617920 
[13]    train-rmse:251.234955   test-rmse:256.794495 
[14]    train-rmse:248.161423   test-rmse:254.101349 
[15]    train-rmse:246.309052   test-rmse:252.572174 
[16]    train-rmse:240.802231   test-rmse:247.456406 
[17]    train-rmse:238.937027   test-rmse:246.033478 
[18]    train-rmse:237.901657   test-rmse:245.347702 
[19]    train-rmse:236.876312   test-rmse:244.565994 
[20]    train-rmse:235.539413   test-rmse:243.581528 
[21]    train-rmse:234.400269   test-rmse:242.763763 
[22]    train-rmse:232.871841   test-rmse:241.503098 
[23]    train-rmse:226.248779   test-rmse:235.161133 
[24]    train-rmse:225.527283   test-rmse:234.676910 
[25]    train-rmse:224.286819   test-rmse:233.755859 
[26]    train-rmse:223.803345   test-rmse:233.443604 
[27]    train-rmse:218.863251   test-rmse:228.860992 
[28]    train-rmse:218.538589   test-rmse:228.645599 
[29]    train-rmse:217.906219   test-rmse:228.249237 
[30]    train-rmse:216.568207   test-rmse:227.265076 
[31]    train-rmse:215.297516   test-rmse:226.160889 
[32]    train-rmse:214.545517   test-rmse:225.613251 
[33]    train-rmse:213.554153   test-rmse:224.887695 
[34]    train-rmse:212.275131   test-rmse:223.876526 
[35]    train-rmse:210.652161   test-rmse:222.581512 
[36]    train-rmse:209.994537   test-rmse:222.235413 
[37]    train-rmse:209.440536   test-rmse:221.954834 
[38]    train-rmse:208.934372   test-rmse:221.710892 
[39]    train-rmse:207.109543   test-rmse:220.239319 
[40]    train-rmse:206.294022   test-rmse:219.669128 
[41]    train-rmse:205.791931   test-rmse:219.386353 
[42]    train-rmse:204.528549   test-rmse:218.466415 
[43]    train-rmse:204.358536   test-rmse:218.423752 
[44]    train-rmse:203.836914   test-rmse:218.073959 
[45]    train-rmse:203.776245   test-rmse:218.062836 
[46]    train-rmse:203.200577   test-rmse:217.630157 
[47]    train-rmse:202.729477   test-rmse:217.405060 
[48]    train-rmse:202.415009   test-rmse:217.273468 
[49]    train-rmse:201.966400   test-rmse:217.070969 
[50]    train-rmse:201.780548   test-rmse:216.986160 
[51]    train-rmse:201.267746   test-rmse:216.632309 
[52]    train-rmse:200.880234   test-rmse:216.497711 
[53]    train-rmse:200.710785   test-rmse:216.443451 
[54]    train-rmse:199.607574   test-rmse:215.645554 
[55]    train-rmse:198.969040   test-rmse:215.178711 
[56]    train-rmse:198.708344   test-rmse:215.118958 
[57]    train-rmse:198.564697   test-rmse:215.086319 
[58]    train-rmse:198.472610   test-rmse:215.076416 
[59]    train-rmse:198.304596   test-rmse:215.063568 
[60]    train-rmse:198.227631   test-rmse:215.076416 
[61]    train-rmse:197.997971   test-rmse:215.041290 
[62]    train-rmse:197.770752   test-rmse:214.999756 
[63]    train-rmse:197.574585   test-rmse:214.964172 
[64]    train-rmse:197.189682   test-rmse:214.749619 
[65]    train-rmse:196.504532   test-rmse:214.304794 
[66]    train-rmse:196.218689   test-rmse:214.178986 
[67]    train-rmse:196.138138   test-rmse:214.166168 
[68]    train-rmse:196.046616   test-rmse:214.130096 
[69]    train-rmse:195.959412   test-rmse:214.122147 
[70]    train-rmse:195.649796   test-rmse:214.005066 
[71]    train-rmse:195.461136   test-rmse:213.989594 
[72]    train-rmse:195.027969   test-rmse:213.819534 
[73]    train-rmse:194.922897   test-rmse:213.828552 
[74]    train-rmse:194.658691   test-rmse:213.725479 
[75]    train-rmse:194.449829   test-rmse:213.670944 
[76]    train-rmse:194.128647   test-rmse:213.572525 
[77]    train-rmse:194.061523   test-rmse:213.566467 
[78]    train-rmse:193.774338   test-rmse:213.515778 
[79]    train-rmse:193.666306   test-rmse:213.515305 
[80]    train-rmse:193.239639   test-rmse:213.289642 
[81]    train-rmse:193.100266   test-rmse:213.258392 
[82]    train-rmse:192.562088   test-rmse:213.036301 
[83]    train-rmse:192.282623   test-rmse:212.884796 
[84]    train-rmse:192.193878   test-rmse:212.874939 
[85]    train-rmse:192.001740   test-rmse:212.836304 
[86]    train-rmse:191.804245   test-rmse:212.805344 
[87]    train-rmse:191.703735   test-rmse:212.794907 
[88]    train-rmse:191.584915   test-rmse:212.769516 
[89]    train-rmse:191.513000   test-rmse:212.766632 
[90]    train-rmse:191.428543   test-rmse:212.766266 
[91]    train-rmse:191.321106   test-rmse:212.753922 
[92]    train-rmse:191.226486   test-rmse:212.757034 
[93]    train-rmse:191.091644   test-rmse:212.755325 
[94]    train-rmse:191.009811   test-rmse:212.750381 
[95]    train-rmse:190.794464   test-rmse:212.721161 
[96]    train-rmse:190.698059   test-rmse:212.687027 
[97]    train-rmse:190.303650   test-rmse:212.516022 
[98]    train-rmse:190.217896   test-rmse:212.520050 
[99]    train-rmse:190.082001   test-rmse:212.506729 
[100]   train-rmse:189.867874   test-rmse:212.449020 
[101]   train-rmse:189.788574   test-rmse:212.435669 
[102]   train-rmse:189.777267   test-rmse:212.437973 
[103]   train-rmse:189.658661   test-rmse:212.425552 
[104]   train-rmse:189.165955   test-rmse:212.188904 
[105]   train-rmse:188.956879   test-rmse:212.133057 
[106]   train-rmse:188.754852   test-rmse:212.119583 
[107]   train-rmse:188.588058   test-rmse:212.112686 
[108]   train-rmse:188.394211   test-rmse:212.051544 
[109]   train-rmse:188.300430   test-rmse:212.030228 
[110]   train-rmse:188.241013   test-rmse:211.997345 
[111]   train-rmse:188.117020   test-rmse:211.964783 
[112]   train-rmse:188.053986   test-rmse:211.963715 
[113]   train-rmse:187.872864   test-rmse:211.934814 
[114]   train-rmse:187.802032   test-rmse:211.928970 
[115]   train-rmse:187.766785   test-rmse:211.925919 
[116]   train-rmse:187.631378   test-rmse:211.935730 
[117]   train-rmse:187.527481   test-rmse:211.946915 
[118]   train-rmse:187.462189   test-rmse:211.937546 
[119]   train-rmse:187.399841   test-rmse:211.937439 
[120]   train-rmse:187.155899   test-rmse:211.897537 
[121]   train-rmse:186.993393   test-rmse:211.874283 
[122]   train-rmse:186.831314   test-rmse:211.852509 
[123]   train-rmse:186.751572   test-rmse:211.838379 
[124]   train-rmse:186.704926   test-rmse:211.846573 
[125]   train-rmse:186.620865   test-rmse:211.839630 
[126]   train-rmse:186.582382   test-rmse:211.837723 
[127]   train-rmse:186.431427   test-rmse:211.831619 
[128]   train-rmse:186.108932   test-rmse:211.760727 
[129]   train-rmse:185.900757   test-rmse:211.696503 
[130]   train-rmse:185.781448   test-rmse:211.718796 
[131]   train-rmse:185.759018   test-rmse:211.722672 
[132]   train-rmse:185.693024   test-rmse:211.732452 
[133]   train-rmse:185.625931   test-rmse:211.732101 
[134]   train-rmse:185.393265   test-rmse:211.693909 
[135]   train-rmse:185.281357   test-rmse:211.677307 
[136]   train-rmse:185.125107   test-rmse:211.681335 
[137]   train-rmse:184.875061   test-rmse:211.558487 
[138]   train-rmse:184.800613   test-rmse:211.542831 
[139]   train-rmse:184.757370   test-rmse:211.553925 
[140]   train-rmse:184.707458   test-rmse:211.545334 
[141]   train-rmse:184.654068   test-rmse:211.552536 
[142]   train-rmse:184.415497   test-rmse:211.495163 
[143]   train-rmse:184.371582   test-rmse:211.497787 
[144]   train-rmse:184.294785   test-rmse:211.492081 
[145]   train-rmse:184.084885   test-rmse:211.475815 
[146]   train-rmse:184.052795   test-rmse:211.480988 
[147]   train-rmse:183.981293   test-rmse:211.495651 
[148]   train-rmse:183.842880   test-rmse:211.480301 
[149]   train-rmse:183.806656   test-rmse:211.480453 
[150]   train-rmse:183.645233   test-rmse:211.444977 
[151]   train-rmse:183.572006   test-rmse:211.447418 
[152]   train-rmse:183.509079   test-rmse:211.462723 
[153]   train-rmse:183.460831   test-rmse:211.461517 
[154]   train-rmse:183.424927   test-rmse:211.466415 
[155]   train-rmse:183.335312   test-rmse:211.484100 
[156]   train-rmse:183.261749   test-rmse:211.501465 
[157]   train-rmse:183.082642   test-rmse:211.465622 
[158]   train-rmse:182.969849   test-rmse:211.456848 
[159]   train-rmse:182.894669   test-rmse:211.429001 
[160]   train-rmse:182.752518   test-rmse:211.447342 
[161]   train-rmse:182.736801   test-rmse:211.444962 
[162]   train-rmse:182.670441   test-rmse:211.446930 
[163]   train-rmse:182.601288   test-rmse:211.436417 
[164]   train-rmse:182.537079   test-rmse:211.452744 
[165]   train-rmse:182.450256   test-rmse:211.454269 
[166]   train-rmse:181.948013   test-rmse:211.230042 
[167]   train-rmse:181.851334   test-rmse:211.228683 
[168]   train-rmse:181.714874   test-rmse:211.252380 
[169]   train-rmse:181.479080   test-rmse:211.199097 
[170]   train-rmse:181.436905   test-rmse:211.207413 
[171]   train-rmse:181.226089   test-rmse:211.150940 
[172]   train-rmse:181.177063   test-rmse:211.160278 
[173]   train-rmse:181.128250   test-rmse:211.170380 
[174]   train-rmse:181.004059   test-rmse:211.133286 
[175]   train-rmse:180.811462   test-rmse:211.102875 
[176]   train-rmse:180.690338   test-rmse:211.124939 
[177]   train-rmse:180.643524   test-rmse:211.124252 
[178]   train-rmse:180.589264   test-rmse:211.131668 
[179]   train-rmse:180.508606   test-rmse:211.145996 
[180]   train-rmse:180.263062   test-rmse:211.060791 
[181]   train-rmse:180.134293   test-rmse:211.067337 
[182]   train-rmse:180.092087   test-rmse:211.066727 
[183]   train-rmse:180.035538   test-rmse:211.079269 
[184]   train-rmse:179.971222   test-rmse:211.079071 
[185]   train-rmse:179.776581   test-rmse:211.026993 
[186]   train-rmse:179.743225   test-rmse:211.025681 
[187]   train-rmse:179.697968   test-rmse:211.023941 
[188]   train-rmse:179.640594   test-rmse:211.012466 
[189]   train-rmse:179.529770   test-rmse:211.024506 
[190]   train-rmse:179.466217   test-rmse:211.038116 
[191]   train-rmse:179.414932   test-rmse:211.049545 
[192]   train-rmse:179.316452   test-rmse:211.048798 
[193]   train-rmse:179.195618   test-rmse:211.051208 
[194]   train-rmse:179.136230   test-rmse:211.069626 
[195]   train-rmse:179.060211   test-rmse:211.076904 
[196]   train-rmse:178.910172   test-rmse:211.085419 
[197]   train-rmse:178.882126   test-rmse:211.096268 
[198]   train-rmse:178.845673   test-rmse:211.097794 
Stopping. Best iteration:
[188]   train-rmse:179.640594   test-rmse:211.012466

7.3.1 Анализ эффективности модели

Анализиируем модель по результатом рапределения ошибки модели.

pred <- predict(bst,ctest)
RMSE(pred, ts$trip_duration)
[1] 211.0125

8 Вывод

Мы видим, что модель линейной регрессии построенной на xgboost выдала нам сравнительно лучший результат.

 

A work by YOUR NAME

YOUREMAIL@gmail.com