На этой странице Вы найдете объяснение линейной регрессии на примере данных о такси Нью-Йорка.
Загружаем библиотеки.
library(data.table) # библиотека для быстрого чтения данных с csv файла
library(lubridate) # библиотека для работы с данными форматаDate
library(dplyr) # библиотека для data engineering
library(ggplot2) # библиотека для построений графиков
library(caret) # библиотека для создания выборок для обучения и теста
library(xgboost) # библиотека для XGBoost
Читаем данные.
df <- fread("../L9Data/train.csv") # fast read
test_final <- fread("../L9Data/test.csv")
subm <- fread("../L9Data/sample_submission.csv")
# Обзор наименование переменных в искомом дата фрейме
colnames(df)
[1] "id" "vendor_id" "pickup_datetime"
[4] "dropoff_datetime" "passenger_count" "pickup_longitude"
[7] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
[10] "store_and_fwd_flag" "trip_duration"
# Обзор наименование переменных в тест файле
colnames(test_final)
[1] "id" "vendor_id" "pickup_datetime"
[4] "passenger_count" "pickup_longitude" "pickup_latitude"
[7] "dropoff_longitude" "dropoff_latitude" "store_and_fwd_flag"
Обзор формата данных. Здесь показывается как можно конвертировать дату, записанной в формате “character” в дату формата “Date”.
str("2014-01-01")
chr "2014-01-01"
class("2014-01-01")
[1] "character"
str(as.Date("01/01/2011"))
Date[1:1], format: "1-01-20"
str(as.Date("2014-01-01 17:01:01"))
Date[1:1], format: "2014-01-01"
Для работы с данными следует воспользоваться библотекой lubridate. Рассматриваем сдуеющую расшифровку ymd - year month day mdy - month day year dmy - day month year ymd_hms(df$dropoff_datetime) # возвращает дату в формате год-месц-день_часы-минуты-секунды
Посмотрим на формат данных
str(ymd_hms(df$dropoff_datetime))
## POSIXct[1:1458644], format: "2016-03-14 17:32:30" "2016-06-12 00:54:38" ...
table((df\(dropoff_datetime-df\)pickup_datetime) == df$trip_duration)
Условие задачи: Определить время поездки на такси. Зависимая переменная (ЗП) - trip_duration
str(df) # Смотрим на тип данных
Classes 'data.table' and 'data.frame': 1458644 obs. of 11 variables:
$ id : chr "id2875421" "id2377394" "id3858529" "id3504673" ...
$ vendor_id : int 2 1 2 2 2 2 1 2 1 2 ...
$ pickup_datetime : chr "2016-03-14 17:24:55" "2016-06-12 00:43:35" "2016-01-19 11:35:24" "2016-04-06 19:32:31" ...
$ dropoff_datetime : chr "2016-03-14 17:32:30" "2016-06-12 00:54:38" "2016-01-19 12:10:48" "2016-04-06 19:39:40" ...
$ passenger_count : int 1 1 1 1 1 6 4 1 1 1 ...
$ pickup_longitude : num -74 -74 -74 -74 -74 ...
$ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
$ dropoff_longitude : num -74 -74 -74 -74 -74 ...
$ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
$ store_and_fwd_flag: chr "N" "N" "N" "N" ...
$ trip_duration : int 455 663 2124 429 435 443 341 1551 255 1225 ...
- attr(*, ".internal.selfref")=<externalptr>
ymd_hms() # преобразует данные
POSIXct of length 0
df$pickup_datetime <- ymd_hms(df$pickup_datetime)
df$dropoff_datetime <- ymd_hms(df$dropoff_datetime)
str(df) # Проверяем формат данных
Classes 'data.table' and 'data.frame': 1458644 obs. of 11 variables:
$ id : chr "id2875421" "id2377394" "id3858529" "id3504673" ...
$ vendor_id : int 2 1 2 2 2 2 1 2 1 2 ...
$ pickup_datetime : POSIXct, format: "2016-03-14 17:24:55" "2016-06-12 00:43:35" ...
$ dropoff_datetime : POSIXct, format: "2016-03-14 17:32:30" "2016-06-12 00:54:38" ...
$ passenger_count : int 1 1 1 1 1 6 4 1 1 1 ...
$ pickup_longitude : num -74 -74 -74 -74 -74 ...
$ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
$ dropoff_longitude : num -74 -74 -74 -74 -74 ...
$ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
$ store_and_fwd_flag: chr "N" "N" "N" "N" ...
$ trip_duration : int 455 663 2124 429 435 443 341 1551 255 1225 ...
- attr(*, ".internal.selfref")=<externalptr>
df$id <- NULL # Удаляем переменную не значимую для анализа
df$store_and_fwd_flag <- as.factor(df$store_and_fwd_flag) # Преобразуем данные в факторные данные с двумя уровнями: "N" и "Y"
str(df$store_and_fwd_flag)
Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
sum(is.na(df)) # Подсчитываем колличество NA значений
[1] 0
boxplot(df$trip_duration) # Строим ящик с усами для того, чтобы увидеть какое распределение зависимой переменной
outliers <- boxplot(df$trip_duration)$out # записываем все выбросы для дальнейшей работы
x = (df[-(which(df$trip_duration %in% outliers)),]) # Удаляем данные из дата фрейма, чьи значения в переменной trip_duration относящийеся к выбросам
nrow(x)/nrow(df) #проверяем соотносимость оставшихся данных от всех изначальных
[1] 0.9491171
boxplot(x$trip_duration) # Строим ящик с усами для того, чтобы увидеть какое распределение данных зависимой переменной в выборке без выбросов
df <- x # Обновляем нашу выборку
Можно долготу и широту сразу преобразовать в расстояние.
Расчленяем данные дат и создаем соответвующие им новые переменные.
df$week <- (lubridate::wday(df$pickup_datetime))
df$hour <- (lubridate::hour(df$pickup_datetime))
df$month <- (lubridate::month(df$pickup_datetime))
df$year <- (lubridate::year(df$pickup_datetime))
df$day <- (lubridate::day(df$pickup_datetime))
# Строим график длительности поездки по дням недели
ggplot(df,aes(y=trip_duration,group=week,fill=as.factor(week)))+
geom_boxplot()
# Убираем ненужные пременные
df$pickup_datetime <- NULL
df$dropoff_datetime <- NULL
# Высчитываем координаты посадки x, y, z относительно ядра Земли по формулам
df$x <- cos(df$pickup_latitude)*cos(df$pickup_longitude)
df$y <- cos(df$pickup_latitude)*sin(df$pickup_longitude)
df$z <- sin(df$pickup_latitude)
# Высчитываем координаты конечного пункта поездки x_, y_, z_ относительно ядра Земли по формулам
df$x_ <- cos(df$dropoff_latitude)*cos(df$dropoff_longitude)
df$y_ <- cos(df$dropoff_latitude)*sin(df$dropoff_longitude)
df$z_ <- sin(df$dropoff_latitude)
attach(df) # Сохраняем название переменных дата фрейма в памяти для упрощенного вызова
# Избавляемся от ненужных перемен
df <- select(df,-c(pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude))
Разделяем данные для обучения (train) и тестирования (test) в соотношении 80% для обучения и 20% для теста.
index <- createDataPartition(df$trip_duration,p=0.8,list = F)
tr <- df[index,]
ts <- df[-index,]
# строим можель линейной зависимости между ЗП и всеми остальными переменными
fit <- lm(trip_duration~.,tr)
summary(fit) # анализируем полученную модель ( 0.06457)
Call:
lm(formula = trip_duration ~ ., data = tr)
Residuals:
Min 1Q Median 3Q Max
-12419.1 -326.6 -84.6 256.3 2800.8
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.199e+03 8.268e+01 26.602 <2e-16 ***
vendor_id 1.479e+00 8.657e-01 1.708 0.0877 .
passenger_count 3.900e+00 3.281e-01 11.884 <2e-16 ***
store_and_fwd_flagY 5.794e+01 5.789e+00 10.009 <2e-16 ***
week 7.069e+00 2.061e-01 34.299 <2e-16 ***
hour 2.765e+00 6.401e-02 43.197 <2e-16 ***
month 1.081e+01 2.454e-01 44.065 <2e-16 ***
year NA NA NA NA
day 5.082e-01 4.731e-02 10.741 <2e-16 ***
x 9.865e+02 1.903e+01 51.826 <2e-16 ***
y -5.219e+03 1.391e+02 -37.517 <2e-16 ***
z 1.991e+03 2.665e+01 74.731 <2e-16 ***
x_ 2.699e+03 1.939e+01 139.232 <2e-16 ***
y_ 6.548e+03 1.293e+02 50.643 <2e-16 ***
z_ 9.677e+02 2.283e+01 42.385 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 433.7 on 1107527 degrees of freedom
Multiple R-squared: 0.06384, Adjusted R-squared: 0.06383
F-statistic: 5810 on 13 and 1107527 DF, p-value: < 2.2e-16
plot(fit)
head(tr) # Посмотрим как выглядит выборка
vendor_id passenger_count store_and_fwd_flag trip_duration week hour
1: 1 1 N 663 1 0
2: 2 1 N 429 4 19
3: 2 1 N 1551 7 7
4: 1 1 N 255 6 23
5: 2 1 N 1225 5 21
6: 2 1 N 1274 3 22
month year day x y z x_ y_
1: 6 2016 12 -0.1515976 -0.9831692 0.10196345 -0.17017986 -0.9793288
2: 4 2016 6 -0.1802777 -0.9762141 0.12044069 -0.18214270 -0.9741555
3: 5 2016 21 -0.1412436 -0.9890444 0.04291223 -0.09459509 -0.9922914
4: 5 2016 27 -0.1703111 -0.9800838 0.10212663 -0.15678110 -0.9817457
5: 3 2016 10 -0.1523078 -0.9836385 0.09621643 -0.14487211 -0.9881509
6: 5 2016 10 -0.1541443 -0.9850599 0.07678911 -0.17290369 -0.9790540
z_
1: 0.10933391
2: 0.13358552
3: 0.08005964
4: 0.10768052
5: 0.05069329
6: 0.10750606
prognoz <- predict(fit,ts) # Прогнозируем значения ЗП по построенной модели
Warning in predict.lm(fit, ts): prediction from a rank-deficient fit may be
misleading
RMSE(prognoz,ts$trip_duration)
[1] 433.842
# Анализируем распределение ошибки модели
boxplot(prognoz-ts$trip_duration)
qplot(prognoz-ts$trip_duration)
summary(ts$pred)
Length Class Mode
0 NULL NULL
#Сравниваем распределение значений ЗП
summary(ts$trip_duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 384.0 632.0 731.3 991.0 2092.0
Cтроим альтернативную модель для того, чтобы выбрать наиболее эффективную модель. Альтернативная модель строится на основе зависимости ЗП взятой в логарифм от всех остальных переменных.
# Logged dataset
tr_log <- tr
tr_log$trip_duration <- log(tr_log$trip_duration)
fit_log <- lm(trip_duration~.,tr_log)
summary(fit_log)
Call:
lm(formula = trip_duration ~ ., data = tr_log)
Residuals:
Min 1Q Median 3Q Max
-12.9034 -0.4126 0.0763 0.5065 6.5977
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.875e+00 1.350e-01 43.526 < 2e-16 ***
vendor_id 9.234e-03 1.413e-03 6.534 6.42e-11 ***
passenger_count 7.909e-03 5.357e-04 14.764 < 2e-16 ***
store_and_fwd_flagY 4.391e-02 9.450e-03 4.646 3.38e-06 ***
week 1.035e-02 3.365e-04 30.776 < 2e-16 ***
hour 5.064e-03 1.045e-04 48.467 < 2e-16 ***
month 1.399e-02 4.006e-04 34.924 < 2e-16 ***
year NA NA NA NA
day 7.473e-04 7.723e-05 9.676 < 2e-16 ***
x 2.225e-01 3.107e-02 7.159 8.11e-13 ***
y -7.273e+00 2.271e-01 -32.026 < 2e-16 ***
z 2.764e+00 4.350e-02 63.529 < 2e-16 ***
x_ 2.881e+00 3.165e-02 91.041 < 2e-16 ***
y_ 6.852e+00 2.111e-01 32.461 < 2e-16 ***
z_ 1.183e+00 3.727e-02 31.742 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.708 on 1107527 degrees of freedom
Multiple R-squared: 0.02936, Adjusted R-squared: 0.02935
F-statistic: 2577 on 13 and 1107527 DF, p-value: < 2.2e-16
# Высчитываем прогнозируемые данные беря экспоненту прогноза
prognoz <- exp(predict(fit_log,ts))
Warning in predict.lm(fit_log, ts): prediction from a rank-deficient fit
may be misleading
ts$pred <- prognoz
# Смотрим распределение данных
summary(ts$pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
7 554 582 701 615 29024200
# избавляемся от выбросов (в реальном тесте так сделать не получиться)
ts <- ts[ts$pred<3.368e+09,]
# сравниваем распределение ЗП и прогноза
summary(ts$pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
7 554 582 701 615 29024200
summary(ts$trip_duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 384.0 632.0 731.3 991.0 2092.0
Анализиируем модель по результатом рапределения ошибки модели.
RMSE(ts$pred,ts$trip_duration)
[1] 55157.72
boxplot(ts$pred-ts$trip_duration)
qplot(ts$pred-ts$trip_duration)
summary(ts$pred-ts$trip_duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1688 -394 -46 -31 196 29022795
Создаем sparse matrix, где будут значения наших предикторов, и отдельный вектор для зависимой прееменной. Далее сохраняем матрицу и вектор в объекте dtrain и ctest для тренировочной и тестовой выборки соответвенно.
ts$pred <- NULL
tr_matrix <- data.matrix(select(tr,-c(trip_duration)))
ts_matrix <- data.matrix(select(ts,-c(trip_duration)))
train_target <- tr$trip_duration
test_target <- ts$trip_duration
dtrain <- xgb.DMatrix(data=tr_matrix,label=train_target)
ctest <- xgb.DMatrix(data=ts_matrix,label=test_target)
Создаем watchlist для просмотра промежуточных результатов.
watchlist = list(train=dtrain,test=ctest)
Cтроим модель со следующими параметрами:
bst <- xgb.train(data=dtrain,
booster="gbtree",
watchlist = watchlist,
nrounds = 30000,
objective = "reg:linear",
eval_metric="rmse",
maximize = F,
early_stopping_rounds = 10,
max_depth=10,
subsample=0.7,
colsample_bytree=0.7,
lambda=0.01,
alpha=0.0001
)
[1] train-rmse:648.846191 test-rmse:648.991577
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.
[2] train-rmse:510.795135 test-rmse:511.477814
[3] train-rmse:419.720764 test-rmse:421.043243
[4] train-rmse:368.974274 test-rmse:370.904144
[5] train-rmse:335.429901 test-rmse:337.635223
[6] train-rmse:315.083679 test-rmse:317.761658
[7] train-rmse:299.923187 test-rmse:303.028046
[8] train-rmse:290.015869 test-rmse:293.650391
[9] train-rmse:278.564789 test-rmse:282.470581
[10] train-rmse:273.517792 test-rmse:277.904388
[11] train-rmse:268.637421 test-rmse:273.369995
[12] train-rmse:262.466827 test-rmse:267.617920
[13] train-rmse:251.234955 test-rmse:256.794495
[14] train-rmse:248.161423 test-rmse:254.101349
[15] train-rmse:246.309052 test-rmse:252.572174
[16] train-rmse:240.802231 test-rmse:247.456406
[17] train-rmse:238.937027 test-rmse:246.033478
[18] train-rmse:237.901657 test-rmse:245.347702
[19] train-rmse:236.876312 test-rmse:244.565994
[20] train-rmse:235.539413 test-rmse:243.581528
[21] train-rmse:234.400269 test-rmse:242.763763
[22] train-rmse:232.871841 test-rmse:241.503098
[23] train-rmse:226.248779 test-rmse:235.161133
[24] train-rmse:225.527283 test-rmse:234.676910
[25] train-rmse:224.286819 test-rmse:233.755859
[26] train-rmse:223.803345 test-rmse:233.443604
[27] train-rmse:218.863251 test-rmse:228.860992
[28] train-rmse:218.538589 test-rmse:228.645599
[29] train-rmse:217.906219 test-rmse:228.249237
[30] train-rmse:216.568207 test-rmse:227.265076
[31] train-rmse:215.297516 test-rmse:226.160889
[32] train-rmse:214.545517 test-rmse:225.613251
[33] train-rmse:213.554153 test-rmse:224.887695
[34] train-rmse:212.275131 test-rmse:223.876526
[35] train-rmse:210.652161 test-rmse:222.581512
[36] train-rmse:209.994537 test-rmse:222.235413
[37] train-rmse:209.440536 test-rmse:221.954834
[38] train-rmse:208.934372 test-rmse:221.710892
[39] train-rmse:207.109543 test-rmse:220.239319
[40] train-rmse:206.294022 test-rmse:219.669128
[41] train-rmse:205.791931 test-rmse:219.386353
[42] train-rmse:204.528549 test-rmse:218.466415
[43] train-rmse:204.358536 test-rmse:218.423752
[44] train-rmse:203.836914 test-rmse:218.073959
[45] train-rmse:203.776245 test-rmse:218.062836
[46] train-rmse:203.200577 test-rmse:217.630157
[47] train-rmse:202.729477 test-rmse:217.405060
[48] train-rmse:202.415009 test-rmse:217.273468
[49] train-rmse:201.966400 test-rmse:217.070969
[50] train-rmse:201.780548 test-rmse:216.986160
[51] train-rmse:201.267746 test-rmse:216.632309
[52] train-rmse:200.880234 test-rmse:216.497711
[53] train-rmse:200.710785 test-rmse:216.443451
[54] train-rmse:199.607574 test-rmse:215.645554
[55] train-rmse:198.969040 test-rmse:215.178711
[56] train-rmse:198.708344 test-rmse:215.118958
[57] train-rmse:198.564697 test-rmse:215.086319
[58] train-rmse:198.472610 test-rmse:215.076416
[59] train-rmse:198.304596 test-rmse:215.063568
[60] train-rmse:198.227631 test-rmse:215.076416
[61] train-rmse:197.997971 test-rmse:215.041290
[62] train-rmse:197.770752 test-rmse:214.999756
[63] train-rmse:197.574585 test-rmse:214.964172
[64] train-rmse:197.189682 test-rmse:214.749619
[65] train-rmse:196.504532 test-rmse:214.304794
[66] train-rmse:196.218689 test-rmse:214.178986
[67] train-rmse:196.138138 test-rmse:214.166168
[68] train-rmse:196.046616 test-rmse:214.130096
[69] train-rmse:195.959412 test-rmse:214.122147
[70] train-rmse:195.649796 test-rmse:214.005066
[71] train-rmse:195.461136 test-rmse:213.989594
[72] train-rmse:195.027969 test-rmse:213.819534
[73] train-rmse:194.922897 test-rmse:213.828552
[74] train-rmse:194.658691 test-rmse:213.725479
[75] train-rmse:194.449829 test-rmse:213.670944
[76] train-rmse:194.128647 test-rmse:213.572525
[77] train-rmse:194.061523 test-rmse:213.566467
[78] train-rmse:193.774338 test-rmse:213.515778
[79] train-rmse:193.666306 test-rmse:213.515305
[80] train-rmse:193.239639 test-rmse:213.289642
[81] train-rmse:193.100266 test-rmse:213.258392
[82] train-rmse:192.562088 test-rmse:213.036301
[83] train-rmse:192.282623 test-rmse:212.884796
[84] train-rmse:192.193878 test-rmse:212.874939
[85] train-rmse:192.001740 test-rmse:212.836304
[86] train-rmse:191.804245 test-rmse:212.805344
[87] train-rmse:191.703735 test-rmse:212.794907
[88] train-rmse:191.584915 test-rmse:212.769516
[89] train-rmse:191.513000 test-rmse:212.766632
[90] train-rmse:191.428543 test-rmse:212.766266
[91] train-rmse:191.321106 test-rmse:212.753922
[92] train-rmse:191.226486 test-rmse:212.757034
[93] train-rmse:191.091644 test-rmse:212.755325
[94] train-rmse:191.009811 test-rmse:212.750381
[95] train-rmse:190.794464 test-rmse:212.721161
[96] train-rmse:190.698059 test-rmse:212.687027
[97] train-rmse:190.303650 test-rmse:212.516022
[98] train-rmse:190.217896 test-rmse:212.520050
[99] train-rmse:190.082001 test-rmse:212.506729
[100] train-rmse:189.867874 test-rmse:212.449020
[101] train-rmse:189.788574 test-rmse:212.435669
[102] train-rmse:189.777267 test-rmse:212.437973
[103] train-rmse:189.658661 test-rmse:212.425552
[104] train-rmse:189.165955 test-rmse:212.188904
[105] train-rmse:188.956879 test-rmse:212.133057
[106] train-rmse:188.754852 test-rmse:212.119583
[107] train-rmse:188.588058 test-rmse:212.112686
[108] train-rmse:188.394211 test-rmse:212.051544
[109] train-rmse:188.300430 test-rmse:212.030228
[110] train-rmse:188.241013 test-rmse:211.997345
[111] train-rmse:188.117020 test-rmse:211.964783
[112] train-rmse:188.053986 test-rmse:211.963715
[113] train-rmse:187.872864 test-rmse:211.934814
[114] train-rmse:187.802032 test-rmse:211.928970
[115] train-rmse:187.766785 test-rmse:211.925919
[116] train-rmse:187.631378 test-rmse:211.935730
[117] train-rmse:187.527481 test-rmse:211.946915
[118] train-rmse:187.462189 test-rmse:211.937546
[119] train-rmse:187.399841 test-rmse:211.937439
[120] train-rmse:187.155899 test-rmse:211.897537
[121] train-rmse:186.993393 test-rmse:211.874283
[122] train-rmse:186.831314 test-rmse:211.852509
[123] train-rmse:186.751572 test-rmse:211.838379
[124] train-rmse:186.704926 test-rmse:211.846573
[125] train-rmse:186.620865 test-rmse:211.839630
[126] train-rmse:186.582382 test-rmse:211.837723
[127] train-rmse:186.431427 test-rmse:211.831619
[128] train-rmse:186.108932 test-rmse:211.760727
[129] train-rmse:185.900757 test-rmse:211.696503
[130] train-rmse:185.781448 test-rmse:211.718796
[131] train-rmse:185.759018 test-rmse:211.722672
[132] train-rmse:185.693024 test-rmse:211.732452
[133] train-rmse:185.625931 test-rmse:211.732101
[134] train-rmse:185.393265 test-rmse:211.693909
[135] train-rmse:185.281357 test-rmse:211.677307
[136] train-rmse:185.125107 test-rmse:211.681335
[137] train-rmse:184.875061 test-rmse:211.558487
[138] train-rmse:184.800613 test-rmse:211.542831
[139] train-rmse:184.757370 test-rmse:211.553925
[140] train-rmse:184.707458 test-rmse:211.545334
[141] train-rmse:184.654068 test-rmse:211.552536
[142] train-rmse:184.415497 test-rmse:211.495163
[143] train-rmse:184.371582 test-rmse:211.497787
[144] train-rmse:184.294785 test-rmse:211.492081
[145] train-rmse:184.084885 test-rmse:211.475815
[146] train-rmse:184.052795 test-rmse:211.480988
[147] train-rmse:183.981293 test-rmse:211.495651
[148] train-rmse:183.842880 test-rmse:211.480301
[149] train-rmse:183.806656 test-rmse:211.480453
[150] train-rmse:183.645233 test-rmse:211.444977
[151] train-rmse:183.572006 test-rmse:211.447418
[152] train-rmse:183.509079 test-rmse:211.462723
[153] train-rmse:183.460831 test-rmse:211.461517
[154] train-rmse:183.424927 test-rmse:211.466415
[155] train-rmse:183.335312 test-rmse:211.484100
[156] train-rmse:183.261749 test-rmse:211.501465
[157] train-rmse:183.082642 test-rmse:211.465622
[158] train-rmse:182.969849 test-rmse:211.456848
[159] train-rmse:182.894669 test-rmse:211.429001
[160] train-rmse:182.752518 test-rmse:211.447342
[161] train-rmse:182.736801 test-rmse:211.444962
[162] train-rmse:182.670441 test-rmse:211.446930
[163] train-rmse:182.601288 test-rmse:211.436417
[164] train-rmse:182.537079 test-rmse:211.452744
[165] train-rmse:182.450256 test-rmse:211.454269
[166] train-rmse:181.948013 test-rmse:211.230042
[167] train-rmse:181.851334 test-rmse:211.228683
[168] train-rmse:181.714874 test-rmse:211.252380
[169] train-rmse:181.479080 test-rmse:211.199097
[170] train-rmse:181.436905 test-rmse:211.207413
[171] train-rmse:181.226089 test-rmse:211.150940
[172] train-rmse:181.177063 test-rmse:211.160278
[173] train-rmse:181.128250 test-rmse:211.170380
[174] train-rmse:181.004059 test-rmse:211.133286
[175] train-rmse:180.811462 test-rmse:211.102875
[176] train-rmse:180.690338 test-rmse:211.124939
[177] train-rmse:180.643524 test-rmse:211.124252
[178] train-rmse:180.589264 test-rmse:211.131668
[179] train-rmse:180.508606 test-rmse:211.145996
[180] train-rmse:180.263062 test-rmse:211.060791
[181] train-rmse:180.134293 test-rmse:211.067337
[182] train-rmse:180.092087 test-rmse:211.066727
[183] train-rmse:180.035538 test-rmse:211.079269
[184] train-rmse:179.971222 test-rmse:211.079071
[185] train-rmse:179.776581 test-rmse:211.026993
[186] train-rmse:179.743225 test-rmse:211.025681
[187] train-rmse:179.697968 test-rmse:211.023941
[188] train-rmse:179.640594 test-rmse:211.012466
[189] train-rmse:179.529770 test-rmse:211.024506
[190] train-rmse:179.466217 test-rmse:211.038116
[191] train-rmse:179.414932 test-rmse:211.049545
[192] train-rmse:179.316452 test-rmse:211.048798
[193] train-rmse:179.195618 test-rmse:211.051208
[194] train-rmse:179.136230 test-rmse:211.069626
[195] train-rmse:179.060211 test-rmse:211.076904
[196] train-rmse:178.910172 test-rmse:211.085419
[197] train-rmse:178.882126 test-rmse:211.096268
[198] train-rmse:178.845673 test-rmse:211.097794
Stopping. Best iteration:
[188] train-rmse:179.640594 test-rmse:211.012466
Анализиируем модель по результатом рапределения ошибки модели.
pred <- predict(bst,ctest)
RMSE(pred, ts$trip_duration)
[1] 211.0125
Мы видим, что модель линейной регрессии построенной на xgboost выдала нам сравнительно лучший результат.
A work by YOUR NAME
YOUREMAIL@gmail.com