## # A tibble: 16 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet chr 0 0 3 NA NA NA
## 3 CryoSleep lgl 0 0 2 0 0.35 1
## 4 Destination chr 0 0 4 NA NA NA
## 5 Age dbl 0 0 90 0 28.8 79
## 6 VIP lgl 0 0 2 0 0.02 1
## 7 RoomService dbl 0 0 1273 0 220. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 0 0 1115 0 170. 23492
## 10 Spa dbl 0 0 1327 0 305. 22408
## 11 VRDeck dbl 0 0 1306 0 298. 24133
## 12 Transported lgl 0 0 2 0 0.5 1
## 13 withgroup dbl 0 0 2 0 0.45 1
## 14 deck chr 0 0 8 NA NA NA
## 15 side chr 0 0 2 NA NA NA
## 16 expense dbl 0 0 2336 0 1441. 35987
## # A tibble: 15 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 4277 NA NA NA
## 2 HomePlanet chr 0 0 3 NA NA NA
## 3 CryoSleep lgl 0 0 2 0 0.36 1
## 4 Destination chr 0 0 4 NA NA NA
## 5 Age dbl 0 0 89 0 28.7 79
## 6 VIP lgl 0 0 2 0 0.02 1
## 7 RoomService dbl 0 0 842 0 215. 11567
## 8 FoodCourt dbl 0 0 902 0 429. 25273
## 9 ShoppingMall dbl 0 0 715 0 173. 8292
## 10 Spa dbl 0 0 833 0 296. 19844
## 11 VRDeck dbl 0 0 796 0 305. 22272
## 12 withgroup dbl 0 0 2 0 0.45 1
## 13 deck chr 0 0 8 NA NA NA
## 14 side chr 0 0 2 NA NA NA
## 15 expense dbl 0 0 1437 0 1418. 33666
train_c$HomePlanet <- as.factor(train_c$HomePlanet)
train_c$Destination <- as.factor(train_c$Destination)
train_c$deck <- as.factor(train_c$deck)
train_c$side <- as.factor(train_c$side)
test_c$HomePlanet <- as.factor(test_c$HomePlanet)
test_c$Destination <- as.factor(test_c$Destination)
test_c$deck <- as.factor(test_c$deck)
test_c$side <- as.factor(test_c$side)
## | | | 0% | |. | 2% | |.. | 5% [global_options] | |... | 7% | |.... | 10% [introduce] | |.... | 12% | |..... | 14% [plot_intro] | |...... | 17% | |....... | 19% [data_structure] | |........ | 21% | |......... | 24% [missing_profile] | |.......... | 26% | |........... | 29% [univariate_distribution_header] | |........... | 31% | |............ | 33% [plot_histogram] | |............. | 36% | |.............. | 38% [plot_density] | |............... | 40% | |................ | 43% [plot_frequency_bar] | |................. | 45% | |.................. | 48% [plot_response_bar] | |.................. | 50% | |................... | 52% [plot_with_bar] | |.................... | 55% | |..................... | 57% [plot_normal_qq] | |...................... | 60% | |....................... | 62% [plot_response_qq] | |........................ | 64% | |......................... | 67% [plot_by_qq] | |.......................... | 69% | |.......................... | 71% [correlation_analysis] | |........................... | 74% | |............................ | 76% [principal_component_analysis] | |............................. | 79% | |.............................. | 81% [bivariate_distribution_header] | |............................... | 83% | |................................ | 86% [plot_response_boxplot] | |................................. | 88% | |................................. | 90% [plot_by_boxplot] | |.................................. | 93% | |................................... | 95% [plot_response_scatterplot] | |.................................... | 98% | |.....................................| 100% [plot_by_scatterplot]
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS "C:\Users\User\OneDrive\AD0F~1\EKONOM~2\REPORT~1.MD" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc41cc56aa574d.html --lua-filter "C:\Users\User\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\User\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\User\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\User\AppData\Local\Temp\RtmpG682s0\rmarkdown-str41cc6fa665cc.html"
##
## Call:
## lm(formula = Transported ~ ., data = train_c[, 2:16])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.42802 -0.30952 -0.03188 0.28870 1.79172
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.147e-01 4.000e-02 7.866 4.09e-15 ***
## HomePlanetEuropa 2.150e-01 2.883e-02 7.457 9.72e-14 ***
## HomePlanetMars 9.617e-02 1.475e-02 6.522 7.33e-11 ***
## CryoSleepTRUE 3.811e-01 1.152e-02 33.082 < 2e-16 ***
## DestinationPSO J318.5-22 -4.449e-02 1.803e-02 -2.468 0.013598 *
## DestinationTRAPPIST-1e -5.140e-02 1.133e-02 -4.538 5.76e-06 ***
## DestinationTRAPPIST-le 7.513e-03 3.142e-02 0.239 0.811033
## Age -2.182e-03 3.201e-04 -6.815 1.00e-11 ***
## VIPTRUE -3.757e-02 2.974e-02 -1.263 0.206555
## RoomService -1.176e-04 7.055e-06 -16.674 < 2e-16 ***
## FoodCourt 4.281e-05 3.054e-06 14.017 < 2e-16 ***
## ShoppingMall 7.969e-05 7.457e-06 10.686 < 2e-16 ***
## Spa -8.675e-05 4.109e-06 -21.110 < 2e-16 ***
## VRDeck -8.280e-05 4.115e-06 -20.120 < 2e-16 ***
## withgroup 1.958e-02 9.293e-03 2.107 0.035134 *
## deckB 1.107e-01 2.874e-02 3.852 0.000118 ***
## deckC 1.477e-01 2.900e-02 5.094 3.58e-07 ***
## deckD 5.090e-02 3.478e-02 1.463 0.143372
## deckE 8.494e-03 3.622e-02 0.235 0.814567
## deckF 1.031e-01 3.709e-02 2.778 0.005475 **
## deckG 5.791e-02 3.866e-02 1.498 0.134218
## deckT 6.610e-02 1.815e-01 0.364 0.715784
## sideS 8.542e-02 8.629e-03 9.899 < 2e-16 ***
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4012 on 8670 degrees of freedom
## Multiple R-squared: 0.3578, Adjusted R-squared: 0.3561
## F-statistic: 219.5 on 22 and 8670 DF, p-value: < 2.2e-16
library(caTools)
set.seed(123)
split = sample.split(train_c$Transported, SplitRatio = 0.75)
train_train = subset(train_c, split == TRUE)
train_test = subset(train_c, split == FALSE)
##
## Call:
## lm(formula = Transported ~ ., data = train_train[, -c(1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.50155 -0.31213 -0.03018 0.29171 1.76723
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.482e-01 4.626e-02 7.527 5.89e-14 ***
## HomePlanetEuropa 1.971e-01 3.358e-02 5.871 4.55e-09 ***
## HomePlanetMars 8.600e-02 1.699e-02 5.061 4.29e-07 ***
## CryoSleepTRUE 3.772e-01 1.328e-02 28.402 < 2e-16 ***
## DestinationPSO J318.5-22 -6.107e-02 2.077e-02 -2.940 0.00330 **
## DestinationTRAPPIST-1e -5.426e-02 1.309e-02 -4.144 3.46e-05 ***
## DestinationTRAPPIST-le -9.053e-03 3.649e-02 -0.248 0.80407
## Age -2.339e-03 3.725e-04 -6.279 3.63e-10 ***
## VIPTRUE -1.616e-02 3.384e-02 -0.478 0.63290
## RoomService -1.175e-04 7.790e-06 -15.088 < 2e-16 ***
## FoodCourt 3.905e-05 3.467e-06 11.262 < 2e-16 ***
## ShoppingMall 8.282e-05 8.415e-06 9.842 < 2e-16 ***
## Spa -8.543e-05 4.704e-06 -18.160 < 2e-16 ***
## VRDeck -8.216e-05 4.777e-06 -17.197 < 2e-16 ***
## withgroup 1.601e-02 1.076e-02 1.488 0.13688
## deckB 1.049e-01 3.290e-02 3.188 0.00144 **
## deckC 1.441e-01 3.318e-02 4.343 1.43e-05 ***
## deckD 3.223e-02 3.974e-02 0.811 0.41744
## deckE -2.463e-02 4.213e-02 -0.585 0.55884
## deckF 8.346e-02 4.290e-02 1.945 0.05176 .
## deckG 4.007e-02 4.472e-02 0.896 0.37029
## deckT 5.680e-02 1.822e-01 0.312 0.75526
## sideS 8.895e-02 9.984e-03 8.909 < 2e-16 ***
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4013 on 6497 degrees of freedom
## Multiple R-squared: 0.358, Adjusted R-squared: 0.3558
## F-statistic: 164.7 on 22 and 6497 DF, p-value: < 2.2e-16
## reg_transported_tahmin
## transported_gercek 0 1
## 0 902 177
## 1 323 771
## [1] 0.7699034
## reg_transported_tahmin
## transported_gercek 0 1
## 0 2682 554
## 1 946 2338
## [1] 0.7699387
modellog = lm(Transported ~ 1 + HomePlanet + Destination + deck + side + CryoSleep + Age + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck, data = train_c_log)
## 1 2 3 4 5 6
## TRUE FALSE TRUE TRUE TRUE FALSE
model = lm(y ~ x1 + x2 + x3, data = df)
##
## Call:
## glm(formula = Transported ~ ., family = binomial, data = train_train[,
## -c(1)])
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.607e-01 3.588e-01 -1.284 0.19917
## HomePlanetEuropa 1.556e+00 2.674e-01 5.819 5.91e-09 ***
## HomePlanetMars 5.323e-01 1.103e-01 4.827 1.39e-06 ***
## CryoSleepTRUE 1.285e+00 9.185e-02 13.987 < 2e-16 ***
## DestinationPSO J318.5-22 -5.125e-01 1.297e-01 -3.950 7.80e-05 ***
## DestinationTRAPPIST-1e -4.904e-01 9.441e-02 -5.194 2.05e-07 ***
## DestinationTRAPPIST-le -1.471e-01 2.436e-01 -0.604 0.54580
## Age -8.481e-03 2.461e-03 -3.446 0.00057 ***
## VIPTRUE -1.650e-01 2.916e-01 -0.566 0.57149
## RoomService -1.730e-03 1.136e-04 -15.225 < 2e-16 ***
## FoodCourt 4.581e-04 4.460e-05 10.271 < 2e-16 ***
## ShoppingMall 5.289e-04 7.559e-05 6.997 2.61e-12 ***
## Spa -1.969e-03 1.164e-04 -16.913 < 2e-16 ***
## VRDeck -1.902e-03 1.161e-04 -16.392 < 2e-16 ***
## withgroup 1.206e-01 7.192e-02 1.677 0.09346 .
## deckB 1.153e+00 2.912e-01 3.961 7.47e-05 ***
## deckC 2.481e+00 3.325e-01 7.463 8.48e-14 ***
## deckD 6.978e-01 3.262e-01 2.139 0.03242 *
## deckE 1.029e-01 3.378e-01 0.305 0.76065
## deckF 7.739e-01 3.420e-01 2.263 0.02363 *
## deckG 3.766e-01 3.511e-01 1.073 0.28345
## deckT -1.184e-01 1.866e+00 -0.063 0.94942
## sideS 5.988e-01 6.663e-02 8.987 < 2e-16 ***
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9038.3 on 6519 degrees of freedom
## Residual deviance: 5600.2 on 6497 degrees of freedom
## AIC: 5646.2
##
## Number of Fisher Scoring iterations: 7
## 1 2 3 4 5 6
## -1.1175750 -11.0810548 0.9556805 -1.6375160 -0.4294310 -1.6887335
## 1 2 3 4 5 6
## 0 0 1 0 0 0
result$Transported = as.factor(result$Transported)
result$logistic_transported_tahmin = as.factor(result$logistic_transported_tahmin)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.786
## Truth
## Prediction 0 1
## 0 918 305
## 1 161 789
## logistic_transported_tahmin
## transported_gercek 0 1
## 0 918 161
## 1 305 789
## [1] 0.7855499
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 918 161
## 1 305 789
##
## Accuracy : 0.7855
## 95% CI : (0.7677, 0.8026)
## No Information Rate : 0.5628
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5715
##
## Mcnemar's Test P-Value : 3.488e-11
##
## Sensitivity : 0.7506
## Specificity : 0.8305
## Pos Pred Value : 0.8508
## Neg Pred Value : 0.7212
## Prevalence : 0.5628
## Detection Rate : 0.4225
## Detection Prevalence : 0.4965
## Balanced Accuracy : 0.7906
##
## 'Positive' Class : 0
##
## [1] 0.7855499
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## FALSE TRUE
## 0.496319 0.503681
##
## Conditional probabilities:
## HomePlanet
## Y Earth Europa Mars
## FALSE 0.6236094 0.1739802 0.2024104
## TRUE 0.4631547 0.3279537 0.2088916
##
## CryoSleep
## Y FALSE TRUE
## FALSE 0.8702101 0.1297899
## TRUE 0.4336175 0.5663825
##
## Destination
## Y 55 Cancri e PSO J318.5-22 TRAPPIST-1e TRAPPIST-le
## FALSE 0.16069221 0.09363412 0.72404203 0.02163164
## TRUE 0.24969549 0.09043849 0.64007308 0.01979294
##
## Age
## Y [,1] [,2]
## FALSE 30.01092 13.45222
## TRUE 27.85527 14.87646
##
## VIP
## Y FALSE TRUE
## FALSE 0.97126082 0.02873918
## TRUE 0.98142509 0.01857491
##
## RoomService
## Y [,1] [,2]
## FALSE 402.20365 916.6547
## TRUE 56.93484 246.8105
##
## FoodCourt
## Y [,1] [,2]
## FALSE 396.2923 1258.123
## TRUE 514.8018 1918.620
##
## ShoppingMall
## Y [,1] [,2]
## FALSE 160.2250 432.1103
## TRUE 182.7135 748.5867
##
## Spa
## Y [,1] [,2]
## FALSE 561.16193 1554.2636
## TRUE 61.11571 264.0556
##
## VRDeck
## Y [,1] [,2]
## FALSE 533.98733 1542.8703
## TRUE 69.07186 295.0542
##
## withgroup
## Y [,1] [,2]
## FALSE 0.3952410 0.4889780
## TRUE 0.4960414 0.5000605
##
## deck
## Y A B C D E
## FALSE 0.0296662546 0.0540791100 0.0574783684 0.0661310260 0.1344252163
## TRUE 0.0310596833 0.1385505481 0.1199756395 0.0490255786 0.0676004872
## deck
## Y F G T
## FALSE 0.3637206428 0.2932632880 0.0012360939
## TRUE 0.2828867235 0.3105968331 0.0003045067
##
## side
## Y P S
## FALSE 0.5571693 0.4428307
## TRUE 0.4448843 0.5551157
##
## expense
## Y [,1] [,2]
## FALSE 2053.8702 3224.219
## TRUE 884.6376 2307.962
## FALSE. TRUE.
## 1 0.22238032 7.776197e-01
## 2 1.00000000 8.663595e-135
## 3 0.02729931 9.727007e-01
## 4 0.59276719 4.072328e-01
## 5 0.02410104 9.758990e-01
## 6 0.38468138 6.153186e-01
## [1] 1 0 1 0 1 1
## Transported
## [1,] 1
## [2,] 0
## [3,] 1
## [4,] 0
## [5,] 0
## [6,] 0
## Transported_pred_nb
## Transported_train_test 0 1
## 0 480 599
## 1 76 1018
## [1] 0.6893695
Transported = as.character(Transported_pred_nb)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_nb.csv", row.names = FALSE, quote = FALSE)
fit_svm = svm(Transported ~ ., data = train_train[, -1],
type= 'C-classification',
kernel = 'linear')
preds = predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>%
data.frame()
## .
## 1 FALSE
## 2 FALSE
## 3 TRUE
## 4 FALSE
## 5 FALSE
## 6 FALSE
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm
## Transported_pred_svm
## Transported_train_test 0 1
## 0 836 243
## 1 180 914
## [1] 0.8053382
fit_svm = svm(Transported ~ ., data = train_c[, -1],
type= 'C-classification',
kernel = 'linear')
preds = predict(fit_svm, newdata = test_c, type ="raw")%>%
data.frame()
## .
## 1 TRUE
## 2 FALSE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
Transported = as.character(Transported_pred_svm)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm.csv", row.names = FALSE, quote = FALSE)
fit_svm = svm(Transported ~ ., data = train_train[, -1],
type= 'C-classification',
kernel = 'radial')
preds = predict(fit_svm, newdata = train_test[, -c(1, 12)], type ="raw")%>%
data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm
## Transported_pred_svm
## Transported_train_test 0 1
## 0 849 230
## 1 202 892
## [1] 0.8011965
fit_svm = svm(Transported ~ ., data = train_c[, -1],
type= 'C-classification',
kernel = 'radial')
preds = predict(fit_svm, newdata = test_c, type ="raw")%>%
data.frame()
Transported = as.character(Transported_pred_svm)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm_radial.csv", row.names = FALSE, quote = FALSE)
P = ggplot(train_train, aes(x=HomePlanet, y=deck, color=factor(Transported))) +
geom_point(aes(shape=factor(Transported)), size=3) +
scale_color_viridis_d() + labs(title = "", x="HomePlanet", y="deck") + theme_minimal() + theme(legend.position ="top")
P
## Call:
## rpart::rpart(formula = Transported ~ ., data = train_train[,
## -1])
## n= 6520
##
## CP nsplit rel error xerror xstd
## 1 0.23321359 0 1.0000000 1.0001811 0.0001977604
## 2 0.04905507 1 0.7667864 0.7671015 0.0103212807
## 3 0.02807107 2 0.7177313 0.7180892 0.0090936774
## 4 0.02784643 3 0.6896603 0.7039381 0.0095638265
## 5 0.01770479 4 0.6618138 0.6661699 0.0097007659
## 6 0.01381587 5 0.6441090 0.6496207 0.0100644975
## 7 0.01194576 6 0.6302932 0.6389139 0.0102048399
## 8 0.01164524 7 0.6183474 0.6310762 0.0102269497
## 9 0.01000000 8 0.6067022 0.6242632 0.0102752753
##
## Variable importance
## expense CryoSleep FoodCourt Spa VRDeck RoomService
## 23 17 13 12 11 9
## deck HomePlanet ShoppingMall Age Destination
## 5 5 2 1 1
##
## Node number 1: 6520 observations, complexity param=0.2332136
## mean=0.503681, MSE=0.2499865
## left son=2 (3795 obs) right son=3 (2725 obs)
## Primary splits:
## expense < 0.5 to the right, improve=0.2332136, (0 missing)
## CryoSleep < 0.5 to the left, improve=0.2095384, (0 missing)
## RoomService < 0.5 to the right, improve=0.1254121, (0 missing)
## Spa < 0.5 to the right, improve=0.1142180, (0 missing)
## VRDeck < 0.5 to the right, improve=0.1084074, (0 missing)
## Surrogate splits:
## CryoSleep < 0.5 to the left, agree=0.932, adj=0.837, (0 split)
## Spa < 0.5 to the right, agree=0.784, adj=0.484, (0 split)
## FoodCourt < 0.5 to the right, agree=0.770, adj=0.451, (0 split)
## VRDeck < 0.5 to the right, agree=0.768, adj=0.444, (0 split)
## RoomService < 0.5 to the right, agree=0.757, adj=0.419, (0 split)
##
## Node number 2: 3795 observations, complexity param=0.02807107
## mean=0.2990777, MSE=0.2096302
## left son=4 (3243 obs) right son=5 (552 obs)
## Primary splits:
## FoodCourt < 1331 to the left, improve=0.05751186, (0 missing)
## ShoppingMall < 627.5 to the left, improve=0.04530804, (0 missing)
## RoomService < 365.5 to the right, improve=0.04440387, (0 missing)
## Spa < 257.5 to the right, improve=0.03347453, (0 missing)
## VRDeck < 721 to the right, improve=0.02290457, (0 missing)
## Surrogate splits:
## expense < 5981 to the left, agree=0.885, adj=0.210, (0 split)
## deck splits as RRRLLLLL, agree=0.884, adj=0.203, (0 split)
## HomePlanet splits as LRL, agree=0.878, adj=0.159, (0 split)
## Spa < 8955.5 to the left, agree=0.856, adj=0.009, (0 split)
## VRDeck < 11692 to the left, agree=0.856, adj=0.009, (0 split)
##
## Node number 3: 2725 observations, complexity param=0.04905507
## mean=0.7886239, MSE=0.1666963
## left son=6 (1448 obs) right son=7 (1277 obs)
## Primary splits:
## deck splits as RRRRLRL-, improve=0.17601740, (0 missing)
## HomePlanet splits as LRR, improve=0.12704810, (0 missing)
## Destination splits as RLLR, improve=0.02774089, (0 missing)
## CryoSleep < 0.5 to the left, improve=0.02268236, (0 missing)
## side splits as LR, improve=0.01498707, (0 missing)
## Surrogate splits:
## HomePlanet splits as LRR, agree=0.934, adj=0.860, (0 split)
## Age < 24.5 to the left, agree=0.626, adj=0.201, (0 split)
## Destination splits as RLLR, agree=0.596, adj=0.137, (0 split)
## withgroup < 0.5 to the left, agree=0.583, adj=0.110, (0 split)
## VIP < 0.5 to the left, agree=0.538, adj=0.014, (0 split)
##
## Node number 4: 3243 observations, complexity param=0.02784643
## mean=0.2537774, MSE=0.1893744
## left son=8 (2577 obs) right son=9 (666 obs)
## Primary splits:
## ShoppingMall < 541.5 to the left, improve=0.07390355, (0 missing)
## RoomService < 365.5 to the right, improve=0.03464407, (0 missing)
## Spa < 240.5 to the right, improve=0.03327259, (0 missing)
## VRDeck < 114 to the right, improve=0.02784287, (0 missing)
## expense < 2867.5 to the right, improve=0.01811461, (0 missing)
## Surrogate splits:
## expense < 18644 to the left, agree=0.795, adj=0.003, (0 split)
##
## Node number 5: 552 observations, complexity param=0.01770479
## mean=0.5652174, MSE=0.2457467
## left son=10 (123 obs) right son=11 (429 obs)
## Primary splits:
## Spa < 1372.5 to the right, improve=0.21272970, (0 missing)
## VRDeck < 1063.5 to the right, improve=0.17089500, (0 missing)
## expense < 5395 to the right, improve=0.06611166, (0 missing)
## FoodCourt < 2513 to the left, improve=0.02780045, (0 missing)
## deck splits as LLRLLRRL, improve=0.02705865, (0 missing)
## Surrogate splits:
## expense < 12647 to the right, agree=0.790, adj=0.057, (0 split)
## Age < 13.5 to the left, agree=0.779, adj=0.008, (0 split)
## RoomService < 3895.5 to the right, agree=0.779, adj=0.008, (0 split)
##
## Node number 6: 1448 observations
## mean=0.6277624, MSE=0.2336768
##
## Node number 7: 1277 observations
## mean=0.9710258, MSE=0.02813466
##
## Node number 8: 2577 observations, complexity param=0.01194576
## mean=0.193636, MSE=0.1561411
## left son=16 (2067 obs) right son=17 (510 obs)
## Primary splits:
## FoodCourt < 456.5 to the left, improve=0.04838893, (0 missing)
## expense < 1447.5 to the right, improve=0.04016842, (0 missing)
## HomePlanet splits as RLL, improve=0.02539308, (0 missing)
## Spa < 537.5 to the right, improve=0.01895890, (0 missing)
## RoomService < 400.5 to the right, improve=0.01706521, (0 missing)
## Surrogate splits:
## expense < 12373 to the left, agree=0.804, adj=0.008, (0 split)
## Spa < 13650 to the left, agree=0.803, adj=0.006, (0 split)
## VRDeck < 10123.5 to the left, agree=0.802, adj=0.002, (0 split)
## deck splits as LLLLLLLR, agree=0.802, adj=0.002, (0 split)
##
## Node number 9: 666 observations
## mean=0.4864865, MSE=0.2498174
##
## Node number 10: 123 observations
## mean=0.1382114, MSE=0.119109
##
## Node number 11: 429 observations, complexity param=0.01381587
## mean=0.6876457, MSE=0.2147891
## left son=22 (143 obs) right son=23 (286 obs)
## Primary splits:
## VRDeck < 611 to the right, improve=0.24438400, (0 missing)
## Spa < 225 to the right, improve=0.05300377, (0 missing)
## FoodCourt < 3119.5 to the left, improve=0.05168044, (0 missing)
## side splits as LR, improve=0.04004566, (0 missing)
## RoomService < 1719.5 to the right, improve=0.03930897, (0 missing)
## Surrogate splits:
## expense < 6032 to the right, agree=0.702, adj=0.105, (0 split)
## Age < 53.5 to the right, agree=0.674, adj=0.021, (0 split)
## FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
##
## Node number 16: 2067 observations
## mean=0.1504596, MSE=0.1278215
##
## Node number 17: 510 observations, complexity param=0.01164524
## mean=0.3686275, MSE=0.2327413
## left son=34 (204 obs) right son=35 (306 obs)
## Primary splits:
## expense < 1447.5 to the right, improve=0.15990760, (0 missing)
## VRDeck < 86.5 to the right, improve=0.10060710, (0 missing)
## HomePlanet splits as RLL, improve=0.07952538, (0 missing)
## Spa < 500 to the right, improve=0.07353369, (0 missing)
## deck splits as LLLLRRRL, improve=0.05075444, (0 missing)
## Surrogate splits:
## HomePlanet splits as RLL, agree=0.873, adj=0.681, (0 split)
## deck splits as LLLLRRRL, agree=0.839, adj=0.598, (0 split)
## VRDeck < 213.5 to the right, agree=0.818, adj=0.544, (0 split)
## Spa < 219.5 to the right, agree=0.792, adj=0.480, (0 split)
## FoodCourt < 907 to the right, agree=0.722, adj=0.304, (0 split)
##
## Node number 22: 143 observations
## mean=0.3636364, MSE=0.231405
##
## Node number 23: 286 observations
## mean=0.8496503, MSE=0.1277446
##
## Node number 34: 204 observations
## mean=0.1323529, MSE=0.1148356
##
## Node number 35: 306 observations
## mean=0.5261438, MSE=0.2493165
## .
## 1 0.1504596
## 2 0.1382114
## 3 0.8496503
## 4 0.1504596
## 5 0.4864865
## 6 0.1504596
Transported_pred_tree = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_tree)
cm
## Transported_pred_tree
## Transported_train_test 0 1
## 0 807 272
## 1 237 857
## [1] 0.7657616
fit_tree = rpart::rpart(Transported ~ ., data = train_c[, -1])
preds= predict(fit_tree, newdata = test_c) %>%
data.frame()
Transported = as.character(Transported_pred_tree)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_tree.csv", row.names = FALSE, quote = FALSE)
## IncNodePurity
## HomePlanet 48.125593
## CryoSleep 107.989293
## Destination 30.575915
## Age 91.836683
## VIP 2.516955
## RoomService 125.936343
## FoodCourt 113.623386
## ShoppingMall 96.731658
## Spa 131.332461
## VRDeck 122.410460
## withgroup 12.925079
## deck 90.196804
## side 22.439063
## expense 238.242509
## .
## 1 0.1050685
## 2 0.1614667
## 3 0.8062586
## 4 0.1920834
## 5 0.6907569
## 6 0.1659998
## Transported_pred_forest
## Transported_train_test 0 1
## 0 823 256
## 1 175 919
## [1] 0.8016567
fit_forest = randomForest(Transported ~ ., data = train_c[, -1])
preds = predict(fit_forest, newdata = test_c) %>%
data.frame()
Transported = as.character(Transported_pred_forest)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_forest.csv", row.names = FALSE, quote = FALSE)