library(rmarkdown)
library(explore)
library(tidyverse)
library(readr)
test_c <- read_csv("test_c.csv")
library(readr)
train_c <- read_csv("train_c.csv")
describe_all(train_c)
## # A tibble: 16 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet chr 0 0 3 NA NA NA
## 3 CryoSleep lgl 0 0 2 0 0.35 1
## 4 Destination chr 0 0 4 NA NA NA
## 5 Age dbl 0 0 90 0 28.8 79
## 6 VIP lgl 0 0 2 0 0.02 1
## 7 RoomService dbl 0 0 1273 0 220. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 0 0 1115 0 170. 23492
## 10 Spa dbl 0 0 1327 0 305. 22408
## 11 VRDeck dbl 0 0 1306 0 298. 24133
## 12 Transported lgl 0 0 2 0 0.5 1
## 13 withgroup dbl 0 0 2 0 0.45 1
## 14 deck chr 0 0 8 NA NA NA
## 15 side chr 0 0 2 NA NA NA
## 16 expense dbl 0 0 2336 0 1441. 35987
describe_all(test_c)
## # A tibble: 15 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 4277 NA NA NA
## 2 HomePlanet chr 0 0 3 NA NA NA
## 3 CryoSleep lgl 0 0 2 0 0.36 1
## 4 Destination chr 0 0 4 NA NA NA
## 5 Age dbl 0 0 89 0 28.7 79
## 6 VIP lgl 0 0 2 0 0.02 1
## 7 RoomService dbl 0 0 842 0 215. 11567
## 8 FoodCourt dbl 0 0 902 0 429. 25273
## 9 ShoppingMall dbl 0 0 715 0 173. 8292
## 10 Spa dbl 0 0 833 0 296. 19844
## 11 VRDeck dbl 0 0 796 0 305. 22272
## 12 withgroup dbl 0 0 2 0 0.45 1
## 13 deck chr 0 0 8 NA NA NA
## 14 side chr 0 0 2 NA NA NA
## 15 expense dbl 0 0 1437 0 1418. 33666
train_c$HomePlanet <- as.factor(train_c$HomePlanet)
train_c$Destination <- as.factor(train_c$Destination)
train_c$deck <- as.factor(train_c$deck)
train_c$side <- as.factor(train_c$side)
test_c$HomePlanet <- as.factor(test_c$HomePlanet)
test_c$Destination <- as.factor(test_c$Destination)
test_c$deck <- as.factor(test_c$deck)
test_c$side <- as.factor(test_c$side)
library(DataExplorer)
create_report(train_c)
##
|
| | 0%
|
|. | 2%
|
|.. | 5% [global_options]
|
|... | 7%
|
|.... | 10% [introduce]
|
|.... | 12%
|
|..... | 14% [plot_intro]
|
|...... | 17%
|
|....... | 19% [data_structure]
|
|........ | 21%
|
|......... | 24% [missing_profile]
|
|.......... | 26%
|
|........... | 29% [univariate_distribution_header]
|
|........... | 31%
|
|............ | 33% [plot_histogram]
|
|............. | 36%
|
|.............. | 38% [plot_density]
|
|............... | 40%
|
|................ | 43% [plot_frequency_bar]
|
|................. | 45%
|
|.................. | 48% [plot_response_bar]
|
|.................. | 50%
|
|................... | 52% [plot_with_bar]
|
|.................... | 55%
|
|..................... | 57% [plot_normal_qq]
|
|...................... | 60%
|
|....................... | 62% [plot_response_qq]
|
|........................ | 64%
|
|......................... | 67% [plot_by_qq]
|
|.......................... | 69%
|
|.......................... | 71% [correlation_analysis]
|
|........................... | 74%
|
|............................ | 76% [principal_component_analysis]
|
|............................. | 79%
|
|.............................. | 81% [bivariate_distribution_header]
|
|............................... | 83%
|
|................................ | 86% [plot_response_boxplot]
|
|................................. | 88%
|
|................................. | 90% [plot_by_boxplot]
|
|.................................. | 93%
|
|................................... | 95% [plot_response_scatterplot]
|
|.................................... | 98%
|
|.....................................| 100% [plot_by_scatterplot]
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS "C:\RSTUDI~1\FINALE~1\REPORT~1.MD" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc37007ee532e2.html --lua-filter "C:\Users\MRK~1\AppData\Local\R\WIN-LI~1\4.3\RMARKD~1\RMARKD~1\lua\PAGEBR~1.LUA" --lua-filter "C:\Users\MRK~1\AppData\Local\R\WIN-LI~1\4.3\RMARKD~1\RMARKD~1\lua\LATEX-~1.LUA" --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\MRK~1\AppData\Local\R\WIN-LI~1\4.3\RMARKD~1\rmd\h\DEFAUL~1.HTM" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\MRK~1\AppData\Local\Temp\RtmpUJyHae\rmarkdown-str37003d549af.html"
model = lm(Transported ~ ., data = train_c[, 2:16])
summary(model)
##
## Call:
## lm(formula = Transported ~ ., data = train_c[, 2:16])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.42802 -0.30952 -0.03188 0.28870 1.79172
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.147e-01 4.000e-02 7.866 4.09e-15 ***
## HomePlanetEuropa 2.150e-01 2.883e-02 7.457 9.72e-14 ***
## HomePlanetMars 9.617e-02 1.475e-02 6.522 7.33e-11 ***
## CryoSleepTRUE 3.811e-01 1.152e-02 33.082 < 2e-16 ***
## DestinationPSO J318.5-22 -4.449e-02 1.803e-02 -2.468 0.013598 *
## DestinationTRAPPIST-1e -5.140e-02 1.133e-02 -4.538 5.76e-06 ***
## DestinationTRAPPIST-le 7.513e-03 3.142e-02 0.239 0.811033
## Age -2.182e-03 3.201e-04 -6.815 1.00e-11 ***
## VIPTRUE -3.757e-02 2.974e-02 -1.263 0.206555
## RoomService -1.176e-04 7.055e-06 -16.674 < 2e-16 ***
## FoodCourt 4.281e-05 3.054e-06 14.017 < 2e-16 ***
## ShoppingMall 7.969e-05 7.457e-06 10.686 < 2e-16 ***
## Spa -8.675e-05 4.109e-06 -21.110 < 2e-16 ***
## VRDeck -8.280e-05 4.115e-06 -20.120 < 2e-16 ***
## withgroup 1.958e-02 9.293e-03 2.107 0.035134 *
## deckB 1.107e-01 2.874e-02 3.852 0.000118 ***
## deckC 1.477e-01 2.900e-02 5.094 3.58e-07 ***
## deckD 5.090e-02 3.478e-02 1.463 0.143372
## deckE 8.494e-03 3.622e-02 0.235 0.814567
## deckF 1.031e-01 3.709e-02 2.778 0.005475 **
## deckG 5.791e-02 3.866e-02 1.498 0.134218
## deckT 6.610e-02 1.815e-01 0.364 0.715784
## sideS 8.542e-02 8.629e-03 9.899 < 2e-16 ***
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4012 on 8670 degrees of freedom
## Multiple R-squared: 0.3578, Adjusted R-squared: 0.3561
## F-statistic: 219.5 on 22 and 8670 DF, p-value: < 2.2e-16
library(caTools)
## Warning: le package 'caTools' a été compilé avec la version R 4.3.3
set.seed(123)
split = sample.split(train_c$Transported, SplitRatio = 0.75)
train_train = subset(train_c, split == TRUE)
train_test = subset(train_c, split == FALSE)
regresyon = lm(Transported ~ ., data = train_train[, -c(1)])
summary(regresyon)
##
## Call:
## lm(formula = Transported ~ ., data = train_train[, -c(1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.50155 -0.31213 -0.03018 0.29171 1.76723
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.482e-01 4.626e-02 7.527 5.89e-14 ***
## HomePlanetEuropa 1.971e-01 3.358e-02 5.871 4.55e-09 ***
## HomePlanetMars 8.600e-02 1.699e-02 5.061 4.29e-07 ***
## CryoSleepTRUE 3.772e-01 1.328e-02 28.402 < 2e-16 ***
## DestinationPSO J318.5-22 -6.107e-02 2.077e-02 -2.940 0.00330 **
## DestinationTRAPPIST-1e -5.426e-02 1.309e-02 -4.144 3.46e-05 ***
## DestinationTRAPPIST-le -9.053e-03 3.649e-02 -0.248 0.80407
## Age -2.339e-03 3.725e-04 -6.279 3.63e-10 ***
## VIPTRUE -1.616e-02 3.384e-02 -0.478 0.63290
## RoomService -1.175e-04 7.790e-06 -15.088 < 2e-16 ***
## FoodCourt 3.905e-05 3.467e-06 11.262 < 2e-16 ***
## ShoppingMall 8.282e-05 8.415e-06 9.842 < 2e-16 ***
## Spa -8.543e-05 4.704e-06 -18.160 < 2e-16 ***
## VRDeck -8.216e-05 4.777e-06 -17.197 < 2e-16 ***
## withgroup 1.601e-02 1.076e-02 1.488 0.13688
## deckB 1.049e-01 3.290e-02 3.188 0.00144 **
## deckC 1.441e-01 3.318e-02 4.343 1.43e-05 ***
## deckD 3.223e-02 3.974e-02 0.811 0.41744
## deckE -2.463e-02 4.213e-02 -0.585 0.55884
## deckF 8.346e-02 4.290e-02 1.945 0.05176 .
## deckG 4.007e-02 4.472e-02 0.896 0.37029
## deckT 5.680e-02 1.822e-01 0.312 0.75526
## sideS 8.895e-02 9.984e-03 8.909 < 2e-16 ***
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4013 on 6497 degrees of freedom
## Multiple R-squared: 0.358, Adjusted R-squared: 0.3558
## F-statistic: 164.7 on 22 and 6497 DF, p-value: < 2.2e-16
reg_tahmin = predict(regresyon, newdata = train_test[, -c(1,12)])
reg_transported_tahmin = ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek = ifelse(train_test[12] == TRUE, 1, 0)
cm = table(transported_gercek, reg_transported_tahmin)
cm
## reg_transported_tahmin
## transported_gercek 0 1
## 0 902 177
## 1 323 771
(902 + 771)/(902 + 771 + 323 + 177)
## [1] 0.7699034
reg_tahmin = predict(regresyon, newdata = train_train[, -c(1,12)])
reg_transported_tahmin = ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek = ifelse(train_train[12] == TRUE, 1, 0)
cm = table(transported_gercek, reg_transported_tahmin)
cm
## reg_transported_tahmin
## transported_gercek 0 1
## 0 2682 554
## 1 946 2338
(2684 + 2338)/(2682 + 2338 + 554 + 946)
## [1] 0.7702454
reg_tahmin_bd = predict(model, newdata = test_c[, -c(1)])
reg_transported_test_c_tahmin = ifelse(reg_tahmin_bd > 0.5, TRUE, FALSE)
transported = as.character(reg_transported_test_c_tahmin)
PassengerId = test_c$PassengerId
Transported=as.vector(transported)
submission_regresyon = cbind(PassengerId, Transported)
submission_regresyon = as.data.frame(submission_regresyon)
library(stringr)
submission_regresyon$transported <- str_to_title(submission_regresyon$Transported)
write.csv(submission_regresyon, "siniftahmini.csv", row.names = FALSE, quote = FALSE)
library(glmulti)
train_c_log = train_c %>% mutate_at(c(5, 7:11, 16), ~ log(1 + .))
test_c_log = test_c %>% mutate_at(c(5, 7:11, 15), ~ log(1 + .))
modellog = lm(Transported ~ 1 + HomePlanet + Destination + deck + side + CryoSleep + Age + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck, data = train_c_log)
reg_tahmin_log = predict(modellog, newdata = test_c_log[, -c(1)])
reg_transported_test_c_tahmin_log = ifelse(reg_tahmin_log > 0.5, TRUE, FALSE)
head(reg_transported_test_c_tahmin_log)
## 1 2 3 4 5 6
## TRUE FALSE TRUE TRUE TRUE FALSE
Transported = as.character(reg_transported_test_c_tahmin_log)
PassengerId = test_c_log$PassengerId
Transported = as.vector(Transported)
submission_regresyon_log = cbind(PassengerId, Transported)
submission_regresyon_log =as.data.frame(submission_regresyon_log)
submission_regresyon_log$Transported = str_to_title(submission_regresyon_log$Transported)
write.csv(submission_regresyon_log,"submissyon_regresyon_log.csv", row.names = FALSE,quote = FALSE)
model = lm(y ~ x1 + x2 + x3, data = df)
logistic = glm(formula = Transported ~ .,
family = binomial,
data = train_train[, -c(1)])
## Warning: glm.fit: des probabilités ont été ajustées numériquement à 0 ou 1
summary(logistic)
##
## Call:
## glm(formula = Transported ~ ., family = binomial, data = train_train[,
## -c(1)])
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.607e-01 3.588e-01 -1.284 0.19917
## HomePlanetEuropa 1.556e+00 2.674e-01 5.819 5.91e-09 ***
## HomePlanetMars 5.323e-01 1.103e-01 4.827 1.39e-06 ***
## CryoSleepTRUE 1.285e+00 9.185e-02 13.987 < 2e-16 ***
## DestinationPSO J318.5-22 -5.125e-01 1.297e-01 -3.950 7.80e-05 ***
## DestinationTRAPPIST-1e -4.904e-01 9.441e-02 -5.194 2.05e-07 ***
## DestinationTRAPPIST-le -1.471e-01 2.436e-01 -0.604 0.54580
## Age -8.481e-03 2.461e-03 -3.446 0.00057 ***
## VIPTRUE -1.650e-01 2.916e-01 -0.566 0.57149
## RoomService -1.730e-03 1.136e-04 -15.225 < 2e-16 ***
## FoodCourt 4.581e-04 4.460e-05 10.271 < 2e-16 ***
## ShoppingMall 5.289e-04 7.559e-05 6.997 2.61e-12 ***
## Spa -1.969e-03 1.164e-04 -16.913 < 2e-16 ***
## VRDeck -1.902e-03 1.161e-04 -16.392 < 2e-16 ***
## withgroup 1.206e-01 7.192e-02 1.677 0.09346 .
## deckB 1.153e+00 2.912e-01 3.961 7.47e-05 ***
## deckC 2.481e+00 3.325e-01 7.463 8.48e-14 ***
## deckD 6.978e-01 3.262e-01 2.139 0.03242 *
## deckE 1.029e-01 3.378e-01 0.305 0.76065
## deckF 7.739e-01 3.420e-01 2.263 0.02363 *
## deckG 3.766e-01 3.511e-01 1.073 0.28345
## deckT -1.184e-01 1.866e+00 -0.063 0.94942
## sideS 5.988e-01 6.663e-02 8.987 < 2e-16 ***
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9038.3 on 6519 degrees of freedom
## Residual deviance: 5600.2 on 6497 degrees of freedom
## AIC: 5646.2
##
## Number of Fisher Scoring iterations: 7
logistic_tahmin = predict(logistic, newdata = train_test[, -c(1,12)])
head(logistic_tahmin)
## 1 2 3 4 5 6
## -1.1175750 -11.0810548 0.9556805 -1.6375160 -0.4294310 -1.6887335
logistic_transported_tahmin = ifelse(logistic_tahmin > 0.5, 1, 0)
head(logistic_transported_tahmin)
## 1 2 3 4 5 6
## 0 0 1 0 0 0
transported_gercek = ifelse(train_test[12] == TRUE, 1, 0)
library(tidymodels)
result = data.frame(cbind(transported_gercek, logistic_transported_tahmin))
result$Transported = as.factor(result$Transported)
result$logistic_transported_tahmin = as.factor(result$logistic_transported_tahmin)
accuracy(result, truth = Transported, estimate = logistic_transported_tahmin)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.786
conf_mat(result, truth = Transported, estimate = logistic_transported_tahmin)
## Truth
## Prediction 0 1
## 0 918 305
## 1 161 789
library(caret)
cm = table(transported_gercek, logistic_transported_tahmin)
cm
## logistic_transported_tahmin
## transported_gercek 0 1
## 0 918 161
## 1 305 789
(918 +789)/(918 +161 + 305 +789)
## [1] 0.7855499
confusionMatrix(as.factor(transported_gercek), as.factor(logistic_transported_tahmin))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 918 161
## 1 305 789
##
## Accuracy : 0.7855
## 95% CI : (0.7677, 0.8026)
## No Information Rate : 0.5628
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5715
##
## Mcnemar's Test P-Value : 3.488e-11
##
## Sensitivity : 0.7506
## Specificity : 0.8305
## Pos Pred Value : 0.8508
## Neg Pred Value : 0.7212
## Prevalence : 0.5628
## Detection Rate : 0.4225
## Detection Prevalence : 0.4965
## Balanced Accuracy : 0.7906
##
## 'Positive' Class : 0
##
(918+789)/(918+789+161+305)
## [1] 0.7855499
logistic_bd = glm(formula = Transported ~ .,
family = binomial,
data = train_c[, -c(1)])
logistic_tahmin_bd = predict(logistic_bd, newdata = test_c[, -c(1)])
logistic_transported_test_c_tahmin = ifelse(logistic_tahmin_bd >0.5, TRUE, FALSE )
Transported = as.character(logistic_transported_test_c_tahmin)
PassengerId = test_c$PassengerId
Transported=as.vector(Transported)
submission_logistic = cbind(PassengerId, Transported)
submission_logistic = as.data.frame(submission_logistic)
submission_logistic$Transported = str_to_title(submission_logistic$Transported)
write.csv(submission_logistic, "submission_logistic.csv", row.names =FALSE, quote=FALSE)
logistic_bd = glm(formula = Transported ~ .,
family = binomial,
data = train_c_log[, -c(1)])
logistic_tahmin_bd = predict(logistic_bd, newdata = test_c_log[, -c(1)])
logistic_transported_test_c_tahmin = ifelse(logistic_tahmin_bd >0.5, TRUE, FALSE )
Transported = as.character(logistic_transported_test_c_tahmin)
PassengerId = test_c$PassengerId
Transported=as.vector(Transported)
submission_llogistic = cbind(PassengerId, Transported)
submission_llogistic = as.data.frame(submission_logistic)
submission_llogistic$Transported = str_to_title(submission_logistic$Transported)
write.csv(submission_logistic, "submission_llogistic.csv", row.names =FALSE, quote=FALSE)
library(e1071)
fit_nb = naiveBayes(Transported ~ ., data =train_train[, -1])
fit_nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## FALSE TRUE
## 0.496319 0.503681
##
## Conditional probabilities:
## HomePlanet
## Y Earth Europa Mars
## FALSE 0.6236094 0.1739802 0.2024104
## TRUE 0.4631547 0.3279537 0.2088916
##
## CryoSleep
## Y FALSE TRUE
## FALSE 0.8702101 0.1297899
## TRUE 0.4336175 0.5663825
##
## Destination
## Y 55 Cancri e PSO J318.5-22 TRAPPIST-1e TRAPPIST-le
## FALSE 0.16069221 0.09363412 0.72404203 0.02163164
## TRUE 0.24969549 0.09043849 0.64007308 0.01979294
##
## Age
## Y [,1] [,2]
## FALSE 30.01092 13.45222
## TRUE 27.85527 14.87646
##
## VIP
## Y FALSE TRUE
## FALSE 0.97126082 0.02873918
## TRUE 0.98142509 0.01857491
##
## RoomService
## Y [,1] [,2]
## FALSE 402.20365 916.6547
## TRUE 56.93484 246.8105
##
## FoodCourt
## Y [,1] [,2]
## FALSE 396.2923 1258.123
## TRUE 514.8018 1918.620
##
## ShoppingMall
## Y [,1] [,2]
## FALSE 160.2250 432.1103
## TRUE 182.7135 748.5867
##
## Spa
## Y [,1] [,2]
## FALSE 561.16193 1554.2636
## TRUE 61.11571 264.0556
##
## VRDeck
## Y [,1] [,2]
## FALSE 533.98733 1542.8703
## TRUE 69.07186 295.0542
##
## withgroup
## Y [,1] [,2]
## FALSE 0.3952410 0.4889780
## TRUE 0.4960414 0.5000605
##
## deck
## Y A B C D E
## FALSE 0.0296662546 0.0540791100 0.0574783684 0.0661310260 0.1344252163
## TRUE 0.0310596833 0.1385505481 0.1199756395 0.0490255786 0.0676004872
## deck
## Y F G T
## FALSE 0.3637206428 0.2932632880 0.0012360939
## TRUE 0.2828867235 0.3105968331 0.0003045067
##
## side
## Y P S
## FALSE 0.5571693 0.4428307
## TRUE 0.4448843 0.5551157
##
## expense
## Y [,1] [,2]
## FALSE 2053.8702 3224.219
## TRUE 884.6376 2307.962
pred_nb = predict(fit_nb, newdata = train_test[, -c(1, 12)], type = "raw") %>%
data.frame()
head(pred_nb)
## FALSE. TRUE.
## 1 0.22238032 7.776197e-01
## 2 1.00000000 8.663595e-135
## 3 0.02729931 9.727007e-01
## 4 0.59276719 4.072328e-01
## 5 0.02410104 9.758990e-01
## 6 0.38468138 6.153186e-01
Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, 1, 0)
head(Transported_pred_nb)
## [1] 1 0 1 0 1 1
Transported_train_test = ifelse(train_test[12] == TRUE, 1, 0)
head(Transported_train_test)
## Transported
## [1,] 1
## [2,] 0
## [3,] 1
## [4,] 0
## [5,] 0
## [6,] 0
cm = table(Transported_train_test, Transported_pred_nb)
cm
## Transported_pred_nb
## Transported_train_test 0 1
## 0 480 599
## 1 76 1018
(480 + 1018)/(480 +599 +76 +1018)
## [1] 0.6893695
nb = naiveBayes(Transported ~ ., data = train_c[, -1])
pred_nb = predict(nb, newdata = test_c, type = "raw") %>% data.frame()
Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, TRUE, FALSE)
Transported = as.character(Transported_pred_nb)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_nb.csv", row.names = FALSE, quote = FALSE)
fit_svm = svm(Transported ~ ., data = train_train[, -1],
type= 'C-classification',
kernel = 'linear')
preds = predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>%
data.frame()
head(preds)
## .
## 1 FALSE
## 2 FALSE
## 3 TRUE
## 4 FALSE
## 5 FALSE
## 6 FALSE
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm
## Transported_pred_svm
## Transported_train_test 0 1
## 0 836 243
## 1 180 914
(836 + 914)/(836 +243 +180 + 914)
## [1] 0.8053382
fit_svm = svm(Transported ~ ., data = train_c[, -1],
type= 'C-classification',
kernel = 'linear')
preds = predict(fit_svm, newdata = test_c, type ="raw")%>%
data.frame()
head(preds)
## .
## 1 TRUE
## 2 FALSE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)
Transported = as.character(Transported_pred_svm)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm.csv", row.names = FALSE, quote = FALSE)
fit_svm = svm(Transported ~ ., data = train_train[, -1],
type= 'C-classification',
kernel = 'radial')
preds = predict(fit_svm, newdata = train_test[, -c(1, 12)], type ="raw")%>%
data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm
## Transported_pred_svm
## Transported_train_test 0 1
## 0 849 230
## 1 202 892
(849 +892)/(892 +849 +202 +230)
## [1] 0.8011965
fit_svm = svm(Transported ~ ., data = train_c[, -1],
type= 'C-classification',
kernel = 'radial')
preds = predict(fit_svm, newdata = test_c, type ="raw")%>%
data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)
Transported = as.character(Transported_pred_svm)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm_radial.csv", row.names = FALSE, quote = FALSE)
P = ggplot(train_train, aes(x=HomePlanet, y=deck, color=factor(Transported))) +
geom_point(aes(shape=factor(Transported)), size=3) +
scale_color_viridis_d() + labs(title = "", x="HomePlanet", y="deck") + theme_minimal() + theme(legend.position ="top")
P
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
fit_tree = rpart::rpart(Transported ~ ., data = train_train[, -1])
summary(fit_tree)
## Call:
## rpart::rpart(formula = Transported ~ ., data = train_train[,
## -1])
## n= 6520
##
## CP nsplit rel error xerror xstd
## 1 0.23321359 0 1.0000000 1.0001811 0.0001977604
## 2 0.04905507 1 0.7667864 0.7671015 0.0103212807
## 3 0.02807107 2 0.7177313 0.7180892 0.0090936774
## 4 0.02784643 3 0.6896603 0.7039381 0.0095638265
## 5 0.01770479 4 0.6618138 0.6661699 0.0097007659
## 6 0.01381587 5 0.6441090 0.6496207 0.0100644975
## 7 0.01194576 6 0.6302932 0.6389139 0.0102048399
## 8 0.01164524 7 0.6183474 0.6310762 0.0102269497
## 9 0.01000000 8 0.6067022 0.6242632 0.0102752753
##
## Variable importance
## expense CryoSleep FoodCourt Spa VRDeck RoomService
## 23 17 13 12 11 9
## deck HomePlanet ShoppingMall Age Destination
## 5 5 2 1 1
##
## Node number 1: 6520 observations, complexity param=0.2332136
## mean=0.503681, MSE=0.2499865
## left son=2 (3795 obs) right son=3 (2725 obs)
## Primary splits:
## expense < 0.5 to the right, improve=0.2332136, (0 missing)
## CryoSleep < 0.5 to the left, improve=0.2095384, (0 missing)
## RoomService < 0.5 to the right, improve=0.1254121, (0 missing)
## Spa < 0.5 to the right, improve=0.1142180, (0 missing)
## VRDeck < 0.5 to the right, improve=0.1084074, (0 missing)
## Surrogate splits:
## CryoSleep < 0.5 to the left, agree=0.932, adj=0.837, (0 split)
## Spa < 0.5 to the right, agree=0.784, adj=0.484, (0 split)
## FoodCourt < 0.5 to the right, agree=0.770, adj=0.451, (0 split)
## VRDeck < 0.5 to the right, agree=0.768, adj=0.444, (0 split)
## RoomService < 0.5 to the right, agree=0.757, adj=0.419, (0 split)
##
## Node number 2: 3795 observations, complexity param=0.02807107
## mean=0.2990777, MSE=0.2096302
## left son=4 (3243 obs) right son=5 (552 obs)
## Primary splits:
## FoodCourt < 1331 to the left, improve=0.05751186, (0 missing)
## ShoppingMall < 627.5 to the left, improve=0.04530804, (0 missing)
## RoomService < 365.5 to the right, improve=0.04440387, (0 missing)
## Spa < 257.5 to the right, improve=0.03347453, (0 missing)
## VRDeck < 721 to the right, improve=0.02290457, (0 missing)
## Surrogate splits:
## expense < 5981 to the left, agree=0.885, adj=0.210, (0 split)
## deck splits as RRRLLLLL, agree=0.884, adj=0.203, (0 split)
## HomePlanet splits as LRL, agree=0.878, adj=0.159, (0 split)
## Spa < 8955.5 to the left, agree=0.856, adj=0.009, (0 split)
## VRDeck < 11692 to the left, agree=0.856, adj=0.009, (0 split)
##
## Node number 3: 2725 observations, complexity param=0.04905507
## mean=0.7886239, MSE=0.1666963
## left son=6 (1448 obs) right son=7 (1277 obs)
## Primary splits:
## deck splits as RRRRLRL-, improve=0.17601740, (0 missing)
## HomePlanet splits as LRR, improve=0.12704810, (0 missing)
## Destination splits as RLLR, improve=0.02774089, (0 missing)
## CryoSleep < 0.5 to the left, improve=0.02268236, (0 missing)
## side splits as LR, improve=0.01498707, (0 missing)
## Surrogate splits:
## HomePlanet splits as LRR, agree=0.934, adj=0.860, (0 split)
## Age < 24.5 to the left, agree=0.626, adj=0.201, (0 split)
## Destination splits as RLLR, agree=0.596, adj=0.137, (0 split)
## withgroup < 0.5 to the left, agree=0.583, adj=0.110, (0 split)
## VIP < 0.5 to the left, agree=0.538, adj=0.014, (0 split)
##
## Node number 4: 3243 observations, complexity param=0.02784643
## mean=0.2537774, MSE=0.1893744
## left son=8 (2577 obs) right son=9 (666 obs)
## Primary splits:
## ShoppingMall < 541.5 to the left, improve=0.07390355, (0 missing)
## RoomService < 365.5 to the right, improve=0.03464407, (0 missing)
## Spa < 240.5 to the right, improve=0.03327259, (0 missing)
## VRDeck < 114 to the right, improve=0.02784287, (0 missing)
## expense < 2867.5 to the right, improve=0.01811461, (0 missing)
## Surrogate splits:
## expense < 18644 to the left, agree=0.795, adj=0.003, (0 split)
##
## Node number 5: 552 observations, complexity param=0.01770479
## mean=0.5652174, MSE=0.2457467
## left son=10 (123 obs) right son=11 (429 obs)
## Primary splits:
## Spa < 1372.5 to the right, improve=0.21272970, (0 missing)
## VRDeck < 1063.5 to the right, improve=0.17089500, (0 missing)
## expense < 5395 to the right, improve=0.06611166, (0 missing)
## FoodCourt < 2513 to the left, improve=0.02780045, (0 missing)
## deck splits as LLRLLRRL, improve=0.02705865, (0 missing)
## Surrogate splits:
## expense < 12647 to the right, agree=0.790, adj=0.057, (0 split)
## Age < 13.5 to the left, agree=0.779, adj=0.008, (0 split)
## RoomService < 3895.5 to the right, agree=0.779, adj=0.008, (0 split)
##
## Node number 6: 1448 observations
## mean=0.6277624, MSE=0.2336768
##
## Node number 7: 1277 observations
## mean=0.9710258, MSE=0.02813466
##
## Node number 8: 2577 observations, complexity param=0.01194576
## mean=0.193636, MSE=0.1561411
## left son=16 (2067 obs) right son=17 (510 obs)
## Primary splits:
## FoodCourt < 456.5 to the left, improve=0.04838893, (0 missing)
## expense < 1447.5 to the right, improve=0.04016842, (0 missing)
## HomePlanet splits as RLL, improve=0.02539308, (0 missing)
## Spa < 537.5 to the right, improve=0.01895890, (0 missing)
## RoomService < 400.5 to the right, improve=0.01706521, (0 missing)
## Surrogate splits:
## expense < 12373 to the left, agree=0.804, adj=0.008, (0 split)
## Spa < 13650 to the left, agree=0.803, adj=0.006, (0 split)
## VRDeck < 10123.5 to the left, agree=0.802, adj=0.002, (0 split)
## deck splits as LLLLLLLR, agree=0.802, adj=0.002, (0 split)
##
## Node number 9: 666 observations
## mean=0.4864865, MSE=0.2498174
##
## Node number 10: 123 observations
## mean=0.1382114, MSE=0.119109
##
## Node number 11: 429 observations, complexity param=0.01381587
## mean=0.6876457, MSE=0.2147891
## left son=22 (143 obs) right son=23 (286 obs)
## Primary splits:
## VRDeck < 611 to the right, improve=0.24438400, (0 missing)
## Spa < 225 to the right, improve=0.05300377, (0 missing)
## FoodCourt < 3119.5 to the left, improve=0.05168044, (0 missing)
## side splits as LR, improve=0.04004566, (0 missing)
## RoomService < 1719.5 to the right, improve=0.03930897, (0 missing)
## Surrogate splits:
## expense < 6032 to the right, agree=0.702, adj=0.105, (0 split)
## Age < 53.5 to the right, agree=0.674, adj=0.021, (0 split)
## FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
##
## Node number 16: 2067 observations
## mean=0.1504596, MSE=0.1278215
##
## Node number 17: 510 observations, complexity param=0.01164524
## mean=0.3686275, MSE=0.2327413
## left son=34 (204 obs) right son=35 (306 obs)
## Primary splits:
## expense < 1447.5 to the right, improve=0.15990760, (0 missing)
## VRDeck < 86.5 to the right, improve=0.10060710, (0 missing)
## HomePlanet splits as RLL, improve=0.07952538, (0 missing)
## Spa < 500 to the right, improve=0.07353369, (0 missing)
## deck splits as LLLLRRRL, improve=0.05075444, (0 missing)
## Surrogate splits:
## HomePlanet splits as RLL, agree=0.873, adj=0.681, (0 split)
## deck splits as LLLLRRRL, agree=0.839, adj=0.598, (0 split)
## VRDeck < 213.5 to the right, agree=0.818, adj=0.544, (0 split)
## Spa < 219.5 to the right, agree=0.792, adj=0.480, (0 split)
## FoodCourt < 907 to the right, agree=0.722, adj=0.304, (0 split)
##
## Node number 22: 143 observations
## mean=0.3636364, MSE=0.231405
##
## Node number 23: 286 observations
## mean=0.8496503, MSE=0.1277446
##
## Node number 34: 204 observations
## mean=0.1323529, MSE=0.1148356
##
## Node number 35: 306 observations
## mean=0.5261438, MSE=0.2493165
rpart.plot(fit_tree)
preds= predict(fit_tree, newdata = train_test[, -c(1,12)]) %>%
data.frame()
head(preds)
## .
## 1 0.1504596
## 2 0.1382114
## 3 0.8496503
## 4 0.1504596
## 5 0.4864865
## 6 0.1504596
Transported_pred_tree = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_tree)
cm
## Transported_pred_tree
## Transported_train_test 0 1
## 0 807 272
## 1 237 857
(807 +857)/(807 +857 +272 +237)
## [1] 0.7657616
fit_tree = rpart::rpart(Transported ~ ., data = train_c[, -1])
preds= predict(fit_tree, newdata = test_c) %>%
data.frame()
Transported_pred_tree = ifelse(preds$. > 0.5, TRUE, FALSE)
Transported = as.character(Transported_pred_tree)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_tree.csv", row.names = FALSE, quote = FALSE)
fit_forest = randomForest(Transported ~ ., data = train_train[, -1])
fit_forest$importance
## IncNodePurity
## HomePlanet 48.125593
## CryoSleep 107.989293
## Destination 30.575915
## Age 91.836683
## VIP 2.516955
## RoomService 125.936343
## FoodCourt 113.623386
## ShoppingMall 96.731658
## Spa 131.332461
## VRDeck 122.410460
## withgroup 12.925079
## deck 90.196804
## side 22.439063
## expense 238.242509
varImpPlot(fit_forest)
preds= predict(fit_forest, newdata = train_test[, -c(1,12)]) %>%
data.frame()
head(preds)
## .
## 1 0.1050685
## 2 0.1614667
## 3 0.8062586
## 4 0.1920834
## 5 0.6907569
## 6 0.1659998
Transported_pred_forest = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_forest )
cm
## Transported_pred_forest
## Transported_train_test 0 1
## 0 823 256
## 1 175 919
(823+919)/(823+256+175+919)
## [1] 0.8016567
fit_forest = randomForest(Transported ~ ., data = train_c[, -1])
preds = predict(fit_forest, newdata = test_c) %>%
data.frame()
Transported_pred_forest = ifelse(preds$. > 0.5, TRUE, FALSE)
Transported = as.character(Transported_pred_forest)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_forest.csv", row.names = FALSE, quote = FALSE)