EKONOMETRI FINAL PROJESI

MUHANNA EL FATTUH

2024-06-11


library(rmarkdown)
library(explore)
library(tidyverse)
library(readr)
test_c <- read_csv("test_c.csv")
library(readr)
train_c <- read_csv("train_c.csv")
describe_all(train_c)
## # A tibble: 16 × 8
##    variable     type     na na_pct unique   min    mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>   <dbl> <dbl>
##  1 PassengerId  chr       0      0   8693    NA   NA       NA
##  2 HomePlanet   chr       0      0      3    NA   NA       NA
##  3 CryoSleep    lgl       0      0      2     0    0.35     1
##  4 Destination  chr       0      0      4    NA   NA       NA
##  5 Age          dbl       0      0     90     0   28.8     79
##  6 VIP          lgl       0      0      2     0    0.02     1
##  7 RoomService  dbl       0      0   1273     0  220.   14327
##  8 FoodCourt    dbl       0      0   1507     0  448.   29813
##  9 ShoppingMall dbl       0      0   1115     0  170.   23492
## 10 Spa          dbl       0      0   1327     0  305.   22408
## 11 VRDeck       dbl       0      0   1306     0  298.   24133
## 12 Transported  lgl       0      0      2     0    0.5      1
## 13 withgroup    dbl       0      0      2     0    0.45     1
## 14 deck         chr       0      0      8    NA   NA       NA
## 15 side         chr       0      0      2    NA   NA       NA
## 16 expense      dbl       0      0   2336     0 1441.   35987
describe_all(test_c)
## # A tibble: 15 × 8
##    variable     type     na na_pct unique   min    mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>   <dbl> <dbl>
##  1 PassengerId  chr       0      0   4277    NA   NA       NA
##  2 HomePlanet   chr       0      0      3    NA   NA       NA
##  3 CryoSleep    lgl       0      0      2     0    0.36     1
##  4 Destination  chr       0      0      4    NA   NA       NA
##  5 Age          dbl       0      0     89     0   28.7     79
##  6 VIP          lgl       0      0      2     0    0.02     1
##  7 RoomService  dbl       0      0    842     0  215.   11567
##  8 FoodCourt    dbl       0      0    902     0  429.   25273
##  9 ShoppingMall dbl       0      0    715     0  173.    8292
## 10 Spa          dbl       0      0    833     0  296.   19844
## 11 VRDeck       dbl       0      0    796     0  305.   22272
## 12 withgroup    dbl       0      0      2     0    0.45     1
## 13 deck         chr       0      0      8    NA   NA       NA
## 14 side         chr       0      0      2    NA   NA       NA
## 15 expense      dbl       0      0   1437     0 1418.   33666
train_c$HomePlanet <- as.factor(train_c$HomePlanet)
train_c$Destination <- as.factor(train_c$Destination)
train_c$deck <- as.factor(train_c$deck)
train_c$side <- as.factor(train_c$side)
test_c$HomePlanet <- as.factor(test_c$HomePlanet)
test_c$Destination <- as.factor(test_c$Destination)
test_c$deck <- as.factor(test_c$deck)
test_c$side <- as.factor(test_c$side)
library(DataExplorer)
create_report(train_c)
##   |                                             |                                     |   0%  |                                             |.                                    |   2%                                   |                                             |..                                   |   5% [global_options]                  |                                             |...                                  |   7%                                   |                                             |....                                 |  10% [introduce]                       |                                             |....                                 |  12%                                   |                                             |.....                                |  14% [plot_intro]                      |                                             |......                               |  17%                                   |                                             |.......                              |  19% [data_structure]                  |                                             |........                             |  21%                                   |                                             |.........                            |  24% [missing_profile]                 |                                             |..........                           |  26%                                   |                                             |...........                          |  29% [univariate_distribution_header]  |                                             |...........                          |  31%                                   |                                             |............                         |  33% [plot_histogram]                  |                                             |.............                        |  36%                                   |                                             |..............                       |  38% [plot_density]                    |                                             |...............                      |  40%                                   |                                             |................                     |  43% [plot_frequency_bar]              |                                             |.................                    |  45%                                   |                                             |..................                   |  48% [plot_response_bar]               |                                             |..................                   |  50%                                   |                                             |...................                  |  52% [plot_with_bar]                   |                                             |....................                 |  55%                                   |                                             |.....................                |  57% [plot_normal_qq]                  |                                             |......................               |  60%                                   |                                             |.......................              |  62% [plot_response_qq]                |                                             |........................             |  64%                                   |                                             |.........................            |  67% [plot_by_qq]                      |                                             |..........................           |  69%                                   |                                             |..........................           |  71% [correlation_analysis]            |                                             |...........................          |  74%                                   |                                             |............................         |  76% [principal_component_analysis]    |                                             |.............................        |  79%                                   |                                             |..............................       |  81% [bivariate_distribution_header]   |                                             |...............................      |  83%                                   |                                             |................................     |  86% [plot_response_boxplot]           |                                             |.................................    |  88%                                   |                                             |.................................    |  90% [plot_by_boxplot]                 |                                             |..................................   |  93%                                   |                                             |...................................  |  95% [plot_response_scatterplot]       |                                             |.................................... |  98%                                   |                                             |.....................................| 100% [plot_by_scatterplot]                                                                                                                                      
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS "C:\Users\User\OneDrive\AD0F~1\EKONOM~2\REPORT~1.MD" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc41cc56aa574d.html --lua-filter "C:\Users\User\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\User\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\User\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\User\AppData\Local\Temp\RtmpG682s0\rmarkdown-str41cc6fa665cc.html"
model = lm(Transported ~ ., data = train_c[, 2:16])
summary(model)
## 
## Call:
## lm(formula = Transported ~ ., data = train_c[, 2:16])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.42802 -0.30952 -0.03188  0.28870  1.79172 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.147e-01  4.000e-02   7.866 4.09e-15 ***
## HomePlanetEuropa          2.150e-01  2.883e-02   7.457 9.72e-14 ***
## HomePlanetMars            9.617e-02  1.475e-02   6.522 7.33e-11 ***
## CryoSleepTRUE             3.811e-01  1.152e-02  33.082  < 2e-16 ***
## DestinationPSO J318.5-22 -4.449e-02  1.803e-02  -2.468 0.013598 *  
## DestinationTRAPPIST-1e   -5.140e-02  1.133e-02  -4.538 5.76e-06 ***
## DestinationTRAPPIST-le    7.513e-03  3.142e-02   0.239 0.811033    
## Age                      -2.182e-03  3.201e-04  -6.815 1.00e-11 ***
## VIPTRUE                  -3.757e-02  2.974e-02  -1.263 0.206555    
## RoomService              -1.176e-04  7.055e-06 -16.674  < 2e-16 ***
## FoodCourt                 4.281e-05  3.054e-06  14.017  < 2e-16 ***
## ShoppingMall              7.969e-05  7.457e-06  10.686  < 2e-16 ***
## Spa                      -8.675e-05  4.109e-06 -21.110  < 2e-16 ***
## VRDeck                   -8.280e-05  4.115e-06 -20.120  < 2e-16 ***
## withgroup                 1.958e-02  9.293e-03   2.107 0.035134 *  
## deckB                     1.107e-01  2.874e-02   3.852 0.000118 ***
## deckC                     1.477e-01  2.900e-02   5.094 3.58e-07 ***
## deckD                     5.090e-02  3.478e-02   1.463 0.143372    
## deckE                     8.494e-03  3.622e-02   0.235 0.814567    
## deckF                     1.031e-01  3.709e-02   2.778 0.005475 ** 
## deckG                     5.791e-02  3.866e-02   1.498 0.134218    
## deckT                     6.610e-02  1.815e-01   0.364 0.715784    
## sideS                     8.542e-02  8.629e-03   9.899  < 2e-16 ***
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4012 on 8670 degrees of freedom
## Multiple R-squared:  0.3578, Adjusted R-squared:  0.3561 
## F-statistic: 219.5 on 22 and 8670 DF,  p-value: < 2.2e-16
library(caTools)
set.seed(123)
split = sample.split(train_c$Transported, SplitRatio = 0.75)
train_train = subset(train_c, split == TRUE)
train_test = subset(train_c, split == FALSE)
regresyon = lm(Transported ~ ., data = train_train[, -c(1)])
summary(regresyon)
## 
## Call:
## lm(formula = Transported ~ ., data = train_train[, -c(1)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50155 -0.31213 -0.03018  0.29171  1.76723 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.482e-01  4.626e-02   7.527 5.89e-14 ***
## HomePlanetEuropa          1.971e-01  3.358e-02   5.871 4.55e-09 ***
## HomePlanetMars            8.600e-02  1.699e-02   5.061 4.29e-07 ***
## CryoSleepTRUE             3.772e-01  1.328e-02  28.402  < 2e-16 ***
## DestinationPSO J318.5-22 -6.107e-02  2.077e-02  -2.940  0.00330 ** 
## DestinationTRAPPIST-1e   -5.426e-02  1.309e-02  -4.144 3.46e-05 ***
## DestinationTRAPPIST-le   -9.053e-03  3.649e-02  -0.248  0.80407    
## Age                      -2.339e-03  3.725e-04  -6.279 3.63e-10 ***
## VIPTRUE                  -1.616e-02  3.384e-02  -0.478  0.63290    
## RoomService              -1.175e-04  7.790e-06 -15.088  < 2e-16 ***
## FoodCourt                 3.905e-05  3.467e-06  11.262  < 2e-16 ***
## ShoppingMall              8.282e-05  8.415e-06   9.842  < 2e-16 ***
## Spa                      -8.543e-05  4.704e-06 -18.160  < 2e-16 ***
## VRDeck                   -8.216e-05  4.777e-06 -17.197  < 2e-16 ***
## withgroup                 1.601e-02  1.076e-02   1.488  0.13688    
## deckB                     1.049e-01  3.290e-02   3.188  0.00144 ** 
## deckC                     1.441e-01  3.318e-02   4.343 1.43e-05 ***
## deckD                     3.223e-02  3.974e-02   0.811  0.41744    
## deckE                    -2.463e-02  4.213e-02  -0.585  0.55884    
## deckF                     8.346e-02  4.290e-02   1.945  0.05176 .  
## deckG                     4.007e-02  4.472e-02   0.896  0.37029    
## deckT                     5.680e-02  1.822e-01   0.312  0.75526    
## sideS                     8.895e-02  9.984e-03   8.909  < 2e-16 ***
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4013 on 6497 degrees of freedom
## Multiple R-squared:  0.358,  Adjusted R-squared:  0.3558 
## F-statistic: 164.7 on 22 and 6497 DF,  p-value: < 2.2e-16
reg_tahmin = predict(regresyon, newdata = train_test[, -c(1,12)])
reg_transported_tahmin = ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek = ifelse(train_test[12] == TRUE, 1, 0)
cm = table(transported_gercek, reg_transported_tahmin)
cm
##                   reg_transported_tahmin
## transported_gercek   0   1
##                  0 902 177
##                  1 323 771
(902 + 771)/(902 + 771 + 323 + 177)
## [1] 0.7699034
reg_tahmin = predict(regresyon, newdata = train_train[, -c(1,12)])
reg_transported_tahmin = ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek = ifelse(train_train[12] == TRUE, 1, 0)
cm = table(transported_gercek, reg_transported_tahmin)
cm
##                   reg_transported_tahmin
## transported_gercek    0    1
##                  0 2682  554
##                  1  946 2338
(2682 + 2338)/(2682 + 2338 + 554 + 946)
## [1] 0.7699387
reg_tahmin_bd = predict(model, newdata = test_c[, -c(1)])
reg_transported_test_c_tahmin = ifelse(reg_tahmin_bd > 0.5, TRUE, FALSE)
transported = as.character(reg_transported_test_c_tahmin)
PassengerId = test_c$PassengerId
Transported=as.vector(transported)
submission_regresyon = cbind(PassengerId, Transported)
submission_regresyon = as.data.frame(submission_regresyon)
library(stringr)
submission_regresyon$transported <- str_to_title(submission_regresyon$Transported)
write.csv(submission_regresyon, "siniftahmini.csv", row.names = FALSE, quote = FALSE)

train_c_log = train_c %>% mutate_at(c(5, 7:11, 16), ~ log(1 + .))
test_c_log = test_c %>% mutate_at(c(5, 7:11, 15), ~ log(1 + .))
modellog = lm(Transported ~ 1 + HomePlanet + Destination + deck + side + CryoSleep + Age + RoomService + FoodCourt + ShoppingMall + Spa +  VRDeck, data = train_c_log)
reg_tahmin_log = predict(modellog, newdata = test_c_log[, -c(1)])
reg_transported_test_c_tahmin_log = ifelse(reg_tahmin_log > 0.5, TRUE, FALSE)
head(reg_transported_test_c_tahmin_log)
##     1     2     3     4     5     6 
##  TRUE FALSE  TRUE  TRUE  TRUE FALSE
Transported = as.character(reg_transported_test_c_tahmin_log)
PassengerId = test_c_log$PassengerId
Transported = as.vector(Transported)
submission_regresyon_log = cbind(PassengerId, Transported)
submission_regresyon_log =as.data.frame(submission_regresyon_log)
submission_regresyon_log$Transported = str_to_title(submission_regresyon_log$Transported)
write.csv(submission_regresyon_log,"submissyon_regresyon_log.csv", row.names = FALSE,quote = FALSE)

model = lm(y ~ x1 + x2 + x3, data = df)

logistic = glm(formula = Transported ~ .,
              family = binomial,
              data = train_train[, -c(1)])
summary(logistic)
## 
## Call:
## glm(formula = Transported ~ ., family = binomial, data = train_train[, 
##     -c(1)])
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -4.607e-01  3.588e-01  -1.284  0.19917    
## HomePlanetEuropa          1.556e+00  2.674e-01   5.819 5.91e-09 ***
## HomePlanetMars            5.323e-01  1.103e-01   4.827 1.39e-06 ***
## CryoSleepTRUE             1.285e+00  9.185e-02  13.987  < 2e-16 ***
## DestinationPSO J318.5-22 -5.125e-01  1.297e-01  -3.950 7.80e-05 ***
## DestinationTRAPPIST-1e   -4.904e-01  9.441e-02  -5.194 2.05e-07 ***
## DestinationTRAPPIST-le   -1.471e-01  2.436e-01  -0.604  0.54580    
## Age                      -8.481e-03  2.461e-03  -3.446  0.00057 ***
## VIPTRUE                  -1.650e-01  2.916e-01  -0.566  0.57149    
## RoomService              -1.730e-03  1.136e-04 -15.225  < 2e-16 ***
## FoodCourt                 4.581e-04  4.460e-05  10.271  < 2e-16 ***
## ShoppingMall              5.289e-04  7.559e-05   6.997 2.61e-12 ***
## Spa                      -1.969e-03  1.164e-04 -16.913  < 2e-16 ***
## VRDeck                   -1.902e-03  1.161e-04 -16.392  < 2e-16 ***
## withgroup                 1.206e-01  7.192e-02   1.677  0.09346 .  
## deckB                     1.153e+00  2.912e-01   3.961 7.47e-05 ***
## deckC                     2.481e+00  3.325e-01   7.463 8.48e-14 ***
## deckD                     6.978e-01  3.262e-01   2.139  0.03242 *  
## deckE                     1.029e-01  3.378e-01   0.305  0.76065    
## deckF                     7.739e-01  3.420e-01   2.263  0.02363 *  
## deckG                     3.766e-01  3.511e-01   1.073  0.28345    
## deckT                    -1.184e-01  1.866e+00  -0.063  0.94942    
## sideS                     5.988e-01  6.663e-02   8.987  < 2e-16 ***
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9038.3  on 6519  degrees of freedom
## Residual deviance: 5600.2  on 6497  degrees of freedom
## AIC: 5646.2
## 
## Number of Fisher Scoring iterations: 7
logistic_tahmin = predict(logistic, newdata = train_test[, -c(1,12)])
head(logistic_tahmin)
##           1           2           3           4           5           6 
##  -1.1175750 -11.0810548   0.9556805  -1.6375160  -0.4294310  -1.6887335
logistic_transported_tahmin = ifelse(logistic_tahmin > 0.5, 1, 0)
head(logistic_transported_tahmin)
## 1 2 3 4 5 6 
## 0 0 1 0 0 0
transported_gercek = ifelse(train_test[12] == TRUE, 1, 0)
library(tidymodels)
result = data.frame(cbind(transported_gercek, logistic_transported_tahmin))
result$Transported = as.factor(result$Transported)
result$logistic_transported_tahmin = as.factor(result$logistic_transported_tahmin)
accuracy(result, truth = Transported, estimate = logistic_transported_tahmin)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.786
conf_mat(result, truth = Transported, estimate = logistic_transported_tahmin)
##           Truth
## Prediction   0   1
##          0 918 305
##          1 161 789
library(caret)
cm = table(transported_gercek, logistic_transported_tahmin)
cm
##                   logistic_transported_tahmin
## transported_gercek   0   1
##                  0 918 161
##                  1 305 789
(918 +789)/(918 +161 + 305 +789)
## [1] 0.7855499
confusionMatrix(as.factor(transported_gercek), as.factor(logistic_transported_tahmin))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 918 161
##          1 305 789
##                                           
##                Accuracy : 0.7855          
##                  95% CI : (0.7677, 0.8026)
##     No Information Rate : 0.5628          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5715          
##                                           
##  Mcnemar's Test P-Value : 3.488e-11       
##                                           
##             Sensitivity : 0.7506          
##             Specificity : 0.8305          
##          Pos Pred Value : 0.8508          
##          Neg Pred Value : 0.7212          
##              Prevalence : 0.5628          
##          Detection Rate : 0.4225          
##    Detection Prevalence : 0.4965          
##       Balanced Accuracy : 0.7906          
##                                           
##        'Positive' Class : 0               
## 
(918+789)/(918+789+161+305)
## [1] 0.7855499
logistic_bd = glm(formula = Transported ~ .,
                  family = binomial,
                  data = train_c[, -c(1)])
logistic_tahmin_bd = predict(logistic_bd, newdata = test_c[, -c(1)])
logistic_transported_test_c_tahmin = ifelse(logistic_tahmin_bd >0.5, TRUE, FALSE )
Transported = as.character(logistic_transported_test_c_tahmin)
PassengerId = test_c$PassengerId
Transported=as.vector(Transported)
submission_logistic = cbind(PassengerId, Transported)
submission_logistic = as.data.frame(submission_logistic)
submission_logistic$Transported = str_to_title(submission_logistic$Transported)
write.csv(submission_logistic, "submission_logistic.csv", row.names =FALSE, quote=FALSE)

logistic_bd = glm(formula = Transported ~ .,
                  family = binomial,
                  data = train_c_log[, -c(1)])
logistic_tahmin_bd = predict(logistic_bd, newdata = test_c_log[, -c(1)])
logistic_transported_test_c_tahmin = ifelse(logistic_tahmin_bd >0.5, TRUE, FALSE )
Transported = as.character(logistic_transported_test_c_tahmin)
PassengerId = test_c$PassengerId
Transported=as.vector(Transported)
submission_llogistic = cbind(PassengerId, Transported)
submission_llogistic = as.data.frame(submission_logistic)
submission_llogistic$Transported = str_to_title(submission_logistic$Transported)
write.csv(submission_logistic, "submission_llogistic.csv", row.names =FALSE, quote=FALSE)

Naive Bayes

library(e1071)
fit_nb = naiveBayes(Transported ~ ., data =train_train[, -1])
fit_nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    FALSE     TRUE 
## 0.496319 0.503681 
## 
## Conditional probabilities:
##        HomePlanet
## Y           Earth    Europa      Mars
##   FALSE 0.6236094 0.1739802 0.2024104
##   TRUE  0.4631547 0.3279537 0.2088916
## 
##        CryoSleep
## Y           FALSE      TRUE
##   FALSE 0.8702101 0.1297899
##   TRUE  0.4336175 0.5663825
## 
##        Destination
## Y       55 Cancri e PSO J318.5-22 TRAPPIST-1e TRAPPIST-le
##   FALSE  0.16069221    0.09363412  0.72404203  0.02163164
##   TRUE   0.24969549    0.09043849  0.64007308  0.01979294
## 
##        Age
## Y           [,1]     [,2]
##   FALSE 30.01092 13.45222
##   TRUE  27.85527 14.87646
## 
##        VIP
## Y            FALSE       TRUE
##   FALSE 0.97126082 0.02873918
##   TRUE  0.98142509 0.01857491
## 
##        RoomService
## Y            [,1]     [,2]
##   FALSE 402.20365 916.6547
##   TRUE   56.93484 246.8105
## 
##        FoodCourt
## Y           [,1]     [,2]
##   FALSE 396.2923 1258.123
##   TRUE  514.8018 1918.620
## 
##        ShoppingMall
## Y           [,1]     [,2]
##   FALSE 160.2250 432.1103
##   TRUE  182.7135 748.5867
## 
##        Spa
## Y            [,1]      [,2]
##   FALSE 561.16193 1554.2636
##   TRUE   61.11571  264.0556
## 
##        VRDeck
## Y            [,1]      [,2]
##   FALSE 533.98733 1542.8703
##   TRUE   69.07186  295.0542
## 
##        withgroup
## Y            [,1]      [,2]
##   FALSE 0.3952410 0.4889780
##   TRUE  0.4960414 0.5000605
## 
##        deck
## Y                  A            B            C            D            E
##   FALSE 0.0296662546 0.0540791100 0.0574783684 0.0661310260 0.1344252163
##   TRUE  0.0310596833 0.1385505481 0.1199756395 0.0490255786 0.0676004872
##        deck
## Y                  F            G            T
##   FALSE 0.3637206428 0.2932632880 0.0012360939
##   TRUE  0.2828867235 0.3105968331 0.0003045067
## 
##        side
## Y               P         S
##   FALSE 0.5571693 0.4428307
##   TRUE  0.4448843 0.5551157
## 
##        expense
## Y            [,1]     [,2]
##   FALSE 2053.8702 3224.219
##   TRUE   884.6376 2307.962
pred_nb = predict(fit_nb, newdata = train_test[, -c(1, 12)], type = "raw") %>%
  data.frame()
head(pred_nb)
##       FALSE.         TRUE.
## 1 0.22238032  7.776197e-01
## 2 1.00000000 8.663595e-135
## 3 0.02729931  9.727007e-01
## 4 0.59276719  4.072328e-01
## 5 0.02410104  9.758990e-01
## 6 0.38468138  6.153186e-01
Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, 1, 0)
head(Transported_pred_nb)
## [1] 1 0 1 0 1 1
Transported_train_test = ifelse(train_test[12] == TRUE, 1, 0)
head(Transported_train_test)
##      Transported
## [1,]           1
## [2,]           0
## [3,]           1
## [4,]           0
## [5,]           0
## [6,]           0
cm = table(Transported_train_test, Transported_pred_nb)
cm
##                       Transported_pred_nb
## Transported_train_test    0    1
##                      0  480  599
##                      1   76 1018
(480 + 1018)/(480 +599 +76 +1018)
## [1] 0.6893695
nb = naiveBayes(Transported ~ ., data = train_c[, -1])
pred_nb = predict(nb, newdata = test_c, type = "raw") %>% data.frame()
Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, TRUE, FALSE)
Transported = as.character(Transported_pred_nb)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_nb.csv", row.names = FALSE, quote = FALSE)

SVM

fit_svm = svm(Transported ~ ., data = train_train[, -1], 
              type= 'C-classification',
              kernel = 'linear')

preds = predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>%
  data.frame()
head(preds)
##       .
## 1 FALSE
## 2 FALSE
## 3  TRUE
## 4 FALSE
## 5 FALSE
## 6 FALSE
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm
##                       Transported_pred_svm
## Transported_train_test   0   1
##                      0 836 243
##                      1 180 914
(836 + 914)/(836 +243 +180 + 914)
## [1] 0.8053382
fit_svm = svm(Transported ~ ., data = train_c[, -1],
              type= 'C-classification',
              kernel = 'linear')

preds = predict(fit_svm, newdata = test_c, type ="raw")%>%
  data.frame()
head(preds)
##       .
## 1  TRUE
## 2 FALSE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUE
Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)
Transported = as.character(Transported_pred_svm)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm.csv", row.names = FALSE, quote = FALSE)

SVM RADIAL

fit_svm = svm(Transported ~ ., data = train_train[, -1],
              type= 'C-classification',
              kernel = 'radial')

preds = predict(fit_svm, newdata = train_test[, -c(1, 12)], type ="raw")%>%
  data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm
##                       Transported_pred_svm
## Transported_train_test   0   1
##                      0 849 230
##                      1 202 892
(849 +892)/(892 +849 +202 +230)
## [1] 0.8011965
fit_svm = svm(Transported ~ ., data = train_c[, -1],
              type= 'C-classification',
              kernel = 'radial')

preds = predict(fit_svm, newdata = test_c, type ="raw")%>%
  data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)
Transported = as.character(Transported_pred_svm)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm_radial.csv", row.names = FALSE, quote = FALSE)

P = ggplot(train_train, aes(x=HomePlanet, y=deck, color=factor(Transported))) + 
  geom_point(aes(shape=factor(Transported)), size=3) +
  scale_color_viridis_d() + labs(title = "", x="HomePlanet", y="deck") + theme_minimal() + theme(legend.position ="top")
P

library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
fit_tree = rpart::rpart(Transported ~ ., data = train_train[, -1])
summary(fit_tree)
## Call:
## rpart::rpart(formula = Transported ~ ., data = train_train[, 
##     -1])
##   n= 6520 
## 
##           CP nsplit rel error    xerror         xstd
## 1 0.23321359      0 1.0000000 1.0001811 0.0001977604
## 2 0.04905507      1 0.7667864 0.7671015 0.0103212807
## 3 0.02807107      2 0.7177313 0.7180892 0.0090936774
## 4 0.02784643      3 0.6896603 0.7039381 0.0095638265
## 5 0.01770479      4 0.6618138 0.6661699 0.0097007659
## 6 0.01381587      5 0.6441090 0.6496207 0.0100644975
## 7 0.01194576      6 0.6302932 0.6389139 0.0102048399
## 8 0.01164524      7 0.6183474 0.6310762 0.0102269497
## 9 0.01000000      8 0.6067022 0.6242632 0.0102752753
## 
## Variable importance
##      expense    CryoSleep    FoodCourt          Spa       VRDeck  RoomService 
##           23           17           13           12           11            9 
##         deck   HomePlanet ShoppingMall          Age  Destination 
##            5            5            2            1            1 
## 
## Node number 1: 6520 observations,    complexity param=0.2332136
##   mean=0.503681, MSE=0.2499865 
##   left son=2 (3795 obs) right son=3 (2725 obs)
##   Primary splits:
##       expense     < 0.5     to the right, improve=0.2332136, (0 missing)
##       CryoSleep   < 0.5     to the left,  improve=0.2095384, (0 missing)
##       RoomService < 0.5     to the right, improve=0.1254121, (0 missing)
##       Spa         < 0.5     to the right, improve=0.1142180, (0 missing)
##       VRDeck      < 0.5     to the right, improve=0.1084074, (0 missing)
##   Surrogate splits:
##       CryoSleep   < 0.5     to the left,  agree=0.932, adj=0.837, (0 split)
##       Spa         < 0.5     to the right, agree=0.784, adj=0.484, (0 split)
##       FoodCourt   < 0.5     to the right, agree=0.770, adj=0.451, (0 split)
##       VRDeck      < 0.5     to the right, agree=0.768, adj=0.444, (0 split)
##       RoomService < 0.5     to the right, agree=0.757, adj=0.419, (0 split)
## 
## Node number 2: 3795 observations,    complexity param=0.02807107
##   mean=0.2990777, MSE=0.2096302 
##   left son=4 (3243 obs) right son=5 (552 obs)
##   Primary splits:
##       FoodCourt    < 1331    to the left,  improve=0.05751186, (0 missing)
##       ShoppingMall < 627.5   to the left,  improve=0.04530804, (0 missing)
##       RoomService  < 365.5   to the right, improve=0.04440387, (0 missing)
##       Spa          < 257.5   to the right, improve=0.03347453, (0 missing)
##       VRDeck       < 721     to the right, improve=0.02290457, (0 missing)
##   Surrogate splits:
##       expense    < 5981    to the left,  agree=0.885, adj=0.210, (0 split)
##       deck       splits as  RRRLLLLL,    agree=0.884, adj=0.203, (0 split)
##       HomePlanet splits as  LRL,         agree=0.878, adj=0.159, (0 split)
##       Spa        < 8955.5  to the left,  agree=0.856, adj=0.009, (0 split)
##       VRDeck     < 11692   to the left,  agree=0.856, adj=0.009, (0 split)
## 
## Node number 3: 2725 observations,    complexity param=0.04905507
##   mean=0.7886239, MSE=0.1666963 
##   left son=6 (1448 obs) right son=7 (1277 obs)
##   Primary splits:
##       deck        splits as  RRRRLRL-,    improve=0.17601740, (0 missing)
##       HomePlanet  splits as  LRR,         improve=0.12704810, (0 missing)
##       Destination splits as  RLLR,        improve=0.02774089, (0 missing)
##       CryoSleep   < 0.5     to the left,  improve=0.02268236, (0 missing)
##       side        splits as  LR,          improve=0.01498707, (0 missing)
##   Surrogate splits:
##       HomePlanet  splits as  LRR,         agree=0.934, adj=0.860, (0 split)
##       Age         < 24.5    to the left,  agree=0.626, adj=0.201, (0 split)
##       Destination splits as  RLLR,        agree=0.596, adj=0.137, (0 split)
##       withgroup   < 0.5     to the left,  agree=0.583, adj=0.110, (0 split)
##       VIP         < 0.5     to the left,  agree=0.538, adj=0.014, (0 split)
## 
## Node number 4: 3243 observations,    complexity param=0.02784643
##   mean=0.2537774, MSE=0.1893744 
##   left son=8 (2577 obs) right son=9 (666 obs)
##   Primary splits:
##       ShoppingMall < 541.5   to the left,  improve=0.07390355, (0 missing)
##       RoomService  < 365.5   to the right, improve=0.03464407, (0 missing)
##       Spa          < 240.5   to the right, improve=0.03327259, (0 missing)
##       VRDeck       < 114     to the right, improve=0.02784287, (0 missing)
##       expense      < 2867.5  to the right, improve=0.01811461, (0 missing)
##   Surrogate splits:
##       expense < 18644   to the left,  agree=0.795, adj=0.003, (0 split)
## 
## Node number 5: 552 observations,    complexity param=0.01770479
##   mean=0.5652174, MSE=0.2457467 
##   left son=10 (123 obs) right son=11 (429 obs)
##   Primary splits:
##       Spa       < 1372.5  to the right, improve=0.21272970, (0 missing)
##       VRDeck    < 1063.5  to the right, improve=0.17089500, (0 missing)
##       expense   < 5395    to the right, improve=0.06611166, (0 missing)
##       FoodCourt < 2513    to the left,  improve=0.02780045, (0 missing)
##       deck      splits as  LLRLLRRL,    improve=0.02705865, (0 missing)
##   Surrogate splits:
##       expense     < 12647   to the right, agree=0.790, adj=0.057, (0 split)
##       Age         < 13.5    to the left,  agree=0.779, adj=0.008, (0 split)
##       RoomService < 3895.5  to the right, agree=0.779, adj=0.008, (0 split)
## 
## Node number 6: 1448 observations
##   mean=0.6277624, MSE=0.2336768 
## 
## Node number 7: 1277 observations
##   mean=0.9710258, MSE=0.02813466 
## 
## Node number 8: 2577 observations,    complexity param=0.01194576
##   mean=0.193636, MSE=0.1561411 
##   left son=16 (2067 obs) right son=17 (510 obs)
##   Primary splits:
##       FoodCourt   < 456.5   to the left,  improve=0.04838893, (0 missing)
##       expense     < 1447.5  to the right, improve=0.04016842, (0 missing)
##       HomePlanet  splits as  RLL,         improve=0.02539308, (0 missing)
##       Spa         < 537.5   to the right, improve=0.01895890, (0 missing)
##       RoomService < 400.5   to the right, improve=0.01706521, (0 missing)
##   Surrogate splits:
##       expense < 12373   to the left,  agree=0.804, adj=0.008, (0 split)
##       Spa     < 13650   to the left,  agree=0.803, adj=0.006, (0 split)
##       VRDeck  < 10123.5 to the left,  agree=0.802, adj=0.002, (0 split)
##       deck    splits as  LLLLLLLR,    agree=0.802, adj=0.002, (0 split)
## 
## Node number 9: 666 observations
##   mean=0.4864865, MSE=0.2498174 
## 
## Node number 10: 123 observations
##   mean=0.1382114, MSE=0.119109 
## 
## Node number 11: 429 observations,    complexity param=0.01381587
##   mean=0.6876457, MSE=0.2147891 
##   left son=22 (143 obs) right son=23 (286 obs)
##   Primary splits:
##       VRDeck      < 611     to the right, improve=0.24438400, (0 missing)
##       Spa         < 225     to the right, improve=0.05300377, (0 missing)
##       FoodCourt   < 3119.5  to the left,  improve=0.05168044, (0 missing)
##       side        splits as  LR,          improve=0.04004566, (0 missing)
##       RoomService < 1719.5  to the right, improve=0.03930897, (0 missing)
##   Surrogate splits:
##       expense   < 6032    to the right, agree=0.702, adj=0.105, (0 split)
##       Age       < 53.5    to the right, agree=0.674, adj=0.021, (0 split)
##       FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
## 
## Node number 16: 2067 observations
##   mean=0.1504596, MSE=0.1278215 
## 
## Node number 17: 510 observations,    complexity param=0.01164524
##   mean=0.3686275, MSE=0.2327413 
##   left son=34 (204 obs) right son=35 (306 obs)
##   Primary splits:
##       expense    < 1447.5  to the right, improve=0.15990760, (0 missing)
##       VRDeck     < 86.5    to the right, improve=0.10060710, (0 missing)
##       HomePlanet splits as  RLL,         improve=0.07952538, (0 missing)
##       Spa        < 500     to the right, improve=0.07353369, (0 missing)
##       deck       splits as  LLLLRRRL,    improve=0.05075444, (0 missing)
##   Surrogate splits:
##       HomePlanet splits as  RLL,         agree=0.873, adj=0.681, (0 split)
##       deck       splits as  LLLLRRRL,    agree=0.839, adj=0.598, (0 split)
##       VRDeck     < 213.5   to the right, agree=0.818, adj=0.544, (0 split)
##       Spa        < 219.5   to the right, agree=0.792, adj=0.480, (0 split)
##       FoodCourt  < 907     to the right, agree=0.722, adj=0.304, (0 split)
## 
## Node number 22: 143 observations
##   mean=0.3636364, MSE=0.231405 
## 
## Node number 23: 286 observations
##   mean=0.8496503, MSE=0.1277446 
## 
## Node number 34: 204 observations
##   mean=0.1323529, MSE=0.1148356 
## 
## Node number 35: 306 observations
##   mean=0.5261438, MSE=0.2493165
rpart.plot(fit_tree)

preds= predict(fit_tree, newdata = train_test[, -c(1,12)]) %>%
  data.frame()
head(preds)
##           .
## 1 0.1504596
## 2 0.1382114
## 3 0.8496503
## 4 0.1504596
## 5 0.4864865
## 6 0.1504596
Transported_pred_tree = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_tree)
cm
##                       Transported_pred_tree
## Transported_train_test   0   1
##                      0 807 272
##                      1 237 857
(807 +857)/(807 +857 +272 +237)
## [1] 0.7657616
fit_tree = rpart::rpart(Transported ~ ., data = train_c[, -1])

preds= predict(fit_tree, newdata = test_c) %>%
  data.frame()
Transported_pred_tree = ifelse(preds$. > 0.5, TRUE, FALSE)
Transported = as.character(Transported_pred_tree)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_tree.csv", row.names = FALSE, quote = FALSE)

RANDOM FOREST

fit_forest = randomForest(Transported ~ ., data = train_train[, -1])
fit_forest$importance
##              IncNodePurity
## HomePlanet       48.125593
## CryoSleep       107.989293
## Destination      30.575915
## Age              91.836683
## VIP               2.516955
## RoomService     125.936343
## FoodCourt       113.623386
## ShoppingMall     96.731658
## Spa             131.332461
## VRDeck          122.410460
## withgroup        12.925079
## deck             90.196804
## side             22.439063
## expense         238.242509
varImpPlot(fit_forest)

preds= predict(fit_forest, newdata = train_test[, -c(1,12)]) %>%
  data.frame()
head(preds)
##           .
## 1 0.1050685
## 2 0.1614667
## 3 0.8062586
## 4 0.1920834
## 5 0.6907569
## 6 0.1659998
Transported_pred_forest = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_forest  )
cm
##                       Transported_pred_forest
## Transported_train_test   0   1
##                      0 823 256
##                      1 175 919
(823+919)/(823+256+175+919)
## [1] 0.8016567
fit_forest = randomForest(Transported ~ ., data = train_c[, -1])

preds = predict(fit_forest, newdata = test_c) %>%
  data.frame()
Transported_pred_forest = ifelse(preds$. > 0.5, TRUE, FALSE)
Transported = as.character(Transported_pred_forest)
PassengerId = test_c$PassengerId
Transported = as.vector(Transported)
sample_submission = cbind(PassengerId, Transported)
sample_submission = as.data.frame(sample_submission)
sample_submission$Transported = str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_forest.csv", row.names = FALSE, quote = FALSE)