FINAL

library(readr)
test <- read_csv("test.csv")
## Rows: 4277 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train <- read_csv("train.csv")
## Rows: 8693 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(rmarkdown)
paged_table(test)
paged_table(train)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(explore)
test %>% describe_all()
## # A tibble: 13 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  chr       0    0     4277    NA  NA       NA
##  2 HomePlanet   chr      87    2        4    NA  NA       NA
##  3 CryoSleep    lgl      93    2.2      3     0   0.37     1
##  4 Cabin        chr     100    2.3   3266    NA  NA       NA
##  5 Destination  chr      92    2.2      4    NA  NA       NA
##  6 Age          dbl      91    2.1     80     0  28.7     79
##  7 VIP          lgl      93    2.2      3     0   0.02     1
##  8 RoomService  dbl      82    1.9    843     0 219.   11567
##  9 FoodCourt    dbl     106    2.5    903     0 439.   25273
## 10 ShoppingMall dbl      98    2.3    716     0 177.    8292
## 11 Spa          dbl     101    2.4    834     0 303.   19844
## 12 VRDeck       dbl      80    1.9    797     0 311.   22272
## 13 Name         chr      94    2.2   4177    NA  NA       NA
train %>% describe_all()
## # A tibble: 14 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  chr       0    0     8693    NA  NA       NA
##  2 HomePlanet   chr     201    2.3      4    NA  NA       NA
##  3 CryoSleep    lgl     217    2.5      3     0   0.36     1
##  4 Cabin        chr     199    2.3   6561    NA  NA       NA
##  5 Destination  chr     182    2.1      4    NA  NA       NA
##  6 Age          dbl     179    2.1     81     0  28.8     79
##  7 VIP          lgl     203    2.3      3     0   0.02     1
##  8 RoomService  dbl     181    2.1   1274     0 225.   14327
##  9 FoodCourt    dbl     183    2.1   1508     0 458.   29813
## 10 ShoppingMall dbl     208    2.4   1116     0 174.   23492
## 11 Spa          dbl     183    2.1   1328     0 311.   22408
## 12 VRDeck       dbl     188    2.2   1307     0 305.   24133
## 13 Name         chr     200    2.3   8474    NA  NA       NA
## 14 Transported  lgl       0    0        2     0   0.5      1
test[c('group', 'pp')] <- str_split_fixed(test$PassengerId,'-', 2)
train[c('group', 'pp')] <- str_split_fixed(train$PassengerId,'-', 2)
head(train[,c("PassengerId", "group", "pp")])
## # A tibble: 6 × 3
##   PassengerId group   pp   
##   <chr>       <chr>   <chr>
## 1 0001_01     0001_01 ""   
## 2 0002_01     0002_01 ""   
## 3 0003_01     0003_01 ""   
## 4 0003_02     0003_02 ""   
## 5 0004_01     0004_01 ""   
## 6 0005_01     0005_01 ""
test$withgroup <- ifelse(duplicated(test$group) | duplicated(test$group, fromLast = TRUE), 1,0)
train$withgroup <- ifelse(duplicated(train$group) | duplicated(train$group, fromLast = TRUE), 1,0)
head(test[,c("PassengerId", "group", "pp", "withgroup")])
## # A tibble: 6 × 4
##   PassengerId group   pp    withgroup
##   <chr>       <chr>   <chr>     <dbl>
## 1 0013_01     0013_01 ""            0
## 2 0018_01     0018_01 ""            0
## 3 0019_01     0019_01 ""            0
## 4 0021_01     0021_01 ""            0
## 5 0023_01     0023_01 ""            0
## 6 0027_01     0027_01 ""            0
head(train[,c("PassengerId", "group", "pp", "withgroup")])
## # A tibble: 6 × 4
##   PassengerId group   pp    withgroup
##   <chr>       <chr>   <chr>     <dbl>
## 1 0001_01     0001_01 ""            0
## 2 0002_01     0002_01 ""            0
## 3 0003_01     0003_01 ""            0
## 4 0003_02     0003_02 ""            0
## 5 0004_01     0004_01 ""            0
## 6 0005_01     0005_01 ""            0
train[c('deck', 'num', 'side')] <- str_split_fixed(train$Cabin, '/', 3)
test[c('deck', 'num', 'side')] <- str_split_fixed(test$Cabin, '/', 3)
test <-test %>% mutate_if(is.character,as.factor)
train <-train %>% mutate_if(is.character,as.factor)
summary(test)
##   PassengerId    HomePlanet   CryoSleep           Cabin     
##  0013_01:   1   Earth :2263   Mode :logical   G/160/P:   8  
##  0018_01:   1   Europa:1002   FALSE:2640      B/31/P :   7  
##  0019_01:   1   Mars  : 925   TRUE :1544      D/273/S:   7  
##  0021_01:   1   NA's  :  87   NA's :93        E/228/S:   7  
##  0023_01:   1                                 G/748/S:   7  
##  0027_01:   1                                 (Other):4141  
##  (Other):4271                                 NA's   : 100  
##         Destination        Age           VIP           RoomService     
##  55 Cancri e  : 841   Min.   : 0.00   Mode :logical   Min.   :    0.0  
##  PSO J318.5-22: 388   1st Qu.:19.00   FALSE:4110      1st Qu.:    0.0  
##  TRAPPIST-1e  :2956   Median :26.00   TRUE :74        Median :    0.0  
##  NA's         :  92   Mean   :28.66   NA's :93        Mean   :  219.3  
##                       3rd Qu.:37.00                   3rd Qu.:   53.0  
##                       Max.   :79.00                   Max.   :11567.0  
##                       NA's   :91                      NA's   :82       
##    FoodCourt        ShoppingMall         Spa              VRDeck       
##  Min.   :    0.0   Min.   :   0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:   0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :   0.0   Median :    0.0   Median :    0.0  
##  Mean   :  439.5   Mean   : 177.3   Mean   :  303.1   Mean   :  310.7  
##  3rd Qu.:   78.0   3rd Qu.:  33.0   3rd Qu.:   50.0   3rd Qu.:   36.0  
##  Max.   :25273.0   Max.   :8292.0   Max.   :19844.0   Max.   :22272.0  
##  NA's   :106       NA's   :98       NA's   :101       NA's   :80       
##                Name          group      pp        withgroup      deck     
##  Berta Barnolderg:   2   0013_01:   1   :4277   Min.   :0   F      :1445  
##  Chrey Colte     :   2   0018_01:   1           1st Qu.:0   G      :1222  
##  Cints Erle      :   2   0019_01:   1           Median :0   E      : 447  
##  Cocors Cola     :   2   0021_01:   1           Mean   :0   B      : 362  
##  Con Pashe       :   2   0023_01:   1           3rd Qu.:0   C      : 355  
##  (Other)         :4173   0027_01:   1           Max.   :0   (Other): 346  
##  NA's            :  94   (Other):4271                       NA's   : 100  
##       num       side    
##         : 100    : 100  
##  4      :  21   P:2084  
##  31     :  18   S:2093  
##  197    :  16           
##  294    :  16           
##  228    :  14           
##  (Other):4092
summary(train)
##   PassengerId    HomePlanet   CryoSleep           Cabin     
##  0001_01:   1   Earth :4602   Mode :logical   G/734/S:   8  
##  0002_01:   1   Europa:2131   FALSE:5439      B/11/S :   7  
##  0003_01:   1   Mars  :1759   TRUE :3037      B/201/P:   7  
##  0003_02:   1   NA's  : 201   NA's :217       B/82/S :   7  
##  0004_01:   1                                 C/137/S:   7  
##  0005_01:   1                                 (Other):8458  
##  (Other):8687                                 NA's   : 199  
##         Destination        Age           VIP           RoomService     
##  55 Cancri e  :1800   Min.   : 0.00   Mode :logical   Min.   :    0.0  
##  PSO J318.5-22: 796   1st Qu.:19.00   FALSE:8291      1st Qu.:    0.0  
##  TRAPPIST-1e  :5915   Median :27.00   TRUE :199       Median :    0.0  
##  NA's         : 182   Mean   :28.83   NA's :203       Mean   :  224.7  
##                       3rd Qu.:38.00                   3rd Qu.:   47.0  
##                       Max.   :79.00                   Max.   :14327.0  
##                       NA's   :179                     NA's   :181      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  458.1   Mean   :  173.7   Mean   :  311.1   Mean   :  304.9  
##  3rd Qu.:   76.0   3rd Qu.:   27.0   3rd Qu.:   59.0   3rd Qu.:   46.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##  NA's   :183       NA's   :208       NA's   :183       NA's   :188      
##                  Name      Transported         group      pp        withgroup
##  Alraium Disivering:   2   Mode :logical   0001_01:   1   :8693   Min.   :0  
##  Ankalik Nateansive:   2   FALSE:4315      0002_01:   1           1st Qu.:0  
##  Anton Woody       :   2   TRUE :4378      0003_01:   1           Median :0  
##  Apix Wala         :   2                   0003_02:   1           Mean   :0  
##  Asch Stradick     :   2                   0004_01:   1           3rd Qu.:0  
##  (Other)           :8483                   0005_01:   1           Max.   :0  
##  NA's              : 200                   (Other):8687                      
##       deck           num       side    
##  F      :2794          : 199    : 199  
##  G      :2559   82     :  28   P:4206  
##  E      : 876   19     :  22   S:4288  
##  B      : 779   86     :  22           
##  C      : 747   176    :  21           
##  (Other): 739   56     :  21           
##  NA's   : 199   (Other):8380
test [test == ' '] <- NA
train [train == ' '] <- NA
train$num <- droplevels(train$num)
test$num <- droplevels(test$num)
train$side <- droplevels(train$side)
test$side <- droplevels(test$side)
summary(test)
##   PassengerId    HomePlanet   CryoSleep           Cabin     
##  0013_01:   1   Earth :2263   Mode :logical   G/160/P:   8  
##  0018_01:   1   Europa:1002   FALSE:2640      B/31/P :   7  
##  0019_01:   1   Mars  : 925   TRUE :1544      D/273/S:   7  
##  0021_01:   1   NA's  :  87   NA's :93        E/228/S:   7  
##  0023_01:   1                                 G/748/S:   7  
##  0027_01:   1                                 (Other):4141  
##  (Other):4271                                 NA's   : 100  
##         Destination        Age           VIP           RoomService     
##  55 Cancri e  : 841   Min.   : 0.00   Mode :logical   Min.   :    0.0  
##  PSO J318.5-22: 388   1st Qu.:19.00   FALSE:4110      1st Qu.:    0.0  
##  TRAPPIST-1e  :2956   Median :26.00   TRUE :74        Median :    0.0  
##  NA's         :  92   Mean   :28.66   NA's :93        Mean   :  219.3  
##                       3rd Qu.:37.00                   3rd Qu.:   53.0  
##                       Max.   :79.00                   Max.   :11567.0  
##                       NA's   :91                      NA's   :82       
##    FoodCourt        ShoppingMall         Spa              VRDeck       
##  Min.   :    0.0   Min.   :   0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:   0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :   0.0   Median :    0.0   Median :    0.0  
##  Mean   :  439.5   Mean   : 177.3   Mean   :  303.1   Mean   :  310.7  
##  3rd Qu.:   78.0   3rd Qu.:  33.0   3rd Qu.:   50.0   3rd Qu.:   36.0  
##  Max.   :25273.0   Max.   :8292.0   Max.   :19844.0   Max.   :22272.0  
##  NA's   :106       NA's   :98       NA's   :101       NA's   :80       
##                Name          group      pp        withgroup      deck     
##  Berta Barnolderg:   2   0013_01:   1   :4277   Min.   :0   F      :1445  
##  Chrey Colte     :   2   0018_01:   1           1st Qu.:0   G      :1222  
##  Cints Erle      :   2   0019_01:   1           Median :0   E      : 447  
##  Cocors Cola     :   2   0021_01:   1           Mean   :0   B      : 362  
##  Con Pashe       :   2   0023_01:   1           3rd Qu.:0   C      : 355  
##  (Other)         :4173   0027_01:   1           Max.   :0   (Other): 346  
##  NA's            :  94   (Other):4271                       NA's   : 100  
##       num       side    
##         : 100    : 100  
##  4      :  21   P:2084  
##  31     :  18   S:2093  
##  197    :  16           
##  294    :  16           
##  228    :  14           
##  (Other):4092
summary(train)
##   PassengerId    HomePlanet   CryoSleep           Cabin     
##  0001_01:   1   Earth :4602   Mode :logical   G/734/S:   8  
##  0002_01:   1   Europa:2131   FALSE:5439      B/11/S :   7  
##  0003_01:   1   Mars  :1759   TRUE :3037      B/201/P:   7  
##  0003_02:   1   NA's  : 201   NA's :217       B/82/S :   7  
##  0004_01:   1                                 C/137/S:   7  
##  0005_01:   1                                 (Other):8458  
##  (Other):8687                                 NA's   : 199  
##         Destination        Age           VIP           RoomService     
##  55 Cancri e  :1800   Min.   : 0.00   Mode :logical   Min.   :    0.0  
##  PSO J318.5-22: 796   1st Qu.:19.00   FALSE:8291      1st Qu.:    0.0  
##  TRAPPIST-1e  :5915   Median :27.00   TRUE :199       Median :    0.0  
##  NA's         : 182   Mean   :28.83   NA's :203       Mean   :  224.7  
##                       3rd Qu.:38.00                   3rd Qu.:   47.0  
##                       Max.   :79.00                   Max.   :14327.0  
##                       NA's   :179                     NA's   :181      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  458.1   Mean   :  173.7   Mean   :  311.1   Mean   :  304.9  
##  3rd Qu.:   76.0   3rd Qu.:   27.0   3rd Qu.:   59.0   3rd Qu.:   46.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##  NA's   :183       NA's   :208       NA's   :183       NA's   :188      
##                  Name      Transported         group      pp        withgroup
##  Alraium Disivering:   2   Mode :logical   0001_01:   1   :8693   Min.   :0  
##  Ankalik Nateansive:   2   FALSE:4315      0002_01:   1           1st Qu.:0  
##  Anton Woody       :   2   TRUE :4378      0003_01:   1           Median :0  
##  Apix Wala         :   2                   0003_02:   1           Mean   :0  
##  Asch Stradick     :   2                   0004_01:   1           3rd Qu.:0  
##  (Other)           :8483                   0005_01:   1           Max.   :0  
##  NA's              : 200                   (Other):8687                      
##       deck           num       side    
##  F      :2794          : 199    : 199  
##  G      :2559   82     :  28   P:4206  
##  E      : 876   19     :  22   S:4288  
##  B      : 779   86     :  22           
##  C      : 747   176    :  21           
##  (Other): 739   56     :  21           
##  NA's   : 199   (Other):8380
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
test <- test %>% 
  group_by(group) %>% 
  mutate(HomePlanet = na.locf(HomePlanet, na.rm = FALSE))
train <- train %>% 
  group_by(group) %>% 
  mutate(HomePlanet = na.locf(HomePlanet, na.rm = FALSE))
most_frequent_hp <- train %>%
 filter(!is.na(HomePlanet)) %>%
 group_by(Destination, HomePlanet) %>%
 summarize(count = n()) %>%
 arrange(Destination, desc(count)) %>%
 slice(1) %>%
 ungroup()
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
most_frequent_hp
## # A tibble: 4 × 3
##   Destination   HomePlanet count
##   <fct>         <fct>      <int>
## 1 55 Cancri e   Europa       886
## 2 PSO J318.5-22 Earth        712
## 3 TRAPPIST-1e   Earth       3101
## 4 <NA>          Earth         99
train$HomePlanet <- as.character(train$HomePlanet)
train$Destination <- as.character(train$Destination)
train <- train %>%
  mutate(HomePlanet= ifelse(is.na(HomePlanet) & Destination == "55 Cancri e","Europa",
                            ifelse(is.na(HomePlanet), "Earth", HomePlanet)))
test$HomePlanet <- as.character(test$HomePlanet)
test$Destination <- as.character(test$Destination)
test <- test %>%
  mutate(HomePlanet= ifelse(is.na(HomePlanet) & Destination == "55 Cancri e","Europa",
                            ifelse(is.na(HomePlanet), "Earth", HomePlanet)))
train %>% describe_all()
## # A tibble: 20 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     8693    NA  NA       NA
##  2 HomePlanet   chr       4    0        4    NA  NA       NA
##  3 CryoSleep    lgl     217    2.5      3     0   0.36     1
##  4 Cabin        fct     199    2.3   6561    NA  NA       NA
##  5 Destination  chr     182    2.1      4    NA  NA       NA
##  6 Age          dbl     179    2.1     81     0  28.8     79
##  7 VIP          lgl     203    2.3      3     0   0.02     1
##  8 RoomService  dbl     181    2.1   1274     0 225.   14327
##  9 FoodCourt    dbl     183    2.1   1508     0 458.   29813
## 10 ShoppingMall dbl     208    2.4   1116     0 174.   23492
## 11 Spa          dbl     183    2.1   1328     0 311.   22408
## 12 VRDeck       dbl     188    2.2   1307     0 305.   24133
## 13 Name         fct     200    2.3   8474    NA  NA       NA
## 14 Transported  lgl       0    0        2     0   0.5      1
## 15 group        fct       0    0     8693    NA  NA       NA
## 16 pp           fct       0    0        1    NA  NA       NA
## 17 withgroup    dbl       0    0        1     0   0        0
## 18 deck         fct     199    2.3      9    NA  NA       NA
## 19 num          fct       0    0     1818    NA  NA       NA
## 20 side         fct       0    0        3    NA  NA       NA
test %>% describe_all()
## # A tibble: 19 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     4277    NA  NA       NA
##  2 HomePlanet   chr       2    0        4    NA  NA       NA
##  3 CryoSleep    lgl      93    2.2      3     0   0.37     1
##  4 Cabin        fct     100    2.3   3266    NA  NA       NA
##  5 Destination  chr      92    2.2      4    NA  NA       NA
##  6 Age          dbl      91    2.1     80     0  28.7     79
##  7 VIP          lgl      93    2.2      3     0   0.02     1
##  8 RoomService  dbl      82    1.9    843     0 219.   11567
##  9 FoodCourt    dbl     106    2.5    903     0 439.   25273
## 10 ShoppingMall dbl      98    2.3    716     0 177.    8292
## 11 Spa          dbl     101    2.4    834     0 303.   19844
## 12 VRDeck       dbl      80    1.9    797     0 311.   22272
## 13 Name         fct      94    2.2   4177    NA  NA       NA
## 14 group        fct       0    0     4277    NA  NA       NA
## 15 pp           fct       0    0        1    NA  NA       NA
## 16 withgroup    dbl       0    0        1     0   0        0
## 17 deck         fct     100    2.3      9    NA  NA       NA
## 18 num          fct       0    0     1506    NA  NA       NA
## 19 side         fct       0    0        3    NA  NA       NA
train <- transform(train, HomePlanet = replace(HomePlanet, is.na(HomePlanet),
"Earth"))
train <- transform(train, HomePlanet = replace(HomePlanet, is.na(HomePlanet), "Earth"))
most_frequent_destinations <- train %>%
 filter(!is.na(Destination)) %>%
 group_by(HomePlanet, Destination ) %>%
 summarize(count = n()) %>%
 arrange(HomePlanet, desc(count)) %>%
 slice(1) %>%
 ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_destinations
## # A tibble: 3 × 3
##   HomePlanet Destination count
##   <chr>      <chr>       <int>
## 1 Earth      TRAPPIST-1e  3251
## 2 Europa     TRAPPIST-1e  1189
## 3 Mars       TRAPPIST-1e  1475
test <- transform(test, Destination = replace(Destination, is.na(Destination), "TRAPPIST-1e"))
train <- transform(train, Destination = replace(Destination, is.na(Destination), "TRAPPIST-1e"))
test$HomePlanet <- as.factor(test$HomePlanet)
test$Destination <- as.factor(test$Destination)
train$HomePlanet <- as.factor(train$HomePlanet)
train$Destination <- as.factor(train$Destination)
summary(train)
##   PassengerId    HomePlanet   CryoSleep           Cabin     
##  0001_01:   1   Earth :4772   Mode :logical   G/734/S:   8  
##  0002_01:   1   Europa:2162   FALSE:5439      B/11/S :   7  
##  0003_01:   1   Mars  :1759   TRUE :3037      B/201/P:   7  
##  0003_02:   1                 NA's :217       B/82/S :   7  
##  0004_01:   1                                 C/137/S:   7  
##  0005_01:   1                                 (Other):8458  
##  (Other):8687                                 NA's   : 199  
##         Destination        Age           VIP           RoomService     
##  55 Cancri e  :1800   Min.   : 0.00   Mode :logical   Min.   :    0.0  
##  PSO J318.5-22: 796   1st Qu.:19.00   FALSE:8291      1st Qu.:    0.0  
##  TRAPPIST-1e  :6097   Median :27.00   TRUE :199       Median :    0.0  
##                       Mean   :28.83   NA's :203       Mean   :  224.7  
##                       3rd Qu.:38.00                   3rd Qu.:   47.0  
##                       Max.   :79.00                   Max.   :14327.0  
##                       NA's   :179                     NA's   :181      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  458.1   Mean   :  173.7   Mean   :  311.1   Mean   :  304.9  
##  3rd Qu.:   76.0   3rd Qu.:   27.0   3rd Qu.:   59.0   3rd Qu.:   46.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##  NA's   :183       NA's   :208       NA's   :183       NA's   :188      
##                  Name      Transported         group      pp        withgroup
##  Alraium Disivering:   2   Mode :logical   0001_01:   1   :8693   Min.   :0  
##  Ankalik Nateansive:   2   FALSE:4315      0002_01:   1           1st Qu.:0  
##  Anton Woody       :   2   TRUE :4378      0003_01:   1           Median :0  
##  Apix Wala         :   2                   0003_02:   1           Mean   :0  
##  Asch Stradick     :   2                   0004_01:   1           3rd Qu.:0  
##  (Other)           :8483                   0005_01:   1           Max.   :0  
##  NA's              : 200                   (Other):8687                      
##       deck           num       side    
##  F      :2794          : 199    : 199  
##  G      :2559   82     :  28   P:4206  
##  E      : 876   19     :  22   S:4288  
##  B      : 779   86     :  22           
##  C      : 747   176    :  21           
##  (Other): 739   56     :  21           
##  NA's   : 199   (Other):8380
train <- train %>%
mutate(RoomService = coalesce(RoomService, 0),
 FoodCourt = coalesce(FoodCourt, 0),
ShoppingMall = coalesce(ShoppingMall, 0),
Spa = coalesce(Spa, 0),
VRDeck = coalesce(VRDeck, 0))
test <- test %>%
mutate(RoomService = coalesce(RoomService, 0),
 FoodCourt = coalesce(FoodCourt, 0),
ShoppingMall = coalesce(ShoppingMall, 0),
Spa = coalesce(Spa, 0),
VRDeck = coalesce(VRDeck, 0))
test <- test %>%
group_by(HomePlanet, Destination) %>%
mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
train <- train %>%
group_by(HomePlanet, Destination) %>%
mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
train$expense <- train$RoomService + train$FoodCourt + train$ShoppingMall + train$Spa + train$VRDeck
test$expense <- test$RoomService + test$FoodCourt + test$ShoppingMall + test$Spa + test$VRDeck
train <- transform(train, CryoSleep = replace(CryoSleep, expense>0 & Age>12, "FALSE"))
test <- transform(test, CryoSleep = replace(CryoSleep, expense>0 & Age>12, "FALSE"))
describe_all(train)
## # A tibble: 21 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     8693    NA  NA       NA
##  2 HomePlanet   fct       0    0        3    NA  NA       NA
##  3 CryoSleep    chr      98    1.1      3    NA  NA       NA
##  4 Cabin        fct     199    2.3   6561    NA  NA       NA
##  5 Destination  fct       0    0        3    NA  NA       NA
##  6 Age          dbl       0    0       88     0  28.8     79
##  7 VIP          lgl     203    2.3      3     0   0.02     1
##  8 RoomService  dbl       0    0     1273     0 220.   14327
##  9 FoodCourt    dbl       0    0     1507     0 448.   29813
## 10 ShoppingMall dbl       0    0     1115     0 170.   23492
## # ℹ 11 more rows
train <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep) & expense-0 & Age>12,
"TRUE"))
test <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep) & expense-0 & Age>12,
"TRUE"))
describe_all(train)
## # A tibble: 21 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     8693    NA  NA       NA
##  2 HomePlanet   fct       0    0        3    NA  NA       NA
##  3 CryoSleep    chr      98    1.1      3    NA  NA       NA
##  4 Cabin        fct     199    2.3   6561    NA  NA       NA
##  5 Destination  fct       0    0        3    NA  NA       NA
##  6 Age          dbl       0    0       88     0  28.8     79
##  7 VIP          lgl     203    2.3      3     0   0.02     1
##  8 RoomService  dbl       0    0     1273     0 220.   14327
##  9 FoodCourt    dbl       0    0     1507     0 448.   29813
## 10 ShoppingMall dbl       0    0     1115     0 170.   23492
## # ℹ 11 more rows
train <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep),  "FALSE"))
test <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep),  "FALSE"))
describe_all(train)
## # A tibble: 21 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     8693    NA  NA       NA
##  2 HomePlanet   fct       0    0        3    NA  NA       NA
##  3 CryoSleep    chr       0    0        2    NA  NA       NA
##  4 Cabin        fct     199    2.3   6561    NA  NA       NA
##  5 Destination  fct       0    0        3    NA  NA       NA
##  6 Age          dbl       0    0       88     0  28.8     79
##  7 VIP          lgl     203    2.3      3     0   0.02     1
##  8 RoomService  dbl       0    0     1273     0 220.   14327
##  9 FoodCourt    dbl       0    0     1507     0 448.   29813
## 10 ShoppingMall dbl       0    0     1115     0 170.   23492
## # ℹ 11 more rows
train$cryosleep <- as.factor(train$CryoSleep)
test$cryosleep <- as.factor(test$CryoSleep)
summary(train)
##   PassengerId    HomePlanet    CryoSleep             Cabin     
##  0001_01:   1   Earth :4772   Length:8693        G/734/S:   8  
##  0002_01:   1   Europa:2162   Class :character   B/11/S :   7  
##  0003_01:   1   Mars  :1759   Mode  :character   B/201/P:   7  
##  0003_02:   1                                    B/82/S :   7  
##  0004_01:   1                                    C/137/S:   7  
##  0005_01:   1                                    (Other):8458  
##  (Other):8687                                    NA's   : 199  
##         Destination        Age           VIP           RoomService   
##  55 Cancri e  :1800   Min.   : 0.00   Mode :logical   Min.   :    0  
##  PSO J318.5-22: 796   1st Qu.:20.00   FALSE:8291      1st Qu.:    0  
##  TRAPPIST-1e  :6097   Median :27.00   TRUE :199       Median :    0  
##                       Mean   :28.83   NA's :203       Mean   :  220  
##                       3rd Qu.:37.00                   3rd Qu.:   41  
##                       Max.   :79.00                   Max.   :14327  
##                                                                      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  448.4   Mean   :  169.6   Mean   :  304.6   Mean   :  298.3  
##  3rd Qu.:   61.0   3rd Qu.:   22.0   3rd Qu.:   53.0   3rd Qu.:   40.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##                                                                         
##                  Name      Transported         group      pp        withgroup
##  Alraium Disivering:   2   Mode :logical   0001_01:   1   :8693   Min.   :0  
##  Ankalik Nateansive:   2   FALSE:4315      0002_01:   1           1st Qu.:0  
##  Anton Woody       :   2   TRUE :4378      0003_01:   1           Median :0  
##  Apix Wala         :   2                   0003_02:   1           Mean   :0  
##  Asch Stradick     :   2                   0004_01:   1           3rd Qu.:0  
##  (Other)           :8483                   0005_01:   1           Max.   :0  
##  NA's              : 200                   (Other):8687                      
##       deck           num       side        expense      cryosleep   
##  F      :2794          : 199    : 199   Min.   :    0   FALSE:5656  
##  G      :2559   82     :  28   P:4206   1st Qu.:    0   TRUE :3037  
##  E      : 876   19     :  22   S:4288   Median :  716               
##  B      : 779   86     :  22            Mean   : 1441               
##  C      : 747   176    :  21            3rd Qu.: 1441               
##  (Other): 739   56     :  21            Max.   :35987               
##  NA's   : 199   (Other):8380
train <- transform(train, VIP = replace(VIP, is.na(VIP), FALSE))
test <- transform(train, VIP = replace(VIP, is.na(VIP), FALSE))
train %>% describe_all()
## # A tibble: 22 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     8693    NA  NA       NA
##  2 HomePlanet   fct       0    0        3    NA  NA       NA
##  3 CryoSleep    chr       0    0        2    NA  NA       NA
##  4 Cabin        fct     199    2.3   6561    NA  NA       NA
##  5 Destination  fct       0    0        3    NA  NA       NA
##  6 Age          dbl       0    0       88     0  28.8     79
##  7 VIP          lgl       0    0        2     0   0.02     1
##  8 RoomService  dbl       0    0     1273     0 220.   14327
##  9 FoodCourt    dbl       0    0     1507     0 448.   29813
## 10 ShoppingMall dbl       0    0     1115     0 170.   23492
## # ℹ 12 more rows
train <- train %>%
  group_by(group) %>%
mutate(deck = na.locf(deck, na.rm = FALSE))
test <- test %>%
  group_by(group) %>%
mutate(deck = na.locf(deck, na.rm = FALSE))
summary(train)
##   PassengerId    HomePlanet    CryoSleep             Cabin     
##  0001_01:   1   Earth :4772   Length:8693        G/734/S:   8  
##  0002_01:   1   Europa:2162   Class :character   B/11/S :   7  
##  0003_01:   1   Mars  :1759   Mode  :character   B/201/P:   7  
##  0003_02:   1                                    B/82/S :   7  
##  0004_01:   1                                    C/137/S:   7  
##  0005_01:   1                                    (Other):8458  
##  (Other):8687                                    NA's   : 199  
##         Destination        Age           VIP           RoomService   
##  55 Cancri e  :1800   Min.   : 0.00   Mode :logical   Min.   :    0  
##  PSO J318.5-22: 796   1st Qu.:20.00   FALSE:8494      1st Qu.:    0  
##  TRAPPIST-1e  :6097   Median :27.00   TRUE :199       Median :    0  
##                       Mean   :28.83                   Mean   :  220  
##                       3rd Qu.:37.00                   3rd Qu.:   41  
##                       Max.   :79.00                   Max.   :14327  
##                                                                      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  448.4   Mean   :  169.6   Mean   :  304.6   Mean   :  298.3  
##  3rd Qu.:   61.0   3rd Qu.:   22.0   3rd Qu.:   53.0   3rd Qu.:   40.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##                                                                         
##                  Name      Transported         group      pp        withgroup
##  Alraium Disivering:   2   Mode :logical   0001_01:   1   :8693   Min.   :0  
##  Ankalik Nateansive:   2   FALSE:4315      0002_01:   1           1st Qu.:0  
##  Anton Woody       :   2   TRUE :4378      0003_01:   1           Median :0  
##  Apix Wala         :   2                   0003_02:   1           Mean   :0  
##  Asch Stradick     :   2                   0004_01:   1           3rd Qu.:0  
##  (Other)           :8483                   0005_01:   1           Max.   :0  
##  NA's              : 200                   (Other):8687                      
##       deck           num       side        expense      cryosleep   
##  F      :2794          : 199    : 199   Min.   :    0   FALSE:5656  
##  G      :2559   82     :  28   P:4206   1st Qu.:    0   TRUE :3037  
##  E      : 876   19     :  22   S:4288   Median :  716               
##  B      : 779   86     :  22            Mean   : 1441               
##  C      : 747   176    :  21            3rd Qu.: 1441               
##  (Other): 739   56     :  21            Max.   :35987               
##  NA's   : 199   (Other):8380
most_frequent_deck <- train %>%
filter(!is.na(deck)) %>%
group_by (HomePlanet, deck) %>%
summarize(count = n()) %>%
arrange(HomePlanet, desc(count)) %>%
slice (1) %>%
ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_deck
## # A tibble: 3 × 3
##   HomePlanet deck  count
##   <fct>      <fct> <int>
## 1 Earth      G      2553
## 2 Europa     B       771
## 3 Mars       F      1110
train$HomePlanet <- as.character(train$HomePlanet)
train$deck <- as.character(train$deck)
train <- train %>% 
   mutate(deck = ifelse(is.na(deck) & HomePlanet ==  "Earth", "G",
                       ifelse(is.na(deck) & HomePlanet == "Europa", "B",
                              ifelse(is.na(deck) & HomePlanet == "Mars", "F", deck))))
test$HomePlanet <- as.character(test$HomePlanet)
test$deck <- as.character(test$deck)
test <- test %>% 
   mutate(deck = ifelse(is.na(deck) & HomePlanet ==  "Earth", "G",
                       ifelse(is.na(deck) & HomePlanet == "Europa", "B",
                              ifelse(is.na(deck) & HomePlanet == "Mars", "F", deck))))
most_frequent_side <- train %>%
filter(!is.na(deck)) %>%
group_by (HomePlanet, deck) %>%
summarize(count = n()) %>%
arrange(HomePlanet, desc(count)) %>%
slice (1) %>%
ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_side
## # A tibble: 3 × 3
##   HomePlanet deck  count
##   <chr>      <chr> <int>
## 1 Earth      G      2652
## 2 Europa     B       834
## 3 Mars       F      1147
train$side <- as.character(train$side)
train <- train %>% 
   mutate(deck = ifelse(is.na(side) & HomePlanet ==  "Earth", "G",
                       ifelse(is.na(side) & HomePlanet == "Europa", "B",
                              ifelse(is.na(side) & HomePlanet == "Mars", "F", deck))))
test$side <- as.character(test$side)
test <- test %>% 
   mutate(deck = ifelse(is.na(side) & HomePlanet ==  "Earth", "G",
                       ifelse(is.na(side) & HomePlanet == "Europa", "B",
                              ifelse(is.na(side) & HomePlanet == "Mars", "F", deck))))
train %>% describe_all()
## # A tibble: 22 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  fct       0    0     8693    NA  NA       NA
##  2 HomePlanet   chr       0    0        3    NA  NA       NA
##  3 CryoSleep    chr       0    0        2    NA  NA       NA
##  4 Cabin        fct     199    2.3   6561    NA  NA       NA
##  5 Destination  fct       0    0        3    NA  NA       NA
##  6 Age          dbl       0    0       88     0  28.8     79
##  7 VIP          lgl       0    0        2     0   0.02     1
##  8 RoomService  dbl       0    0     1273     0 220.   14327
##  9 FoodCourt    dbl       0    0     1507     0 448.   29813
## 10 ShoppingMall dbl       0    0     1115     0 170.   23492
## # ℹ 12 more rows
train <- train %>% select(-c("Cabin", "Name", "group", "pp", "num"))
## Adding missing grouping variables: `group`
test <- test %>% select(-c("Cabin", "Name", "group", "pp", "num"))
## Adding missing grouping variables: `group`
describe_all(train)
## # A tibble: 18 × 8
##    variable     type     na na_pct unique   min    mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>   <dbl> <dbl>
##  1 group        fct       0      0   8693    NA   NA       NA
##  2 PassengerId  fct       0      0   8693    NA   NA       NA
##  3 HomePlanet   chr       0      0      3    NA   NA       NA
##  4 CryoSleep    chr       0      0      2    NA   NA       NA
##  5 Destination  fct       0      0      3    NA   NA       NA
##  6 Age          dbl       0      0     88     0   28.8     79
##  7 VIP          lgl       0      0      2     0    0.02     1
##  8 RoomService  dbl       0      0   1273     0  220.   14327
##  9 FoodCourt    dbl       0      0   1507     0  448.   29813
## 10 ShoppingMall dbl       0      0   1115     0  170.   23492
## 11 Spa          dbl       0      0   1327     0  305.   22408
## 12 VRDeck       dbl       0      0   1306     0  298.   24133
## 13 Transported  lgl       0      0      2     0    0.5      1
## 14 withgroup    dbl       0      0      1     0    0        0
## 15 deck         chr       0      0      8    NA   NA       NA
## 16 side         chr       0      0      3    NA   NA       NA
## 17 expense      dbl       0      0   2336     0 1441.   35987
## 18 cryosleep    fct       0      0      2    NA   NA       NA
train <- train %>% mutate_if(is.character,as.factor)
## `mutate_if()` ignored the following grouping variables:
## • Column `group`
test <- test %>% mutate_if(is.character,as.factor)
## `mutate_if()` ignored the following grouping variables:
## • Column `group`
train$PassengerId <- as.character(train$PassengerId)
test$PassengerId <- as.character(test$PassengerId)
summary(train)
##      group      PassengerId         HomePlanet   CryoSleep   
##  0001_01:   1   Length:8693        Europa:2162   FALSE:5656  
##  0002_01:   1   Class :character   Earth :4772   TRUE :3037  
##  0003_01:   1   Mode  :character   Mars  :1759               
##  0003_02:   1                                                
##  0004_01:   1                                                
##  0005_01:   1                                                
##  (Other):8687                                                
##         Destination        Age           VIP           RoomService   
##  55 Cancri e  :1800   Min.   : 0.00   Mode :logical   Min.   :    0  
##  PSO J318.5-22: 796   1st Qu.:20.00   FALSE:8494      1st Qu.:    0  
##  TRAPPIST-1e  :6097   Median :27.00   TRUE :199       Median :    0  
##                       Mean   :28.83                   Mean   :  220  
##                       3rd Qu.:37.00                   3rd Qu.:   41  
##                       Max.   :79.00                   Max.   :14327  
##                                                                      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  448.4   Mean   :  169.6   Mean   :  304.6   Mean   :  298.3  
##  3rd Qu.:   61.0   3rd Qu.:   22.0   3rd Qu.:   53.0   3rd Qu.:   40.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##                                                                         
##  Transported       withgroup      deck      side        expense     
##  Mode :logical   Min.   :0   F      :2831   P:4206   Min.   :    0  
##  FALSE:4315      1st Qu.:0   G      :2658   S:4288   1st Qu.:    0  
##  TRUE :4378      Median :0   E      : 876    : 199   Median :  716  
##                  Mean   :0   B      : 842            Mean   : 1441  
##                  3rd Qu.:0   C      : 747            3rd Qu.: 1441  
##                  Max.   :0   D      : 478            Max.   :35987  
##                              (Other): 261                           
##  cryosleep   
##  FALSE:5656  
##  TRUE :3037  
##              
##              
##              
##              
## 
train$group <- NULL 
test$group <- NULL 
write.csv(train, "train_c.csv", row.names=FALSE)
write.csv(test, "test_c.csv", row.names=FALSE)
train <- read.csv("train_c.csv")
test <- read.csv("test_c.csv")
paged_table(train)
test$HomePlanet <- as.factor(test$HomePlanet)
test$Destination <- as.factor(test$Destination)
test$deck <- as.factor(test$deck)
test$side <- as.factor(test$side)
train$HomePlanet <- as.factor(train$HomePlanet)
train$Destination <- as.factor(train$Destination)
train$deck <- as.factor(train$deck)
train$side <- as.factor(train$side)
D <- train[,2:16] %>% mutate(across(everything(), ~as.integer(.)))
kor <- cor(D)
## Warning in cor(D): the standard deviation is zero
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(kor)

library(DataExplorer)
create_report(train)
model <- lm(Transported ~ ., data = train[, 2:16]) 
summary(model)
## 
## Call:
## lm(formula = Transported ~ ., data = train[, 2:16])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.40936 -0.31038 -0.03383  0.29023  1.78012 
## 
## Coefficients: (2 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.562e-01  4.909e-02   7.256 4.33e-13 ***
## HomePlanetEuropa          2.057e-01  2.771e-02   7.422 1.26e-13 ***
## HomePlanetMars            9.750e-02  1.456e-02   6.698 2.25e-11 ***
## CryoSleepTRUE             3.817e-01  1.152e-02  33.134  < 2e-16 ***
## DestinationPSO J318.5-22 -4.337e-02  1.807e-02  -2.399   0.0164 *  
## DestinationTRAPPIST-1e   -4.628e-02  1.133e-02  -4.085 4.45e-05 ***
## Age                      -2.306e-03  3.141e-04  -7.341 2.31e-13 ***
## VIPTRUE                  -3.816e-02  2.976e-02  -1.282   0.1998    
## RoomService              -1.182e-04  7.056e-06 -16.757  < 2e-16 ***
## FoodCourt                 4.283e-05  3.057e-06  14.009  < 2e-16 ***
## ShoppingMall              7.870e-05  7.460e-06  10.550  < 2e-16 ***
## Spa                      -8.640e-05  4.116e-06 -20.993  < 2e-16 ***
## VRDeck                   -8.271e-05  4.117e-06 -20.090  < 2e-16 ***
## withgroup                        NA         NA      NA       NA    
## deckB                     1.215e-01  2.898e-02   4.191 2.80e-05 ***
## deckC                     1.518e-01  2.928e-02   5.183 2.23e-07 ***
## deckD                     4.223e-02  3.477e-02   1.214   0.2246    
## deckE                    -2.038e-03  3.582e-02  -0.057   0.9546    
## deckF                     9.154e-02  3.656e-02   2.504   0.0123 *  
## deckG                     4.482e-02  3.810e-02   1.176   0.2395    
## deckT                     5.289e-02  1.816e-01   0.291   0.7709    
## sideP                    -2.210e-02  2.955e-02  -0.748   0.4546    
## sideS                     6.413e-02  2.952e-02   2.172   0.0299 *  
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4015 on 8671 degrees of freedom
## Multiple R-squared:  0.3567, Adjusted R-squared:  0.3552 
## F-statistic:   229 on 21 and 8671 DF,  p-value: < 2.2e-16
library(caTools)
set.seed(123)
split = sample.split(train$Transported, SplitRatio = 0.75)
train_train = subset(train, split == TRUE)
train_test = subset(train, split == FALSE)
regresyon <- lm(Transported ~ ., data = train_train[, -c(1)])
summary(regresyon)
## 
## Call:
## lm(formula = Transported ~ ., data = train_train[, -c(1)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48713 -0.31346 -0.03176  0.29206  1.75532 
## 
## Coefficients: (3 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.751e-01  5.686e-02   6.596 4.56e-11 ***
## HomePlanetEuropa          1.886e-01  3.217e-02   5.861 4.82e-09 ***
## HomePlanetMars            8.668e-02  1.675e-02   5.174 2.36e-07 ***
## CryoSleepTRUE             3.776e-01  1.328e-02  28.445  < 2e-16 ***
## DestinationPSO J318.5-22 -5.948e-02  2.081e-02  -2.858 0.004279 ** 
## DestinationTRAPPIST-1e   -4.993e-02  1.308e-02  -3.816 0.000137 ***
## Age                      -2.442e-03  3.650e-04  -6.690 2.42e-11 ***
## VIPTRUE                  -1.618e-02  3.385e-02  -0.478 0.632715    
## RoomService              -1.178e-04  7.789e-06 -15.118  < 2e-16 ***
## FoodCourt                 3.890e-05  3.468e-06  11.216  < 2e-16 ***
## ShoppingMall              8.188e-05  8.413e-06   9.733  < 2e-16 ***
## Spa                      -8.526e-05  4.711e-06 -18.099  < 2e-16 ***
## VRDeck                   -8.201e-05  4.776e-06 -17.170  < 2e-16 ***
## withgroup                        NA         NA      NA       NA    
## deckB                     1.155e-01  3.313e-02   3.485 0.000495 ***
## deckC                     1.449e-01  3.345e-02   4.333 1.49e-05 ***
## deckD                     2.188e-02  3.961e-02   0.552 0.580706    
## deckE                    -3.674e-02  4.147e-02  -0.886 0.375642    
## deckF                     7.079e-02  4.208e-02   1.682 0.092551 .  
## deckG                     2.552e-02  4.388e-02   0.582 0.560908    
## deckT                     4.365e-02  1.822e-01   0.240 0.810637    
## sideP                    -7.773e-03  3.454e-02  -0.225 0.821954    
## sideS                     8.234e-02  3.451e-02   2.386 0.017047 *  
## expense                          NA         NA      NA       NA    
## cryosleepTRUE                    NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4014 on 6498 degrees of freedom
## Multiple R-squared:  0.3577, Adjusted R-squared:  0.3556 
## F-statistic: 172.3 on 21 and 6498 DF,  p-value: < 2.2e-16
reg_tahmin = predict(regresyon, newdata = train_test[, -c(1,12)])
reg_transported_tahmin <- ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek <- ifelse(train_test[12] == TRUE, 1,0)
cm = table(transported_gercek, reg_transported_tahmin)
cm
##                   reg_transported_tahmin
## transported_gercek   0   1
##                  0 905 174
##                  1 325 769
(823 + 917)/(823 + 256 + 177 + 917)
## [1] 0.8007363
reg_tahmin = predict(regresyon, newdata = train_train[, -c(1,12)])
reg_transported_tahmin <- ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek <- ifelse(train_train[12] == TRUE, 1,0)
cm = table(transported_gercek, reg_transported_tahmin)
cm
##                   reg_transported_tahmin
## transported_gercek    0    1
##                  0 2680  556
##                  1  947 2337
( 2680 + 2337)/(2680 + 556 +  947  + 2337)
## [1] 0.7694785
reg_tahmin_bd = predict(model, newdata = test[, -c(1)])
reg_transported_test_tahmin <- ifelse(reg_tahmin_bd > 0.5, TRUE, FALSE)
Transported <- as.character(reg_transported_test_tahmin)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submission_regrasyon <- cbind(PassengerId, Transported)
submission_regrasyon <- as.data.frame(submission_regrasyon)
library(stringr)
submission_regrasyon$Transported <- str_to_title(submission_regrasyon$Transported)
write.csv(submission_regrasyon, "siniftahmini.csv", row.names = FALSE, quote = FALSE)
library(glmulti)
## Loading required package: rJava
## Loading required package: leaps
regresyon_opt <- glmulti(Transported ~ HomePlanet + Cryosleep + Destination + Age + VIP + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck + withgroup + deck + side, + level = 1, crit = bic, data =train)
modelglmulti <- lm(Transported ~ 1 + HomePlanet +  Destination + deck + side + Cryosleep + Age + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck, data = train )
reg_tahmin_glmulti = predict(modelglmulti, newdata = test[, -c(1)])
reg_transported_test_tahmin_glmulti <- ifelse(reg_tahmin_glmulti > 0.5, TRUE, FALSE)
Transported <- as.character(reg_transported_test_tahmin_glmulti)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submission_regrasyon_glmulti <- cbind(PassengerId, Transported)
submission_regrasyon_glmulti <- as.data.frame(submission_regrasyon_glmulti)
submission_regrasyon_glmulti$Transported <- str_to_title(submission_regrasyon_glmulti$Transpored)
write.csv(submission_regrasyon_glmulti, "submission_regrasyon_glmulti.csv", row.names =FALSE, quote=FALSE)
train_log <- train %>% 
  mutate_at(c(5, 7:11, 16), ~log(1 + .))
test_log <- test %>% 
  mutate_at(c(5, 7:11, 16), ~log(1 + .))
modellog <- lm(Transported ~ 1 + HomePlanet + Destination + deck + side + cryosleep +Age + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck, data = train_log)
reg_tahmin_log = predict(modellog, newdata = test_log[, -c(1)])
reg_transported_test_tahmin_log <- ifelse(reg_tahmin_log > 0.5, TRUE, FALSE)
Transported <- as.character(reg_transported_test_tahmin_log)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submission_regrasyon_log <- cbind(PassengerId, Transported)
submission_regrasyon_log <- as.data.frame(submission_regrasyon_log)
submission_regrasyon_log$Transported <- str_to_title(submission_regrasyon_log$Transported)
write.csv(submission_regrasyon_log, "submission_regrasyon_log_csv", row.names =FALSE, quote=FALSE)
logistic = glm(formula = Transported ~ ., 
               family = binomial,
               data = train_train[, -c(1)])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic)
## 
## Call:
## glm(formula = Transported ~ ., family = binomial, data = train_train[, 
##     -c(1)])
## 
## Coefficients: (3 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -6.785e-02  4.240e-01  -0.160 0.872875    
## HomePlanetEuropa          1.386e+00  2.512e-01   5.519 3.41e-08 ***
## HomePlanetMars            5.227e-01  1.083e-01   4.824 1.41e-06 ***
## CryoSleepTRUE             1.294e+00  9.154e-02  14.139  < 2e-16 ***
## DestinationPSO J318.5-22 -5.017e-01  1.297e-01  -3.869 0.000109 ***
## DestinationTRAPPIST-1e   -4.540e-01  9.401e-02  -4.829 1.37e-06 ***
## Age                      -9.359e-03  2.397e-03  -3.905 9.42e-05 ***
## VIPTRUE                  -1.870e-01  2.935e-01  -0.637 0.524085    
## RoomService              -1.731e-03  1.135e-04 -15.246  < 2e-16 ***
## FoodCourt                 4.557e-04  4.444e-05  10.255  < 2e-16 ***
## ShoppingMall              5.211e-04  7.524e-05   6.927 4.30e-12 ***
## Spa                      -1.951e-03  1.161e-04 -16.810  < 2e-16 ***
## VRDeck                   -1.879e-03  1.157e-04 -16.243  < 2e-16 ***
## withgroup                        NA         NA      NA       NA    
## deckB                     1.260e+00  2.916e-01   4.320 1.56e-05 ***
## deckC                     2.398e+00  3.301e-01   7.266 3.71e-13 ***
## deckD                     5.535e-01  3.191e-01   1.735 0.082774 .  
## deckE                    -7.900e-02  3.255e-01  -0.243 0.808213    
## deckF                     5.733e-01  3.287e-01   1.744 0.081110 .  
## deckG                     1.611e-01  3.378e-01   0.477 0.633416    
## deckT                    -2.762e-01  1.816e+00  -0.152 0.879136    
## sideP                    -1.471e-01  2.452e-01  -0.600 0.548453    
## sideS                     4.591e-01  2.455e-01   1.870 0.061496 .  
## expense                          NA         NA      NA       NA    
## cryosleepTRUE                    NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9038.3  on 6519  degrees of freedom
## Residual deviance: 5612.7  on 6498  degrees of freedom
## AIC: 5656.7
## 
## Number of Fisher Scoring iterations: 7
logistic_tahmin = predict(logistic, newdata = train_test[, -c(1,12)])
head(logistic_tahmin)
##           2           3           7          14          16          21 
##  -1.0391743 -11.0422312   0.8975435  -1.6060140  -0.2496921  -1.7609533
logistic_transported_tahmin <- ifelse(logistic_tahmin > 0.5, 1, 0)
transported_gercek <- ifelse(train_test[12] == TRUE,1,0)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
result = data.frame(cbind(transported_gercek, logistic_transported_tahmin))
result$Transported <- as.factor(result$Transported)
result$logistic_transported_tahmin <- as.factor(result$logistic_transported_tahmin)
accuracy(result,truth = Transported, estimate = logistic_transported_tahmin)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.785
conf_mat(result, truth = Transported, estimate = logistic_transported_tahmin)
##           Truth
## Prediction   0   1
##          0 921 310
##          1 158 784
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
cm = table(transported_gercek, logistic_transported_tahmin)
cm
##                   logistic_transported_tahmin
## transported_gercek   0   1
##                  0 921 158
##                  1 310 784
(921 + 784)/ (921 + 158 + 310 + 784)
## [1] 0.7846295
confusionMatrix(as.factor(transported_gercek), as.factor(logistic_transported_tahmin))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 921 158
##          1 310 784
##                                           
##                Accuracy : 0.7846          
##                  95% CI : (0.7667, 0.8018)
##     No Information Rate : 0.5665          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5697          
##                                           
##  Mcnemar's Test P-Value : 2.952e-12       
##                                           
##             Sensitivity : 0.7482          
##             Specificity : 0.8323          
##          Pos Pred Value : 0.8536          
##          Neg Pred Value : 0.7166          
##              Prevalence : 0.5665          
##          Detection Rate : 0.4238          
##    Detection Prevalence : 0.4965          
##       Balanced Accuracy : 0.7902          
##                                           
##        'Positive' Class : 0               
## 
logistic_bd = glm(formula = Transported ~ ., 
                  family = binomial, 
                  data = train[, -c(1)])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic_tahmin_bd = predict(logistic_bd, newdata = test[, -c(1)])
logistic_transported_test_tahmin <- ifelse(logistic_tahmin_bd > 0.5, TRUE, FALSE)
Transported <- as.character(logistic_transported_test_tahmin)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submision_logistic <- cbind(PassengerId, Transported)
submision_logistic <- as.data.frame(submision_logistic)
submision_logistic$Transported <- 
  str_to_title(submision_logistic$Transported)
write.csv(submision_logistic, "submision_logistic.csv", row.names =FALSE, quote = FALSE)
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
## 
##     tune
## The following object is masked from 'package:rsample':
## 
##     permutations
## The following object is masked from 'package:parsnip':
## 
##     tune
fit_nb <- naiveBayes(Transported ~ ., data =train_train[, -1])
fit_nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    FALSE     TRUE 
## 0.496319 0.503681 
## 
## Conditional probabilities:
##        HomePlanet
## Y           Earth    Europa      Mars
##   FALSE 0.6251545 0.1749073 0.1999382
##   TRUE  0.4649817 0.3285627 0.2064555
## 
##        CryoSleep
## Y           FALSE      TRUE
##   FALSE 0.8702101 0.1297899
##   TRUE  0.4336175 0.5663825
## 
##        Destination
## Y       55 Cancri e PSO J318.5-22 TRAPPIST-1e
##   FALSE  0.16069221    0.09363412  0.74567367
##   TRUE   0.24969549    0.09043849  0.65986602
## 
##        Age
## Y           [,1]     [,2]
##   FALSE 30.01152 13.45216
##   TRUE  27.84835 14.87502
## 
##        VIP
## Y            FALSE       TRUE
##   FALSE 0.97126082 0.02873918
##   TRUE  0.98142509 0.01857491
## 
##        RoomService
## Y            [,1]     [,2]
##   FALSE 402.20365 916.6547
##   TRUE   56.93484 246.8105
## 
##        FoodCourt
## Y           [,1]     [,2]
##   FALSE 396.2923 1258.123
##   TRUE  514.8018 1918.620
## 
##        ShoppingMall
## Y           [,1]     [,2]
##   FALSE 160.2250 432.1103
##   TRUE  182.7135 748.5867
## 
##        Spa
## Y            [,1]      [,2]
##   FALSE 561.16193 1554.2636
##   TRUE   61.11571  264.0556
## 
##        VRDeck
## Y            [,1]      [,2]
##   FALSE 533.98733 1542.8703
##   TRUE   69.07186  295.0542
## 
##        withgroup
## Y       [,1] [,2]
##   FALSE    0    0
##   TRUE     0    0
## 
##        deck
## Y                  A            B            C            D            E
##   FALSE 0.0296662546 0.0553152040 0.0568603214 0.0655129790 0.1338071693
##   TRUE  0.0301461632 0.1419001218 0.1178440926 0.0484165652 0.0672959805
##        deck
## Y                  F            G            T
##   FALSE 0.3634116193 0.2941903585 0.0012360939
##   TRUE  0.2834957369 0.3105968331 0.0003045067
## 
##        side
## Y                           P          S
##   FALSE 0.02348578 0.54079110 0.43572311
##   TRUE  0.02131547 0.43300853 0.54567600
## 
##        expense
## Y            [,1]     [,2]
##   FALSE 2053.8702 3224.219
##   TRUE   884.6376 2307.962
## 
##        cryosleep
## Y           FALSE      TRUE
##   FALSE 0.8702101 0.1297899
##   TRUE  0.4336175 0.5663825
pred_nb <- predict(fit_nb, newdata = train_test[, -c(1, 12)], type = "raw") %>% data.frame()
head(pred_nb)
##       FALSE.         TRUE.
## 1 0.32131320  6.786868e-01
## 2 1.00000000 3.298257e-135
## 3 0.06619047  9.338095e-01
## 4 0.70820285  2.917971e-01
## 5 0.03490270  9.650973e-01
## 6 0.61151399  3.884860e-01
Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, 1, 0)
Transported_test_train <- ifelse(train_test[12] == TRUE, 1, 0)
head(Transported_test_train)
##    Transported
## 2            1
## 3            0
## 7            1
## 14           0
## 16           0
## 21           0
cm= table(Transported_test_train, Transported_pred_nb)
cm
##                       Transported_pred_nb
## Transported_test_train    0    1
##                      0  530  549
##                      1   86 1008
(530 + 549) / (530 + 549 + 86 + 1008)
## [1] 0.4965486
fit_svm <- svm(Transported ~ ., data = train_train[, -1], type = 'C-classification')
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>% data.frame()
head(preds)
##        .
## 2  FALSE
## 3  FALSE
## 7  FALSE
## 14 FALSE
## 16 FALSE
## 21 FALSE
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_test_train, Transported_pred_svm)
cm
##                       Transported_pred_svm
## Transported_test_train   0   1
##                      0 897 182
##                      1 378 716
(897 + 182) / (897 + 182 + 378 + 812)
## [1] 0.4755399
fit_svm <- svm(Transported ~ ., data = train[, -1], 
               type = 'C-classification')
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>% data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)
Transported <- as.character(Transported_pred_svm)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submision <- cbind(PassengerId, Transported)
## Warning in cbind(PassengerId, Transported): number of rows of result is not a
## multiple of vector length (arg 2)
sample_submision <- as.data.frame(sample_submision)
write.csv(sample_submision, "sub_svm_csv", row.names = FALSE, quote = FALSE)
fit_svm <- svm(Transported ~ ., data = train_train[, -1], 
      type= 'C-classification',
       kernel = 'radial' )
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata= train_test[, -c(1,12)], type= "raw") %>% data.frame()
Transported_pred_svm = ifelse(preds$.== TRUE, 1,0)
cm = table(Transported_test_train, Transported_pred_svm) 
cm 
##                       Transported_pred_svm
## Transported_test_train   0   1
##                      0 897 182
##                      1 378 716
(897 + 182) / (897 + 182 + 378 + 812)
## [1] 0.4755399
fit_svm <- svm(Transported ~ ., data = train[, -1], 
        type= 'C-classification',
       kernel = 'radial' )
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata= test, type= "raw") %>% data.frame()
Transported_pred_svm = ifelse(preds$.== TRUE,TRUE,FALSE)
Transported <- as.character(Transported_pred_svm)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId,Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission,"sub_svm_radial.csv",row.names = FALSE, quote = FALSE)
P <- ggplot(train_train,aes(x=HomePlanet, y=deck,color=factor(Transported))) +
  geom_point(aes(shape=factor(Transported)), size=3) +
  scale_color_viridis_d() +
  labs(title = "", x="HomePlanet", y="deck") +
  theme_minimal() +
  theme(legend.position = "top")
P

library(rpart)
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
fit_tree <- rpart::rpart(Transported ~ ., data = train_train[, -1])
summary(fit_tree)
## Call:
## rpart::rpart(formula = Transported ~ ., data = train_train[, 
##     -1])
##   n= 6520 
## 
##           CP nsplit rel error    xerror         xstd
## 1 0.23321359      0 1.0000000 1.0001811 0.0001977604
## 2 0.04981598      1 0.7667864 0.7671015 0.0103212807
## 3 0.02807107      2 0.7169704 0.7173428 0.0090686704
## 4 0.02784643      3 0.6888994 0.7031918 0.0095398822
## 5 0.01770479      4 0.6610529 0.6654236 0.0096767137
## 6 0.01381587      5 0.6433481 0.6488743 0.0100411278
## 7 0.01194576      6 0.6295323 0.6381676 0.0101816720
## 8 0.01164524      7 0.6175865 0.6303299 0.0102037441
## 9 0.01000000      8 0.6059413 0.6235169 0.0102521030
## 
## Variable importance
##      expense    cryosleep    CryoSleep    FoodCourt          Spa       VRDeck 
##           21           16           16           12           11           10 
##         deck   HomePlanet ShoppingMall          Age  Destination 
##            5            5            2            1            1 
## 
## Node number 1: 6520 observations,    complexity param=0.2332136
##   mean=0.503681, MSE=0.2499865 
##   left son=2 (3795 obs) right son=3 (2725 obs)
##   Primary splits:
##       expense     < 0.5     to the right, improve=0.2332136, (0 missing)
##       CryoSleep   < 0.5     to the left,  improve=0.2095384, (0 missing)
##       cryosleep   < 0.5     to the left,  improve=0.2095384, (0 missing)
##       RoomService < 0.5     to the right, improve=0.1254121, (0 missing)
##       Spa         < 0.5     to the right, improve=0.1142180, (0 missing)
##   Surrogate splits:
##       CryoSleep < 0.5     to the left,  agree=0.932, adj=0.837, (0 split)
##       cryosleep < 0.5     to the left,  agree=0.932, adj=0.837, (0 split)
##       Spa       < 0.5     to the right, agree=0.784, adj=0.484, (0 split)
##       FoodCourt < 0.5     to the right, agree=0.770, adj=0.451, (0 split)
##       VRDeck    < 0.5     to the right, agree=0.768, adj=0.444, (0 split)
## 
## Node number 2: 3795 observations,    complexity param=0.02807107
##   mean=0.2990777, MSE=0.2096302 
##   left son=4 (3243 obs) right son=5 (552 obs)
##   Primary splits:
##       FoodCourt    < 1331    to the left,  improve=0.05751186, (0 missing)
##       ShoppingMall < 627.5   to the left,  improve=0.04530804, (0 missing)
##       RoomService  < 365.5   to the right, improve=0.04440387, (0 missing)
##       Spa          < 257.5   to the right, improve=0.03347453, (0 missing)
##       VRDeck       < 721     to the right, improve=0.02290457, (0 missing)
##   Surrogate splits:
##       expense    < 5981    to the left,  agree=0.885, adj=0.210, (0 split)
##       deck       splits as  RRRLLLLL,    agree=0.884, adj=0.201, (0 split)
##       HomePlanet splits as  LRL,         agree=0.878, adj=0.161, (0 split)
##       Spa        < 8955.5  to the left,  agree=0.856, adj=0.009, (0 split)
##       VRDeck     < 11692   to the left,  agree=0.856, adj=0.009, (0 split)
## 
## Node number 3: 2725 observations,    complexity param=0.04981598
##   mean=0.7886239, MSE=0.1666963 
##   left son=6 (1449 obs) right son=7 (1276 obs)
##   Primary splits:
##       deck        splits as  RRRRLRL-,    improve=0.17874760, (0 missing)
##       HomePlanet  splits as  LRR,         improve=0.12440710, (0 missing)
##       Destination splits as  RLL,         improve=0.02625136, (0 missing)
##       CryoSleep   < 0.5     to the left,  improve=0.02268236, (0 missing)
##       cryosleep   < 0.5     to the left,  improve=0.02268236, (0 missing)
##   Surrogate splits:
##       HomePlanet  splits as  LRR,         agree=0.933, adj=0.857, (0 split)
##       Age         < 24.5    to the left,  agree=0.625, adj=0.200, (0 split)
##       Destination splits as  RLL,         agree=0.591, adj=0.126, (0 split)
##       VIP         < 0.5     to the left,  agree=0.538, adj=0.014, (0 split)
##       side        splits as  RLL,         agree=0.533, adj=0.002, (0 split)
## 
## Node number 4: 3243 observations,    complexity param=0.02784643
##   mean=0.2537774, MSE=0.1893744 
##   left son=8 (2577 obs) right son=9 (666 obs)
##   Primary splits:
##       ShoppingMall < 541.5   to the left,  improve=0.07390355, (0 missing)
##       RoomService  < 365.5   to the right, improve=0.03464407, (0 missing)
##       Spa          < 240.5   to the right, improve=0.03327259, (0 missing)
##       VRDeck       < 114     to the right, improve=0.02784287, (0 missing)
##       expense      < 2867.5  to the right, improve=0.01811461, (0 missing)
##   Surrogate splits:
##       expense < 18644   to the left,  agree=0.795, adj=0.003, (0 split)
## 
## Node number 5: 552 observations,    complexity param=0.01770479
##   mean=0.5652174, MSE=0.2457467 
##   left son=10 (123 obs) right son=11 (429 obs)
##   Primary splits:
##       Spa     < 1372.5  to the right, improve=0.21272970, (0 missing)
##       VRDeck  < 1063.5  to the right, improve=0.17089500, (0 missing)
##       expense < 5395    to the right, improve=0.06611166, (0 missing)
##       deck    splits as  LLRLLRRL,    improve=0.02812225, (0 missing)
##       side    splits as  LLR,         improve=0.02807513, (0 missing)
##   Surrogate splits:
##       expense     < 12647   to the right, agree=0.790, adj=0.057, (0 split)
##       Age         < 13.5    to the left,  agree=0.779, adj=0.008, (0 split)
##       RoomService < 3895.5  to the right, agree=0.779, adj=0.008, (0 split)
## 
## Node number 6: 1449 observations
##   mean=0.6266391, MSE=0.2339625 
## 
## Node number 7: 1276 observations
##   mean=0.9725705, MSE=0.02667709 
## 
## Node number 8: 2577 observations,    complexity param=0.01194576
##   mean=0.193636, MSE=0.1561411 
##   left son=16 (2067 obs) right son=17 (510 obs)
##   Primary splits:
##       FoodCourt   < 456.5   to the left,  improve=0.04838893, (0 missing)
##       expense     < 1447.5  to the right, improve=0.04016842, (0 missing)
##       HomePlanet  splits as  RLL,         improve=0.02438006, (0 missing)
##       Spa         < 537.5   to the right, improve=0.01895890, (0 missing)
##       RoomService < 400.5   to the right, improve=0.01706521, (0 missing)
##   Surrogate splits:
##       expense < 12373   to the left,  agree=0.804, adj=0.008, (0 split)
##       Spa     < 13650   to the left,  agree=0.803, adj=0.006, (0 split)
##       VRDeck  < 10123.5 to the left,  agree=0.802, adj=0.002, (0 split)
##       deck    splits as  LLLLLLLR,    agree=0.802, adj=0.002, (0 split)
## 
## Node number 9: 666 observations
##   mean=0.4864865, MSE=0.2498174 
## 
## Node number 10: 123 observations
##   mean=0.1382114, MSE=0.119109 
## 
## Node number 11: 429 observations,    complexity param=0.01381587
##   mean=0.6876457, MSE=0.2147891 
##   left son=22 (143 obs) right son=23 (286 obs)
##   Primary splits:
##       VRDeck      < 611     to the right, improve=0.24438400, (0 missing)
##       Spa         < 225     to the right, improve=0.05300377, (0 missing)
##       FoodCourt   < 3119.5  to the left,  improve=0.05168044, (0 missing)
##       side        splits as  LLR,         improve=0.04323810, (0 missing)
##       RoomService < 1719.5  to the right, improve=0.03930897, (0 missing)
##   Surrogate splits:
##       expense   < 6032    to the right, agree=0.702, adj=0.105, (0 split)
##       Age       < 53.5    to the right, agree=0.674, adj=0.021, (0 split)
##       FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
## 
## Node number 16: 2067 observations
##   mean=0.1504596, MSE=0.1278215 
## 
## Node number 17: 510 observations,    complexity param=0.01164524
##   mean=0.3686275, MSE=0.2327413 
##   left son=34 (204 obs) right son=35 (306 obs)
##   Primary splits:
##       expense    < 1447.5  to the right, improve=0.15990760, (0 missing)
##       VRDeck     < 86.5    to the right, improve=0.10060710, (0 missing)
##       HomePlanet splits as  RLL,         improve=0.07491751, (0 missing)
##       Spa        < 500     to the right, improve=0.07353369, (0 missing)
##       deck       splits as  LLLLRRRL,    improve=0.05075444, (0 missing)
##   Surrogate splits:
##       HomePlanet splits as  RLL,         agree=0.867, adj=0.667, (0 split)
##       deck       splits as  LLLLRRRL,    agree=0.839, adj=0.598, (0 split)
##       VRDeck     < 213.5   to the right, agree=0.818, adj=0.544, (0 split)
##       Spa        < 219.5   to the right, agree=0.792, adj=0.480, (0 split)
##       FoodCourt  < 907     to the right, agree=0.722, adj=0.304, (0 split)
## 
## Node number 22: 143 observations
##   mean=0.3636364, MSE=0.231405 
## 
## Node number 23: 286 observations
##   mean=0.8496503, MSE=0.1277446 
## 
## Node number 34: 204 observations
##   mean=0.1323529, MSE=0.1148356 
## 
## Node number 35: 306 observations
##   mean=0.5261438, MSE=0.2493165
rpart.plot(fit_tree)

preds= predict(fit_tree,newdata = train_test[, -c(1,12)]) %>%
  data.frame()
head(preds)
##            .
## 2  0.1504596
## 3  0.1382114
## 7  0.8496503
## 14 0.1504596
## 16 0.4864865
## 21 0.1504596
Transported_pred_tree = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_test_train, Transported_pred_tree)
cm
##                       Transported_pred_tree
## Transported_test_train   0   1
##                      0 807 272
##                      1 237 857
(807 + 272) / (807 + 272 + 237 + 272)
## [1] 0.679471
fit_tree <- rpart(Transported ~ ., data = train[, -1])

preds <- predict(fit_tree, newdata = test) %>%
  data.frame()
Transported_pred_tree = ifelse(preds$. > 0.5, TRUE, FALSE)
Transported <- as.character(Transported_pred_tree)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission) 
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_tree.csv", row.names = FALSE, quote = FALSE)
fit_forest <- randomForest(Transported ~ ., data = train_train[, -1])
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
fit_forest$importance
##              IncNodePurity
## HomePlanet       39.895123
## CryoSleep        80.141134
## Destination      25.772116
## Age              94.120805
## VIP               2.593687
## RoomService     108.064499
## FoodCourt       116.320446
## ShoppingMall     93.053181
## Spa             115.939271
## VRDeck          105.156138
## withgroup         0.000000
## deck             97.745154
## side             26.504009
## expense         250.784767
## cryosleep        84.477854
varImpPlot(fit_forest)

preds = predict(fit_forest, newdata = train_test[, -c(1,12)]) %>%
  data.frame()
head(preds)
##             .
## 2  0.09394136
## 3  0.13646667
## 7  0.80923810
## 14 0.22278600
## 16 0.72546365
## 21 0.15576667
Transported_pred_forest = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_test_train, Transported_pred_forest)
cm
##                       Transported_pred_forest
## Transported_test_train   0   1
##                      0 823 256
##                      1 177 917
( 820 + 259 ) / ( 820 + 259 + 176 + 918)
## [1] 0.4965486
fit_forest <- randomForest(Transported ~ ., data = train[, -1])
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
  preds <- predict(fit_forest, newdata = test) %>%
    data.frame()
Transported <- as.character(Transported_pred_tree)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission) 
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_forest.csv", row.names = FALSE, quote = FALSE)