PROJET

2110701569

Spaceship Titanic

2912 yılına hoş geldiniz, veri bilimi becerilerinizin bir kozmik gizemi çözmek için gerekeceği bir çağdayız. Dört ışık yılı uzaklıktan bir ileti aldık ve durum iyi görünmüyor.

Uzay Gemisi Titanic, yaklaşık bir ay önce fırlatılan bir yıldızlararası yolcu gemisiydi. Yaklaşık 13.000 yolcuyla birlikte, gemi, Güneş Sistemi’nden üç yeni yaşanabilir gezegeni çevreleyen yakındaki yıldızlar etrafında göçmen taşıyarak ilk seyahatine çıktı.

İlk hedefi olan 55 Cancri E’nin sıcak yüzeyine giderken Alfa Centauri’yi dolaşırken, dikkatsiz Uzay Gemisi Titanic, bir toz bulutunun içinde gizlenmiş bir zaman-mekan anormalliğiyle çarpıştı. Ne yazık ki, gemi ismini aldığı bin yıl önceki kaderiyle benzer bir sonla karşılaştı. Gemi bütünlüğünü korudu, ancak neredeyse yolcuların yarısı alternatif bir boyuta taşındı.

library(readr)
train <- read_csv("train.csv")
library(readr)
test <- read_csv("test.csv")
library(rmarkdown)
paged_table(train)
paged_table(test)
library(tidyverse)
library(explore)
train %>%describe_all()
## # A tibble: 14 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  chr       0    0     8693    NA  NA       NA
##  2 HomePlanet   chr     201    2.3      4    NA  NA       NA
##  3 CryoSleep    lgl     217    2.5      3     0   0.36     1
##  4 Cabin        chr     199    2.3   6561    NA  NA       NA
##  5 Destination  chr     182    2.1      4    NA  NA       NA
##  6 Age          dbl     179    2.1     81     0  28.8     79
##  7 VIP          lgl     203    2.3      3     0   0.02     1
##  8 RoomService  dbl     181    2.1   1274     0 225.   14327
##  9 FoodCourt    dbl     183    2.1   1508     0 458.   29813
## 10 ShoppingMall dbl     208    2.4   1116     0 174.   23492
## 11 Spa          dbl     183    2.1   1328     0 311.   22408
## 12 VRDeck       dbl     188    2.2   1307     0 305.   24133
## 13 Name         chr     200    2.3   8474    NA  NA       NA
## 14 Transported  lgl       0    0        2     0   0.5      1
test %>%describe_all()
## # A tibble: 13 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  chr       0    0     4277    NA  NA       NA
##  2 HomePlanet   chr      87    2        4    NA  NA       NA
##  3 CryoSleep    lgl      93    2.2      3     0   0.37     1
##  4 Cabin        chr     100    2.3   3266    NA  NA       NA
##  5 Destination  chr      92    2.2      4    NA  NA       NA
##  6 Age          dbl      91    2.1     80     0  28.7     79
##  7 VIP          lgl      93    2.2      3     0   0.02     1
##  8 RoomService  dbl      82    1.9    843     0 219.   11567
##  9 FoodCourt    dbl     106    2.5    903     0 439.   25273
## 10 ShoppingMall dbl      98    2.3    716     0 177.    8292
## 11 Spa          dbl     101    2.4    834     0 303.   19844
## 12 VRDeck       dbl      80    1.9    797     0 311.   22272
## 13 Name         chr      94    2.2   4177    NA  NA       NA
head(train)
## # A tibble: 6 × 14
##   PassengerId HomePlanet CryoSleep Cabin Destination     Age VIP   RoomService
##   <chr>       <chr>      <lgl>     <chr> <chr>         <dbl> <lgl>       <dbl>
## 1 0001_01     Europa     FALSE     B/0/P TRAPPIST-1e      39 FALSE           0
## 2 0002_01     Earth      FALSE     F/0/S TRAPPIST-1e      24 FALSE         109
## 3 0003_01     Europa     FALSE     A/0/S TRAPPIST-1e      58 TRUE           43
## 4 0003_02     Europa     FALSE     A/0/S TRAPPIST-1e      33 FALSE           0
## 5 0004_01     Earth      FALSE     F/1/S TRAPPIST-1e      16 FALSE         303
## 6 0005_01     Earth      FALSE     F/0/P PSO J318.5-22    44 FALSE           0
## # ℹ 6 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## #   VRDeck <dbl>, Name <chr>, Transported <lgl>
head(test)
## # A tibble: 6 × 13
##   PassengerId HomePlanet CryoSleep Cabin Destination   Age VIP   RoomService
##   <chr>       <chr>      <lgl>     <chr> <chr>       <dbl> <lgl>       <dbl>
## 1 0013_01     Earth      TRUE      G/3/S TRAPPIST-1e    27 FALSE           0
## 2 0018_01     Earth      FALSE     F/4/S TRAPPIST-1e    19 FALSE           0
## 3 0019_01     Europa     TRUE      C/0/S 55 Cancri e    31 FALSE           0
## 4 0021_01     Europa     FALSE     C/1/S TRAPPIST-1e    38 FALSE           0
## 5 0023_01     Earth      FALSE     F/5/S TRAPPIST-1e    20 FALSE          10
## 6 0027_01     Earth      FALSE     F/7/P TRAPPIST-1e    31 FALSE           0
## # ℹ 5 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## #   VRDeck <dbl>, Name <chr>
unique(train$HomePlanet)
## [1] "Europa" "Earth"  "Mars"   NA
unique(test$HomePlanet)
## [1] "Earth"  "Europa" "Mars"   NA
unique(train$CryoSleep)
## [1] FALSE  TRUE    NA
unique(test$CryoSleep)
## [1]  TRUE FALSE    NA
unique(train$Destination)
## [1] "TRAPPIST-1e"   "PSO J318.5-22" "55 Cancri e"   NA
unique(test$Destination)
## [1] "TRAPPIST-1e"   "55 Cancri e"   "PSO J318.5-22" NA
train$HomePlanet<-addNA(train$HomePlanet)
test$HomePlanet<-addNA(test$HomePlanet)
train$CryoSleep<-addNA(train$CryoSleep)
test$CryoSleep<-addNA(test$CryoSleep)
train$Destination<-addNA(train$Destination)
test$Destination<-addNA(test$Destination)
library(dplyr)
library(tidyr)
train <- train %>%
  group_by(HomePlanet, Destination) %>%
  mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
test <- test %>%
  group_by(HomePlanet, Destination) %>%
  mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
train$VIP<-addNA(train$VIP)
test$VIP<-addNA(test$VIP)
train <- train %>%
mutate(RoomService = coalesce(RoomService, 0),
       FoodCourt = coalesce(FoodCourt, 0),
       ShoppingMall = coalesce(ShoppingMall, 0),
       Spa = coalesce(Spa, 0),
       VRDeck = coalesce(VRDeck, 0))
test <- test %>%
mutate(RoomService = coalesce(RoomService, 0),
       FoodCourt = coalesce(FoodCourt, 0),
       ShoppingMall = coalesce(ShoppingMall, 0),
       Spa = coalesce(Spa, 0),
       VRDeck = coalesce(VRDeck, 0))
train[c('Familynum','Familyrow')] <- str_split_fixed(train$PassengerId,'_',2)
test[c('Familynum','Familyrow')] <- str_split_fixed(test$PassengerId,'_',2)
train<-train [,c(15,16,1:14)]
test<-test [,c(14,15,1:13)]
train[c('Deck','Num','Side')] <- str_split_fixed(train$Cabin,'/',3)
test[c('Deck','Num','Side')] <- str_split_fixed(test$Cabin,'/',3)
train<-train [,c(1:6,17,18,19,7:16)]
test<-test [,c(1:6,16,17,18,7:15)]
train[train==''] <- NA
test[test==''] <- NA
train %>%describe_all()
## # A tibble: 19 × 8
##    variable     type     na na_pct unique   min  mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl> <dbl> <dbl>
##  1 Familynum    chr       0    0     6217    NA  NA      NA
##  2 Familyrow    chr       0    0        8    NA  NA      NA
##  3 PassengerId  chr       0    0     8693    NA  NA      NA
##  4 HomePlanet   fct       0    0        4    NA  NA      NA
##  5 CryoSleep    fct       0    0        3    NA  NA      NA
##  6 Cabin        chr     199    2.3   6561    NA  NA      NA
##  7 Deck         chr     199    2.3      9    NA  NA      NA
##  8 Num          chr     199    2.3   1818    NA  NA      NA
##  9 Side         chr     199    2.3      3    NA  NA      NA
## 10 Destination  fct       0    0        4    NA  NA      NA
## 11 Age          dbl       0    0       91     0  28.8    79
## 12 VIP          fct       0    0        3    NA  NA      NA
## 13 RoomService  dbl       0    0     1273     0 220.  14327
## 14 FoodCourt    dbl       0    0     1507     0 448.  29813
## 15 ShoppingMall dbl       0    0     1115     0 170.  23492
## 16 Spa          dbl       0    0     1327     0 305.  22408
## 17 VRDeck       dbl       0    0     1306     0 298.  24133
## 18 Name         chr     200    2.3   8474    NA  NA      NA
## 19 Transported  lgl       0    0        2     0   0.5     1
test %>%describe_all()
## # A tibble: 18 × 8
##    variable     type     na na_pct unique   min  mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl> <dbl> <dbl>
##  1 Familynum    chr       0    0     3063    NA  NA      NA
##  2 Familyrow    chr       0    0        8    NA  NA      NA
##  3 PassengerId  chr       0    0     4277    NA  NA      NA
##  4 HomePlanet   fct       0    0        4    NA  NA      NA
##  5 CryoSleep    fct       0    0        3    NA  NA      NA
##  6 Cabin        chr     100    2.3   3266    NA  NA      NA
##  7 Deck         chr     100    2.3      9    NA  NA      NA
##  8 Num          chr     100    2.3   1506    NA  NA      NA
##  9 Side         chr     100    2.3      3    NA  NA      NA
## 10 Destination  fct       0    0        4    NA  NA      NA
## 11 Age          dbl       0    0       91     0  28.7    79
## 12 VIP          fct       0    0        3    NA  NA      NA
## 13 RoomService  dbl       0    0      842     0 215.  11567
## 14 FoodCourt    dbl       0    0      902     0 429.  25273
## 15 ShoppingMall dbl       0    0      715     0 173.   8292
## 16 Spa          dbl       0    0      833     0 296.  19844
## 17 VRDeck       dbl       0    0      796     0 305.  22272
## 18 Name         chr      94    2.2   4177    NA  NA      NA
train$Family <- ifelse(duplicated(train$Familynum) | duplicated(train$Familynum, fromlast= TRUE),1,0)
test$Family <- ifelse(duplicated(test$Familynum) | duplicated(test$Familynum, fromlast= TRUE),1,0)
head(train[,c("PassengerId","Familynum","Familyrow", "Family")],30)
## # A tibble: 30 × 4
##    PassengerId Familynum Familyrow Family
##    <chr>       <chr>     <chr>      <dbl>
##  1 0001_01     0001      01             0
##  2 0002_01     0002      01             0
##  3 0003_01     0003      01             0
##  4 0003_02     0003      02             1
##  5 0004_01     0004      01             0
##  6 0005_01     0005      01             0
##  7 0006_01     0006      01             0
##  8 0006_02     0006      02             1
##  9 0007_01     0007      01             0
## 10 0008_01     0008      01             0
## # ℹ 20 more rows
head(test[,c("PassengerId","Familynum","Familyrow", "Family")],30)
## # A tibble: 30 × 4
##    PassengerId Familynum Familyrow Family
##    <chr>       <chr>     <chr>      <dbl>
##  1 0013_01     0013      01             0
##  2 0018_01     0018      01             0
##  3 0019_01     0019      01             0
##  4 0021_01     0021      01             0
##  5 0023_01     0023      01             0
##  6 0027_01     0027      01             0
##  7 0029_01     0029      01             0
##  8 0032_01     0032      01             0
##  9 0032_02     0032      02             1
## 10 0033_01     0033      01             0
## # ℹ 20 more rows
train <- train %>% select(-c("Cabin", "Name", "Familynum", "Familyrow", "Num"))
test <- test %>% select(-c("Cabin", "Name", "Familynum", "Familyrow", "Num"))
train$Deck<-addNA(train$Deck)
test$Deck<-addNA(test$Deck)
train$Side<-addNA(train$Side)
test$Side<-addNA(test$Side)
most_frequent_hp <- train %>%
  filter(!is.na(HomePlanet)) %>%
  group_by(Destination, HomePlanet) %>%
  summarize(count = n()) %>%
  arrange(Destination, desc(count)) %>%
  slice(1) %>%
  ungroup()
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
most_frequent_hp <- test %>%
  filter(!is.na(HomePlanet)) %>%
  group_by(Destination, HomePlanet) %>%
  summarize(count = n()) %>%
  arrange(Destination, desc(count)) %>%
  slice(1) %>%
  ungroup()
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
most_frequent_hp
## # A tibble: 4 × 3
##   Destination   HomePlanet count
##   <fct>         <fct>      <int>
## 1 55 Cancri e   Europa       424
## 2 PSO J318.5-22 Earth        353
## 3 TRAPPIST-1e   Earth       1571
## 4 <NA>          Earth         45
train$HomePlanet <- as.character(train$HomePlanet)
train$Destination<- as.character(train$Destination)
test$HomePlanet <- as.character(test$HomePlanet)
test$Destination<- as.character(test$Destination)
train <- train %>%
  mutate(HomePlanet = ifelse(is.na(HomePlanet) & Destination == "55 cancri e", "Europa",                                     ifelse(is.na(HomePlanet), "Earth", HomePlanet)))
test <- test %>%
  mutate(HomePlanet = ifelse(is.na(HomePlanet) & Destination == "55 cancri e", "Europa",                                     ifelse(is.na(HomePlanet), "Earth", HomePlanet)))
train %>% describe_all()
## # A tibble: 15 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  chr       0    0     8693    NA  NA       NA
##  2 HomePlanet   chr       4    0        4    NA  NA       NA
##  3 CryoSleep    fct       0    0        3    NA  NA       NA
##  4 Deck         fct       0    0        9    NA  NA       NA
##  5 Side         fct       0    0        3    NA  NA       NA
##  6 Destination  chr     182    2.1      4    NA  NA       NA
##  7 Age          dbl       0    0       91     0  28.8     79
##  8 VIP          fct       0    0        3    NA  NA       NA
##  9 RoomService  dbl       0    0     1273     0 220.   14327
## 10 FoodCourt    dbl       0    0     1507     0 448.   29813
## 11 ShoppingMall dbl       0    0     1115     0 170.   23492
## 12 Spa          dbl       0    0     1327     0 305.   22408
## 13 VRDeck       dbl       0    0     1306     0 298.   24133
## 14 Transported  lgl       0    0        2     0   0.5      1
## 15 Family       dbl       0    0        2     0   0.28     1
test %>% describe_all()
## # A tibble: 14 × 8
##    variable     type     na na_pct unique   min   mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId  chr       0    0     4277    NA  NA       NA
##  2 HomePlanet   chr       2    0        4    NA  NA       NA
##  3 CryoSleep    fct       0    0        3    NA  NA       NA
##  4 Deck         fct       0    0        9    NA  NA       NA
##  5 Side         fct       0    0        3    NA  NA       NA
##  6 Destination  chr      92    2.2      4    NA  NA       NA
##  7 Age          dbl       0    0       91     0  28.7     79
##  8 VIP          fct       0    0        3    NA  NA       NA
##  9 RoomService  dbl       0    0      842     0 215.   11567
## 10 FoodCourt    dbl       0    0      902     0 429.   25273
## 11 ShoppingMall dbl       0    0      715     0 173.    8292
## 12 Spa          dbl       0    0      833     0 296.   19844
## 13 VRDeck       dbl       0    0      796     0 305.   22272
## 14 Family       dbl       0    0        2     0   0.28     1
train <- transform (train, HomePlanet  = replace (HomePlanet, is.na (HomePlanet), "Earth"))
test <- transform (test, HomePlanet  = replace (HomePlanet, is.na (HomePlanet), "Earth"))
most_frequent_destinations <- train %>%
  filter(!is.na(Destination)) %>%
  group_by(HomePlanet, Destination) %>%
  summarize(count = n()) %>%
  arrange(Destination, desc(count)) %>%
  slice(1) %>%
  ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_destinations
## # A tibble: 3 × 3
##   HomePlanet Destination count
##   <chr>      <chr>       <int>
## 1 Earth      55 Cancri e   721
## 2 Europa     55 Cancri e   886
## 3 Mars       55 Cancri e   193
train<- transform (train, Destination = replace(Destination, is.na(Destination), "TRAPPIST-1e"))
test<- transform (test, Destination = replace(Destination, is.na(Destination), "TRAPPIST-1e"))
train$HomePlanet <- as.factor (train$HomePlanet)
train $destination <- as.factor(train$Destination)
test$HomePlanet <- as.factor (test$HomePlanet)
test$destination <- as.factor(test$Destination)
train <- train %>%
  group_by(HomePlanet, Destination) %>%
  mutate_at(vars(Age), ~replace_na(., mean (., na.rm =TRUE)))
test <- test %>%
  group_by(HomePlanet, Destination) %>%
  mutate_at(vars(Age), ~replace_na(., mean (., na.rm =TRUE)))
train$expense <- train$RoomService + train$FoodCourt + train$ShoppingMall + train$Spa + train$VRDeck
test$expense <- test$RoomService + test$FoodCourt + test$ShoppingMall + test$Spa + test$VRDeck
train <- transform(train, CryoSleep = replace(CryoSleep,is.na(CryoSleep) & expense>0 & Age>12, "FALSE"))
test <- transform(test, CryoSleep = replace(CryoSleep,is.na(CryoSleep) & expense>0 & Age>12, "FALSE"))
summary(train)
##  PassengerId         HomePlanet   CryoSleep         Deck      Side     
##  Length:8693        Earth :4803   FALSE:5439   F      :2794   P :4206  
##  Class :character   Europa:2131   TRUE :3037   G      :2559   S :4288  
##  Mode  :character   Mars  :1759   NA   : 217   E      : 876   NA: 199  
##                                                B      : 779            
##                                                C      : 747            
##                                                D      : 478            
##                                                (Other): 460            
##  Destination             Age           VIP        RoomService   
##  Length:8693        Min.   : 0.00   FALSE:8291   Min.   :    0  
##  Class :character   1st Qu.:20.00   TRUE : 199   1st Qu.:    0  
##  Mode  :character   Median :27.00   NA   : 203   Median :    0  
##                     Mean   :28.83                Mean   :  220  
##                     3rd Qu.:37.00                3rd Qu.:   41  
##                     Max.   :79.00                Max.   :14327  
##                                                                 
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  448.4   Mean   :  169.6   Mean   :  304.6   Mean   :  298.3  
##  3rd Qu.:   61.0   3rd Qu.:   22.0   3rd Qu.:   53.0   3rd Qu.:   40.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##                                                                         
##  Transported         Family              destination      expense     
##  Mode :logical   Min.   :0.0000   55 Cancri e  :1800   Min.   :    0  
##  FALSE:4315      1st Qu.:0.0000   PSO J318.5-22: 796   1st Qu.:    0  
##  TRUE :4378      Median :0.0000   TRAPPIST-1e  :6097   Median :  716  
##                  Mean   :0.2848                        Mean   : 1441  
##                  3rd Qu.:1.0000                        3rd Qu.: 1441  
##                  Max.   :1.0000                        Max.   :35987  
## 
summary(test)
##  PassengerId         HomePlanet   CryoSleep         Deck      Side     
##  Length:4277        Earth :2350   FALSE:2640   F      :1445   P :2084  
##  Class :character   Europa:1002   TRUE :1544   G      :1222   S :2093  
##  Mode  :character   Mars  : 925   NA   :  93   E      : 447   NA: 100  
##                                                B      : 362            
##                                                C      : 355            
##                                                D      : 242            
##                                                (Other): 204            
##  Destination             Age           VIP        RoomService     
##  Length:4277        Min.   : 0.00   FALSE:4110   Min.   :    0.0  
##  Class :character   1st Qu.:20.00   TRUE :  74   1st Qu.:    0.0  
##  Mode  :character   Median :26.22   NA   :  93   Median :    0.0  
##                     Mean   :28.66                Mean   :  215.1  
##                     3rd Qu.:37.00                3rd Qu.:   48.0  
##                     Max.   :79.00                Max.   :11567.0  
##                                                                   
##    FoodCourt        ShoppingMall         Spa              VRDeck       
##  Min.   :    0.0   Min.   :   0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:   0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :   0.0   Median :    0.0   Median :    0.0  
##  Mean   :  428.6   Mean   : 173.2   Mean   :  295.9   Mean   :  304.9  
##  3rd Qu.:   66.0   3rd Qu.:  27.0   3rd Qu.:   43.0   3rd Qu.:   31.0  
##  Max.   :25273.0   Max.   :8292.0   Max.   :19844.0   Max.   :22272.0  
##                                                                        
##      Family              destination      expense     
##  Min.   :0.0000   55 Cancri e  : 841   Min.   :    0  
##  1st Qu.:0.0000   PSO J318.5-22: 388   1st Qu.:    0  
##  Median :0.0000   TRAPPIST-1e  :3048   Median :  714  
##  Mean   :0.2838                        Mean   : 1418  
##  3rd Qu.:1.0000                        3rd Qu.: 1444  
##  Max.   :1.0000                        Max.   :33666  
## 
train <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep) & expense==0 & Age>12, "TRUE"))
test <- transform(test, CryoSleep = replace(CryoSleep, is.na(CryoSleep) & expense==0 & Age>12, "TRUE"))
train$CryoSleep <- as.factor(train$CryoSleep)
test$CryoSleep <- as.factor(test$CryoSleep)
most_frequent_Deck <- train %>%
  filter(!is.na(Deck)) %>%
  group_by(HomePlanet,  Deck) %>%
  summarize (count = n()) %>%
  arrange(HomePlanet, desc(count)) %>%
  slice (1) %>%
  ungroup ()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_Deck <- test %>%
  filter(!is.na(Deck)) %>%
  group_by(HomePlanet,  Deck) %>%
  summarize (count = n()) %>%
  arrange(HomePlanet, desc(count)) %>%
  slice (1) %>%
  ungroup ()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_Deck
## # A tibble: 3 × 3
##   HomePlanet Deck  count
##   <fct>      <fct> <int>
## 1 Earth      G      1222
## 2 Europa     B       358
## 3 Mars       F       603
train$HomePlanet  <- as.character(train$HomePlanet)
train$Deck <- as.character(train$Deck)
test$HomePlanet  <- as.character(test$HomePlanet)
test$Deck <- as.character(test$Deck)
train <- train %>%
  mutate(Deck = ifelse(is.na(Deck) & HomePlanet == "Earth", "G",
                       ifelse(is.na(Deck) & HomePlanet == "Europa", "B" ,
                              ifelse(is.na(Deck) & HomePlanet == "Mars", "F", Deck))))
test <- test %>%
  mutate(Deck = ifelse(is.na(Deck) & HomePlanet == "Earth", "G",
                       ifelse(is.na(Deck) & HomePlanet == "Europa", "B" ,
                              ifelse(is.na(Deck) & HomePlanet == "Mars", "F", Deck))))
most_frequent_Side <- train %>%
  filter(!is.na(Side)) %>%
  group_by(HomePlanet,  Side) %>%
  summarize (count = n()) %>%
  arrange(HomePlanet, desc(count)) %>%
  slice (1) %>%
  ungroup ()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_Side <- test %>%
  filter(!is.na(Side)) %>%
  group_by(HomePlanet,  Side) %>%
  summarize (count = n()) %>%
  arrange(HomePlanet, desc(count)) %>%
  slice (1) %>%
  ungroup ()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_Side
## # A tibble: 3 × 3
##   HomePlanet Side  count
##   <chr>      <fct> <int>
## 1 Earth      P      1147
## 2 Europa     P       495
## 3 Mars       S       463
train$Side <- as.character(train$Side)
test$Side <- as.character(test$Side)
train <- train %>%
  mutate(Side = ifelse(is.na(Side) & HomePlanet == "Earth", "P",
                       ifelse(is.na(Side) & HomePlanet == "Europa", "S" ,
                              ifelse(is.na(Side) & HomePlanet == "Mars", "P", Side))))
test <- test %>%
  mutate(Side = ifelse(is.na(Side) & HomePlanet == "Earth", "P",
                       ifelse(is.na(Side) & HomePlanet == "Europa", "S" ,
                              ifelse(is.na(Side) & HomePlanet == "Mars", "P", Side))))
train %>% describe_all()
## # A tibble: 17 × 8
##    variable     type     na na_pct unique   min    mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>   <dbl> <dbl>
##  1 PassengerId  chr       0      0   8693    NA   NA       NA
##  2 HomePlanet   chr       0      0      3    NA   NA       NA
##  3 CryoSleep    fct       0      0      3    NA   NA       NA
##  4 Deck         chr       0      0      8    NA   NA       NA
##  5 Side         chr       0      0      2    NA   NA       NA
##  6 Destination  chr       0      0      3    NA   NA       NA
##  7 Age          dbl       0      0     91     0   28.8     79
##  8 VIP          fct       0      0      3    NA   NA       NA
##  9 RoomService  dbl       0      0   1273     0  220.   14327
## 10 FoodCourt    dbl       0      0   1507     0  448.   29813
## 11 ShoppingMall dbl       0      0   1115     0  170.   23492
## 12 Spa          dbl       0      0   1327     0  305.   22408
## 13 VRDeck       dbl       0      0   1306     0  298.   24133
## 14 Transported  lgl       0      0      2     0    0.5      1
## 15 Family       dbl       0      0      2     0    0.28     1
## 16 destination  fct       0      0      3    NA   NA       NA
## 17 expense      dbl       0      0   2336     0 1441.   35987
test %>% describe_all()
## # A tibble: 16 × 8
##    variable     type     na na_pct unique   min    mean   max
##    <chr>        <chr> <int>  <dbl>  <int> <dbl>   <dbl> <dbl>
##  1 PassengerId  chr       0      0   4277    NA   NA       NA
##  2 HomePlanet   chr       0      0      3    NA   NA       NA
##  3 CryoSleep    fct       0      0      3    NA   NA       NA
##  4 Deck         chr       0      0      8    NA   NA       NA
##  5 Side         chr       0      0      2    NA   NA       NA
##  6 Destination  chr       0      0      3    NA   NA       NA
##  7 Age          dbl       0      0     91     0   28.7     79
##  8 VIP          fct       0      0      3    NA   NA       NA
##  9 RoomService  dbl       0      0    842     0  215.   11567
## 10 FoodCourt    dbl       0      0    902     0  429.   25273
## 11 ShoppingMall dbl       0      0    715     0  173.    8292
## 12 Spa          dbl       0      0    833     0  296.   19844
## 13 VRDeck       dbl       0      0    796     0  305.   22272
## 14 Family       dbl       0      0      2     0    0.28     1
## 15 destination  fct       0      0      3    NA   NA       NA
## 16 expense      dbl       0      0   1437     0 1418.   33666
train <- train %>% mutate_if(is.character,as.factor)
test <- test %>% mutate_if(is.character,as.factor)
write.csv(train, "train_c.csv", row.names=FALSE)
write.csv(test, "train_c.csv", row.names=FALSE)
table(train$Transported) / length(train$Transported)
## 
##     FALSE      TRUE 
## 0.4963764 0.5036236
ggplot(train, aes(x = Age)) +
  geom_histogram(fill = "skyblue", color = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

D <- train[,2:16] %>% mutate(across(everything(), ~as.integer(.)))
kor<- cor(D)
train_set <- train[2:17]
test_set <- test[2:16]
library(caTools)
set.seed(123)
split = sample.split(train_set$Transported,SplitRatio = 0.75)
training_set = subset(train_set, split == TRUE)
testing_set = subset(train_set, split == FALSE)
logistic = glm(formula = Transported ~ . ,family = binomial, data = training_set)
## Warning: glm.fit: des probabilités ont été ajustées numériquement à 0 ou 1
prob_pred = predict(logistic, type = 'response', newdata = testing_set[-13])
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
y_pred = ifelse(prob_pred > 0.5,1,0)
y_true <- ifelse(testing_set[13] == TRUE,1,0)
cm = table(y_true, y_pred)
cm
##       y_pred
## y_true   0   1
##      0 821 258
##      1 189 905
(971+1093)/(971+1093+324+220)
## [1] 0.791411
logistic_son = glm(formula = Transported ~ . ,family = binomial, data = train_set)
## Warning: glm.fit: des probabilités ont été ajustées numériquement à 0 ou 1
prob_pred = predict(logistic_son, type = 'response', newdata = test_set)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
y_pred = ifelse(prob_pred > 0.5, TRUE,FALSE)
Transported <- as.character(y_pred)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
submission <- cbind(PassengerId,Transported)
submission <- as.data.frame(submission)
library(stringr)
write.csv(submission,"sub_logis.csv", row.names = FALSE,quote = FALSE)
submission <- cbind(PassengerId,Transported)
submission <- as.data.frame(submission)
write.csv(submission,"sub_nb.csv", row.names = FALSE,quote = FALSE)
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
training_set$Transported <- as.factor(training_set$Transported)
testing_set$Transported <- as.factor(testing_set$Transported)
train_set$Transported <- as.factor(train_set$Transported)
fit_tree <- rpart(Transported ~ . , data = training_set)
summary(fit_tree)
## Call:
## rpart(formula = Transported ~ ., data = training_set)
##   n= 6520 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.47126082      0 1.0000000 1.0000000 0.01247595
## 2 0.02487639      1 0.5287392 0.5287392 0.01097792
## 3 0.01205192      3 0.4789864 0.4808405 0.01063625
## 4 0.01127936      4 0.4669345 0.4749691 0.01059132
## 5 0.01112485      6 0.4443758 0.4694067 0.01054812
## 6 0.01000000      7 0.4332509 0.4629172 0.01049692
## 
## Variable importance
##      expense    CryoSleep          Spa    FoodCourt       VRDeck  RoomService 
##           25           20           14           14           12           11 
## ShoppingMall         Deck   HomePlanet 
##            3            1            1 
## 
## Node number 1: 6520 observations,    complexity param=0.4712608
##   predicted class=TRUE   expected loss=0.496319  P(node) =1
##     class counts:  3236  3284
##    probabilities: 0.496 0.504 
##   left son=2 (3795 obs) right son=3 (2725 obs)
##   Primary splits:
##       expense     < 0.5     to the right, improve=760.2351, (0 missing)
##       CryoSleep   splits as  LRL,         improve=683.0583, (0 missing)
##       RoomService < 0.5     to the right, improve=408.8215, (0 missing)
##       Spa         < 0.5     to the right, improve=372.3306, (0 missing)
##       VRDeck      < 0.5     to the right, improve=353.3889, (0 missing)
##   Surrogate splits:
##       CryoSleep   splits as  LRL,         agree=0.932, adj=0.837, (0 split)
##       Spa         < 0.5     to the right, agree=0.784, adj=0.484, (0 split)
##       FoodCourt   < 0.5     to the right, agree=0.770, adj=0.451, (0 split)
##       VRDeck      < 0.5     to the right, agree=0.768, adj=0.444, (0 split)
##       RoomService < 0.5     to the right, agree=0.757, adj=0.419, (0 split)
## 
## Node number 2: 3795 observations,    complexity param=0.02487639
##   predicted class=FALSE  expected loss=0.2990777  P(node) =0.5820552
##     class counts:  2660  1135
##    probabilities: 0.701 0.299 
##   left son=4 (3243 obs) right son=5 (552 obs)
##   Primary splits:
##       FoodCourt    < 1331    to the left,  improve=91.50674, (0 missing)
##       ShoppingMall < 627.5   to the left,  improve=72.08933, (0 missing)
##       RoomService  < 365.5   to the right, improve=70.65071, (0 missing)
##       Spa          < 257.5   to the right, improve=53.26111, (0 missing)
##       VRDeck       < 721     to the right, improve=36.44331, (0 missing)
##   Surrogate splits:
##       expense    < 5981    to the left,  agree=0.885, adj=0.210, (0 split)
##       Deck       splits as  RRRLLLLL,    agree=0.884, adj=0.199, (0 split)
##       HomePlanet splits as  LRL,         agree=0.878, adj=0.159, (0 split)
##       Spa        < 8955.5  to the left,  agree=0.856, adj=0.009, (0 split)
##       VRDeck     < 11692   to the left,  agree=0.856, adj=0.009, (0 split)
## 
## Node number 3: 2725 observations
##   predicted class=TRUE   expected loss=0.2113761  P(node) =0.4179448
##     class counts:   576  2149
##    probabilities: 0.211 0.789 
## 
## Node number 4: 3243 observations,    complexity param=0.01127936
##   predicted class=FALSE  expected loss=0.2537774  P(node) =0.4973926
##     class counts:  2420   823
##    probabilities: 0.746 0.254 
##   left son=8 (2577 obs) right son=9 (666 obs)
##   Primary splits:
##       ShoppingMall < 541.5   to the left,  improve=90.77444, (0 missing)
##       RoomService  < 365.5   to the right, improve=42.55270, (0 missing)
##       Spa          < 240.5   to the right, improve=40.86813, (0 missing)
##       VRDeck       < 114     to the right, improve=34.19891, (0 missing)
##       expense      < 2867.5  to the right, improve=22.24986, (0 missing)
##   Surrogate splits:
##       expense < 18644   to the left,  agree=0.795, adj=0.003, (0 split)
## 
## Node number 5: 552 observations,    complexity param=0.02487639
##   predicted class=TRUE   expected loss=0.4347826  P(node) =0.08466258
##     class counts:   240   312
##    probabilities: 0.435 0.565 
##   left son=10 (123 obs) right son=11 (429 obs)
##   Primary splits:
##       Spa       < 1372.5  to the right, improve=57.714490, (0 missing)
##       VRDeck    < 1063.5  to the right, improve=46.364550, (0 missing)
##       expense   < 5395    to the right, improve=17.936380, (0 missing)
##       FoodCourt < 2513    to the left,  improve= 7.542383, (0 missing)
##       Deck      splits as  LLRLLRRL,    improve= 7.251224, (0 missing)
##   Surrogate splits:
##       expense     < 12647   to the right, agree=0.790, adj=0.057, (0 split)
##       Age         < 13.5    to the left,  agree=0.779, adj=0.008, (0 split)
##       RoomService < 3895.5  to the right, agree=0.779, adj=0.008, (0 split)
## 
## Node number 8: 2577 observations
##   predicted class=FALSE  expected loss=0.193636  P(node) =0.3952454
##     class counts:  2078   499
##    probabilities: 0.806 0.194 
## 
## Node number 9: 666 observations,    complexity param=0.01127936
##   predicted class=FALSE  expected loss=0.4864865  P(node) =0.1021472
##     class counts:   342   324
##    probabilities: 0.514 0.486 
##   left son=18 (157 obs) right son=19 (509 obs)
##   Primary splits:
##       RoomService  < 310     to the right, improve=31.364140, (0 missing)
##       Spa          < 200     to the right, improve=19.676920, (0 missing)
##       ShoppingMall < 1586.5  to the left,  improve=13.953460, (0 missing)
##       VRDeck       < 120.5   to the right, improve=13.338010, (0 missing)
##       HomePlanet   splits as  RLL,         improve= 6.864426, (0 missing)
##   Surrogate splits:
##       FoodCourt < 1102    to the right, agree=0.766, adj=0.006, (0 split)
## 
## Node number 10: 123 observations
##   predicted class=FALSE  expected loss=0.1382114  P(node) =0.01886503
##     class counts:   106    17
##    probabilities: 0.862 0.138 
## 
## Node number 11: 429 observations,    complexity param=0.01205192
##   predicted class=TRUE   expected loss=0.3123543  P(node) =0.06579755
##     class counts:   134   295
##    probabilities: 0.312 0.688 
##   left son=22 (143 obs) right son=23 (286 obs)
##   Primary splits:
##       VRDeck      < 611     to the right, improve=45.037300, (0 missing)
##       Spa         < 225     to the right, improve= 9.768013, (0 missing)
##       FoodCourt   < 3119.5  to the left,  improve= 9.524139, (0 missing)
##       RoomService < 1719.5  to the right, improve= 7.244213, (0 missing)
##       Side        splits as  LR,          improve= 6.625210, (0 missing)
##   Surrogate splits:
##       expense   < 6032    to the right, agree=0.702, adj=0.105, (0 split)
##       Age       < 53.5    to the right, agree=0.674, adj=0.021, (0 split)
##       FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
## 
## Node number 18: 157 observations
##   predicted class=FALSE  expected loss=0.2101911  P(node) =0.02407975
##     class counts:   124    33
##    probabilities: 0.790 0.210 
## 
## Node number 19: 509 observations,    complexity param=0.01112485
##   predicted class=TRUE   expected loss=0.4282908  P(node) =0.07806748
##     class counts:   218   291
##    probabilities: 0.428 0.572 
##   left son=38 (66 obs) right son=39 (443 obs)
##   Primary splits:
##       Spa          < 209     to the right, improve=17.99311, (0 missing)
##       VRDeck       < 33.5    to the right, improve=17.76420, (0 missing)
##       ShoppingMall < 1540.5  to the left,  improve=10.97812, (0 missing)
##       Deck         splits as  LRLRLRR-,    improve= 7.32794, (0 missing)
##       expense      < 4099.5  to the right, improve= 5.09474, (0 missing)
##   Surrogate splits:
##       expense      < 4161    to the right, agree=0.898, adj=0.212, (0 split)
##       Deck         splits as  LLLRRRR-,    agree=0.896, adj=0.197, (0 split)
##       HomePlanet   splits as  RLR,         agree=0.884, adj=0.106, (0 split)
##       ShoppingMall < 7126    to the right, agree=0.876, adj=0.045, (0 split)
##       VRDeck       < 1206.5  to the right, agree=0.876, adj=0.045, (0 split)
## 
## Node number 22: 143 observations
##   predicted class=FALSE  expected loss=0.3636364  P(node) =0.02193252
##     class counts:    91    52
##    probabilities: 0.636 0.364 
## 
## Node number 23: 286 observations
##   predicted class=TRUE   expected loss=0.1503497  P(node) =0.04386503
##     class counts:    43   243
##    probabilities: 0.150 0.850 
## 
## Node number 38: 66 observations
##   predicted class=FALSE  expected loss=0.2272727  P(node) =0.0101227
##     class counts:    51    15
##    probabilities: 0.773 0.227 
## 
## Node number 39: 443 observations
##   predicted class=TRUE   expected loss=0.3769752  P(node) =0.06794479
##     class counts:   167   276
##    probabilities: 0.377 0.623
rpart.plot(fit_tree)

preds = predict(fit_tree, newdata = testing_set[-13],type = "class")
y_pred = ifelse(preds == TRUE, 1,0)
cm = table(y_true, y_pred)
cm
##       y_pred
## y_true   0   1
##      0 798 281
##      1 196 898
(798+898)/(798+898+281+196)
## [1] 0.7804878