R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Installing Packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(rpart)
library(rsample)
## Warning: package 'rsample' was built under R version 4.2.1
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.2.1
library(ranger)
## Warning: package 'ranger' was built under R version 4.2.1
library(mice)
## Warning: package 'mice' was built under R version 4.2.1
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.1
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:rsample':
## 
##     permutations
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.1
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
options(warn=-1)

Data Importing

Train <- read.csv("train.csv",na.strings=c("", "NA"), sep=",",header = TRUE)
Test<- read.csv("test.csv",na.strings = c("",NA),sep = ",",header = TRUE)
head(Train)
##   PassengerId HomePlanet CryoSleep Cabin   Destination Age   VIP RoomService
## 1     0001_01     Europa     False B/0/P   TRAPPIST-1e  39 False           0
## 2     0002_01      Earth     False F/0/S   TRAPPIST-1e  24 False         109
## 3     0003_01     Europa     False A/0/S   TRAPPIST-1e  58  True          43
## 4     0003_02     Europa     False A/0/S   TRAPPIST-1e  33 False           0
## 5     0004_01      Earth     False F/1/S   TRAPPIST-1e  16 False         303
## 6     0005_01      Earth     False F/0/P PSO J318.5-22  44 False           0
##   FoodCourt ShoppingMall  Spa VRDeck              Name Transported
## 1         0            0    0      0   Maham Ofracculy       False
## 2         9           25  549     44      Juanna Vines        True
## 3      3576            0 6715     49     Altark Susent       False
## 4      1283          371 3329    193      Solam Susent       False
## 5        70          151  565      2 Willy Santantines        True
## 6       483            0  291      0 Sandie Hinetthews        True
summary(Train)
##  PassengerId         HomePlanet         CryoSleep            Cabin          
##  Length:8693        Length:8693        Length:8693        Length:8693       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Destination             Age            VIP             RoomService     
##  Length:8693        Min.   : 0.00   Length:8693        Min.   :    0.0  
##  Class :character   1st Qu.:19.00   Class :character   1st Qu.:    0.0  
##  Mode  :character   Median :27.00   Mode  :character   Median :    0.0  
##                     Mean   :28.83                      Mean   :  224.7  
##                     3rd Qu.:38.00                      3rd Qu.:   47.0  
##                     Max.   :79.00                      Max.   :14327.0  
##                     NA's   :179                        NA's   :181      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  458.1   Mean   :  173.7   Mean   :  311.1   Mean   :  304.9  
##  3rd Qu.:   76.0   3rd Qu.:   27.0   3rd Qu.:   59.0   3rd Qu.:   46.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##  NA's   :183       NA's   :208       NA's   :183       NA's   :188      
##      Name           Transported       
##  Length:8693        Length:8693       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

Pre-processing & Cleaning

Cabin variable contains 3 different information, we split the Cabin column into 3 independent columns

Train <- separate(Train, Cabin, c('deck','num','side'))

Converting datatype of numeric or categorical data respectively

Train$num <- as.numeric(Train$num)
Train <- as.data.frame(unclass(Train), stringsAsFactors = TRUE)

Recoding the catogarical variable,dropping the variable which arenot useful and replacing the NA

Train <- Train %>% mutate(CryoSleep = factor(case_when(CryoSleep == 'False' ~ 0, CryoSleep == 'True' ~ 1)),
                          Transported = factor(case_when(Transported == 'False' ~ 0,Transported == 'True' ~ 1)),
           VIP = factor(case_when(VIP == 'False' ~ 0, VIP == 'True' ~ 1))) %>% 
  select(-c(PassengerId,Name)) %>% mice(method = "rf") %>% 
  complete()
## 
##  iter imp variable
##   1   1  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   1   2  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   1   3  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   1   4  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   1   5  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   2   1  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   2   2  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   2   3  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   2   4  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   2   5  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   3   1  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   3   2  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   3   3  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   3   4  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   3   5  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   4   1  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   4   2  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   4   3  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   4   4  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   4   5  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   5   1  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   5   2  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   5   3  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   5   4  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
##   5   5  HomePlanet  CryoSleep  deck  num  side  Destination  Age  VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
sum(is.na(Train))
## [1] 0
str(Train)
## 'data.frame':    8693 obs. of  14 variables:
##  $ HomePlanet  : Factor w/ 3 levels "Earth","Europa",..: 2 1 2 2 1 1 1 1 1 2 ...
##  $ CryoSleep   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 2 ...
##  $ deck        : Factor w/ 8 levels "A","B","C","D",..: 2 6 1 1 6 6 6 7 6 2 ...
##  $ num         : num  0 0 0 0 1 0 2 0 3 1 ...
##  $ side        : Factor w/ 2 levels "P","S": 1 2 2 2 2 1 2 2 2 1 ...
##  $ Destination : Factor w/ 3 levels "55 Cancri e",..: 3 3 3 3 3 2 3 3 3 1 ...
##  $ Age         : num  39 24 58 33 16 44 26 28 35 14 ...
##  $ VIP         : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
##  $ RoomService : num  0 109 43 0 303 0 42 0 0 0 ...
##  $ FoodCourt   : num  0 9 3576 1283 70 ...
##  $ ShoppingMall: num  0 25 0 371 151 0 3 0 17 0 ...
##  $ Spa         : num  0 549 6715 3329 565 ...
##  $ VRDeck      : num  0 44 49 193 2 0 0 0 0 0 ...
##  $ Transported : Factor w/ 2 levels "0","1": 1 2 1 1 2 2 2 2 2 2 ...

Data Exploration

TrainnoSleep = filter(Train, CryoSleep == 0)
ggplot(data = TrainnoSleep, mapping = aes(x = Age, y = Transported, color = VIP))+
  geom_boxplot(notch=TRUE,
               outlier.colour="red", 
               outlier.shape = 1,
               outlier.size = 1)+
  coord_flip()