Untitled.knit

Amr Atrash

# packages ===================================================================
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)

## Warning: package 'tibble' was built under R version 4.2.3

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

# data =======================================================================
train <- read.csv("train.csv") %>% tibble::tibble()
test <- read.csv("test.csv") %>% tibble::tibble()

# exploratory analysis =======================================================
colnames(train)

##  [1] "PassengerId"  "HomePlanet"   "CryoSleep"    "Cabin"        "Destination" 
##  [6] "Age"          "VIP"          "RoomService"  "FoodCourt"    "ShoppingMall"
## [11] "Spa"          "VRDeck"       "Name"         "Transported"

train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(HomePlanet) %>%
  tidyr::pivot_wider(
    id_cols = "HomePlanet", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 4 × 3
##   HomePlanet False  True
##   <chr>      <int> <int>
## 1 ""            98   103
## 2 "Earth"     2651  1951
## 3 "Europa"     727  1404
## 4 "Mars"       839   920

# something happened with people from Europa, most of them got transported

train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(CryoSleep) %>%
  tidyr::pivot_wider(
    id_cols = "CryoSleep", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 3 × 3
##   CryoSleep False  True
##   <chr>     <int> <int>
## 1 ""          111   106
## 2 "False"    3650  1789
## 3 "True"      554  2483

# people sleeping in CryoSleep have more chances to be transported

train %>%
  dplyr::filter(HomePlanet == "Europa") %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(CryoSleep) %>%
  tidyr::pivot_wider(
    id_cols = "CryoSleep", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 3 × 3
##   CryoSleep False  True
##   <chr>     <int> <int>
## 1 ""           20    38
## 2 "False"     697   465
## 3 "True"       10   901

train %>%
  dplyr::select(Transported, Cabin) %>%
  # dplyr::arrange(Cabin) %>%
  dplyr::mutate(
    deck = stringr::str_remove(Cabin, "\\/.*"),
    num = stringr::str_remove(Cabin, "\\/.$") %>% stringr::str_remove(".*\\/"),
    side = stringr::str_remove(Cabin, ".*/\\.*")
  ) %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(side) %>%
  tidyr::pivot_wider(
    id_cols = "side", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 3 × 3
##   side  False  True
##   <chr> <int> <int>
## 1 ""       99   100
## 2 "P"    2308  1898
## 3 "S"    1908  2380

# no conclusions

train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(Destination) %>%
  tidyr::pivot_wider(
    id_cols = "Destination", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 4 × 3
##   Destination     False  True
##   <chr>           <int> <int>
## 1 ""                 90    92
## 2 "55 Cancri e"     702  1098
## 3 "PSO J318.5-22"   395   401
## 4 "TRAPPIST-1e"    3128  2787

# no conclusions

train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(Age) %>%
  tidyr::pivot_wider(
    id_cols = "Age", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 81 × 3
##      Age False  True
##    <dbl> <int> <int>
##  1     0    34   144
##  2     1    18    49
##  3     2    22    53
##  4     3    16    59
##  5     4    18    53
##  6     5    13    20
##  7     6    17    23
##  8     7    20    32
##  9     8    20    26
## 10     9    18    24
## # ℹ 71 more rows

# no conclusions

train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(VIP) %>%
  tidyr::pivot_wider(
    id_cols = "VIP", 
    names_from = "Transported",
    values_from = "n"
  )

## # A tibble: 3 × 3
##   VIP     False  True
##   <chr>   <int> <int>
## 1 ""         99   104
## 2 "False"  4093  4198
## 3 "True"    123    76

# no conclusions

train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(VRDeck) %>%
  tidyr::pivot_wider(
    id_cols = "VRDeck", 
    names_from = "Transported",
    values_from = "n"
  ) %>%
  dplyr::arrange(desc(VRDeck)) %>%
  na.omit()

## # A tibble: 296 × 3
##    VRDeck False  True
##     <dbl> <int> <int>
##  1   2577     1     1
##  2   2376     1     1
##  3   2260     1     1
##  4   1804     1     1
##  5   1674     2     1
##  6   1539     1     1
##  7   1514     1     1
##  8   1479     1     1
##  9   1460     1     1
## 10   1384     1     1
## # ℹ 286 more rows

# using luxury amenities decreases chances of being transported

train %>%
  dplyr::select(Transported, Name, HomePlanet) %>%
  dplyr::mutate(
    first_name = purrr::map_chr(
      Name, 
      ~stringr::str_split(.x, " ", simplify = T) %>% unlist() %>% .[1]
    ),
    last_name = purrr::map_chr(
      Name, 
      ~stringr::str_split(.x, " ", simplify = T) %>% unlist() %>% .[1]
    )
  ) %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(last_name, HomePlanet) %>%
  tidyr::pivot_wider(
    id_cols = c("last_name","HomePlanet"), 
    names_from = "Transported",
    values_from = "n"
  ) %>%
  arrange(HomePlanet, desc(True)) %>%
  filter(HomePlanet == "Europa")

## # A tibble: 890 × 4
##    last_name HomePlanet False  True
##    <chr>     <chr>      <int> <int>
##  1 ""        Europa        17    29
##  2 "Minoton" Europa         1     7
##  3 "Betenar" Europa        NA     7
##  4 "Dyonon"  Europa        NA     7
##  5 "Okulas"  Europa        NA     6
##  6 "Terope"  Europa        NA     6
##  7 "Dabik"   Europa         2     5
##  8 "Mothab"  Europa         2     5
##  9 "Sinon"   Europa         1     5
## 10 "Acruxon" Europa        NA     5
## # ℹ 880 more rows

# no conclusions


# decision tree ==============================================================
train_ok <- train %>%
  dplyr::mutate(
    Transported = factor(Transported, levels = c("True","False")),
    deck = stringr::str_remove(Cabin, "\\/.*"),
    num = stringr::str_remove(Cabin, "\\/.$") %>% stringr::str_remove(".*\\/") %>% as.numeric(),
    side = stringr::str_remove(Cabin, ".*/\\.*")
  ) %>%
  dplyr::mutate(
    luxury = RoomService + FoodCourt + ShoppingMall + Spa + VRDeck
  ) #%>%
  # dplyr::filter(CryoSleep != "")
# View(train_ok)
vars <- c("Transported", "CryoSleep","Age")
m <- rpart::rpart(Transported ~ ., data = train_ok[,vars])
rpart.plot(m)

p <- predict(m, train_ok, type = 'class')
caret::confusionMatrix(p, train_ok$Transported, positive="True")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction True False
##      True  2774   692
##      False 1604  3623
##                                           
##                Accuracy : 0.7359          
##                  95% CI : (0.7265, 0.7451)
##     No Information Rate : 0.5036          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4725          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6336          
##             Specificity : 0.8396          
##          Pos Pred Value : 0.8003          
##          Neg Pred Value : 0.6931          
##              Prevalence : 0.5036          
##          Detection Rate : 0.3191          
##    Detection Prevalence : 0.3987          
##       Balanced Accuracy : 0.7366          
##                                           
##        'Positive' Class : True            
##

predict(m, test, type = "prob") %>%
  as.data.frame() %>%
  tibble::tibble() %>%
  dplyr::bind_cols(test[,"PassengerId"]) %>%
  dplyr::mutate(Transported = ifelse(True > False, "True","False")) %>%
  dplyr::select(PassengerId, Transported)

## # A tibble: 4,277 × 2
##    PassengerId Transported
##    <chr>       <chr>      
##  1 0013_01     True       
##  2 0018_01     False      
##  3 0019_01     True       
##  4 0021_01     False      
##  5 0023_01     False      
##  6 0027_01     False      
##  7 0029_01     True       
##  8 0032_01     True       
##  9 0032_02     True       
## 10 0033_01     False      
## # ℹ 4,267 more rows

knitr::include_graphics("5.jpeg")

#   Doğruluk (Accuracy): Modelin genel doğruluk oranı yüzde 73.59'dur, yani model gerçek sonucu yaklaşık olarak %73.59 oranında doğru tahmin etmektedir.

#   Duyarlılık (True Positive Rate): Modelin gerçekten taşınan yolcuları doğru bir şekilde tanıma yeteneği %63.36'dır.

# Özgüllük (True Negative Rate): Modelin taşınmayan yolcuları doğru bir şekilde tanıma yeteneği %83.96'dır.
# Pozitif Tahmin Değeri (Precision): Modelin taşınan olarak tahmin ettiği yolcuların gerçekten taşınmış olma olasılığı %80.03'tür.


#
#KAGGLE SONUÇLARI.
knitr::include_graphics("4.jpeg")