RPUS

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(rmarkdown)

train <- read_csv("train.csv")

## Rows: 8693 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

test <- read_csv("test.csv")

## Rows: 4277 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

paged_table(train)

paged_table(test)

glimpse(train)

## Rows: 8,693
## Columns: 14
## $ PassengerId  <chr> "0001_01", "0002_01", "0003_01", "0003_02", "0004_01", "0…
## $ HomePlanet   <chr> "Europa", "Earth", "Europa", "Europa", "Earth", "Earth", …
## $ CryoSleep    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FA…
## $ Cabin        <chr> "B/0/P", "F/0/S", "A/0/S", "A/0/S", "F/1/S", "F/0/P", "F/…
## $ Destination  <chr> "TRAPPIST-1e", "TRAPPIST-1e", "TRAPPIST-1e", "TRAPPIST-1e…
## $ Age          <dbl> 39, 24, 58, 33, 16, 44, 26, 28, 35, 14, 34, 45, 32, 48, 2…
## $ VIP          <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ RoomService  <dbl> 0, 109, 43, 0, 303, 0, 42, 0, 0, 0, 0, 39, 73, 719, 8, 32…
## $ FoodCourt    <dbl> 0, 9, 3576, 1283, 70, 483, 1539, 0, 785, 0, 0, 7295, 0, 1…
## $ ShoppingMall <dbl> 0, 25, 0, 371, 151, 0, 3, 0, 17, 0, NA, 589, 1123, 65, 12…
## $ Spa          <dbl> 0, 549, 6715, 3329, 565, 291, 0, 0, 216, 0, 0, 110, 0, 0,…
## $ VRDeck       <dbl> 0, 44, 49, 193, 2, 0, 0, NA, 0, 0, 0, 124, 113, 24, 7, 0,…
## $ Name         <chr> "Maham Ofracculy", "Juanna Vines", "Altark Susent", "Sola…
## $ Transported  <lgl> FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, …

colSums(is.na(train))

##  PassengerId   HomePlanet    CryoSleep        Cabin  Destination          Age 
##            0          201          217          199          182          179 
##          VIP  RoomService    FoodCourt ShoppingMall          Spa       VRDeck 
##          203          181          183          208          183          188 
##         Name  Transported 
##          200            0

train %>%
  summarise(across(everything(), ~ n_distinct(.)))

## # A tibble: 1 × 14
##   PassengerId HomePlanet CryoSleep Cabin Destination   Age   VIP RoomService
##         <int>      <int>     <int> <int>       <int> <int> <int>       <int>
## 1        8693          4         3  6561           4    81     3        1274
## # ℹ 6 more variables: FoodCourt <int>, ShoppingMall <int>, Spa <int>,
## #   VRDeck <int>, Name <int>, Transported <int>

train <- train %>%
  mutate(across(c(HomePlanet, CryoSleep, Destination, VIP, Transported), as.factor))
test <- test %>%
  mutate(across(c(HomePlanet, CryoSleep, Destination, VIP), as.factor))

summary(train)

##  PassengerId         HomePlanet   CryoSleep       Cabin          
##  Length:8693        Earth :4602   FALSE:5439   Length:8693       
##  Class :character   Europa:2131   TRUE :3037   Class :character  
##  Mode  :character   Mars  :1759   NA's : 217   Mode  :character  
##                     NA's  : 201                                  
##                                                                  
##                                                                  
##                                                                  
##         Destination        Age           VIP        RoomService     
##  55 Cancri e  :1800   Min.   : 0.00   FALSE:8291   Min.   :    0.0  
##  PSO J318.5-22: 796   1st Qu.:19.00   TRUE : 199   1st Qu.:    0.0  
##  TRAPPIST-1e  :5915   Median :27.00   NA's : 203   Median :    0.0  
##  NA's         : 182   Mean   :28.83                Mean   :  224.7  
##                       3rd Qu.:38.00                3rd Qu.:   47.0  
##                       Max.   :79.00                Max.   :14327.0  
##                       NA's   :179                  NA's   :181      
##    FoodCourt        ShoppingMall          Spa              VRDeck       
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :    0.0   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  458.1   Mean   :  173.7   Mean   :  311.1   Mean   :  304.9  
##  3rd Qu.:   76.0   3rd Qu.:   27.0   3rd Qu.:   59.0   3rd Qu.:   46.0  
##  Max.   :29813.0   Max.   :23492.0   Max.   :22408.0   Max.   :24133.0  
##  NA's   :183       NA's   :208       NA's   :183       NA's   :188      
##      Name           Transported 
##  Length:8693        FALSE:4315  
##  Class :character   TRUE :4378  
##  Mode  :character               
##                                 
##                                 
##                                 
##

train <- train %>%
  separate(PassengerId, into = c("Grup_No", "Kisi_No"), sep = "_", remove = FALSE) %>%
  mutate(Grup_No = as.factor(Grup_No),
         Kisi_No = as.integer(Kisi_No))
test <- test %>%
  separate(PassengerId, into = c("Grup_No", "Kisi_No"), sep = "_", remove = FALSE) %>%
  mutate(Grup_No = as.factor(Grup_No),
         Kisi_No = as.integer(Kisi_No))

head(train %>% select(PassengerId, Grup_No, Kisi_No))

## # A tibble: 6 × 3
##   PassengerId Grup_No Kisi_No
##   <chr>       <fct>     <int>
## 1 0001_01     0001          1
## 2 0002_01     0002          1
## 3 0003_01     0003          1
## 4 0003_02     0003          2
## 5 0004_01     0004          1
## 6 0005_01     0005          1

train <- train %>%
  group_by(Grup_No) %>%
  mutate(Grup_Buyuklugu = n()) %>%
  ungroup()

test <- test %>%
  group_by(Grup_No) %>%
  mutate(Grup_Buyuklugu= n()) %>%
  ungroup()

head(train %>% select(PassengerId, Grup_Buyuklugu), 10)

## # A tibble: 10 × 2
##    PassengerId Grup_Buyuklugu
##    <chr>                <int>
##  1 0001_01                  1
##  2 0002_01                  1
##  3 0003_01                  2
##  4 0003_02                  2
##  5 0004_01                  1
##  6 0005_01                  1
##  7 0006_01                  2
##  8 0006_02                  2
##  9 0007_01                  1
## 10 0008_01                  3

train <- train %>%
  mutate(Yalniz_Seyahat = ifelse(Grup_Buyuklugu == 1, 1, 0))
test <- test %>%
  mutate(Yalniz_Seyahat = ifelse(Grup_Buyuklugu == 1, 1, 0))

table(train$Grup_Buyuklugu)

## 
##    1    2    3    4    5    6    7    8 
## 4805 1682 1020  412  265  174  231  104

train <- train %>%
  mutate(Grup_Buyuklugu_Kat = case_when(
    Grup_Buyuklugu %in% c(1, 2, 3) ~ as.character(Grup_Buyuklugu),
    Grup_Buyuklugu >= 4 ~ "4+"
  )) %>%
  mutate(Grup_Buyuklugu_Kat = factor(Grup_Buyuklugu_Kat))

table(train$Grup_Buyuklugu_Kat)

## 
##    1    2    3   4+ 
## 4805 1682 1020 1186

test <- test %>%
  mutate(Grup_Buyuklugu_Kat = case_when(
    Grup_Buyuklugu %in% c(1, 2, 3) ~ as.character(Grup_Buyuklugu),
    Grup_Buyuklugu >= 4 ~ "4+"
  )) %>%
  mutate(Grup_Buyuklugu_Kat = factor(Grup_Buyuklugu_Kat))

train <- train %>%
  separate(Cabin, into = c("Deck", "Numara", "Side"), sep = "/", remove = FALSE) %>%
  mutate(Deck = as.factor(Deck),
         Side = as.factor(Side),
         Numara = as.integer(Numara))
test <- test %>%
  separate(Cabin, into = c("Deck", "Numara", "Side"), sep = "/", remove = FALSE) %>%
  mutate(Deck = as.factor(Deck),
         Side = as.factor(Side),
         Numara = as.integer(Numara))

head(train %>% select(Cabin, Deck, Numara, Side))

## # A tibble: 6 × 4
##   Cabin Deck  Numara Side 
##   <chr> <fct>  <int> <fct>
## 1 B/0/P B          0 P    
## 2 F/0/S F          0 S    
## 3 A/0/S A          0 S    
## 4 A/0/S A          0 S    
## 5 F/1/S F          1 S    
## 6 F/0/P F          0 P

summary(train %>% select(Deck, Numara, Side))

##       Deck          Numara         Side     
##  F      :2794   Min.   :   0.0   P   :4206  
##  G      :2559   1st Qu.: 167.2   S   :4288  
##  E      : 876   Median : 427.0   NA's: 199  
##  B      : 779   Mean   : 600.4              
##  C      : 747   3rd Qu.: 999.0              
##  (Other): 739   Max.   :1894.0              
##  NA's   : 199   NA's   :199

table(train$Deck)

## 
##    A    B    C    D    E    F    G    T 
##  256  779  747  478  876 2794 2559    5

library(dplyr)
library(forcats)

train <- train %>%
  mutate(
    Deck = fct_collapse(
      Deck,
      Diger = c("A", "D", "T")
    )
  )

test <- test %>%
  mutate(
    Deck = fct_collapse(
      Deck,
      Diger = c("A", "D", "T")
    )
  )

summary(train$Deck)

## Diger     B     C     E     F     G  NA's 
##   739   779   747   876  2794  2559   199

train <- train %>%
  separate(Name, into = c("Ad", "Soyad"), sep = " ", remove = FALSE)
test <- test %>%
  separate(Name, into = c("Ad", "Soyad"), sep = " ", remove = FALSE)

head(train %>% select(Name, Soyad, Ad))

## # A tibble: 6 × 3
##   Name              Soyad       Ad    
##   <chr>             <chr>       <chr> 
## 1 Maham Ofracculy   Ofracculy   Maham 
## 2 Juanna Vines      Vines       Juanna
## 3 Altark Susent     Susent      Altark
## 4 Solam Susent      Susent      Solam 
## 5 Willy Santantines Santantines Willy 
## 6 Sandie Hinetthews Hinetthews  Sandie

Home_Planet_GrupNO_bilgi <- train %>%
  group_by(Grup_No) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_HP_bilgisi_bos       = sum(is.na(HomePlanet)),           
    kacinin_HP_bilgisi_dolu   = sum(!is.na(HomePlanet)),         
    Grup_kac_farkli_HP  = n_distinct(HomePlanet, na.rm = TRUE)
  ) %>%
  ungroup()

head(Home_Planet_GrupNO_bilgi)

## # A tibble: 6 × 5
##   Grup_No grup_kac_kisi kacinin_HP_bilgisi_bos kacinin_HP_bilgisi_dolu
##   <fct>           <int>                  <int>                   <int>
## 1 0001                1                      0                       1
## 2 0002                1                      0                       1
## 3 0003                2                      0                       2
## 4 0004                1                      0                       1
## 5 0005                1                      0                       1
## 6 0006                2                      0                       2
## # ℹ 1 more variable: Grup_kac_farkli_HP <int>

table(Home_Planet_GrupNO_bilgi$Grup_kac_farkli_HP)

## 
##    0    1 
##  110 6107

grup_0 <- Home_Planet_GrupNO_bilgi %>%
  filter(Grup_kac_farkli_HP == 0)

# Bu gruplarda kaç tane dolu HP var?
sum(grup_0$kacinin_HP_bilgisi_dolu)

## [1] 0

grup_1 <- Home_Planet_GrupNO_bilgi %>%
  filter(Grup_kac_farkli_HP == 1)

# Bu grupların içindeki HomePlanet değerlerini görelim
table(grup_1$Grup_kac_farkli_HP, useNA = "ifany")

## 
##    1 
## 6107

# Doldurmadan önce/sonra kaç NA vardı görmek istersen:
sum(is.na(train$HomePlanet))  # şu anki NA sayısı

## [1] 201

# 1) Her grup için referans HomePlanet bilgisini çıkaralım
hp_referans_train <- train %>%
  group_by(Grup_No) %>%
  summarise(
    Grup_HP = HomePlanet[!is.na(HomePlanet)][1],  # gruptaki ilk dolu HomePlanet
    .groups = "drop"
  )

hp_referans_test <- test %>%
  group_by(Grup_No) %>%
  summarise(
    Grup_HP = HomePlanet[!is.na(HomePlanet)][1],  # gruptaki ilk dolu HomePlanet
    .groups = "drop"
  )

head(hp_referans_train)

## # A tibble: 6 × 2
##   Grup_No Grup_HP
##   <fct>   <fct>  
## 1 0001    Europa 
## 2 0002    Earth  
## 3 0003    Europa 
## 4 0004    Earth  
## 5 0005    Earth  
## 6 0006    Earth

# 2) Orijinal train veri seti ile bu referans bilgiyi birleştirelim
train <- train %>%
  left_join(hp_referans_train, by = "Grup_No")
test <- test %>%
  left_join(hp_referans_test, by = "Grup_No")

head(train %>% select(PassengerId, HomePlanet, Grup_HP))

## # A tibble: 6 × 3
##   PassengerId HomePlanet Grup_HP
##   <chr>       <fct>      <fct>  
## 1 0001_01     Europa     Europa 
## 2 0002_01     Earth      Earth  
## 3 0003_01     Europa     Europa 
## 4 0003_02     Europa     Europa 
## 5 0004_01     Earth      Earth  
## 6 0005_01     Earth      Earth

# 3) Eksik HomePlanet değerlerini dolduralım
train <- train %>%
  mutate(HomePlanet = coalesce(HomePlanet, Grup_HP)) %>%
  select(-Grup_HP)
test <- test %>%
  mutate(HomePlanet = coalesce(HomePlanet, Grup_HP)) %>%
  select(-Grup_HP)  # artık gerek yok, silebiliriz

# Doldurduktan sonra kaç NA kaldı görmek istersen:
sum(is.na(train$HomePlanet))  # doldurduktan sonraki NA sayısı

## [1] 111

soyad_planet_analizi <- train %>%
  group_by(Soyad) %>%
  summarise(
    soyad_kac_kisi = n(),
    kac_farkli_HP = n_distinct(HomePlanet, na.rm = TRUE),
    hangi_HP = paste(unique(HomePlanet[!is.na(HomePlanet)]), collapse = ", "),
    .groups = "drop"
  )

head(soyad_planet_analizi)

## # A tibble: 6 × 4
##   Soyad      soyad_kac_kisi kac_farkli_HP hangi_HP
##   <chr>               <int>         <int> <chr>   
## 1 Acobson                 4             1 "Earth" 
## 2 Acobsond                3             1 "Earth" 
## 3 Adavisons               9             1 "Earth" 
## 4 Adkinson                3             1 "Earth" 
## 5 Admingried              4             1 "Europa"
## 6 Ageurante               1             0 ""

table(soyad_planet_analizi$kac_farkli_HP)

## 
##    0    1    3 
##    7 2210    1

hp_referans_soyad <- soyad_planet_analizi %>%
filter(kac_farkli_HP == 1) %>%
  select(Soyad, Soyad_HP = hangi_HP)

head(hp_referans_soyad)

## # A tibble: 6 × 2
##   Soyad      Soyad_HP
##   <chr>      <chr>   
## 1 Acobson    Earth   
## 2 Acobsond   Earth   
## 3 Adavisons  Earth   
## 4 Adkinson   Earth   
## 5 Admingried Europa  
## 6 Aginge     Europa

# Orijinal train veri seti ile bu referans bilgiyi birleştirelim
train <- train %>%
  left_join(hp_referans_soyad, by = "Soyad")
test <- test %>%
  left_join(hp_referans_soyad, by = "Soyad")

# Dolduralım
train <- train %>%
  mutate(HomePlanet = coalesce(HomePlanet, Soyad_HP)) %>%
  select(-Soyad_HP)
test <- test %>%
  mutate(HomePlanet = coalesce(HomePlanet, Soyad_HP)) %>%
  select(-Soyad_HP)  # Soyad_HP gerek yok, silebiliriz

# Doldurduktan sonra kaç NA kaldı görmek istersen:
sum(is.na(train$HomePlanet))

## [1] 12

sum(is.na(test$HomePlanet))

## [1] 7

Home_Planet_Deck_bilgi <- train %>%
  group_by(Deck) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_HP_bilgisi_bos       = sum(is.na(HomePlanet)),           
    kacinin_HP_bilgisi_dolu   = sum(!is.na(HomePlanet)),         
    Grup_kac_farkli_HP  = n_distinct(HomePlanet, na.rm = TRUE),
    hangi_HP = paste(unique(HomePlanet[!is.na(HomePlanet)]), collapse = ", "),
  ) %>%
  ungroup()

Home_Planet_Deck_bilgi

## # A tibble: 7 × 6
##   Deck  grup_kac_kisi kacinin_HP_bilgisi_bos kacinin_HP_bilgisi_dolu
##   <fct>         <int>                  <int>                   <int>
## 1 Diger           739                      2                     737
## 2 B               779                      0                     779
## 3 C               747                      1                     746
## 4 E               876                      1                     875
## 5 F              2794                      6                    2788
## 6 G              2559                      2                    2557
## 7 <NA>            199                      0                     199
## # ℹ 2 more variables: Grup_kac_farkli_HP <int>, hangi_HP <chr>

# Deck bazında dolduralım
hp_referans_deck <- Home_Planet_Deck_bilgi %>%
  filter(Grup_kac_farkli_HP == 1) %>%
  select(Deck, Deck_HP = hangi_HP)
head(hp_referans_deck)

## # A tibble: 3 × 2
##   Deck  Deck_HP
##   <fct> <chr>  
## 1 B     Europa 
## 2 C     Europa 
## 3 G     Earth

# Orijinal train veri seti ile bu referans bilgiyi birleştirelim
train <- train %>%
  left_join(hp_referans_deck, by = "Deck")
test <- test %>%
  left_join(hp_referans_deck, by = "Deck")

# Dolduralım
train <- train %>%
  mutate(HomePlanet = coalesce(HomePlanet, Deck_HP)) %>%
  select(-Deck_HP)
test <- test %>%
  mutate(HomePlanet = coalesce(HomePlanet, Deck_HP)) %>%
  select(-Deck_HP)  # artık gerek yok, silebiliriz

# Doldurduktan sonra kaç NA kaldı görmek istersen:
sum(is.na(train$HomePlanet))

## [1] 9

sum(is.na(test$HomePlanet))

## [1] 5

table(train$HomePlanet)

## 
##  Earth Europa   Mars 
##   4707   2173   1804

train <- train %>%
  mutate(HomePlanet = coalesce(HomePlanet, "Earth"))
test <- test %>%
  mutate(HomePlanet = coalesce(HomePlanet, "Earth"))

sum(is.na(train$HomePlanet))

## [1] 0

sum(is.na(test$HomePlanet))

## [1] 0

train$HomePlanet <- as.factor(train$HomePlanet)
test$HomePlanet <- as.factor(test$HomePlanet)

Dest_deck_bilgi <- train %>%
  group_by(Deck) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_D_bilgisi_bos       = sum(is.na(Destination)),           
    kacinin_D_bilgisi_dolu   = sum(!is.na(Destination)),         
    Grup_kac_farkli_D = n_distinct(Destination, na.rm = TRUE),
    hangi_D = paste(unique(Destination[!is.na(Destination)]), collapse = ", "),
  ) %>%
  ungroup() 

table(Dest_deck_bilgi$Grup_kac_farkli_D)

## 
## 3 
## 7

Dest_Ad_bilgi <- train %>%
  group_by(Ad) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_D_bilgisi_bos       = sum(is.na(Destination)),           
    kacinin_D_bilgisi_dolu   = sum(!is.na(Destination)),         
    Grup_kac_farkli_D = n_distinct(Destination, na.rm = TRUE),
    hangi_D = paste(unique(Destination[!is.na(Destination)]), collapse = ", "),
  ) %>%
  ungroup() %>%
  filter(grup_kac_kisi > 1) 

table(Dest_Ad_bilgi$Grup_kac_farkli_D)

## 
##    1    2    3 
##  770 1054  270

Dest_Soyad_bilgi <- train %>%
  group_by(Soyad) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_D_bilgisi_bos       = sum(is.na(Destination)),           
    kacinin_D_bilgisi_dolu   = sum(!is.na(Destination)),         
    Grup_kac_farkli_D = n_distinct(Destination, na.rm = TRUE),
    hangi_D = paste(unique(Destination[!is.na(Destination)]), collapse = ", "),
  ) %>%
  ungroup() %>%
  filter(grup_kac_kisi > 1) 

table(Dest_Soyad_bilgi$Grup_kac_farkli_D)

## 
##   1   2   3 
## 620 948 268

library(rpart)
library(rpart.plot)
dt_model <- rpart(
  Destination ~ HomePlanet + Deck + Side + Age + CryoSleep + VIP + RoomService + 
    FoodCourt + ShoppingMall + Spa + VRDeck + Grup_Buyuklugu_Kat+ Yalniz_Seyahat,
  data = train %>% filter(!is.na(Destination)),
  method = "class",
  cp = 0.00221
)
rpart.plot(
  dt_model
)

# 1) Sadece Destination NA olanları tahmin edelim
train$Destination_tree_pred <- predict(
  dt_model,
  newdata = train,
  type   = "class"
)

test$Destination_tree_pred <- predict(
  dt_model,
  newdata = test,
  type   = "class"
)

rpart.rules(
  dt_model,
  style = "tall",       # her kural alt alta, daha okunaklı
  cover = TRUE,         # her kuralın kaç gözlemi kapsadığını gösterir
  nn    = TRUE          # node numaralarını gösterir
)

##                                    55 C PSO  TRAP 
## [128]  Destination is 55 Cancri e [ .55  .02  .43] with cover 3% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 15 to 62
##            Deck is C
##            Grup_Buyuklugu_Kat is 1 or 2 or 4+
## 
## [260]  Destination is 55 Cancri e [ .61  .00  .39] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 36 to 47
##            Deck is Diger or B or E
## 
## [4184] Destination is 55 Cancri e [ .62  .00  .38] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 15 to 27
##            Deck is Diger or B
##            VIP is FALSE
##            Grup_Buyuklugu_Kat is 1 or 3 or 4+
## 
## [84]   Destination is 55 Cancri e [ .62  .11  .28] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            VIP is TRUE
##            FoodCourt is 16 to 9637
##            VRDeck >= 1110
## 
## [170]  Destination is 55 Cancri e [ .64  .00  .36] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            VIP is FALSE
##            FoodCourt is 16 to 9637
##            VRDeck >= 1110
##            RoomService >= 70
## 
## [342]  Destination is 55 Cancri e [ .68  .00  .32] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            VIP is FALSE
##            FoodCourt is 16 to 9637
##            VRDeck is 1110 to 1373
##            RoomService < 70
## 
## [20]   Destination is 55 Cancri e [ .71  .00  .29] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            FoodCourt >= 9637
## 
## [522]  Destination is 55 Cancri e [ .78  .00  .22] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 15 to 36
##            Deck is Diger or B or E
##            VIP is TRUE
## 
## [2093] Destination is TRAPPIST-1e [ .43  .01  .57] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 27 to 36
##            Deck is Diger or B or E
##            VIP is FALSE
##            Grup_Buyuklugu_Kat is 1 or 3 or 4+
## 
## [1047] Destination is TRAPPIST-1e [ .42  .01  .57] with cover 2% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 15 to 36
##            Deck is Diger or B or E
##            VIP is FALSE
##            Grup_Buyuklugu_Kat is 2
## 
## [129]  Destination is TRAPPIST-1e [ .36  .06  .58] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 15 to 62
##            Deck is C
##            Grup_Buyuklugu_Kat is 3
## 
## [131]  Destination is TRAPPIST-1e [ .39  .00  .61] with cover 1% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 47 to 62
##            Deck is Diger or B or E
## 
## [343]  Destination is TRAPPIST-1e [ .36  .00  .64] with cover 3% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            VIP is FALSE
##            FoodCourt is 16 to 9637
##            VRDeck >= 1373
##            RoomService < 70
## 
## [33]   Destination is TRAPPIST-1e [ .36  .00  .64] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 5 to 15
## 
## [43]   Destination is TRAPPIST-1e [ .33  .01  .66] with cover 7% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            FoodCourt is 16 to 9637
##            VRDeck < 1110
## 
## [3]    Destination is TRAPPIST-1e [ .14  .12  .74] with cover 75% when
##            HomePlanet is Earth or Mars
## 
## [4185] Destination is TRAPPIST-1e [ .25  .00  .75] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age is 15 to 27
##            Deck is E
##            VIP is FALSE
##            Grup_Buyuklugu_Kat is 1 or 3 or 4+
## 
## [17]   Destination is TRAPPIST-1e [ .23  .00  .77] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age >= 62
## 
## [11]   Destination is TRAPPIST-1e [ .16  .00  .84] with cover 2% when
##            HomePlanet is Europa
##            CryoSleep is FALSE
##            FoodCourt < 16
## 
## [9]    Destination is TRAPPIST-1e [ .00  .00 1.00] with cover 0% when
##            HomePlanet is Europa
##            CryoSleep is TRUE
##            Age < 5

train <- train %>%
  mutate(
    Destination_tree_pred2 = case_when(
      # [128] 55 Cancri e, cover 3%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 15 & Age <= 62 &
        Deck == "C" &
        Grup_Buyuklugu_Kat %in% c("1", "2", "4+") ~ "55 Cancri e",
      
      # [260] 55 Cancri e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 36 & Age <= 47 &
        Deck %in% c("Diğer", "B", "E") ~ "55 Cancri e",
      
      # [4184] 55 Cancri e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 15 & Age <= 27 &
        Deck %in% c("Diğer", "B") &
        VIP == FALSE &
        Grup_Buyuklugu_Kat %in% c("1", "3", "4+") ~ "55 Cancri e",
      
      # [84] 55 Cancri e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        VIP == TRUE &
        FoodCourt >= 16 & FoodCourt <= 9637 &
        VRDeck >= 1110 ~ "55 Cancri e",
      
      # [170] 55 Cancri e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        VIP == FALSE &
        FoodCourt >= 16 & FoodCourt <= 9637 &
        VRDeck >= 1110 &
        RoomService >= 70 ~ "55 Cancri e",
      
      # [342] 55 Cancri e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        VIP == FALSE &
        FoodCourt >= 16 & FoodCourt <= 9637 &
        VRDeck >= 1110 & VRDeck <= 1373 &
        RoomService < 70 ~ "55 Cancri e",
      
      # [20] 55 Cancri e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        FoodCourt >= 9637 ~ "55 Cancri e",
      
      # [522] 55 Cancri e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 15 & Age <= 36 &
        Deck %in% c("Diğer", "B", "E") &
        VIP == TRUE ~ "55 Cancri e",
      
      # [2093] TRAPPIST-1e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 27 & Age <= 36 &
        Deck %in% c("Diğer", "B", "E") &
        VIP == FALSE &
        Grup_Buyuklugu_Kat %in% c("1", "3", "4+") ~ "TRAPPIST-1e",
      
      # [1047] TRAPPIST-1e, cover 2%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 15 & Age <= 36 &
        Deck %in% c("Diğer", "B", "E") &
        VIP == FALSE &
        Grup_Buyuklugu_Kat %in% c("2") ~ "TRAPPIST-1e",
      
      # [129] TRAPPIST-1e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 15 & Age <= 62 &
        Deck == "C" &
        Grup_Buyuklugu_Kat %in% c("3") ~ "TRAPPIST-1e",
      
      # [131] TRAPPIST-1e, cover 1%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 47 & Age <= 62 &
        Deck %in% c("Diğer", "B", "E") ~ "TRAPPIST-1e",
      
      # [343] TRAPPIST-1e, cover 3%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        VIP == FALSE &
        FoodCourt >= 16 & FoodCourt <= 9637 &
        VRDeck >= 1373 &
        RoomService < 70 ~ "TRAPPIST-1e",
      
      # [33] TRAPPIST-1e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 5 & Age <= 15 ~ "TRAPPIST-1e",
      
      # [43] TRAPPIST-1e, cover 7%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        FoodCourt >= 16 & FoodCourt <= 9637 &
        VRDeck < 1110 ~ "TRAPPIST-1e",
      
      # [3] TRAPPIST-1e, cover 75% (Earth & Mars)
      HomePlanet %in% c("Earth", "Mars") ~ "TRAPPIST-1e",
      
      # [4185] TRAPPIST-1e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 15 & Age <= 27 &
        Deck == "E" &
        VIP == FALSE &
        Grup_Buyuklugu_Kat %in% c("1", "3", "4+") ~ "TRAPPIST-1e",
      
      # [17] TRAPPIST-1e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age >= 62 ~ "TRAPPIST-1e",
      
      # [11] TRAPPIST-1e, cover 2%
      HomePlanet == "Europa" &
        CryoSleep == FALSE &
        FoodCourt < 16 ~ "TRAPPIST-1e",
      
      # [9] TRAPPIST-1e, cover 0%
      HomePlanet == "Europa" &
        CryoSleep == TRUE &
        Age < 5 ~ "TRAPPIST-1e",
      
      # Hiçbir kurala girmeyen varsa (teorik olarak kalmamalı)
      TRUE ~ "TRAPPIST-1e"
    )
  )

train$Destination_tree_pred2 <- factor(
  train$Destination_tree_pred2,
  levels = levels(train$Destination)
)

mean(train$Destination_tree_pred == train$Destination_tree_pred2)

## [1] 0.9853905

# 2) Destination NA olan yerleri tree tahminiyle dolduralım
train <- train %>%
  mutate(
    Destination = ifelse(
      is.na(Destination),
      as.character(Destination_tree_pred),
      as.character(Destination)
    ),
    Destination = factor(Destination)
  )

test <- test %>%
  mutate(
    Destination = ifelse(
      is.na(Destination),
      as.character(Destination_tree_pred),
      as.character(Destination)
    ),
    Destination = factor(Destination)
  )

# Tahmin sütunlarını silelim
train <- train %>%
  select(-Destination_tree_pred, -Destination_tree_pred2)
test <- test %>%
  select(-Destination_tree_pred)

library(explore)
describe_all(train)

## # A tibble: 24 × 8
##    variable    type     na na_pct unique   min   mean   max
##    <chr>       <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId chr       0    0     8693    NA  NA       NA
##  2 Grup_No     fct       0    0     6217    NA  NA       NA
##  3 Kisi_No     int       0    0        8     1   1.52     8
##  4 HomePlanet  fct       0    0        3    NA  NA       NA
##  5 CryoSleep   fct     217    2.5      3    NA  NA       NA
##  6 Cabin       chr     199    2.3   6561    NA  NA       NA
##  7 Deck        fct     199    2.3      7    NA  NA       NA
##  8 Numara      int     199    2.3   1818     0 600.    1894
##  9 Side        fct     199    2.3      3    NA  NA       NA
## 10 Destination fct       0    0        3    NA  NA       NA
## # ℹ 14 more rows

summary(train$CryoSleep)

## FALSE  TRUE  NA's 
##  5439  3037   217

train %>%
  mutate(RS = ifelse(RoomService>0, "BZ", "Z")) %>%
  group_by(RS) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_CS_bilgisi_bos       = sum(is.na(CryoSleep)),           
    kacinin_CS_bilgisi_dolu   = sum(!is.na(CryoSleep)),         
    Grup_kac_farkli_CS  = n_distinct(CryoSleep, na.rm = TRUE)
  ) %>%
  ungroup()

## # A tibble: 3 × 5
##   RS    grup_kac_kisi kacinin_CS_bilgisi_bos kacinin_CS_bilgisi_dolu
##   <chr>         <int>                  <int>                   <int>
## 1 BZ             2935                     71                    2864
## 2 Z              5577                    144                    5433
## 3 <NA>            181                      2                     179
## # ℹ 1 more variable: Grup_kac_farkli_CS <int>

train <- train %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & RoomService > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

test <- test %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & RoomService > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

train %>%
  mutate(FC = ifelse(FoodCourt>0, "BZ", "Z")) %>%
  group_by(FC) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_CS_bilgisi_bos       = sum(is.na(CryoSleep)),           
    kacinin_CS_bilgisi_dolu   = sum(!is.na(CryoSleep)),         
    Grup_kac_farkli_CS  = n_distinct(CryoSleep, na.rm = TRUE)
  ) %>%
  ungroup()

## # A tibble: 3 × 5
##   FC    grup_kac_kisi kacinin_CS_bilgisi_bos kacinin_CS_bilgisi_dolu
##   <chr>         <int>                  <int>                   <int>
## 1 BZ             3054                     37                    3017
## 2 Z              5456                    108                    5348
## 3 <NA>            183                      1                     182
## # ℹ 1 more variable: Grup_kac_farkli_CS <int>

train <- train %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & FoodCourt > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

test <- test %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & FoodCourt > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

train %>%
  mutate(SM = ifelse(ShoppingMall>0, "BZ", "Z")) %>%
  group_by(SM) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_CS_bilgisi_bos       = sum(is.na(CryoSleep)),           
    kacinin_CS_bilgisi_dolu   = sum(!is.na(CryoSleep)),         
    Grup_kac_farkli_CS  = n_distinct(CryoSleep, na.rm = TRUE)
  ) %>%
  ungroup()

## # A tibble: 3 × 5
##   SM    grup_kac_kisi kacinin_CS_bilgisi_bos kacinin_CS_bilgisi_dolu
##   <chr>         <int>                  <int>                   <int>
## 1 BZ             2898                      7                    2891
## 2 Z              5587                     98                    5489
## 3 <NA>            208                      4                     204
## # ℹ 1 more variable: Grup_kac_farkli_CS <int>

train <- train %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & ShoppingMall > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

test <- test %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & ShoppingMall > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

train %>%
  mutate(SP = ifelse(Spa>0, "BZ", "Z")) %>%
  group_by(SP) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_CS_bilgisi_bos       = sum(is.na(CryoSleep)),           
    kacinin_CS_bilgisi_dolu   = sum(!is.na(CryoSleep)),         
    Grup_kac_farkli_CS  = n_distinct(CryoSleep, na.rm = TRUE)
  ) %>%
  ungroup()

## # A tibble: 3 × 5
##   SP    grup_kac_kisi kacinin_CS_bilgisi_bos kacinin_CS_bilgisi_dolu
##   <chr>         <int>                  <int>                   <int>
## 1 BZ             3186                      4                    3182
## 2 Z              5324                     96                    5228
## 3 <NA>            183                      2                     181
## # ℹ 1 more variable: Grup_kac_farkli_CS <int>

train <- train %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & Spa > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

test <- test %>%
  mutate(CryoSleep = case_when(
    is.na(CryoSleep) & Spa > 0 ~ "FALSE",
    TRUE ~ CryoSleep
  ))

train %>%
  mutate(VD = ifelse(VRDeck>0, "BZ", "Z")) %>%
  group_by(VD) %>%
  summarize(
    grup_kac_kisi       = n(),
    kacinin_CS_bilgisi_bos       = sum(is.na(CryoSleep)),           
    kacinin_CS_bilgisi_dolu   = sum(!is.na(CryoSleep)),         
    Grup_kac_farkli_CS  = n_distinct(CryoSleep, na.rm = TRUE)
  ) %>%
  ungroup()

## # A tibble: 3 × 5
##   VD    grup_kac_kisi kacinin_CS_bilgisi_bos kacinin_CS_bilgisi_dolu
##   <chr>         <int>                  <int>                   <int>
## 1 BZ             3010                      0                    3010
## 2 Z              5495                     93                    5402
## 3 <NA>            188                      5                     183
## # ℹ 1 more variable: Grup_kac_farkli_CS <int>

train <- train %>%
 mutate(CryoSleep = factor(CryoSleep))
test <- test %>% 
 mutate(CryoSleep = factor(CryoSleep))

cs_model <- rpart(
  CryoSleep ~ HomePlanet + Deck + Side + Age + Destination + VIP + RoomService + 
    FoodCourt + ShoppingMall + Spa + VRDeck + Grup_Buyuklugu_Kat+ Yalniz_Seyahat,
  data = train %>% filter(!is.na(Destination)),
  method = "class"
)

rpart.plot(
  cs_model
)

train$CryoSleep_tree_pred <- predict(
  cs_model,
  newdata = train,
  type   = "class"
)

test$CryoSleep_tree_pred <- predict(
  cs_model,
  newdata = test,
  type   = "class"
)

train <- train %>%
  mutate(
    CryoSleep = ifelse(
      is.na(CryoSleep),
      as.character(CryoSleep_tree_pred),
      as.character(CryoSleep)
    ),
    CryoSleep = factor(CryoSleep)
  )

test <- test %>%
  mutate(
    CryoSleep = ifelse(
      is.na(CryoSleep),
      as.character(CryoSleep_tree_pred),
      as.character(CryoSleep)
    ),
    CryoSleep = factor(CryoSleep)
  )

train <- train %>%
  select(-CryoSleep_tree_pred)
test <- test %>%
  select(-CryoSleep_tree_pred)

describe_all(train)

## # A tibble: 24 × 8
##    variable    type     na na_pct unique   min   mean   max
##    <chr>       <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId chr       0    0     8693    NA  NA       NA
##  2 Grup_No     fct       0    0     6217    NA  NA       NA
##  3 Kisi_No     int       0    0        8     1   1.52     8
##  4 HomePlanet  fct       0    0        3    NA  NA       NA
##  5 CryoSleep   fct       0    0        2    NA  NA       NA
##  6 Cabin       chr     199    2.3   6561    NA  NA       NA
##  7 Deck        fct     199    2.3      7    NA  NA       NA
##  8 Numara      int     199    2.3   1818     0 600.    1894
##  9 Side        fct     199    2.3      3    NA  NA       NA
## 10 Destination fct       0    0        3    NA  NA       NA
## # ℹ 14 more rows

summary(train$Deck)

## Diger     B     C     E     F     G  NA's 
##   739   779   747   876  2794  2559   199

deck_model <- rpart(
  Deck ~ HomePlanet + CryoSleep + Side + Age + Destination + VIP + RoomService + 
    FoodCourt + ShoppingMall + Spa + VRDeck + Grup_Buyuklugu_Kat+ Yalniz_Seyahat,
  data = train %>% filter(!is.na(Deck)),
  method = "class"
)

rpart.plot(
  deck_model
)

train$deck_tree_pred <- predict(
  deck_model,
  newdata = train,
  type   = "class"
)

test$deck_tree_pred <- predict(
  deck_model,
  newdata = test,
  type   = "class"
)

train <- train %>%
  mutate(
    Deck = ifelse(
      is.na(Deck),
      as.character(deck_tree_pred),
      as.character(Deck)
    ),
    Deck = factor(Deck)
  )

test <- test %>%
  mutate(
    Deck = ifelse(
      is.na(Deck),
      as.character(deck_tree_pred),
      as.character(Deck)
    ),
    Deck = factor(Deck)
  )

train <- train %>%
  select(-deck_tree_pred)
test <- test %>%
  select(-deck_tree_pred)

train <- train %>%
  select(-c(Cabin,Numara ))
test <- test %>%
  select(-c(Cabin,Numara ))

train <- train %>%
  select(-c(Name,Ad, Soyad, Grup_Buyuklugu ))
test <- test %>%
  select(-c(Name,Ad, Soyad, Grup_Buyuklugu ))

describe_all(train)

## # A tibble: 18 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     8693    NA  NA       NA
##  2 Grup_No            fct       0    0     6217    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.52     8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct     199    2.3      3    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl     179    2.1     81     0  28.8     79
## 10 VIP                fct     203    2.3      3    NA  NA       NA
## 11 RoomService        dbl     181    2.1   1274     0 225.   14327
## 12 FoodCourt          dbl     183    2.1   1508     0 458.   29813
## 13 ShoppingMall       dbl     208    2.4   1116     0 174.   23492
## 14 Spa                dbl     183    2.1   1328     0 311.   22408
## 15 VRDeck             dbl     188    2.2   1307     0 305.   24133
## 16 Transported        fct       0    0        2    NA  NA       NA
## 17 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 18 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA

side_model <- rpart(
  Side ~ HomePlanet + CryoSleep + Deck + Age + Destination + VIP + RoomService + 
    FoodCourt + ShoppingMall + Spa + VRDeck + Grup_Buyuklugu_Kat+ Yalniz_Seyahat,
  data = train %>% filter(!is.na(Side)),
  method = "class",
  cp = 0.002
)

rpart.plot(
  side_model
)

train$side_tree_pred <- predict(
  side_model,
  newdata = train,
  type   = "class"
)
test$side_tree_pred <- predict(
  side_model,
  newdata = test,
  type   = "class"
)
train <- train %>%
  mutate(
    Side = ifelse(
      is.na(Side),
      as.character(side_tree_pred),
      as.character(Side)
    ),
    Side = factor(Side)
  )
test <- test %>%
    mutate(
    Side = ifelse(
      is.na(Side),
      as.character(side_tree_pred),
      as.character(Side)
    ),
    Side = factor(Side)
  )
train <- train %>%
  select(-side_tree_pred)
test <- test %>%
  select(-side_tree_pred)
describe_all(train)

## # A tibble: 18 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     8693    NA  NA       NA
##  2 Grup_No            fct       0    0     6217    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.52     8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct       0    0        2    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl     179    2.1     81     0  28.8     79
## 10 VIP                fct     203    2.3      3    NA  NA       NA
## 11 RoomService        dbl     181    2.1   1274     0 225.   14327
## 12 FoodCourt          dbl     183    2.1   1508     0 458.   29813
## 13 ShoppingMall       dbl     208    2.4   1116     0 174.   23492
## 14 Spa                dbl     183    2.1   1328     0 311.   22408
## 15 VRDeck             dbl     188    2.2   1307     0 305.   24133
## 16 Transported        fct       0    0        2    NA  NA       NA
## 17 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 18 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA

vip_model <- rpart(
  VIP ~ HomePlanet + CryoSleep + Deck + Age + Destination + Side + RoomService + 
    FoodCourt + ShoppingMall + Spa + VRDeck + Grup_Buyuklugu_Kat+ Yalniz_Seyahat,
  data = train %>% filter(!is.na(VIP)),
  method = "class",
  cp = 0.002
)
rpart.plot(
  vip_model
)

train$vip_tree_pred <- predict(
  vip_model,
  newdata = train,
  type   = "class"
)
test$vip_tree_pred <- predict(
  vip_model,
  newdata = test,
  type   = "class"
)
train <- train %>%
  mutate(
    VIP = ifelse(
      is.na(VIP),
      as.character(vip_tree_pred),
      as.character(VIP)
    ),
    VIP = factor(VIP)
  )

test <- test %>%
    mutate(
    VIP = ifelse(
      is.na(VIP),
      as.character(vip_tree_pred),
      as.character(VIP)
    ),
    VIP = factor(VIP)
  )
train <- train %>%
  select(-vip_tree_pred)
test <- test %>%
  select(-vip_tree_pred)
describe_all(train)

## # A tibble: 18 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     8693    NA  NA       NA
##  2 Grup_No            fct       0    0     6217    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.52     8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct       0    0        2    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl     179    2.1     81     0  28.8     79
## 10 VIP                fct       0    0        2    NA  NA       NA
## 11 RoomService        dbl     181    2.1   1274     0 225.   14327
## 12 FoodCourt          dbl     183    2.1   1508     0 458.   29813
## 13 ShoppingMall       dbl     208    2.4   1116     0 174.   23492
## 14 Spa                dbl     183    2.1   1328     0 311.   22408
## 15 VRDeck             dbl     188    2.2   1307     0 305.   24133
## 16 Transported        fct       0    0        2    NA  NA       NA
## 17 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 18 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA

age_model <- lm(
  Age ~ HomePlanet + CryoSleep + Deck + VIP + Destination + Side + RoomService + 
    FoodCourt + ShoppingMall + Spa + VRDeck + Grup_Buyuklugu_Kat+ Yalniz_Seyahat,
  data = train)
train$age_tree_pred <- predict(
  age_model,
  newdata = train
)
test$age_tree_pred <- predict(
  age_model,
  newdata = test
)
train <- train %>%
  mutate(
    Age = ifelse(
      is.na(Age),
      round(age_tree_pred), Age))
test <- test %>%
    mutate(
    Age = ifelse(
      is.na(Age),
      round(age_tree_pred), Age))
train <- train %>%
  select(-age_tree_pred)
test <- test %>%
  select(-age_tree_pred)
describe_all(train)

## # A tibble: 18 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     8693    NA  NA       NA
##  2 Grup_No            fct       0    0     6217    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.52     8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct       0    0        2    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl      14    0.2     81     0  28.8     79
## 10 VIP                fct       0    0        2    NA  NA       NA
## 11 RoomService        dbl     181    2.1   1274     0 225.   14327
## 12 FoodCourt          dbl     183    2.1   1508     0 458.   29813
## 13 ShoppingMall       dbl     208    2.4   1116     0 174.   23492
## 14 Spa                dbl     183    2.1   1328     0 311.   22408
## 15 VRDeck             dbl     188    2.2   1307     0 305.   24133
## 16 Transported        fct       0    0        2    NA  NA       NA
## 17 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 18 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA

mean_age <- train %>%
  summarize(mean_age = mean(Age, na.rm=TRUE)) %>%
  pull(mean_age)
mean_age

## [1] 28.82844

train <- train %>%
  mutate(
    Age = ifelse(
      is.na(Age),
      round(mean_age), Age))
test <- test %>%
  mutate(
    Age = ifelse(
      is.na(Age),
      round(mean_age), Age))
describe_all(train)

## # A tibble: 18 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     8693    NA  NA       NA
##  2 Grup_No            fct       0    0     6217    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.52     8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct       0    0        2    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl       0    0       80     0  28.8     79
## 10 VIP                fct       0    0        2    NA  NA       NA
## 11 RoomService        dbl     181    2.1   1274     0 225.   14327
## 12 FoodCourt          dbl     183    2.1   1508     0 458.   29813
## 13 ShoppingMall       dbl     208    2.4   1116     0 174.   23492
## 14 Spa                dbl     183    2.1   1328     0 311.   22408
## 15 VRDeck             dbl     188    2.2   1307     0 305.   24133
## 16 Transported        fct       0    0        2    NA  NA       NA
## 17 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 18 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA

median_table <- train %>%
  mutate(Age_Group = cut(Age,
                         breaks = c(0, 12, 18, 30, 45, 60, Inf),
                         labels = c("Child", "Teen", "YoungAdult", "Adult", "MidAge", "Senior"),
                         right = FALSE)) %>%
  group_by(HomePlanet, Deck, CryoSleep, Destination, VIP, Grup_Buyuklugu_Kat, Side, Age_Group) %>%
  summarize(med = median(RoomService, na.rm = TRUE))

## `summarise()` has grouped output by 'HomePlanet', 'Deck', 'CryoSleep',
## 'Destination', 'VIP', 'Grup_Buyuklugu_Kat', 'Side'. You can override using the
## `.groups` argument.

train <- train %>%
  mutate(Age_Group = cut(Age,
                         breaks = c(0, 12, 18, 30, 45, 60, Inf),
                         labels = c("Child", "Teen", "YoungAdult", "Adult", "MidAge", "Senior"),
                         right = FALSE)) %>%
  left_join(median_table, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

train <- train %>%
  mutate(RoomService = ifelse(is.na(RoomService), med, RoomService)) %>%
  select(-med)

median_table_fc <- train %>%
  group_by(HomePlanet, Deck, CryoSleep, Destination, VIP, Grup_Buyuklugu_Kat, Side, Age_Group) %>%
  summarize(med = median(FoodCourt, na.rm = TRUE))

## `summarise()` has grouped output by 'HomePlanet', 'Deck', 'CryoSleep',
## 'Destination', 'VIP', 'Grup_Buyuklugu_Kat', 'Side'. You can override using the
## `.groups` argument.

train <- train  %>%
  left_join(median_table_fc, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

train <- train %>%
  mutate(FoodCourt = ifelse(is.na(FoodCourt), med, FoodCourt)) %>%
  select(-med)

median_table_sm <- train %>%
  group_by(HomePlanet, Deck, CryoSleep, Destination, VIP, Grup_Buyuklugu_Kat, Side, Age_Group) %>%
  summarize(med = median(ShoppingMall, na.rm = TRUE))

## `summarise()` has grouped output by 'HomePlanet', 'Deck', 'CryoSleep',
## 'Destination', 'VIP', 'Grup_Buyuklugu_Kat', 'Side'. You can override using the
## `.groups` argument.

train <- train %>%
  left_join(median_table_sm, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

train <- train %>%
  mutate(Spa = ifelse(is.na(Spa), med, Spa)) %>%
  select(-med)

median_table_spa <- train %>%
  group_by(HomePlanet, Deck, CryoSleep, Destination, VIP, Grup_Buyuklugu_Kat, Side, Age_Group) %>%
  summarize(med = median(Spa, na.rm = TRUE))

## `summarise()` has grouped output by 'HomePlanet', 'Deck', 'CryoSleep',
## 'Destination', 'VIP', 'Grup_Buyuklugu_Kat', 'Side'. You can override using the
## `.groups` argument.

train <- train %>%
  left_join(median_table_spa, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

train <- train %>%
  mutate(Spa = ifelse(is.na(Spa), med, Spa)) %>%
  select(-med)

median_table_vrd <- train %>%
  group_by(HomePlanet, Deck, CryoSleep, Destination, VIP, Grup_Buyuklugu_Kat, Side, Age_Group) %>%
  summarize(med = median(VRDeck, na.rm = TRUE))

## `summarise()` has grouped output by 'HomePlanet', 'Deck', 'CryoSleep',
## 'Destination', 'VIP', 'Grup_Buyuklugu_Kat', 'Side'. You can override using the
## `.groups` argument.

train <- train %>%
  left_join(median_table_vrd, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

train <- train %>%
  mutate(VRDeck = ifelse(is.na(VRDeck), med, VRDeck)) %>%
  select(-med)

describe_all(train)

## # A tibble: 19 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     8693    NA  NA       NA
##  2 Grup_No            fct       0    0     6217    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.52     8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct       0    0        2    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl       0    0       80     0  28.8     79
## 10 VIP                fct       0    0        2    NA  NA       NA
## 11 RoomService        dbl      14    0.2   1285     0 222.   14327
## 12 FoodCourt          dbl      14    0.2   1522     0 454.   29813
## 13 ShoppingMall       dbl     208    2.4   1116     0 174.   23492
## 14 Spa                dbl       0    0     1343     0 305.   22408
## 15 VRDeck             dbl      13    0.1   1316     0 300.   24133
## 16 Transported        fct       0    0        2    NA  NA       NA
## 17 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 18 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA
## 19 Age_Group          fct       0    0        6    NA  NA       NA

test <- test %>%
  mutate(Age_Group = cut(Age,
                         breaks = c(0, 12, 18, 30, 45, 60, Inf),
                         labels = c("Child", "Teen", "YoungAdult", "Adult", "MidAge", "Senior"),
                         right = FALSE)) %>%
  left_join(median_table, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

test <- test %>%
  mutate(RoomService = ifelse(is.na(RoomService), med, RoomService)) %>%
  select(-med)

test <- test  %>%
  left_join(median_table_fc, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

test <- test %>%
  mutate(FoodCourt = ifelse(is.na(FoodCourt), med, FoodCourt)) %>%
  select(-med)

test <- test %>%
  left_join(median_table_sm, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

test <- test %>%
  mutate(ShoppingMall = ifelse(is.na(ShoppingMall), med, ShoppingMall)) %>%
  select(-med)

test <- test %>%
  left_join(median_table_spa, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

test <- test %>%
  mutate(Spa = ifelse(is.na(Spa), med, Spa)) %>%
  select(-med)

test <- test %>%
  left_join(median_table_vrd, by = c("HomePlanet", "Deck", "CryoSleep", "Destination", "VIP", "Grup_Buyuklugu_Kat", "Side", "Age_Group"))

test <- test %>%
  mutate(VRDeck = ifelse(is.na(VRDeck), med, VRDeck)) %>%
  select(-med)

describe_all(test)

## # A tibble: 18 × 8
##    variable           type     na na_pct unique   min   mean   max
##    <chr>              <chr> <int>  <dbl>  <int> <dbl>  <dbl> <dbl>
##  1 PassengerId        chr       0    0     4277    NA  NA       NA
##  2 Grup_No            fct       0    0     3063    NA  NA       NA
##  3 Kisi_No            int       0    0        8     1   1.5      8
##  4 HomePlanet         fct       0    0        3    NA  NA       NA
##  5 CryoSleep          fct       0    0        2    NA  NA       NA
##  6 Deck               fct       0    0        6    NA  NA       NA
##  7 Side               fct       0    0        2    NA  NA       NA
##  8 Destination        fct       0    0        3    NA  NA       NA
##  9 Age                dbl       0    0       79     0  28.7     79
## 10 VIP                fct       0    0        2    NA  NA       NA
## 11 RoomService        dbl       0    0      850     0 217.   11567
## 12 FoodCourt          dbl       3    0.1    916     0 435.   25273
## 13 ShoppingMall       dbl       8    0.2    728     0 175.    8292
## 14 Spa                dbl       8    0.2    841     0 298    19844
## 15 VRDeck             dbl       4    0.1    807     0 308.   22272
## 16 Yalniz_Seyahat     dbl       0    0        2     0   0.55     1
## 17 Grup_Buyuklugu_Kat fct       0    0        4    NA  NA       NA
## 18 Age_Group          fct       0    0        6    NA  NA       NA

train <- train %>%
  mutate(RoomService = coalesce(RoomService, 0),
         FoodCourt   = coalesce(FoodCourt, 0),
         ShoppingMall= coalesce(ShoppingMall, 0),
         Spa         = coalesce(Spa, 0),
         VRDeck      = coalesce(VRDeck, 0)
  )

test <- test %>%
  mutate(RoomService = coalesce(RoomService, 0),
         FoodCourt   = coalesce(FoodCourt, 0),
         ShoppingMall= coalesce(ShoppingMall, 0),
         Spa         = coalesce(Spa, 0),
         VRDeck      = coalesce(VRDeck, 0)
  )

train <- train %>%
  select(-c(Grup_No, Kisi_No, Yalniz_Seyahat))
test <- test %>%
  select(-c(Grup_No, Kisi_No, Yalniz_Seyahat))

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──

## ✔ broom        1.0.11     ✔ rsample      1.3.1 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.0      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ dials::prune()    masks rpart::prune()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

set.seed(123)

data_split <- initial_split(train, prop = 0.8, strata = Transported)
train_data  <- training(data_split)
test_data   <- testing(data_split)
rs_recipe <- recipe(Transported ~ ., data = train_data) %>%
  update_role(PassengerId, Age, new_role = "ID") %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_normalize(all_numeric_predictors())
log_spec <- logistic_reg() %>%
  set_engine("glm")
log_workflow <- workflow() %>%
  add_recipe(rs_recipe) %>%
  add_model(log_spec)
log_fit <- log_workflow %>%
  fit(data = train_data)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

log_preds <- predict(log_fit, test_data, type = "prob")
head(log_preds)

## # A tibble: 6 × 2
##   .pred_FALSE .pred_TRUE
##         <dbl>      <dbl>
## 1       0.789    0.211  
## 2       0.205    0.795  
## 3       0.843    0.157  
## 4       0.302    0.698  
## 5       0.993    0.00690
## 6       0.223    0.777

log_class <- predict(log_fit, test_data)
results <- test_data %>%
  select(Transported) %>%
  bind_cols(log_class)

log_acc <- accuracy(data = results,
                    truth = Transported,
                    estimate = .pred_class)
log_acc

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.794

conf_mat_log <- conf_mat(data = results,
                         truth = Transported,
                         estimate = .pred_class)
conf_mat_log

##           Truth
## Prediction FALSE TRUE
##      FALSE   663  158
##      TRUE    200  718

pred_prob  <- predict(log_fit, test_data, type = "prob")
results <- results %>%
  bind_cols(pred_prob)

autoplot(conf_mat_log, type = "heatmap") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix - Isı Haritası")

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

results <- results %>%
  mutate(Transported = relevel(Transported, ref = "TRUE"))

roc_data <- roc_curve(
  data  = results,
  truth = Transported,
  .pred_TRUE
)

autoplot(roc_data) +
  labs(title = "ROC Eğrisi", subtitle = "Lojistik Regresyon Modeli")

logistic_recipe <- recipe(Transported ~ ., data = train) %>%
  update_role(PassengerId, Age, new_role = "ID") %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_normalize(all_numeric_predictors())
log_spec <- logistic_reg() %>%
  set_engine("glm")
log_workflow <- workflow() %>%
  add_recipe(logistic_recipe) %>%
  add_model(log_spec)
log_fit <- log_workflow %>%
  fit(data = train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

library(stringr)

tahminler <- log_fit %>%
  predict(new_data = test) %>%     
  bind_cols(test)                  
submission <- tahminler %>% 
  select(PassengerId = PassengerId, .pred_class = .pred_class) %>%  
  rename(Transported = .pred_class)             
submission <- as.data.frame(submission)
submission$Transported <- str_to_title(submission$Transported)
write.csv(submission, "logistic.csv", row.names = FALSE, quote = FALSE)

library(tidyverse)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(e1071)

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:tune':
## 
##     tune

## The following object is masked from 'package:rsample':
## 
##     permutations

## The following object is masked from 'package:parsnip':
## 
##     tune

## The following object is masked from 'package:ggplot2':
## 
##     element

library(caret)

## Zorunlu paket yükleniyor: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity

## The following object is masked from 'package:rsample':
## 
##     calibration

## The following object is masked from 'package:purrr':
## 
##     lift

library(xgboost)

train$Transported <- as.factor(train$Transported)

randomforest

library(tidyverse)
library(forcats)
library(randomForest)

rf_final <- randomForest(
  Transported ~ .,
  data = train,
  ntree = 500
)

rf_final

## 
## Call:
##  randomForest(formula = Transported ~ ., data = train, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 19.36%
## Confusion matrix:
##       FALSE TRUE class.error
## FALSE  3383  932   0.2159907
## TRUE    751 3627   0.1715395

library(caret)
rf_train_pred <- predict(rf_final, train)

confusionMatrix(rf_train_pred, train$Transported)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  3700   16
##      TRUE    615 4362
##                                           
##                Accuracy : 0.9274          
##                  95% CI : (0.9218, 0.9328)
##     No Information Rate : 0.5036          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8547          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8575          
##             Specificity : 0.9963          
##          Pos Pred Value : 0.9957          
##          Neg Pred Value : 0.8764          
##              Prevalence : 0.4964          
##          Detection Rate : 0.4256          
##    Detection Prevalence : 0.4275          
##       Balanced Accuracy : 0.9269          
##                                           
##        'Positive' Class : FALSE           
##

Modelin performansı genel olarak oldukça yüksektir. Confusion matrix incelendiğinde modelin FALSE sınıfını (pozitif sınıf) doğru tahmin etme konusunda çok başarılı olduğu görülmektedir. Gerçekten FALSE olan 3699 gözlem doğru şekilde FALSE olarak sınıflandırılmış, sadece 16 TRUE gözlem yanlışlıkla FALSE olarak tahmin edilmiştir. Bu durum modelin yanlış alarm üretme olasılığının çok düşük olduğunu göstermektedir.

Modelin genel doğruluğu %92.7’dir. Bu değer, modelin büyük çoğunlukta doğru tahmin yaptığını göstermektedir. Ayrıca No Information Rate %50.36 iken doğruluk oranının bunun çok üzerinde olması ve p-değerinin 2.2e-16’dan küçük olması, modelin rastgele ya da çoğunluk sınıfını tahmin eden basit bir modelden anlamlı derecede daha iyi olduğunu ortaya koymaktadır.

Kappa katsayısının 0.8544 olması, model ile gerçek değerler arasında çok güçlü bir uyum bulunduğunu ve bu başarının tesadüfe bağlı olmadığını göstermektedir. Bu değer, sınıflandırma modelleri için oldukça yüksek ve güven verici bir seviyededir.

Duyarlılık (Sensitivity) değeri %85.7’dir. Bu, modelin gerçekten FALSE olan gözlemlerin yaklaşık %86’sını doğru şekilde yakalayabildiğini, ancak %14 civarında FALSE gözlemin TRUE olarak kaçırıldığını göstermektedir. Buna karşılık özgüllük (Specificity) %99.6 gibi çok yüksek bir değere sahiptir; yani TRUE olan gözlemlerin neredeyse tamamı doğru şekilde TRUE olarak sınıflandırılmıştır.

Pozitif kestirim değeri (Positive Predictive Value) %99.6’dır. Bu, model FALSE tahmini yaptığında bunun neredeyse her zaman doğru olduğu anlamına gelir. Negatif kestirim değeri (Negative Predictive Value) ise %87.6 olup, TRUE tahminlerinin büyük ölçüde doğru olduğunu ancak FALSE tahminlerine göre biraz daha fazla hata içerdiğini göstermektedir.

Dengeli doğruluk (Balanced Accuracy) değerinin %92.7 olması, modelin her iki sınıfı da adil bir şekilde ve yüksek başarıyla ayırt edebildiğini göstermektedir. McNemar testinin anlamlı çıkması ise modelin hata türlerinin simetrik olmadığını, yani özellikle FALSE sınıfının bir kısmının TRUE olarak tahmin edilmesi yönünde hata eğilimi bulunduğunu göstermektedir.

Özetle, model genel olarak çok güçlü ve güvenilir, özellikle yanlış pozitif üretme riskinin düşük olması gereken senaryolar için oldukça uygundur. Ancak FALSE sınıfını kaçırmanın maliyeti yüksekse, duyarlılığı artıracak ek ayarlamalar (eşik değeri, sınıf ağırlıkları vb.) düşünülebilir.

rf_test_pred <- predict(rf_final, test)

pred_table <- data.frame(
  PassengerId = test$PassengerId,
  Prediction = rf_test_pred
)

head(pred_table)

##   PassengerId Prediction
## 1     0013_01       TRUE
## 2     0018_01      FALSE
## 3     0019_01       TRUE
## 4     0021_01       TRUE
## 5     0023_01       TRUE
## 6     0027_01       TRUE

pred_table$Group <- substr(pred_table$PassengerId, 1, 4)

final_pred <- pred_table
final_pred$FinalPrediction <- ave(
  final_pred$Prediction,
  final_pred$Group,
  FUN = function(x) {
    sum(x == TRUE) > sum(x == FALSE)
  }
)

head(final_pred, 20)

##    PassengerId Prediction Group FinalPrediction
## 1      0013_01       TRUE  0013            TRUE
## 2      0018_01      FALSE  0018           FALSE
## 3      0019_01       TRUE  0019            TRUE
## 4      0021_01       TRUE  0021            TRUE
## 5      0023_01       TRUE  0023            TRUE
## 6      0027_01       TRUE  0027            TRUE
## 7      0029_01       TRUE  0029            TRUE
## 8      0032_01       TRUE  0032            TRUE
## 9      0032_02       TRUE  0032            TRUE
## 10     0033_01       TRUE  0033            TRUE
## 11     0037_01      FALSE  0037           FALSE
## 12     0040_01      FALSE  0040           FALSE
## 13     0040_02       TRUE  0040           FALSE
## 14     0042_01       TRUE  0042            TRUE
## 15     0046_01      FALSE  0046           FALSE
## 16     0046_02      FALSE  0046           FALSE
## 17     0046_03      FALSE  0046           FALSE
## 18     0047_01       TRUE  0047            TRUE
## 19     0047_02       TRUE  0047            TRUE
## 20     0047_03      FALSE  0047            TRUE

submission <- data.frame(
  PassengerId = final_pred$PassengerId,
  Transported = final_pred$FinalPrediction
)

write.csv(submission, "submission_grouped.csv", row.names = FALSE)

rf_train_pred <- predict(rf_final, train)

cm_rf <- table(
  Prediction = rf_train_pred,
  Reference  = train$Transported
)

cm_rf_df <- as.data.frame(cm_rf)

library(ggplot2)

ggplot(cm_rf_df, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Random Forest Confusion Matrix",
    x = "Actual",
    y = "Predicted"
  ) +
  theme_minimal()

library(e1071)

train_svm <- train %>% select(-PassengerId)
test_svm  <- test  %>% select(-PassengerId)

library(e1071)

svm_final <- svm(
  Transported ~ .,
  data = train_svm,
  kernel = "radial",
  scale = TRUE
)

library(caret)

svm_train_pred <- predict(svm_final, train)
confusionMatrix(svm_train_pred, train$Transported)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  3427  785
##      TRUE    888 3593
##                                           
##                Accuracy : 0.8075          
##                  95% CI : (0.7991, 0.8158)
##     No Information Rate : 0.5036          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.615           
##                                           
##  Mcnemar's Test P-Value : 0.01264         
##                                           
##             Sensitivity : 0.7942          
##             Specificity : 0.8207          
##          Pos Pred Value : 0.8136          
##          Neg Pred Value : 0.8018          
##              Prevalence : 0.4964          
##          Detection Rate : 0.3942          
##    Detection Prevalence : 0.4845          
##       Balanced Accuracy : 0.8075          
##                                           
##        'Positive' Class : FALSE           
##

Modelin sonuçları incelendiğinde, performansın orta–iyi düzeyde olduğu görülmektedir. Confusion matrix’e göre model, FALSE (pozitif sınıf) olan 3427 gözlemi doğru şekilde FALSE olarak tahmin ederken, 888 FALSE gözlemi TRUE olarak tahmin ederek kaçırmıştır. Buna karşılık, TRUE olan 3593 gözlem doğru sınıflandırılmış, ancak 785 TRUE gözlem yanlışlıkla FALSE olarak tahmin edilmiştir. Bu tablo, modelin her iki sınıf için de hatalar yaptığını ve önceki modele kıyasla hata sayılarının belirgin biçimde arttığını göstermektedir.

Genel doğruluk (Accuracy) %80.75’tir. Bu değer, modelin tahminlerinin yaklaşık beşte dördünün doğru olduğunu ifade eder. No Information Rate’in %50.36 olduğu dikkate alındığında ve buna ait p-değerinin 2e-16’dan küçük olması, modelin çoğunluk sınıfını tahmin eden basit bir yaklaşımdan anlamlı derecede daha iyi olduğunu göstermektedir.

Kappa katsayısı 0.615’tir. Bu değer, model ile gerçek sınıflar arasında iyi fakat çok güçlü olmayan bir uyuma işaret eder. Yani modelin başarısı tesadüften uzak olsa da mükemmel değildir. McNemar testinin p-değerinin 0.01264 olması, modelin hata türlerinin simetrik olmadığını ve belirli bir yönde sistematik hata eğilimi bulunduğunu göstermektedir.

Duyarlılık (Sensitivity) %79.4’tür. Bu, modelin gerçekten FALSE olan gözlemlerin yaklaşık %79’unu doğru yakalayabildiğini, ancak %21’lik bir kısmını kaçırdığını gösterir. Özgüllük (Specificity) %82.1 olup, TRUE sınıfının da benzer düzeyde doğru tahmin edildiğini ifade eder. Yani model her iki sınıfa da benzer performansla yaklaşmaktadır, ancak hiçbirinde çok yüksek bir başarı yoktur.

Pozitif kestirim değeri %81.4’tür; model FALSE tahmini yaptığında bunun doğru olma olasılığı yaklaşık %81’dir. Negatif kestirim değeri %80.2 olup, TRUE tahminlerinin de benzer güvenilirlikte olduğunu göstermektedir. Dengeli doğruluk (Balanced Accuracy) değerinin %80.75 olması, modelin sınıflar arasında ciddi bir dengesizlik olmadan çalıştığını ancak genel başarının orta seviyede kaldığını göstermektedir.

Özetle bu model, her iki sınıfı da makul düzeyde ayırt edebilen, ancak önceki modele kıyasla belirgin biçimde daha zayıf olan bir performans sergilemektedir. Eğer hata maliyetleri yüksekse veya daha güvenilir tahminler gerekiyorsa, modelin iyileştirilmesi (özellik seçimi, model türü, eşik ayarı veya sınıf ağırlıkları) faydalı olacaktır.

cm_svm <- table(
  Prediction = svm_train_pred,
  Reference  = train$Transported
)

mosaicplot(
  cm_svm,
  color = TRUE,
  main = "SVM Confusion Matrix",
  xlab = "Reference",
  ylab = "Prediction"
)

fourfoldplot(
  cm_svm,
  color = c("#CC6666", "#99CC99"),
  main = "SVM Confusion Matrix"
)

library(ggplot2)

cm_df <- as.data.frame(cm_svm)

ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "SVM Confusion Matrix",
    x = "Actual",
    y = "Predicted"
  ) +
  theme_minimal()

library(xgboost)
library(caret)

y_train <- ifelse(train$Transported == TRUE, 1, 0)

x_train <- train %>%
  select(-Transported, -PassengerId) %>%
  mutate(across(where(is.logical), as.numeric)) %>%
  mutate(across(where(is.factor), as.numeric))

dtrain <- xgb.DMatrix(
  data = as.matrix(x_train),
  label = y_train
)

xgb_model <- xgb.train(
  data = dtrain,
  nrounds = 150,
  objective = "binary:logistic",
  eval_metric = "logloss",
  max_depth = 6,
  eta = 0.1
)

## Warning in check.deprecation(deprecated_train_params, match.call(), ...):
## Passed invalid function arguments: eval_metric, max_depth, eta. These should be
## passed as a list to argument 'params'. Conversion from argument to 'params'
## entry will be done automatically, but this behavior will become an error in a
## future version.

## Warning in check.custom.obj(params, objective): Argument 'objective' is only
## for custom objectives. For built-in objectives, pass the objective under
## 'params'. This warning will become an error in a future version.

xgb_prob <- predict(xgb_model, dtrain)
xgb_pred <- ifelse(xgb_prob > 0.5, TRUE, FALSE)

confusionMatrix(as.factor(xgb_pred), train$Transported)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  3647  564
##      TRUE    668 3814
##                                           
##                Accuracy : 0.8583          
##                  95% CI : (0.8508, 0.8655)
##     No Information Rate : 0.5036          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7165          
##                                           
##  Mcnemar's Test P-Value : 0.003341        
##                                           
##             Sensitivity : 0.8452          
##             Specificity : 0.8712          
##          Pos Pred Value : 0.8661          
##          Neg Pred Value : 0.8510          
##              Prevalence : 0.4964          
##          Detection Rate : 0.4195          
##    Detection Prevalence : 0.4844          
##       Balanced Accuracy : 0.8582          
##                                           
##        'Positive' Class : FALSE           
##

Bu modelin sonuçları, iyi düzeyde ve dengeli bir sınıflandırma performansına işaret etmektedir. Confusion matrix incelendiğinde, pozitif sınıf olarak tanımlanan FALSE gözlemlerinin büyük bir kısmı doğru şekilde sınıflandırılmıştır. Gerçekte FALSE olan 3647 gözlem doğru biçimde FALSE olarak tahmin edilirken, 668 FALSE gözlem TRUE olarak tahmin edilerek kaçırılmıştır. TRUE sınıfında ise 3814 gözlem doğru tahmin edilmiş, 564 gözlem yanlışlıkla FALSE olarak sınıflandırılmıştır. Bu durum, modelin her iki sınıfta da hata yaptığını ancak hataların makul ve dengeli olduğunu göstermektedir.

Modelin genel doğruluğu %85.8’dir. Bu oran, tahminlerin büyük çoğunluğunun doğru olduğunu ifade eder. No Information Rate’in %50.36 olması ve buna karşılık doğruluk oranının anlamlı biçimde yüksek çıkması (p < 2.2e-16), modelin rastgele ya da çoğunluk sınıfını tahmin eden basit bir yaklaşımdan çok daha iyi performans gösterdiğini kanıtlamaktadır.

Kappa katsayısının 0.7165 olması, model ile gerçek sınıflar arasında iyi düzeyde bir uyum bulunduğunu göstermektedir. Bu değer, model başarısının tesadüfe bağlı olmadığını ancak mükemmel seviyeye henüz ulaşmadığını ifade eder. McNemar testinin anlamlı çıkması (p = 0.003341) ise modelin hata türlerinin tamamen simetrik olmadığını, yani belirli bir yönde hata yapma eğilimi bulunduğunu göstermektedir.

Duyarlılık (Sensitivity) %84.5’tir. Bu, modelin gerçekten FALSE olan gözlemlerin yaklaşık %85’ini doğru şekilde yakalayabildiğini, %15 civarında FALSE gözlemi ise TRUE olarak kaçırdığını göstermektedir. Özgüllük (Specificity) %87.1 olup, TRUE sınıfının da benzer bir başarıyla sınıflandırıldığını ortaya koymaktadır. Bu değerler, modelin iki sınıf arasında dengeli bir ayırım yaptığını göstermektedir.

Pozitif kestirim değeri %86.6’dır; yani model FALSE tahmini yaptığında bunun doğru olma olasılığı yüksektir. Negatif kestirim değeri %85.1 olup, TRUE tahminlerinin de büyük ölçüde güvenilir olduğunu göstermektedir. Dengeli doğruluğun %85.8 olması da bu dengeli performansı desteklemektedir.

Özetle bu model, önceki modele kıyasla daha güçlü ve daha kararlı, ancak ilk paylaştığın çok yüksek performanslı modele göre biraz daha zayıf bir sonuç sunmaktadır. Yine de genel kullanım için güvenilir, hataların maliyetinin aşırı yüksek olmadığı senaryolarda rahatlıkla tercih edilebilecek bir modeldir.

list.files()

##  [1] "desktop.ini"            "ek.Rproj"               "hakkimizda.Rmd"        
##  [4] "index.Rmd"              "kggle.html"             "kggle.Rmd"             
##  [7] "kggle_files"            "logistic.csv"           "submission_grouped.csv"
## [10] "test.csv"               "train.csv"

RPUS

Şerife Sıla Özen