Amr Atrash
# packages ===================================================================
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
## Warning: package 'tibble' was built under R version 4.2.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
# data =======================================================================
train <- read.csv("train.csv") %>% tibble::tibble()
test <- read.csv("test.csv") %>% tibble::tibble()
# exploratory analysis =======================================================
colnames(train)
## [1] "PassengerId" "HomePlanet" "CryoSleep" "Cabin" "Destination"
## [6] "Age" "VIP" "RoomService" "FoodCourt" "ShoppingMall"
## [11] "Spa" "VRDeck" "Name" "Transported"
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(HomePlanet) %>%
tidyr::pivot_wider(
id_cols = "HomePlanet",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 4 × 3
## HomePlanet False True
## <chr> <int> <int>
## 1 "" 98 103
## 2 "Earth" 2651 1951
## 3 "Europa" 727 1404
## 4 "Mars" 839 920
# something happened with people from Europa, most of them got transported
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(CryoSleep) %>%
tidyr::pivot_wider(
id_cols = "CryoSleep",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## CryoSleep False True
## <chr> <int> <int>
## 1 "" 111 106
## 2 "False" 3650 1789
## 3 "True" 554 2483
# people sleeping in CryoSleep have more chances to be transported
train %>%
dplyr::filter(HomePlanet == "Europa") %>%
dplyr::group_by(Transported) %>%
dplyr::count(CryoSleep) %>%
tidyr::pivot_wider(
id_cols = "CryoSleep",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## CryoSleep False True
## <chr> <int> <int>
## 1 "" 20 38
## 2 "False" 697 465
## 3 "True" 10 901
train %>%
dplyr::select(Transported, Cabin) %>%
# dplyr::arrange(Cabin) %>%
dplyr::mutate(
deck = stringr::str_remove(Cabin, "\\/.*"),
num = stringr::str_remove(Cabin, "\\/.$") %>% stringr::str_remove(".*\\/"),
side = stringr::str_remove(Cabin, ".*/\\.*")
) %>%
dplyr::group_by(Transported) %>%
dplyr::count(side) %>%
tidyr::pivot_wider(
id_cols = "side",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## side False True
## <chr> <int> <int>
## 1 "" 99 100
## 2 "P" 2308 1898
## 3 "S" 1908 2380
# no conclusions
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(Destination) %>%
tidyr::pivot_wider(
id_cols = "Destination",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 4 × 3
## Destination False True
## <chr> <int> <int>
## 1 "" 90 92
## 2 "55 Cancri e" 702 1098
## 3 "PSO J318.5-22" 395 401
## 4 "TRAPPIST-1e" 3128 2787
# no conclusions
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(Age) %>%
tidyr::pivot_wider(
id_cols = "Age",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 81 × 3
## Age False True
## <dbl> <int> <int>
## 1 0 34 144
## 2 1 18 49
## 3 2 22 53
## 4 3 16 59
## 5 4 18 53
## 6 5 13 20
## 7 6 17 23
## 8 7 20 32
## 9 8 20 26
## 10 9 18 24
## # ℹ 71 more rows
# no conclusions
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(VIP) %>%
tidyr::pivot_wider(
id_cols = "VIP",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## VIP False True
## <chr> <int> <int>
## 1 "" 99 104
## 2 "False" 4093 4198
## 3 "True" 123 76
# no conclusions
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(VRDeck) %>%
tidyr::pivot_wider(
id_cols = "VRDeck",
names_from = "Transported",
values_from = "n"
) %>%
dplyr::arrange(desc(VRDeck)) %>%
na.omit()
## # A tibble: 296 × 3
## VRDeck False True
## <dbl> <int> <int>
## 1 2577 1 1
## 2 2376 1 1
## 3 2260 1 1
## 4 1804 1 1
## 5 1674 2 1
## 6 1539 1 1
## 7 1514 1 1
## 8 1479 1 1
## 9 1460 1 1
## 10 1384 1 1
## # ℹ 286 more rows
# using luxury amenities decreases chances of being transported
train %>%
dplyr::select(Transported, Name, HomePlanet) %>%
dplyr::mutate(
first_name = purrr::map_chr(
Name,
~stringr::str_split(.x, " ", simplify = T) %>% unlist() %>% .[1]
),
last_name = purrr::map_chr(
Name,
~stringr::str_split(.x, " ", simplify = T) %>% unlist() %>% .[1]
)
) %>%
dplyr::group_by(Transported) %>%
dplyr::count(last_name, HomePlanet) %>%
tidyr::pivot_wider(
id_cols = c("last_name","HomePlanet"),
names_from = "Transported",
values_from = "n"
) %>%
arrange(HomePlanet, desc(True)) %>%
filter(HomePlanet == "Europa")
## # A tibble: 890 × 4
## last_name HomePlanet False True
## <chr> <chr> <int> <int>
## 1 "" Europa 17 29
## 2 "Minoton" Europa 1 7
## 3 "Betenar" Europa NA 7
## 4 "Dyonon" Europa NA 7
## 5 "Okulas" Europa NA 6
## 6 "Terope" Europa NA 6
## 7 "Dabik" Europa 2 5
## 8 "Mothab" Europa 2 5
## 9 "Sinon" Europa 1 5
## 10 "Acruxon" Europa NA 5
## # ℹ 880 more rows
# no conclusions
# decision tree ==============================================================
train_ok <- train %>%
dplyr::mutate(
Transported = factor(Transported, levels = c("True","False")),
deck = stringr::str_remove(Cabin, "\\/.*"),
num = stringr::str_remove(Cabin, "\\/.$") %>% stringr::str_remove(".*\\/") %>% as.numeric(),
side = stringr::str_remove(Cabin, ".*/\\.*")
) %>%
dplyr::mutate(
luxury = RoomService + FoodCourt + ShoppingMall + Spa + VRDeck
) #%>%
# dplyr::filter(CryoSleep != "")
# View(train_ok)
vars <- c("Transported", "CryoSleep","Age")
m <- rpart::rpart(Transported ~ ., data = train_ok[,vars])
rpart.plot(m)
p <- predict(m, train_ok, type = 'class')
caret::confusionMatrix(p, train_ok$Transported, positive="True")
## Confusion Matrix and Statistics
##
## Reference
## Prediction True False
## True 2774 692
## False 1604 3623
##
## Accuracy : 0.7359
## 95% CI : (0.7265, 0.7451)
## No Information Rate : 0.5036
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4725
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6336
## Specificity : 0.8396
## Pos Pred Value : 0.8003
## Neg Pred Value : 0.6931
## Prevalence : 0.5036
## Detection Rate : 0.3191
## Detection Prevalence : 0.3987
## Balanced Accuracy : 0.7366
##
## 'Positive' Class : True
##
predict(m, test, type = "prob") %>%
as.data.frame() %>%
tibble::tibble() %>%
dplyr::bind_cols(test[,"PassengerId"]) %>%
dplyr::mutate(Transported = ifelse(True > False, "True","False")) %>%
dplyr::select(PassengerId, Transported)
## # A tibble: 4,277 × 2
## PassengerId Transported
## <chr> <chr>
## 1 0013_01 True
## 2 0018_01 False
## 3 0019_01 True
## 4 0021_01 False
## 5 0023_01 False
## 6 0027_01 False
## 7 0029_01 True
## 8 0032_01 True
## 9 0032_02 True
## 10 0033_01 False
## # ℹ 4,267 more rows
knitr::include_graphics("5.jpeg")
# Doğruluk (Accuracy): Modelin genel doğruluk oranı yüzde 73.59'dur, yani model gerçek sonucu yaklaşık olarak %73.59 oranında doğru tahmin etmektedir.
# Duyarlılık (True Positive Rate): Modelin gerçekten taşınan yolcuları doğru bir şekilde tanıma yeteneği %63.36'dır.
# Özgüllük (True Negative Rate): Modelin taşınmayan yolcuları doğru bir şekilde tanıma yeteneği %83.96'dır.
# Pozitif Tahmin Değeri (Precision): Modelin taşınan olarak tahmin ettiği yolcuların gerçekten taşınmış olma olasılığı %80.03'tür.
#
#KAGGLE SONUÇLARI.
knitr::include_graphics("4.jpeg")