library(readr)
train <- read_csv("train.csv")
## Rows: 8693 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train.csv: Yolcuların yaklaşık üçte ikisinin (~8700) kişisel kayıtları, eğitim verileri olarak kullanılacak
PassengerId: Her yolcu için benzersiz bir Kimlik. Her kimlik, yolcunun birlikte seyahat ettiği grubu belirten ve grup içindeki numarası olan gggg_ppformu alır . Bir gruptaki insanlar çoğunlukla aile üyeleridir, ancak her zaman değil.ggggpp
HomePlanet:Yolcunun ayrıldığı gezegen, genellikle daimi ikamet ettikleri gezegen.
CryoSleep: Yolcunun yolculuk süresince askıya alınmış animasyona alınmayı seçip seçmediğini belirtir. Dondurucu uykudaki yolcular kabinlerine hapsedilir.
Cabin: Yolcunun kaldığı kabin numarası. deck/num/sideİskele sideveya PSancak formunu alır . _ _ S_
Destination: Yolcunun ineceği gezegen.
Age: Yolcunun yaşı.
VIP:Yolcunun yolculuk sırasında özel VIP hizmeti için ödeme yapıp yapmadığı.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck:Yolcunun Uzay Gemisi Titanic’in birçok lüks olanağının her birinde fatura ettiği tutar.
Name: Yolcunun adı ve soyadı.
Transported: Yolcunun başka bir boyuta taşınıp taşınmadığı. Bu hedeftir, tahmin etmeye çalıştığınız sütundur.
Test data yükleyelim
library(readr)
test<- read_csv("test.csv")
## Rows: 4277 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Yükleme sonucunda Test datamızda 4277 tane gözlem ve 13 tane değişkenimiz olduğunu görüdük.
Transported:Göreviniz bu setteki yolcular için değerini tahmin etmektir .
test.csv:Yolcuların geri kalan üçte birinin (~4300) kişisel kayıtları, test verisi olarak kullanılacak.
str(train)
## spc_tbl_ [8,693 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId : chr [1:8693] "0001_01" "0002_01" "0003_01" "0003_02" ...
## $ HomePlanet : chr [1:8693] "Europa" "Earth" "Europa" "Europa" ...
## $ CryoSleep : logi [1:8693] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Cabin : chr [1:8693] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
## $ Destination : chr [1:8693] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
## $ Age : num [1:8693] 39 24 58 33 16 44 26 28 35 14 ...
## $ VIP : logi [1:8693] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ RoomService : num [1:8693] 0 109 43 0 303 0 42 0 0 0 ...
## $ FoodCourt : num [1:8693] 0 9 3576 1283 70 ...
## $ ShoppingMall: num [1:8693] 0 25 0 371 151 0 3 0 17 0 ...
## $ Spa : num [1:8693] 0 549 6715 3329 565 ...
## $ VRDeck : num [1:8693] 0 44 49 193 2 0 0 NA 0 0 ...
## $ Name : chr [1:8693] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
## $ Transported : logi [1:8693] FALSE TRUE FALSE FALSE TRUE TRUE ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_character(),
## .. HomePlanet = col_character(),
## .. CryoSleep = col_logical(),
## .. Cabin = col_character(),
## .. Destination = col_character(),
## .. Age = col_double(),
## .. VIP = col_logical(),
## .. RoomService = col_double(),
## .. FoodCourt = col_double(),
## .. ShoppingMall = col_double(),
## .. Spa = col_double(),
## .. VRDeck = col_double(),
## .. Name = col_character(),
## .. Transported = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
str(test)
## spc_tbl_ [4,277 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId : chr [1:4277] "0013_01" "0018_01" "0019_01" "0021_01" ...
## $ HomePlanet : chr [1:4277] "Earth" "Earth" "Europa" "Europa" ...
## $ CryoSleep : logi [1:4277] TRUE FALSE TRUE FALSE FALSE FALSE ...
## $ Cabin : chr [1:4277] "G/3/S" "F/4/S" "C/0/S" "C/1/S" ...
## $ Destination : chr [1:4277] "TRAPPIST-1e" "TRAPPIST-1e" "55 Cancri e" "TRAPPIST-1e" ...
## $ Age : num [1:4277] 27 19 31 38 20 31 21 20 23 24 ...
## $ VIP : logi [1:4277] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ RoomService : num [1:4277] 0 0 0 0 10 0 0 0 0 0 ...
## $ FoodCourt : num [1:4277] 0 9 0 6652 0 ...
## $ ShoppingMall: num [1:4277] 0 0 0 0 635 263 0 0 0 0 ...
## $ Spa : num [1:4277] 0 2823 0 181 0 ...
## $ VRDeck : num [1:4277] 0 0 0 585 0 60 0 0 0 0 ...
## $ Name : chr [1:4277] "Nelly Carsoning" "Lerome Peckers" "Sabih Unhearfus" "Meratz Caltilter" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_character(),
## .. HomePlanet = col_character(),
## .. CryoSleep = col_logical(),
## .. Cabin = col_character(),
## .. Destination = col_character(),
## .. Age = col_double(),
## .. VIP = col_logical(),
## .. RoomService = col_double(),
## .. FoodCourt = col_double(),
## .. ShoppingMall = col_double(),
## .. Spa = col_double(),
## .. VRDeck = col_double(),
## .. Name = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Name (AD) Value(Değer) Rows (Satır) 8,693 Columns (Sütun) 14 Discrete columns (Ayrık Sütunlar-Kesikli Data) 8 Continuous columns (Sürekli Sütunlar) 6 All missing columns (Tüm Eksik Sütunlar) 0 Missing observations (Eksik Gözlemler) 2,324 Complete Rows (Tam Sayılar) 6,606 Total observations (Toplam Gözlem Sayısı) 121,702 Memory allocation (Bellek Ayırma) 2.2 Mb
YÜZDE
Discrete columns (Ayrık Sütunlar-Kesikli Data) Continuous columns (Sürekli Sütunlar) All missing columns (Tüm Eksik Sütunlar) Complete Rows (Tam Sayılar) Missing observations (Eksik Gözlemler) Columns (Sütun) observation(Gözlem) Rows (Satır)
Transported: Verinin %0’ı kayıp PassengerId: Verinin %0’ı kayıp Age: Verinin %2.06’sı kayıp RoomService: Verinin %2.08’i kayıp Destination: Verinin %2.09’u kayıp Spa: Verinin %2.11’i kayıp FoodCourt: Verinin %2.11’i kayıp VRDeck: Verinin %2.16’sı kayıp Cabin: Verinin %2.29’u kayıp Name: Verinin %2.30’u kayıp HomePlanet: Verinin %2.31’i kayıp VIP: Verinin %2.34’ü kayıp ShoppingMall: Verinin %2.39’u kayıp CryoSleep: Verinin %2.50’si kayıp
## Yaş testi Yaş histogramında 0 ile 80 yaş
aralığında en çok 20’li yaşlarda insanların olduğu gözlemledik.
## Yemek alanın test edilmesi Yemek alanı
histogramında 0-30.000 fiyat aralığında çok az kişinin yemeklere para
harcadığı ve bunun gelirle ilgili olduğu gözlemleniyor.
## TEST ROOMSERVİCE RoomService
histogramında 0-15.000 fiyat aralığında çok az kişinin oda servisine
para harcadığı gözlemleniyor.
## Alışveriş merkezi test Alışveriş
histogramında 0-25.000 fiyat aralığında kişilerin çoğu alışveriş
yapmadığı gözlemleniyor.
## VRdeck histogramında VRdeck’e insanlar
neredeyse hiç para harcamadıkları gözlemleniyor.
3 columns ignored with more than 50 categories.(50’den fazla kategoride 3 sütun göz ardı edildi.) PassengerId: 8693 categories(Yolcu Kimliği: 8693 kategori) Cabin: 6561 categories(Kabin: 6561 kategori) Name: 8474 categories(Ad: 8474 kategori)
## VIP grafiğinde yolcunun yolculuk
sırasında özel VIP hizmeti için ödeme yapmadığı gözlemleniyor.
Warning: Removed 124 rows containing non-finite values
(stat_qq()). (Uyarı: Sonlu olmayan değerler
(stat_qq()) içeren 124 satır kaldırıldı.)
AGE-FOODCOURT-ROOMSERVICE-SHOPPINGMALL-SPA
ve VRDECK grafikleri gelirle ilgili olduğu için normal dağılım
göstermemektedir.
traindatasınıküçültme1 <-train [,c(1:6,14)]
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(explore)
Train veri çerçevesindeki sayısal değişkenlerin istatistik özeti
train %>% describe_all()
## # A tibble: 14 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet chr 201 2.3 4 NA NA NA
## 3 CryoSleep lgl 217 2.5 3 0 0.36 1
## 4 Cabin chr 199 2.3 6561 NA NA NA
## 5 Destination chr 182 2.1 4 NA NA NA
## 6 Age dbl 179 2.1 81 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 181 2.1 1274 0 225. 14327
## 9 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 10 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 11 Spa dbl 183 2.1 1328 0 311. 22408
## 12 VRDeck dbl 188 2.2 1307 0 305. 24133
## 13 Name chr 200 2.3 8474 NA NA NA
## 14 Transported lgl 0 0 2 0 0.5 1
train[c('ailenum', 'ailesira')] <- str_split_fixed(train$PassengerId, "_", 2)
test[c('ailenum', 'ailesira')] <- str_split_fixed(test$PassengerId, "_", 2)
test[c('ailenum', 'ailesira')] <- str_split_fixed(test$PassengerId, "_", 2)
train[c('deck', 'num', 'side')] <- str_split_fixed(train$Cabin, '/', 3)
test[c('deck', 'num', 'side')] <- str_split_fixed(test$Cabin, '/', 3)
train[train == ''] <- NA
test[test == ''] <- NA
train %>% describe_all()
## # A tibble: 19 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet chr 201 2.3 4 NA NA NA
## 3 CryoSleep lgl 217 2.5 3 0 0.36 1
## 4 Cabin chr 199 2.3 6561 NA NA NA
## 5 Destination chr 182 2.1 4 NA NA NA
## 6 Age dbl 179 2.1 81 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 181 2.1 1274 0 225. 14327
## 9 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 10 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 11 Spa dbl 183 2.1 1328 0 311. 22408
## 12 VRDeck dbl 188 2.2 1307 0 305. 24133
## 13 Name chr 200 2.3 8474 NA NA NA
## 14 Transported lgl 0 0 2 0 0.5 1
## 15 ailenum chr 0 0 6217 NA NA NA
## 16 ailesira chr 0 0 8 NA NA NA
## 17 deck chr 199 2.3 9 NA NA NA
## 18 num chr 199 2.3 1818 NA NA NA
## 19 side chr 199 2.3 3 NA NA NA
train <- train %>% select(-Cabin)
test <- test %>% select(-Cabin)
unique(train$HomePlanet)
## [1] "Europa" "Earth" "Mars" NA
levels(train$HomePlanet)
## NULL
train$HomePlanet <- addNA(train$HomePlanet)
test$HomePlanet <- addNA(test$HomePlanet)
levels(train$HomePlanet)
## [1] "Earth" "Europa" "Mars" NA
levels(train$HomePlanet)[is.na(levels(train$HomePlanet))] <- "NA"
levels(test$HomePlanet)[is.na(levels(test$HomePlanet))] <- "NA"
levels(train$HomePlanet)
## [1] "Earth" "Europa" "Mars" "NA"
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep lgl 217 2.5 3 0 0.36 1
## 4 Destination chr 182 2.1 4 NA NA NA
## 5 Age dbl 179 2.1 81 0 28.8 79
## 6 VIP lgl 203 2.3 3 0 0.02 1
## 7 RoomService dbl 181 2.1 1274 0 225. 14327
## 8 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 9 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 10 Spa dbl 183 2.1 1328 0 311. 22408
## 11 VRDeck dbl 188 2.2 1307 0 305. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side chr 199 2.3 3 NA NA NA
train <- train %>%
group_by(HomePlanet) %>%
mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
test <- test %>%
group_by(HomePlanet) %>%
mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep lgl 217 2.5 3 0 0.36 1
## 4 Destination chr 182 2.1 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP lgl 203 2.3 3 0 0.02 1
## 7 RoomService dbl 181 2.1 1274 0 225. 14327
## 8 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 9 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 10 Spa dbl 183 2.1 1328 0 311. 22408
## 11 VRDeck dbl 188 2.2 1307 0 305. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side chr 199 2.3 3 NA NA NA
train$CryoSleep <- addNA(train$CryoSleep)
test$CryoSleep <- addNA(test$CryoSleep)
levels(train$CryoSleep)[is.na(levels(train$CryoSleep))] <- "NA"
levels(test$CryoSleep)[is.na(levels(test$CryoSleep))] <- "NA"
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination chr 182 2.1 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP lgl 203 2.3 3 0 0.02 1
## 7 RoomService dbl 181 2.1 1274 0 225. 14327
## 8 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 9 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 10 Spa dbl 183 2.1 1328 0 311. 22408
## 11 VRDeck dbl 188 2.2 1307 0 305. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side chr 199 2.3 3 NA NA NA
unique(train$Destination)
## [1] "TRAPPIST-1e" "PSO J318.5-22" "55 Cancri e" NA
train$Destination <- addNA(train$Destination)
test$Destination <- addNA(test$Destination)
levels(train$Destination)[is.na(levels(train$Destination))] <- "NA"
levels(test$Destination)[is.na(levels(test$Destination))] <- "NA"
train$side <- addNA(train$side)
test$side <- addNA(test$side)
levels(train$side)[is.na(levels(train$side))] <- "NA"
levels(test$side)[is.na(levels(test$side))] <- "NA"
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP lgl 203 2.3 3 0 0.02 1
## 7 RoomService dbl 181 2.1 1274 0 225. 14327
## 8 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 9 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 10 Spa dbl 183 2.1 1328 0 311. 22408
## 11 VRDeck dbl 188 2.2 1307 0 305. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side fct 0 0 3 NA NA NA
train$VIP <- addNA(train$VIP)
test$VIP <- addNA(test$VIP)
levels(train$VIP)[is.na(levels(train$VIP))] <- "NA"
levels(test$VIP)[is.na(levels(test$VIP))] <- "NA"
train <- train %>%
group_by(Destination) %>%
mutate_at(vars(RoomService), ~replace_na(., mean(., na.rm = TRUE)))
test <- test %>%
group_by(Destination) %>%
mutate_at(vars(RoomService), ~replace_na(., mean(., na.rm = TRUE)))
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 1277 0 225. 14327
## 8 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 9 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 10 Spa dbl 183 2.1 1328 0 311. 22408
## 11 VRDeck dbl 188 2.2 1307 0 305. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side fct 0 0 3 NA NA NA
hist(train$FoodCourt)
train <- train %>% mutate(FoodCourt = coalesce(FoodCourt, 0))
test <- test %>% mutate(FoodCourt = coalesce(FoodCourt, 0))
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 1277 0 225. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 10 Spa dbl 183 2.1 1328 0 311. 22408
## 11 VRDeck dbl 188 2.2 1307 0 305. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side fct 0 0 3 NA NA NA
train <- train %>% mutate(ShoppingMall = coalesce(ShoppingMall, 0),
Spa = coalesce(Spa, 0),
VRDeck = coalesce(VRDeck, 0))
test <- test %>% mutate(ShoppingMall = coalesce(ShoppingMall, 0),
Spa = coalesce(Spa, 0),
VRDeck = coalesce(VRDeck, 0))
train %>% describe_all()
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 1277 0 225. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 0 0 1115 0 170. 23492
## 10 Spa dbl 0 0 1327 0 305. 22408
## 11 VRDeck dbl 0 0 1306 0 298. 24133
## 12 Name chr 200 2.3 8474 NA NA NA
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 ailenum chr 0 0 6217 NA NA NA
## 15 ailesira chr 0 0 8 NA NA NA
## 16 deck chr 199 2.3 9 NA NA NA
## 17 num chr 199 2.3 1818 NA NA NA
## 18 side fct 0 0 3 NA NA NA
train <- train %>% select(-Name)
test <- test %>% select(-Name)
train %>% describe_all()
## # A tibble: 17 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 1277 0 225. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 0 0 1115 0 170. 23492
## 10 Spa dbl 0 0 1327 0 305. 22408
## 11 VRDeck dbl 0 0 1306 0 298. 24133
## 12 Transported lgl 0 0 2 0 0.5 1
## 13 ailenum chr 0 0 6217 NA NA NA
## 14 ailesira chr 0 0 8 NA NA NA
## 15 deck chr 199 2.3 9 NA NA NA
## 16 num chr 199 2.3 1818 NA NA NA
## 17 side fct 0 0 3 NA NA NA
train$deck <- addNA(train$deck)
test$deck <- addNA(test$deck)
levels(train$deck)[is.na(levels(train$deck))] <- "NA"
levels(test$deck)[is.na(levels(test$deck))] <- "NA"
train %>% describe_all()
## # A tibble: 17 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 1277 0 225. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 0 0 1115 0 170. 23492
## 10 Spa dbl 0 0 1327 0 305. 22408
## 11 VRDeck dbl 0 0 1306 0 298. 24133
## 12 Transported lgl 0 0 2 0 0.5 1
## 13 ailenum chr 0 0 6217 NA NA NA
## 14 ailesira chr 0 0 8 NA NA NA
## 15 deck fct 0 0 9 NA NA NA
## 16 num chr 199 2.3 1818 NA NA NA
## 17 side fct 0 0 3 NA NA NA
test %>% describe_all()
## # A tibble: 16 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 4277 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 83 0 28.7 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 846 0 219. 11567
## 8 FoodCourt dbl 0 0 902 0 429. 25273
## 9 ShoppingMall dbl 0 0 715 0 173. 8292
## 10 Spa dbl 0 0 833 0 296. 19844
## 11 VRDeck dbl 0 0 796 0 305. 22272
## 12 ailenum chr 0 0 3063 NA NA NA
## 13 ailesira chr 0 0 8 NA NA NA
## 14 deck fct 0 0 9 NA NA NA
## 15 num chr 100 2.3 1506 NA NA NA
## 16 side fct 0 0 3 NA NA NA
train$aile <- ifelse(duplicated(train$ailenum) | duplicated(train$ailenum,
fromLast = TRUE), 1, 0)
test$aile <- ifelse(duplicated(test$ailenum) | duplicated(test$ailenum,
fromLast = TRUE), 1, 0)
head(train[, c("PassengerId", "ailenum", "ailesira", "aile")], 20)
## # A tibble: 20 × 4
## PassengerId ailenum ailesira aile
## <chr> <chr> <chr> <dbl>
## 1 0001_01 0001 01 0
## 2 0002_01 0002 01 0
## 3 0003_01 0003 01 1
## 4 0003_02 0003 02 1
## 5 0004_01 0004 01 0
## 6 0005_01 0005 01 0
## 7 0006_01 0006 01 1
## 8 0006_02 0006 02 1
## 9 0007_01 0007 01 0
## 10 0008_01 0008 01 1
## 11 0008_02 0008 02 1
## 12 0008_03 0008 03 1
## 13 0009_01 0009 01 0
## 14 0010_01 0010 01 0
## 15 0011_01 0011 01 0
## 16 0012_01 0012 01 0
## 17 0014_01 0014 01 0
## 18 0015_01 0015 01 0
## 19 0016_01 0016 01 0
## 20 0017_01 0017 01 1
train <- train %>% select(-c(ailenum, ailesira, num))
test <- test %>% select(-c(ailenum, ailesira, num))
train %>% describe_all()
## # A tibble: 15 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 4 NA NA NA
## 3 CryoSleep fct 0 0 3 NA NA NA
## 4 Destination fct 0 0 4 NA NA NA
## 5 Age dbl 0 0 84 0 28.8 79
## 6 VIP fct 0 0 3 NA NA NA
## 7 RoomService dbl 0 0 1277 0 225. 14327
## 8 FoodCourt dbl 0 0 1507 0 448. 29813
## 9 ShoppingMall dbl 0 0 1115 0 170. 23492
## 10 Spa dbl 0 0 1327 0 305. 22408
## 11 VRDeck dbl 0 0 1306 0 298. 24133
## 12 Transported lgl 0 0 2 0 0.5 1
## 13 deck fct 0 0 9 NA NA NA
## 14 side fct 0 0 3 NA NA NA
## 15 aile dbl 0 0 2 0 0.45 1
train_set <- train[2:15]
test_set <- test[2:14]
library(caTools)
set.seed(123)
split = sample.split(train_set$Transported, SplitRatio = 0.75)
training_set = subset(train_set, split == TRUE)
testing_set = subset(train_set, split == FALSE)
logistic = glm(formula = Transported ~ .,
family = binomial,
data = training_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
prob_pred = predict(logistic, type = 'response', newdata = testing_set[-11])
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
y_pred = ifelse(prob_pred > 0.5, 1, 0)
y_true <- ifelse(testing_set[11] == TRUE, 1, 0)
cm = table(y_true, y_pred)
cm
## y_pred
## y_true 0 1
## 0 819 260
## 1 189 905
(819+905)/(819+905+260+189)
## [1] 0.7933732
logistic_son = glm(formula = Transported ~ .,
family = binomial,
data = train_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
prob_pred = predict(logistic_son, type = 'response', newdata = test_set)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
y_pred = ifelse(prob_pred > 0.5, TRUE, FALSE)
Transported <- as.character(y_pred)
PassengerId <- test$PassengerId
Transported <-as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
library(stringr)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_logistic.csv", row.names = FALSE,
quote = FALSE)
library(e1071)
fit_nb <- naiveBayes(Transported ~ ., data =training_set)
preds <- predict(fit_nb, newdata = testing_set[-11], type = "raw") %>%
data.frame()
y_pred = ifelse(preds$TRUE. > 0.5, 1, 0)
cm = table(y_true, y_pred )
cm
## y_pred
## y_true 0 1
## 0 510 569
## 1 78 1016
(510+1016)/(510+1016+78+569)
## [1] 0.7022549
nb_son = naiveBayes(Transported ~ ., data = train_set)
preds <- predict(nb_son, newdata = test_set, type = "raw") %>%
data.frame()
y_pred = ifelse(preds$TRUE. > 0.5, TRUE, FALSE)
Transported <- as.character(y_pred)
PassengerId <- test$PassengerId
Transported <-as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_nb_logistic.csv", row.names = FALSE,
quote = FALSE)
library(e1071)
fit_svm <- svm(Transported ~ ., data = training_set,
type= 'C-classification',
kernel = 'linear')
preds <- predict(fit_svm, newdata = testing_set, type = “raw”) %>% data.frame()
y_pred = ifelse(preds$. == TRUE, 1, 0)