library(readr)
test <- read_csv("test.csv")
## Rows: 4277 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train <- read_csv("train.csv")
## Rows: 8693 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## # A tibble: 13 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 4277 NA NA NA
## 2 HomePlanet chr 87 2 4 NA NA NA
## 3 CryoSleep lgl 93 2.2 3 0 0.37 1
## 4 Cabin chr 100 2.3 3266 NA NA NA
## 5 Destination chr 92 2.2 4 NA NA NA
## 6 Age dbl 91 2.1 80 0 28.7 79
## 7 VIP lgl 93 2.2 3 0 0.02 1
## 8 RoomService dbl 82 1.9 843 0 219. 11567
## 9 FoodCourt dbl 106 2.5 903 0 439. 25273
## 10 ShoppingMall dbl 98 2.3 716 0 177. 8292
## 11 Spa dbl 101 2.4 834 0 303. 19844
## 12 VRDeck dbl 80 1.9 797 0 311. 22272
## 13 Name chr 94 2.2 4177 NA NA NA
## # A tibble: 14 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId chr 0 0 8693 NA NA NA
## 2 HomePlanet chr 201 2.3 4 NA NA NA
## 3 CryoSleep lgl 217 2.5 3 0 0.36 1
## 4 Cabin chr 199 2.3 6561 NA NA NA
## 5 Destination chr 182 2.1 4 NA NA NA
## 6 Age dbl 179 2.1 81 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 181 2.1 1274 0 225. 14327
## 9 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 10 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 11 Spa dbl 183 2.1 1328 0 311. 22408
## 12 VRDeck dbl 188 2.2 1307 0 305. 24133
## 13 Name chr 200 2.3 8474 NA NA NA
## 14 Transported lgl 0 0 2 0 0.5 1
test[c('group', 'pp')] <- str_split_fixed(test$PassengerId,'-', 2)
train[c('group', 'pp')] <- str_split_fixed(train$PassengerId,'-', 2)
head(train[,c("PassengerId", "group", "pp")])
## # A tibble: 6 × 3
## PassengerId group pp
## <chr> <chr> <chr>
## 1 0001_01 0001_01 ""
## 2 0002_01 0002_01 ""
## 3 0003_01 0003_01 ""
## 4 0003_02 0003_02 ""
## 5 0004_01 0004_01 ""
## 6 0005_01 0005_01 ""
test$withgroup <- ifelse(duplicated(test$group) | duplicated(test$group, fromLast = TRUE), 1,0)
train$withgroup <- ifelse(duplicated(train$group) | duplicated(train$group, fromLast = TRUE), 1,0)
head(test[,c("PassengerId", "group", "pp", "withgroup")])
## # A tibble: 6 × 4
## PassengerId group pp withgroup
## <chr> <chr> <chr> <dbl>
## 1 0013_01 0013_01 "" 0
## 2 0018_01 0018_01 "" 0
## 3 0019_01 0019_01 "" 0
## 4 0021_01 0021_01 "" 0
## 5 0023_01 0023_01 "" 0
## 6 0027_01 0027_01 "" 0
head(train[,c("PassengerId", "group", "pp", "withgroup")])
## # A tibble: 6 × 4
## PassengerId group pp withgroup
## <chr> <chr> <chr> <dbl>
## 1 0001_01 0001_01 "" 0
## 2 0002_01 0002_01 "" 0
## 3 0003_01 0003_01 "" 0
## 4 0003_02 0003_02 "" 0
## 5 0004_01 0004_01 "" 0
## 6 0005_01 0005_01 "" 0
train[c('deck', 'num', 'side')] <- str_split_fixed(train$Cabin, '/', 3)
test[c('deck', 'num', 'side')] <- str_split_fixed(test$Cabin, '/', 3)
test <-test %>% mutate_if(is.character,as.factor)
train <-train %>% mutate_if(is.character,as.factor)
## PassengerId HomePlanet CryoSleep Cabin
## 0013_01: 1 Earth :2263 Mode :logical G/160/P: 8
## 0018_01: 1 Europa:1002 FALSE:2640 B/31/P : 7
## 0019_01: 1 Mars : 925 TRUE :1544 D/273/S: 7
## 0021_01: 1 NA's : 87 NA's :93 E/228/S: 7
## 0023_01: 1 G/748/S: 7
## 0027_01: 1 (Other):4141
## (Other):4271 NA's : 100
## Destination Age VIP RoomService
## 55 Cancri e : 841 Min. : 0.00 Mode :logical Min. : 0.0
## PSO J318.5-22: 388 1st Qu.:19.00 FALSE:4110 1st Qu.: 0.0
## TRAPPIST-1e :2956 Median :26.00 TRUE :74 Median : 0.0
## NA's : 92 Mean :28.66 NA's :93 Mean : 219.3
## 3rd Qu.:37.00 3rd Qu.: 53.0
## Max. :79.00 Max. :11567.0
## NA's :91 NA's :82
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 439.5 Mean : 177.3 Mean : 303.1 Mean : 310.7
## 3rd Qu.: 78.0 3rd Qu.: 33.0 3rd Qu.: 50.0 3rd Qu.: 36.0
## Max. :25273.0 Max. :8292.0 Max. :19844.0 Max. :22272.0
## NA's :106 NA's :98 NA's :101 NA's :80
## Name group pp withgroup deck
## Berta Barnolderg: 2 0013_01: 1 :4277 Min. :0 F :1445
## Chrey Colte : 2 0018_01: 1 1st Qu.:0 G :1222
## Cints Erle : 2 0019_01: 1 Median :0 E : 447
## Cocors Cola : 2 0021_01: 1 Mean :0 B : 362
## Con Pashe : 2 0023_01: 1 3rd Qu.:0 C : 355
## (Other) :4173 0027_01: 1 Max. :0 (Other): 346
## NA's : 94 (Other):4271 NA's : 100
## num side
## : 100 : 100
## 4 : 21 P:2084
## 31 : 18 S:2093
## 197 : 16
## 294 : 16
## 228 : 14
## (Other):4092
## PassengerId HomePlanet CryoSleep Cabin
## 0001_01: 1 Earth :4602 Mode :logical G/734/S: 8
## 0002_01: 1 Europa:2131 FALSE:5439 B/11/S : 7
## 0003_01: 1 Mars :1759 TRUE :3037 B/201/P: 7
## 0003_02: 1 NA's : 201 NA's :217 B/82/S : 7
## 0004_01: 1 C/137/S: 7
## 0005_01: 1 (Other):8458
## (Other):8687 NA's : 199
## Destination Age VIP RoomService
## 55 Cancri e :1800 Min. : 0.00 Mode :logical Min. : 0.0
## PSO J318.5-22: 796 1st Qu.:19.00 FALSE:8291 1st Qu.: 0.0
## TRAPPIST-1e :5915 Median :27.00 TRUE :199 Median : 0.0
## NA's : 182 Mean :28.83 NA's :203 Mean : 224.7
## 3rd Qu.:38.00 3rd Qu.: 47.0
## Max. :79.00 Max. :14327.0
## NA's :179 NA's :181
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 458.1 Mean : 173.7 Mean : 311.1 Mean : 304.9
## 3rd Qu.: 76.0 3rd Qu.: 27.0 3rd Qu.: 59.0 3rd Qu.: 46.0
## Max. :29813.0 Max. :23492.0 Max. :22408.0 Max. :24133.0
## NA's :183 NA's :208 NA's :183 NA's :188
## Name Transported group pp withgroup
## Alraium Disivering: 2 Mode :logical 0001_01: 1 :8693 Min. :0
## Ankalik Nateansive: 2 FALSE:4315 0002_01: 1 1st Qu.:0
## Anton Woody : 2 TRUE :4378 0003_01: 1 Median :0
## Apix Wala : 2 0003_02: 1 Mean :0
## Asch Stradick : 2 0004_01: 1 3rd Qu.:0
## (Other) :8483 0005_01: 1 Max. :0
## NA's : 200 (Other):8687
## deck num side
## F :2794 : 199 : 199
## G :2559 82 : 28 P:4206
## E : 876 19 : 22 S:4288
## B : 779 86 : 22
## C : 747 176 : 21
## (Other): 739 56 : 21
## NA's : 199 (Other):8380
test [test == ' '] <- NA
train [train == ' '] <- NA
train$num <- droplevels(train$num)
test$num <- droplevels(test$num)
train$side <- droplevels(train$side)
test$side <- droplevels(test$side)
## PassengerId HomePlanet CryoSleep Cabin
## 0013_01: 1 Earth :2263 Mode :logical G/160/P: 8
## 0018_01: 1 Europa:1002 FALSE:2640 B/31/P : 7
## 0019_01: 1 Mars : 925 TRUE :1544 D/273/S: 7
## 0021_01: 1 NA's : 87 NA's :93 E/228/S: 7
## 0023_01: 1 G/748/S: 7
## 0027_01: 1 (Other):4141
## (Other):4271 NA's : 100
## Destination Age VIP RoomService
## 55 Cancri e : 841 Min. : 0.00 Mode :logical Min. : 0.0
## PSO J318.5-22: 388 1st Qu.:19.00 FALSE:4110 1st Qu.: 0.0
## TRAPPIST-1e :2956 Median :26.00 TRUE :74 Median : 0.0
## NA's : 92 Mean :28.66 NA's :93 Mean : 219.3
## 3rd Qu.:37.00 3rd Qu.: 53.0
## Max. :79.00 Max. :11567.0
## NA's :91 NA's :82
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 439.5 Mean : 177.3 Mean : 303.1 Mean : 310.7
## 3rd Qu.: 78.0 3rd Qu.: 33.0 3rd Qu.: 50.0 3rd Qu.: 36.0
## Max. :25273.0 Max. :8292.0 Max. :19844.0 Max. :22272.0
## NA's :106 NA's :98 NA's :101 NA's :80
## Name group pp withgroup deck
## Berta Barnolderg: 2 0013_01: 1 :4277 Min. :0 F :1445
## Chrey Colte : 2 0018_01: 1 1st Qu.:0 G :1222
## Cints Erle : 2 0019_01: 1 Median :0 E : 447
## Cocors Cola : 2 0021_01: 1 Mean :0 B : 362
## Con Pashe : 2 0023_01: 1 3rd Qu.:0 C : 355
## (Other) :4173 0027_01: 1 Max. :0 (Other): 346
## NA's : 94 (Other):4271 NA's : 100
## num side
## : 100 : 100
## 4 : 21 P:2084
## 31 : 18 S:2093
## 197 : 16
## 294 : 16
## 228 : 14
## (Other):4092
## PassengerId HomePlanet CryoSleep Cabin
## 0001_01: 1 Earth :4602 Mode :logical G/734/S: 8
## 0002_01: 1 Europa:2131 FALSE:5439 B/11/S : 7
## 0003_01: 1 Mars :1759 TRUE :3037 B/201/P: 7
## 0003_02: 1 NA's : 201 NA's :217 B/82/S : 7
## 0004_01: 1 C/137/S: 7
## 0005_01: 1 (Other):8458
## (Other):8687 NA's : 199
## Destination Age VIP RoomService
## 55 Cancri e :1800 Min. : 0.00 Mode :logical Min. : 0.0
## PSO J318.5-22: 796 1st Qu.:19.00 FALSE:8291 1st Qu.: 0.0
## TRAPPIST-1e :5915 Median :27.00 TRUE :199 Median : 0.0
## NA's : 182 Mean :28.83 NA's :203 Mean : 224.7
## 3rd Qu.:38.00 3rd Qu.: 47.0
## Max. :79.00 Max. :14327.0
## NA's :179 NA's :181
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 458.1 Mean : 173.7 Mean : 311.1 Mean : 304.9
## 3rd Qu.: 76.0 3rd Qu.: 27.0 3rd Qu.: 59.0 3rd Qu.: 46.0
## Max. :29813.0 Max. :23492.0 Max. :22408.0 Max. :24133.0
## NA's :183 NA's :208 NA's :183 NA's :188
## Name Transported group pp withgroup
## Alraium Disivering: 2 Mode :logical 0001_01: 1 :8693 Min. :0
## Ankalik Nateansive: 2 FALSE:4315 0002_01: 1 1st Qu.:0
## Anton Woody : 2 TRUE :4378 0003_01: 1 Median :0
## Apix Wala : 2 0003_02: 1 Mean :0
## Asch Stradick : 2 0004_01: 1 3rd Qu.:0
## (Other) :8483 0005_01: 1 Max. :0
## NA's : 200 (Other):8687
## deck num side
## F :2794 : 199 : 199
## G :2559 82 : 28 P:4206
## E : 876 19 : 22 S:4288
## B : 779 86 : 22
## C : 747 176 : 21
## (Other): 739 56 : 21
## NA's : 199 (Other):8380
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
test <- test %>%
group_by(group) %>%
mutate(HomePlanet = na.locf(HomePlanet, na.rm = FALSE))
train <- train %>%
group_by(group) %>%
mutate(HomePlanet = na.locf(HomePlanet, na.rm = FALSE))
most_frequent_hp <- train %>%
filter(!is.na(HomePlanet)) %>%
group_by(Destination, HomePlanet) %>%
summarize(count = n()) %>%
arrange(Destination, desc(count)) %>%
slice(1) %>%
ungroup()
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## Destination HomePlanet count
## <fct> <fct> <int>
## 1 55 Cancri e Europa 886
## 2 PSO J318.5-22 Earth 712
## 3 TRAPPIST-1e Earth 3101
## 4 <NA> Earth 99
train$HomePlanet <- as.character(train$HomePlanet)
train$Destination <- as.character(train$Destination)
train <- train %>%
mutate(HomePlanet= ifelse(is.na(HomePlanet) & Destination == "55 Cancri e","Europa",
ifelse(is.na(HomePlanet), "Earth", HomePlanet)))
test$HomePlanet <- as.character(test$HomePlanet)
test$Destination <- as.character(test$Destination)
test <- test %>%
mutate(HomePlanet= ifelse(is.na(HomePlanet) & Destination == "55 Cancri e","Europa",
ifelse(is.na(HomePlanet), "Earth", HomePlanet)))
## # A tibble: 20 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 8693 NA NA NA
## 2 HomePlanet chr 4 0 4 NA NA NA
## 3 CryoSleep lgl 217 2.5 3 0 0.36 1
## 4 Cabin fct 199 2.3 6561 NA NA NA
## 5 Destination chr 182 2.1 4 NA NA NA
## 6 Age dbl 179 2.1 81 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 181 2.1 1274 0 225. 14327
## 9 FoodCourt dbl 183 2.1 1508 0 458. 29813
## 10 ShoppingMall dbl 208 2.4 1116 0 174. 23492
## 11 Spa dbl 183 2.1 1328 0 311. 22408
## 12 VRDeck dbl 188 2.2 1307 0 305. 24133
## 13 Name fct 200 2.3 8474 NA NA NA
## 14 Transported lgl 0 0 2 0 0.5 1
## 15 group fct 0 0 8693 NA NA NA
## 16 pp fct 0 0 1 NA NA NA
## 17 withgroup dbl 0 0 1 0 0 0
## 18 deck fct 199 2.3 9 NA NA NA
## 19 num fct 0 0 1818 NA NA NA
## 20 side fct 0 0 3 NA NA NA
## # A tibble: 19 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 4277 NA NA NA
## 2 HomePlanet chr 2 0 4 NA NA NA
## 3 CryoSleep lgl 93 2.2 3 0 0.37 1
## 4 Cabin fct 100 2.3 3266 NA NA NA
## 5 Destination chr 92 2.2 4 NA NA NA
## 6 Age dbl 91 2.1 80 0 28.7 79
## 7 VIP lgl 93 2.2 3 0 0.02 1
## 8 RoomService dbl 82 1.9 843 0 219. 11567
## 9 FoodCourt dbl 106 2.5 903 0 439. 25273
## 10 ShoppingMall dbl 98 2.3 716 0 177. 8292
## 11 Spa dbl 101 2.4 834 0 303. 19844
## 12 VRDeck dbl 80 1.9 797 0 311. 22272
## 13 Name fct 94 2.2 4177 NA NA NA
## 14 group fct 0 0 4277 NA NA NA
## 15 pp fct 0 0 1 NA NA NA
## 16 withgroup dbl 0 0 1 0 0 0
## 17 deck fct 100 2.3 9 NA NA NA
## 18 num fct 0 0 1506 NA NA NA
## 19 side fct 0 0 3 NA NA NA
train <- transform(train, HomePlanet = replace(HomePlanet, is.na(HomePlanet),
"Earth"))
train <- transform(train, HomePlanet = replace(HomePlanet, is.na(HomePlanet), "Earth"))
most_frequent_destinations <- train %>%
filter(!is.na(Destination)) %>%
group_by(HomePlanet, Destination ) %>%
summarize(count = n()) %>%
arrange(HomePlanet, desc(count)) %>%
slice(1) %>%
ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
most_frequent_destinations
## # A tibble: 3 × 3
## HomePlanet Destination count
## <chr> <chr> <int>
## 1 Earth TRAPPIST-1e 3251
## 2 Europa TRAPPIST-1e 1189
## 3 Mars TRAPPIST-1e 1475
test <- transform(test, Destination = replace(Destination, is.na(Destination), "TRAPPIST-1e"))
train <- transform(train, Destination = replace(Destination, is.na(Destination), "TRAPPIST-1e"))
test$HomePlanet <- as.factor(test$HomePlanet)
test$Destination <- as.factor(test$Destination)
train$HomePlanet <- as.factor(train$HomePlanet)
train$Destination <- as.factor(train$Destination)
## PassengerId HomePlanet CryoSleep Cabin
## 0001_01: 1 Earth :4772 Mode :logical G/734/S: 8
## 0002_01: 1 Europa:2162 FALSE:5439 B/11/S : 7
## 0003_01: 1 Mars :1759 TRUE :3037 B/201/P: 7
## 0003_02: 1 NA's :217 B/82/S : 7
## 0004_01: 1 C/137/S: 7
## 0005_01: 1 (Other):8458
## (Other):8687 NA's : 199
## Destination Age VIP RoomService
## 55 Cancri e :1800 Min. : 0.00 Mode :logical Min. : 0.0
## PSO J318.5-22: 796 1st Qu.:19.00 FALSE:8291 1st Qu.: 0.0
## TRAPPIST-1e :6097 Median :27.00 TRUE :199 Median : 0.0
## Mean :28.83 NA's :203 Mean : 224.7
## 3rd Qu.:38.00 3rd Qu.: 47.0
## Max. :79.00 Max. :14327.0
## NA's :179 NA's :181
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 458.1 Mean : 173.7 Mean : 311.1 Mean : 304.9
## 3rd Qu.: 76.0 3rd Qu.: 27.0 3rd Qu.: 59.0 3rd Qu.: 46.0
## Max. :29813.0 Max. :23492.0 Max. :22408.0 Max. :24133.0
## NA's :183 NA's :208 NA's :183 NA's :188
## Name Transported group pp withgroup
## Alraium Disivering: 2 Mode :logical 0001_01: 1 :8693 Min. :0
## Ankalik Nateansive: 2 FALSE:4315 0002_01: 1 1st Qu.:0
## Anton Woody : 2 TRUE :4378 0003_01: 1 Median :0
## Apix Wala : 2 0003_02: 1 Mean :0
## Asch Stradick : 2 0004_01: 1 3rd Qu.:0
## (Other) :8483 0005_01: 1 Max. :0
## NA's : 200 (Other):8687
## deck num side
## F :2794 : 199 : 199
## G :2559 82 : 28 P:4206
## E : 876 19 : 22 S:4288
## B : 779 86 : 22
## C : 747 176 : 21
## (Other): 739 56 : 21
## NA's : 199 (Other):8380
train <- train %>%
mutate(RoomService = coalesce(RoomService, 0),
FoodCourt = coalesce(FoodCourt, 0),
ShoppingMall = coalesce(ShoppingMall, 0),
Spa = coalesce(Spa, 0),
VRDeck = coalesce(VRDeck, 0))
test <- test %>%
mutate(RoomService = coalesce(RoomService, 0),
FoodCourt = coalesce(FoodCourt, 0),
ShoppingMall = coalesce(ShoppingMall, 0),
Spa = coalesce(Spa, 0),
VRDeck = coalesce(VRDeck, 0))
test <- test %>%
group_by(HomePlanet, Destination) %>%
mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
train <- train %>%
group_by(HomePlanet, Destination) %>%
mutate_at(vars(Age), ~replace_na(., mean(., na.rm = TRUE)))
train$expense <- train$RoomService + train$FoodCourt + train$ShoppingMall + train$Spa + train$VRDeck
test$expense <- test$RoomService + test$FoodCourt + test$ShoppingMall + test$Spa + test$VRDeck
train <- transform(train, CryoSleep = replace(CryoSleep, expense>0 & Age>12, "FALSE"))
test <- transform(test, CryoSleep = replace(CryoSleep, expense>0 & Age>12, "FALSE"))
## # A tibble: 21 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 3 NA NA NA
## 3 CryoSleep chr 98 1.1 3 NA NA NA
## 4 Cabin fct 199 2.3 6561 NA NA NA
## 5 Destination fct 0 0 3 NA NA NA
## 6 Age dbl 0 0 88 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 0 0 1273 0 220. 14327
## 9 FoodCourt dbl 0 0 1507 0 448. 29813
## 10 ShoppingMall dbl 0 0 1115 0 170. 23492
## # ℹ 11 more rows
train <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep) & expense-0 & Age>12,
"TRUE"))
test <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep) & expense-0 & Age>12,
"TRUE"))
## # A tibble: 21 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 3 NA NA NA
## 3 CryoSleep chr 98 1.1 3 NA NA NA
## 4 Cabin fct 199 2.3 6561 NA NA NA
## 5 Destination fct 0 0 3 NA NA NA
## 6 Age dbl 0 0 88 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 0 0 1273 0 220. 14327
## 9 FoodCourt dbl 0 0 1507 0 448. 29813
## 10 ShoppingMall dbl 0 0 1115 0 170. 23492
## # ℹ 11 more rows
train <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep), "FALSE"))
test <- transform(train, CryoSleep = replace(CryoSleep, is.na(CryoSleep), "FALSE"))
## # A tibble: 21 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 3 NA NA NA
## 3 CryoSleep chr 0 0 2 NA NA NA
## 4 Cabin fct 199 2.3 6561 NA NA NA
## 5 Destination fct 0 0 3 NA NA NA
## 6 Age dbl 0 0 88 0 28.8 79
## 7 VIP lgl 203 2.3 3 0 0.02 1
## 8 RoomService dbl 0 0 1273 0 220. 14327
## 9 FoodCourt dbl 0 0 1507 0 448. 29813
## 10 ShoppingMall dbl 0 0 1115 0 170. 23492
## # ℹ 11 more rows
train$cryosleep <- as.factor(train$CryoSleep)
test$cryosleep <- as.factor(test$CryoSleep)
## PassengerId HomePlanet CryoSleep Cabin
## 0001_01: 1 Earth :4772 Length:8693 G/734/S: 8
## 0002_01: 1 Europa:2162 Class :character B/11/S : 7
## 0003_01: 1 Mars :1759 Mode :character B/201/P: 7
## 0003_02: 1 B/82/S : 7
## 0004_01: 1 C/137/S: 7
## 0005_01: 1 (Other):8458
## (Other):8687 NA's : 199
## Destination Age VIP RoomService
## 55 Cancri e :1800 Min. : 0.00 Mode :logical Min. : 0
## PSO J318.5-22: 796 1st Qu.:20.00 FALSE:8291 1st Qu.: 0
## TRAPPIST-1e :6097 Median :27.00 TRUE :199 Median : 0
## Mean :28.83 NA's :203 Mean : 220
## 3rd Qu.:37.00 3rd Qu.: 41
## Max. :79.00 Max. :14327
##
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 448.4 Mean : 169.6 Mean : 304.6 Mean : 298.3
## 3rd Qu.: 61.0 3rd Qu.: 22.0 3rd Qu.: 53.0 3rd Qu.: 40.0
## Max. :29813.0 Max. :23492.0 Max. :22408.0 Max. :24133.0
##
## Name Transported group pp withgroup
## Alraium Disivering: 2 Mode :logical 0001_01: 1 :8693 Min. :0
## Ankalik Nateansive: 2 FALSE:4315 0002_01: 1 1st Qu.:0
## Anton Woody : 2 TRUE :4378 0003_01: 1 Median :0
## Apix Wala : 2 0003_02: 1 Mean :0
## Asch Stradick : 2 0004_01: 1 3rd Qu.:0
## (Other) :8483 0005_01: 1 Max. :0
## NA's : 200 (Other):8687
## deck num side expense cryosleep
## F :2794 : 199 : 199 Min. : 0 FALSE:5656
## G :2559 82 : 28 P:4206 1st Qu.: 0 TRUE :3037
## E : 876 19 : 22 S:4288 Median : 716
## B : 779 86 : 22 Mean : 1441
## C : 747 176 : 21 3rd Qu.: 1441
## (Other): 739 56 : 21 Max. :35987
## NA's : 199 (Other):8380
train <- transform(train, VIP = replace(VIP, is.na(VIP), FALSE))
test <- transform(train, VIP = replace(VIP, is.na(VIP), FALSE))
## # A tibble: 22 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 8693 NA NA NA
## 2 HomePlanet fct 0 0 3 NA NA NA
## 3 CryoSleep chr 0 0 2 NA NA NA
## 4 Cabin fct 199 2.3 6561 NA NA NA
## 5 Destination fct 0 0 3 NA NA NA
## 6 Age dbl 0 0 88 0 28.8 79
## 7 VIP lgl 0 0 2 0 0.02 1
## 8 RoomService dbl 0 0 1273 0 220. 14327
## 9 FoodCourt dbl 0 0 1507 0 448. 29813
## 10 ShoppingMall dbl 0 0 1115 0 170. 23492
## # ℹ 12 more rows
train <- train %>%
group_by(group) %>%
mutate(deck = na.locf(deck, na.rm = FALSE))
test <- test %>%
group_by(group) %>%
mutate(deck = na.locf(deck, na.rm = FALSE))
## PassengerId HomePlanet CryoSleep Cabin
## 0001_01: 1 Earth :4772 Length:8693 G/734/S: 8
## 0002_01: 1 Europa:2162 Class :character B/11/S : 7
## 0003_01: 1 Mars :1759 Mode :character B/201/P: 7
## 0003_02: 1 B/82/S : 7
## 0004_01: 1 C/137/S: 7
## 0005_01: 1 (Other):8458
## (Other):8687 NA's : 199
## Destination Age VIP RoomService
## 55 Cancri e :1800 Min. : 0.00 Mode :logical Min. : 0
## PSO J318.5-22: 796 1st Qu.:20.00 FALSE:8494 1st Qu.: 0
## TRAPPIST-1e :6097 Median :27.00 TRUE :199 Median : 0
## Mean :28.83 Mean : 220
## 3rd Qu.:37.00 3rd Qu.: 41
## Max. :79.00 Max. :14327
##
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 448.4 Mean : 169.6 Mean : 304.6 Mean : 298.3
## 3rd Qu.: 61.0 3rd Qu.: 22.0 3rd Qu.: 53.0 3rd Qu.: 40.0
## Max. :29813.0 Max. :23492.0 Max. :22408.0 Max. :24133.0
##
## Name Transported group pp withgroup
## Alraium Disivering: 2 Mode :logical 0001_01: 1 :8693 Min. :0
## Ankalik Nateansive: 2 FALSE:4315 0002_01: 1 1st Qu.:0
## Anton Woody : 2 TRUE :4378 0003_01: 1 Median :0
## Apix Wala : 2 0003_02: 1 Mean :0
## Asch Stradick : 2 0004_01: 1 3rd Qu.:0
## (Other) :8483 0005_01: 1 Max. :0
## NA's : 200 (Other):8687
## deck num side expense cryosleep
## F :2794 : 199 : 199 Min. : 0 FALSE:5656
## G :2559 82 : 28 P:4206 1st Qu.: 0 TRUE :3037
## E : 876 19 : 22 S:4288 Median : 716
## B : 779 86 : 22 Mean : 1441
## C : 747 176 : 21 3rd Qu.: 1441
## (Other): 739 56 : 21 Max. :35987
## NA's : 199 (Other):8380
most_frequent_deck <- train %>%
filter(!is.na(deck)) %>%
group_by (HomePlanet, deck) %>%
summarize(count = n()) %>%
arrange(HomePlanet, desc(count)) %>%
slice (1) %>%
ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 3
## HomePlanet deck count
## <fct> <fct> <int>
## 1 Earth G 2553
## 2 Europa B 771
## 3 Mars F 1110
train$HomePlanet <- as.character(train$HomePlanet)
train$deck <- as.character(train$deck)
train <- train %>%
mutate(deck = ifelse(is.na(deck) & HomePlanet == "Earth", "G",
ifelse(is.na(deck) & HomePlanet == "Europa", "B",
ifelse(is.na(deck) & HomePlanet == "Mars", "F", deck))))
test$HomePlanet <- as.character(test$HomePlanet)
test$deck <- as.character(test$deck)
test <- test %>%
mutate(deck = ifelse(is.na(deck) & HomePlanet == "Earth", "G",
ifelse(is.na(deck) & HomePlanet == "Europa", "B",
ifelse(is.na(deck) & HomePlanet == "Mars", "F", deck))))
most_frequent_side <- train %>%
filter(!is.na(deck)) %>%
group_by (HomePlanet, deck) %>%
summarize(count = n()) %>%
arrange(HomePlanet, desc(count)) %>%
slice (1) %>%
ungroup()
## `summarise()` has grouped output by 'HomePlanet'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 3
## HomePlanet deck count
## <chr> <chr> <int>
## 1 Earth G 2652
## 2 Europa B 834
## 3 Mars F 1147
train$side <- as.character(train$side)
train <- train %>%
mutate(deck = ifelse(is.na(side) & HomePlanet == "Earth", "G",
ifelse(is.na(side) & HomePlanet == "Europa", "B",
ifelse(is.na(side) & HomePlanet == "Mars", "F", deck))))
test$side <- as.character(test$side)
test <- test %>%
mutate(deck = ifelse(is.na(side) & HomePlanet == "Earth", "G",
ifelse(is.na(side) & HomePlanet == "Europa", "B",
ifelse(is.na(side) & HomePlanet == "Mars", "F", deck))))
## # A tibble: 22 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 PassengerId fct 0 0 8693 NA NA NA
## 2 HomePlanet chr 0 0 3 NA NA NA
## 3 CryoSleep chr 0 0 2 NA NA NA
## 4 Cabin fct 199 2.3 6561 NA NA NA
## 5 Destination fct 0 0 3 NA NA NA
## 6 Age dbl 0 0 88 0 28.8 79
## 7 VIP lgl 0 0 2 0 0.02 1
## 8 RoomService dbl 0 0 1273 0 220. 14327
## 9 FoodCourt dbl 0 0 1507 0 448. 29813
## 10 ShoppingMall dbl 0 0 1115 0 170. 23492
## # ℹ 12 more rows
train <- train %>% select(-c("Cabin", "Name", "group", "pp", "num"))
## Adding missing grouping variables: `group`
test <- test %>% select(-c("Cabin", "Name", "group", "pp", "num"))
## Adding missing grouping variables: `group`
## # A tibble: 18 × 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 group fct 0 0 8693 NA NA NA
## 2 PassengerId fct 0 0 8693 NA NA NA
## 3 HomePlanet chr 0 0 3 NA NA NA
## 4 CryoSleep chr 0 0 2 NA NA NA
## 5 Destination fct 0 0 3 NA NA NA
## 6 Age dbl 0 0 88 0 28.8 79
## 7 VIP lgl 0 0 2 0 0.02 1
## 8 RoomService dbl 0 0 1273 0 220. 14327
## 9 FoodCourt dbl 0 0 1507 0 448. 29813
## 10 ShoppingMall dbl 0 0 1115 0 170. 23492
## 11 Spa dbl 0 0 1327 0 305. 22408
## 12 VRDeck dbl 0 0 1306 0 298. 24133
## 13 Transported lgl 0 0 2 0 0.5 1
## 14 withgroup dbl 0 0 1 0 0 0
## 15 deck chr 0 0 8 NA NA NA
## 16 side chr 0 0 3 NA NA NA
## 17 expense dbl 0 0 2336 0 1441. 35987
## 18 cryosleep fct 0 0 2 NA NA NA
train <- train %>% mutate_if(is.character,as.factor)
## `mutate_if()` ignored the following grouping variables:
## • Column `group`
test <- test %>% mutate_if(is.character,as.factor)
## `mutate_if()` ignored the following grouping variables:
## • Column `group`
train$PassengerId <- as.character(train$PassengerId)
test$PassengerId <- as.character(test$PassengerId)
## group PassengerId HomePlanet CryoSleep
## 0001_01: 1 Length:8693 Europa:2162 FALSE:5656
## 0002_01: 1 Class :character Earth :4772 TRUE :3037
## 0003_01: 1 Mode :character Mars :1759
## 0003_02: 1
## 0004_01: 1
## 0005_01: 1
## (Other):8687
## Destination Age VIP RoomService
## 55 Cancri e :1800 Min. : 0.00 Mode :logical Min. : 0
## PSO J318.5-22: 796 1st Qu.:20.00 FALSE:8494 1st Qu.: 0
## TRAPPIST-1e :6097 Median :27.00 TRUE :199 Median : 0
## Mean :28.83 Mean : 220
## 3rd Qu.:37.00 3rd Qu.: 41
## Max. :79.00 Max. :14327
##
## FoodCourt ShoppingMall Spa VRDeck
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 448.4 Mean : 169.6 Mean : 304.6 Mean : 298.3
## 3rd Qu.: 61.0 3rd Qu.: 22.0 3rd Qu.: 53.0 3rd Qu.: 40.0
## Max. :29813.0 Max. :23492.0 Max. :22408.0 Max. :24133.0
##
## Transported withgroup deck side expense
## Mode :logical Min. :0 F :2831 P:4206 Min. : 0
## FALSE:4315 1st Qu.:0 G :2658 S:4288 1st Qu.: 0
## TRUE :4378 Median :0 E : 876 : 199 Median : 716
## Mean :0 B : 842 Mean : 1441
## 3rd Qu.:0 C : 747 3rd Qu.: 1441
## Max. :0 D : 478 Max. :35987
## (Other): 261
## cryosleep
## FALSE:5656
## TRUE :3037
##
##
##
##
##
write.csv(train, "train_c.csv", row.names=FALSE)
write.csv(test, "test_c.csv", row.names=FALSE)
train <- read.csv("train_c.csv")
test <- read.csv("test_c.csv")
test$HomePlanet <- as.factor(test$HomePlanet)
test$Destination <- as.factor(test$Destination)
test$deck <- as.factor(test$deck)
test$side <- as.factor(test$side)
train$HomePlanet <- as.factor(train$HomePlanet)
train$Destination <- as.factor(train$Destination)
train$deck <- as.factor(train$deck)
train$side <- as.factor(train$side)
D <- train[,2:16] %>% mutate(across(everything(), ~as.integer(.)))
## Warning in cor(D): the standard deviation is zero
## corrplot 0.92 loaded

library(DataExplorer)
create_report(train)
model <- lm(Transported ~ ., data = train[, 2:16])
##
## Call:
## lm(formula = Transported ~ ., data = train[, 2:16])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.40936 -0.31038 -0.03383 0.29023 1.78012
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.562e-01 4.909e-02 7.256 4.33e-13 ***
## HomePlanetEuropa 2.057e-01 2.771e-02 7.422 1.26e-13 ***
## HomePlanetMars 9.750e-02 1.456e-02 6.698 2.25e-11 ***
## CryoSleepTRUE 3.817e-01 1.152e-02 33.134 < 2e-16 ***
## DestinationPSO J318.5-22 -4.337e-02 1.807e-02 -2.399 0.0164 *
## DestinationTRAPPIST-1e -4.628e-02 1.133e-02 -4.085 4.45e-05 ***
## Age -2.306e-03 3.141e-04 -7.341 2.31e-13 ***
## VIPTRUE -3.816e-02 2.976e-02 -1.282 0.1998
## RoomService -1.182e-04 7.056e-06 -16.757 < 2e-16 ***
## FoodCourt 4.283e-05 3.057e-06 14.009 < 2e-16 ***
## ShoppingMall 7.870e-05 7.460e-06 10.550 < 2e-16 ***
## Spa -8.640e-05 4.116e-06 -20.993 < 2e-16 ***
## VRDeck -8.271e-05 4.117e-06 -20.090 < 2e-16 ***
## withgroup NA NA NA NA
## deckB 1.215e-01 2.898e-02 4.191 2.80e-05 ***
## deckC 1.518e-01 2.928e-02 5.183 2.23e-07 ***
## deckD 4.223e-02 3.477e-02 1.214 0.2246
## deckE -2.038e-03 3.582e-02 -0.057 0.9546
## deckF 9.154e-02 3.656e-02 2.504 0.0123 *
## deckG 4.482e-02 3.810e-02 1.176 0.2395
## deckT 5.289e-02 1.816e-01 0.291 0.7709
## sideP -2.210e-02 2.955e-02 -0.748 0.4546
## sideS 6.413e-02 2.952e-02 2.172 0.0299 *
## expense NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4015 on 8671 degrees of freedom
## Multiple R-squared: 0.3567, Adjusted R-squared: 0.3552
## F-statistic: 229 on 21 and 8671 DF, p-value: < 2.2e-16
library(caTools)
set.seed(123)
split = sample.split(train$Transported, SplitRatio = 0.75)
train_train = subset(train, split == TRUE)
train_test = subset(train, split == FALSE)
regresyon <- lm(Transported ~ ., data = train_train[, -c(1)])
##
## Call:
## lm(formula = Transported ~ ., data = train_train[, -c(1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48713 -0.31346 -0.03176 0.29206 1.75532
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.751e-01 5.686e-02 6.596 4.56e-11 ***
## HomePlanetEuropa 1.886e-01 3.217e-02 5.861 4.82e-09 ***
## HomePlanetMars 8.668e-02 1.675e-02 5.174 2.36e-07 ***
## CryoSleepTRUE 3.776e-01 1.328e-02 28.445 < 2e-16 ***
## DestinationPSO J318.5-22 -5.948e-02 2.081e-02 -2.858 0.004279 **
## DestinationTRAPPIST-1e -4.993e-02 1.308e-02 -3.816 0.000137 ***
## Age -2.442e-03 3.650e-04 -6.690 2.42e-11 ***
## VIPTRUE -1.618e-02 3.385e-02 -0.478 0.632715
## RoomService -1.178e-04 7.789e-06 -15.118 < 2e-16 ***
## FoodCourt 3.890e-05 3.468e-06 11.216 < 2e-16 ***
## ShoppingMall 8.188e-05 8.413e-06 9.733 < 2e-16 ***
## Spa -8.526e-05 4.711e-06 -18.099 < 2e-16 ***
## VRDeck -8.201e-05 4.776e-06 -17.170 < 2e-16 ***
## withgroup NA NA NA NA
## deckB 1.155e-01 3.313e-02 3.485 0.000495 ***
## deckC 1.449e-01 3.345e-02 4.333 1.49e-05 ***
## deckD 2.188e-02 3.961e-02 0.552 0.580706
## deckE -3.674e-02 4.147e-02 -0.886 0.375642
## deckF 7.079e-02 4.208e-02 1.682 0.092551 .
## deckG 2.552e-02 4.388e-02 0.582 0.560908
## deckT 4.365e-02 1.822e-01 0.240 0.810637
## sideP -7.773e-03 3.454e-02 -0.225 0.821954
## sideS 8.234e-02 3.451e-02 2.386 0.017047 *
## expense NA NA NA NA
## cryosleepTRUE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4014 on 6498 degrees of freedom
## Multiple R-squared: 0.3577, Adjusted R-squared: 0.3556
## F-statistic: 172.3 on 21 and 6498 DF, p-value: < 2.2e-16
reg_tahmin = predict(regresyon, newdata = train_test[, -c(1,12)])
reg_transported_tahmin <- ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek <- ifelse(train_test[12] == TRUE, 1,0)
cm = table(transported_gercek, reg_transported_tahmin)
## reg_transported_tahmin
## transported_gercek 0 1
## 0 905 174
## 1 325 769
(823 + 917)/(823 + 256 + 177 + 917)
## [1] 0.8007363
reg_tahmin = predict(regresyon, newdata = train_train[, -c(1,12)])
reg_transported_tahmin <- ifelse(reg_tahmin > 0.5, 1, 0)
transported_gercek <- ifelse(train_train[12] == TRUE, 1,0)
cm = table(transported_gercek, reg_transported_tahmin)
## reg_transported_tahmin
## transported_gercek 0 1
## 0 2680 556
## 1 947 2337
( 2680 + 2337)/(2680 + 556 + 947 + 2337)
## [1] 0.7694785
reg_tahmin_bd = predict(model, newdata = test[, -c(1)])
reg_transported_test_tahmin <- ifelse(reg_tahmin_bd > 0.5, TRUE, FALSE)
Transported <- as.character(reg_transported_test_tahmin)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submission_regrasyon <- cbind(PassengerId, Transported)
submission_regrasyon <- as.data.frame(submission_regrasyon)
submission_regrasyon$Transported <- str_to_title(submission_regrasyon$Transported)
write.csv(submission_regrasyon, "siniftahmini.csv", row.names = FALSE, quote = FALSE)
## Loading required package: rJava
## Loading required package: leaps
regresyon_opt <- glmulti(Transported ~ HomePlanet + Cryosleep + Destination + Age + VIP + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck + withgroup + deck + side, + level = 1, crit = bic, data =train)
modelglmulti <- lm(Transported ~ 1 + HomePlanet + Destination + deck + side + Cryosleep + Age + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck, data = train )
reg_tahmin_glmulti = predict(modelglmulti, newdata = test[, -c(1)])
reg_transported_test_tahmin_glmulti <- ifelse(reg_tahmin_glmulti > 0.5, TRUE, FALSE)
Transported <- as.character(reg_transported_test_tahmin_glmulti)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submission_regrasyon_glmulti <- cbind(PassengerId, Transported)
submission_regrasyon_glmulti <- as.data.frame(submission_regrasyon_glmulti)
submission_regrasyon_glmulti$Transported <- str_to_title(submission_regrasyon_glmulti$Transpored)
write.csv(submission_regrasyon_glmulti, "submission_regrasyon_glmulti.csv", row.names =FALSE, quote=FALSE)
train_log <- train %>%
mutate_at(c(5, 7:11, 16), ~log(1 + .))
test_log <- test %>%
mutate_at(c(5, 7:11, 16), ~log(1 + .))
modellog <- lm(Transported ~ 1 + HomePlanet + Destination + deck + side + cryosleep +Age + RoomService + FoodCourt + ShoppingMall + Spa + VRDeck, data = train_log)
reg_tahmin_log = predict(modellog, newdata = test_log[, -c(1)])
reg_transported_test_tahmin_log <- ifelse(reg_tahmin_log > 0.5, TRUE, FALSE)
Transported <- as.character(reg_transported_test_tahmin_log)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submission_regrasyon_log <- cbind(PassengerId, Transported)
submission_regrasyon_log <- as.data.frame(submission_regrasyon_log)
submission_regrasyon_log$Transported <- str_to_title(submission_regrasyon_log$Transported)
write.csv(submission_regrasyon_log, "submission_regrasyon_log_csv", row.names =FALSE, quote=FALSE)
logistic = glm(formula = Transported ~ .,
family = binomial,
data = train_train[, -c(1)])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Call:
## glm(formula = Transported ~ ., family = binomial, data = train_train[,
## -c(1)])
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.785e-02 4.240e-01 -0.160 0.872875
## HomePlanetEuropa 1.386e+00 2.512e-01 5.519 3.41e-08 ***
## HomePlanetMars 5.227e-01 1.083e-01 4.824 1.41e-06 ***
## CryoSleepTRUE 1.294e+00 9.154e-02 14.139 < 2e-16 ***
## DestinationPSO J318.5-22 -5.017e-01 1.297e-01 -3.869 0.000109 ***
## DestinationTRAPPIST-1e -4.540e-01 9.401e-02 -4.829 1.37e-06 ***
## Age -9.359e-03 2.397e-03 -3.905 9.42e-05 ***
## VIPTRUE -1.870e-01 2.935e-01 -0.637 0.524085
## RoomService -1.731e-03 1.135e-04 -15.246 < 2e-16 ***
## FoodCourt 4.557e-04 4.444e-05 10.255 < 2e-16 ***
## ShoppingMall 5.211e-04 7.524e-05 6.927 4.30e-12 ***
## Spa -1.951e-03 1.161e-04 -16.810 < 2e-16 ***
## VRDeck -1.879e-03 1.157e-04 -16.243 < 2e-16 ***
## withgroup NA NA NA NA
## deckB 1.260e+00 2.916e-01 4.320 1.56e-05 ***
## deckC 2.398e+00 3.301e-01 7.266 3.71e-13 ***
## deckD 5.535e-01 3.191e-01 1.735 0.082774 .
## deckE -7.900e-02 3.255e-01 -0.243 0.808213
## deckF 5.733e-01 3.287e-01 1.744 0.081110 .
## deckG 1.611e-01 3.378e-01 0.477 0.633416
## deckT -2.762e-01 1.816e+00 -0.152 0.879136
## sideP -1.471e-01 2.452e-01 -0.600 0.548453
## sideS 4.591e-01 2.455e-01 1.870 0.061496 .
## expense NA NA NA NA
## cryosleepTRUE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9038.3 on 6519 degrees of freedom
## Residual deviance: 5612.7 on 6498 degrees of freedom
## AIC: 5656.7
##
## Number of Fisher Scoring iterations: 7
logistic_tahmin = predict(logistic, newdata = train_test[, -c(1,12)])
## 2 3 7 14 16 21
## -1.0391743 -11.0422312 0.8975435 -1.6060140 -0.2496921 -1.7609533
logistic_transported_tahmin <- ifelse(logistic_tahmin > 0.5, 1, 0)
transported_gercek <- ifelse(train_test[12] == TRUE,1,0)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
result = data.frame(cbind(transported_gercek, logistic_transported_tahmin))
result$Transported <- as.factor(result$Transported)
result$logistic_transported_tahmin <- as.factor(result$logistic_transported_tahmin)
accuracy(result,truth = Transported, estimate = logistic_transported_tahmin)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.785
conf_mat(result, truth = Transported, estimate = logistic_transported_tahmin)
## Truth
## Prediction 0 1
## 0 921 310
## 1 158 784
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
cm = table(transported_gercek, logistic_transported_tahmin)
## logistic_transported_tahmin
## transported_gercek 0 1
## 0 921 158
## 1 310 784
(921 + 784)/ (921 + 158 + 310 + 784)
## [1] 0.7846295
confusionMatrix(as.factor(transported_gercek), as.factor(logistic_transported_tahmin))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 921 158
## 1 310 784
##
## Accuracy : 0.7846
## 95% CI : (0.7667, 0.8018)
## No Information Rate : 0.5665
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5697
##
## Mcnemar's Test P-Value : 2.952e-12
##
## Sensitivity : 0.7482
## Specificity : 0.8323
## Pos Pred Value : 0.8536
## Neg Pred Value : 0.7166
## Prevalence : 0.5665
## Detection Rate : 0.4238
## Detection Prevalence : 0.4965
## Balanced Accuracy : 0.7902
##
## 'Positive' Class : 0
##
logistic_bd = glm(formula = Transported ~ .,
family = binomial,
data = train[, -c(1)])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic_tahmin_bd = predict(logistic_bd, newdata = test[, -c(1)])
logistic_transported_test_tahmin <- ifelse(logistic_tahmin_bd > 0.5, TRUE, FALSE)
Transported <- as.character(logistic_transported_test_tahmin)
PassengerId <- test$PassengerId
Transported<-as.vector(Transported)
submision_logistic <- cbind(PassengerId, Transported)
submision_logistic <- as.data.frame(submision_logistic)
submision_logistic$Transported <-
str_to_title(submision_logistic$Transported)
write.csv(submision_logistic, "submision_logistic.csv", row.names =FALSE, quote = FALSE)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:tune':
##
## tune
## The following object is masked from 'package:rsample':
##
## permutations
## The following object is masked from 'package:parsnip':
##
## tune
fit_nb <- naiveBayes(Transported ~ ., data =train_train[, -1])
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## FALSE TRUE
## 0.496319 0.503681
##
## Conditional probabilities:
## HomePlanet
## Y Earth Europa Mars
## FALSE 0.6251545 0.1749073 0.1999382
## TRUE 0.4649817 0.3285627 0.2064555
##
## CryoSleep
## Y FALSE TRUE
## FALSE 0.8702101 0.1297899
## TRUE 0.4336175 0.5663825
##
## Destination
## Y 55 Cancri e PSO J318.5-22 TRAPPIST-1e
## FALSE 0.16069221 0.09363412 0.74567367
## TRUE 0.24969549 0.09043849 0.65986602
##
## Age
## Y [,1] [,2]
## FALSE 30.01152 13.45216
## TRUE 27.84835 14.87502
##
## VIP
## Y FALSE TRUE
## FALSE 0.97126082 0.02873918
## TRUE 0.98142509 0.01857491
##
## RoomService
## Y [,1] [,2]
## FALSE 402.20365 916.6547
## TRUE 56.93484 246.8105
##
## FoodCourt
## Y [,1] [,2]
## FALSE 396.2923 1258.123
## TRUE 514.8018 1918.620
##
## ShoppingMall
## Y [,1] [,2]
## FALSE 160.2250 432.1103
## TRUE 182.7135 748.5867
##
## Spa
## Y [,1] [,2]
## FALSE 561.16193 1554.2636
## TRUE 61.11571 264.0556
##
## VRDeck
## Y [,1] [,2]
## FALSE 533.98733 1542.8703
## TRUE 69.07186 295.0542
##
## withgroup
## Y [,1] [,2]
## FALSE 0 0
## TRUE 0 0
##
## deck
## Y A B C D E
## FALSE 0.0296662546 0.0553152040 0.0568603214 0.0655129790 0.1338071693
## TRUE 0.0301461632 0.1419001218 0.1178440926 0.0484165652 0.0672959805
## deck
## Y F G T
## FALSE 0.3634116193 0.2941903585 0.0012360939
## TRUE 0.2834957369 0.3105968331 0.0003045067
##
## side
## Y P S
## FALSE 0.02348578 0.54079110 0.43572311
## TRUE 0.02131547 0.43300853 0.54567600
##
## expense
## Y [,1] [,2]
## FALSE 2053.8702 3224.219
## TRUE 884.6376 2307.962
##
## cryosleep
## Y FALSE TRUE
## FALSE 0.8702101 0.1297899
## TRUE 0.4336175 0.5663825
pred_nb <- predict(fit_nb, newdata = train_test[, -c(1, 12)], type = "raw") %>% data.frame()
## FALSE. TRUE.
## 1 0.32131320 6.786868e-01
## 2 1.00000000 3.298257e-135
## 3 0.06619047 9.338095e-01
## 4 0.70820285 2.917971e-01
## 5 0.03490270 9.650973e-01
## 6 0.61151399 3.884860e-01
Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, 1, 0)
Transported_test_train <- ifelse(train_test[12] == TRUE, 1, 0)
head(Transported_test_train)
## Transported
## 2 1
## 3 0
## 7 1
## 14 0
## 16 0
## 21 0
cm= table(Transported_test_train, Transported_pred_nb)
## Transported_pred_nb
## Transported_test_train 0 1
## 0 530 549
## 1 86 1008
(530 + 549) / (530 + 549 + 86 + 1008)
## [1] 0.4965486
fit_svm <- svm(Transported ~ ., data = train_train[, -1], type = 'C-classification')
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>% data.frame()
## .
## 2 FALSE
## 3 FALSE
## 7 FALSE
## 14 FALSE
## 16 FALSE
## 21 FALSE
Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_test_train, Transported_pred_svm)
## Transported_pred_svm
## Transported_test_train 0 1
## 0 897 182
## 1 378 716
(897 + 182) / (897 + 182 + 378 + 812)
## [1] 0.4755399
fit_svm <- svm(Transported ~ ., data = train[, -1],
type = 'C-classification')
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata = train_test[, -c(1, 12)], type = "raw") %>% data.frame()
Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)
Transported <- as.character(Transported_pred_svm)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submision <- cbind(PassengerId, Transported)
## Warning in cbind(PassengerId, Transported): number of rows of result is not a
## multiple of vector length (arg 2)
sample_submision <- as.data.frame(sample_submision)
write.csv(sample_submision, "sub_svm_csv", row.names = FALSE, quote = FALSE)
fit_svm <- svm(Transported ~ ., data = train_train[, -1],
type= 'C-classification',
kernel = 'radial' )
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata= train_test[, -c(1,12)], type= "raw") %>% data.frame()
Transported_pred_svm = ifelse(preds$.== TRUE, 1,0)
cm = table(Transported_test_train, Transported_pred_svm)
## Transported_pred_svm
## Transported_test_train 0 1
## 0 897 182
## 1 378 716
(897 + 182) / (897 + 182 + 378 + 812)
## [1] 0.4755399
fit_svm <- svm(Transported ~ ., data = train[, -1],
type= 'C-classification',
kernel = 'radial' )
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'withgroup' constant. Cannot scale data.
preds <- predict(fit_svm, newdata= test, type= "raw") %>% data.frame()
Transported_pred_svm = ifelse(preds$.== TRUE,TRUE,FALSE)
Transported <- as.character(Transported_pred_svm)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId,Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission,"sub_svm_radial.csv",row.names = FALSE, quote = FALSE)
P <- ggplot(train_train,aes(x=HomePlanet, y=deck,color=factor(Transported))) +
geom_point(aes(shape=factor(Transported)), size=3) +
scale_color_viridis_d() +
labs(title = "", x="HomePlanet", y="deck") +
theme_minimal() +
theme(legend.position = "top")
P

##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
fit_tree <- rpart::rpart(Transported ~ ., data = train_train[, -1])
summary(fit_tree)
## Call:
## rpart::rpart(formula = Transported ~ ., data = train_train[,
## -1])
## n= 6520
##
## CP nsplit rel error xerror xstd
## 1 0.23321359 0 1.0000000 1.0001811 0.0001977604
## 2 0.04981598 1 0.7667864 0.7671015 0.0103212807
## 3 0.02807107 2 0.7169704 0.7173428 0.0090686704
## 4 0.02784643 3 0.6888994 0.7031918 0.0095398822
## 5 0.01770479 4 0.6610529 0.6654236 0.0096767137
## 6 0.01381587 5 0.6433481 0.6488743 0.0100411278
## 7 0.01194576 6 0.6295323 0.6381676 0.0101816720
## 8 0.01164524 7 0.6175865 0.6303299 0.0102037441
## 9 0.01000000 8 0.6059413 0.6235169 0.0102521030
##
## Variable importance
## expense cryosleep CryoSleep FoodCourt Spa VRDeck
## 21 16 16 12 11 10
## deck HomePlanet ShoppingMall Age Destination
## 5 5 2 1 1
##
## Node number 1: 6520 observations, complexity param=0.2332136
## mean=0.503681, MSE=0.2499865
## left son=2 (3795 obs) right son=3 (2725 obs)
## Primary splits:
## expense < 0.5 to the right, improve=0.2332136, (0 missing)
## CryoSleep < 0.5 to the left, improve=0.2095384, (0 missing)
## cryosleep < 0.5 to the left, improve=0.2095384, (0 missing)
## RoomService < 0.5 to the right, improve=0.1254121, (0 missing)
## Spa < 0.5 to the right, improve=0.1142180, (0 missing)
## Surrogate splits:
## CryoSleep < 0.5 to the left, agree=0.932, adj=0.837, (0 split)
## cryosleep < 0.5 to the left, agree=0.932, adj=0.837, (0 split)
## Spa < 0.5 to the right, agree=0.784, adj=0.484, (0 split)
## FoodCourt < 0.5 to the right, agree=0.770, adj=0.451, (0 split)
## VRDeck < 0.5 to the right, agree=0.768, adj=0.444, (0 split)
##
## Node number 2: 3795 observations, complexity param=0.02807107
## mean=0.2990777, MSE=0.2096302
## left son=4 (3243 obs) right son=5 (552 obs)
## Primary splits:
## FoodCourt < 1331 to the left, improve=0.05751186, (0 missing)
## ShoppingMall < 627.5 to the left, improve=0.04530804, (0 missing)
## RoomService < 365.5 to the right, improve=0.04440387, (0 missing)
## Spa < 257.5 to the right, improve=0.03347453, (0 missing)
## VRDeck < 721 to the right, improve=0.02290457, (0 missing)
## Surrogate splits:
## expense < 5981 to the left, agree=0.885, adj=0.210, (0 split)
## deck splits as RRRLLLLL, agree=0.884, adj=0.201, (0 split)
## HomePlanet splits as LRL, agree=0.878, adj=0.161, (0 split)
## Spa < 8955.5 to the left, agree=0.856, adj=0.009, (0 split)
## VRDeck < 11692 to the left, agree=0.856, adj=0.009, (0 split)
##
## Node number 3: 2725 observations, complexity param=0.04981598
## mean=0.7886239, MSE=0.1666963
## left son=6 (1449 obs) right son=7 (1276 obs)
## Primary splits:
## deck splits as RRRRLRL-, improve=0.17874760, (0 missing)
## HomePlanet splits as LRR, improve=0.12440710, (0 missing)
## Destination splits as RLL, improve=0.02625136, (0 missing)
## CryoSleep < 0.5 to the left, improve=0.02268236, (0 missing)
## cryosleep < 0.5 to the left, improve=0.02268236, (0 missing)
## Surrogate splits:
## HomePlanet splits as LRR, agree=0.933, adj=0.857, (0 split)
## Age < 24.5 to the left, agree=0.625, adj=0.200, (0 split)
## Destination splits as RLL, agree=0.591, adj=0.126, (0 split)
## VIP < 0.5 to the left, agree=0.538, adj=0.014, (0 split)
## side splits as RLL, agree=0.533, adj=0.002, (0 split)
##
## Node number 4: 3243 observations, complexity param=0.02784643
## mean=0.2537774, MSE=0.1893744
## left son=8 (2577 obs) right son=9 (666 obs)
## Primary splits:
## ShoppingMall < 541.5 to the left, improve=0.07390355, (0 missing)
## RoomService < 365.5 to the right, improve=0.03464407, (0 missing)
## Spa < 240.5 to the right, improve=0.03327259, (0 missing)
## VRDeck < 114 to the right, improve=0.02784287, (0 missing)
## expense < 2867.5 to the right, improve=0.01811461, (0 missing)
## Surrogate splits:
## expense < 18644 to the left, agree=0.795, adj=0.003, (0 split)
##
## Node number 5: 552 observations, complexity param=0.01770479
## mean=0.5652174, MSE=0.2457467
## left son=10 (123 obs) right son=11 (429 obs)
## Primary splits:
## Spa < 1372.5 to the right, improve=0.21272970, (0 missing)
## VRDeck < 1063.5 to the right, improve=0.17089500, (0 missing)
## expense < 5395 to the right, improve=0.06611166, (0 missing)
## deck splits as LLRLLRRL, improve=0.02812225, (0 missing)
## side splits as LLR, improve=0.02807513, (0 missing)
## Surrogate splits:
## expense < 12647 to the right, agree=0.790, adj=0.057, (0 split)
## Age < 13.5 to the left, agree=0.779, adj=0.008, (0 split)
## RoomService < 3895.5 to the right, agree=0.779, adj=0.008, (0 split)
##
## Node number 6: 1449 observations
## mean=0.6266391, MSE=0.2339625
##
## Node number 7: 1276 observations
## mean=0.9725705, MSE=0.02667709
##
## Node number 8: 2577 observations, complexity param=0.01194576
## mean=0.193636, MSE=0.1561411
## left son=16 (2067 obs) right son=17 (510 obs)
## Primary splits:
## FoodCourt < 456.5 to the left, improve=0.04838893, (0 missing)
## expense < 1447.5 to the right, improve=0.04016842, (0 missing)
## HomePlanet splits as RLL, improve=0.02438006, (0 missing)
## Spa < 537.5 to the right, improve=0.01895890, (0 missing)
## RoomService < 400.5 to the right, improve=0.01706521, (0 missing)
## Surrogate splits:
## expense < 12373 to the left, agree=0.804, adj=0.008, (0 split)
## Spa < 13650 to the left, agree=0.803, adj=0.006, (0 split)
## VRDeck < 10123.5 to the left, agree=0.802, adj=0.002, (0 split)
## deck splits as LLLLLLLR, agree=0.802, adj=0.002, (0 split)
##
## Node number 9: 666 observations
## mean=0.4864865, MSE=0.2498174
##
## Node number 10: 123 observations
## mean=0.1382114, MSE=0.119109
##
## Node number 11: 429 observations, complexity param=0.01381587
## mean=0.6876457, MSE=0.2147891
## left son=22 (143 obs) right son=23 (286 obs)
## Primary splits:
## VRDeck < 611 to the right, improve=0.24438400, (0 missing)
## Spa < 225 to the right, improve=0.05300377, (0 missing)
## FoodCourt < 3119.5 to the left, improve=0.05168044, (0 missing)
## side splits as LLR, improve=0.04323810, (0 missing)
## RoomService < 1719.5 to the right, improve=0.03930897, (0 missing)
## Surrogate splits:
## expense < 6032 to the right, agree=0.702, adj=0.105, (0 split)
## Age < 53.5 to the right, agree=0.674, adj=0.021, (0 split)
## FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
##
## Node number 16: 2067 observations
## mean=0.1504596, MSE=0.1278215
##
## Node number 17: 510 observations, complexity param=0.01164524
## mean=0.3686275, MSE=0.2327413
## left son=34 (204 obs) right son=35 (306 obs)
## Primary splits:
## expense < 1447.5 to the right, improve=0.15990760, (0 missing)
## VRDeck < 86.5 to the right, improve=0.10060710, (0 missing)
## HomePlanet splits as RLL, improve=0.07491751, (0 missing)
## Spa < 500 to the right, improve=0.07353369, (0 missing)
## deck splits as LLLLRRRL, improve=0.05075444, (0 missing)
## Surrogate splits:
## HomePlanet splits as RLL, agree=0.867, adj=0.667, (0 split)
## deck splits as LLLLRRRL, agree=0.839, adj=0.598, (0 split)
## VRDeck < 213.5 to the right, agree=0.818, adj=0.544, (0 split)
## Spa < 219.5 to the right, agree=0.792, adj=0.480, (0 split)
## FoodCourt < 907 to the right, agree=0.722, adj=0.304, (0 split)
##
## Node number 22: 143 observations
## mean=0.3636364, MSE=0.231405
##
## Node number 23: 286 observations
## mean=0.8496503, MSE=0.1277446
##
## Node number 34: 204 observations
## mean=0.1323529, MSE=0.1148356
##
## Node number 35: 306 observations
## mean=0.5261438, MSE=0.2493165

preds= predict(fit_tree,newdata = train_test[, -c(1,12)]) %>%
data.frame()
## .
## 2 0.1504596
## 3 0.1382114
## 7 0.8496503
## 14 0.1504596
## 16 0.4864865
## 21 0.1504596
Transported_pred_tree = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_test_train, Transported_pred_tree)
## Transported_pred_tree
## Transported_test_train 0 1
## 0 807 272
## 1 237 857
(807 + 272) / (807 + 272 + 237 + 272)
## [1] 0.679471
fit_tree <- rpart(Transported ~ ., data = train[, -1])
preds <- predict(fit_tree, newdata = test) %>%
data.frame()
Transported_pred_tree = ifelse(preds$. > 0.5, TRUE, FALSE)
Transported <- as.character(Transported_pred_tree)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_tree.csv", row.names = FALSE, quote = FALSE)
fit_forest <- randomForest(Transported ~ ., data = train_train[, -1])
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## IncNodePurity
## HomePlanet 39.895123
## CryoSleep 80.141134
## Destination 25.772116
## Age 94.120805
## VIP 2.593687
## RoomService 108.064499
## FoodCourt 116.320446
## ShoppingMall 93.053181
## Spa 115.939271
## VRDeck 105.156138
## withgroup 0.000000
## deck 97.745154
## side 26.504009
## expense 250.784767
## cryosleep 84.477854

preds = predict(fit_forest, newdata = train_test[, -c(1,12)]) %>%
data.frame()
## .
## 2 0.09394136
## 3 0.13646667
## 7 0.80923810
## 14 0.22278600
## 16 0.72546365
## 21 0.15576667
Transported_pred_forest = ifelse(preds$. >0.5, 1, 0)
cm = table(Transported_test_train, Transported_pred_forest)
## Transported_pred_forest
## Transported_test_train 0 1
## 0 823 256
## 1 177 917
( 820 + 259 ) / ( 820 + 259 + 176 + 918)
## [1] 0.4965486
fit_forest <- randomForest(Transported ~ ., data = train[, -1])
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
preds <- predict(fit_forest, newdata = test) %>%
data.frame()
Transported <- as.character(Transported_pred_tree)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_forest.csv", row.names = FALSE, quote = FALSE)