knitr::opts_chunk$set(warning=FALSE)
knitr::opts_chunk$set(message =FALSE)
library(tidyverse)
library(skimr)
test=read.csv("D:\\wallpapers and photos\\test.csv")
train=read.csv("D:\\wallpapers and photos\\train.csv")
names(train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
[mean can be affected by outlier so eda is important before imputing the data]
library(naniar)
vis_miss(train)
train$Age[is.na(train$Age)]=mean(train$Age,na.rm = TRUE)
skim(train)
| Name | train |
| Number of rows | 891 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Name | 0 | 1 | 12 | 82 | 0 | 891 | 0 |
| Sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| Ticket | 0 | 1 | 3 | 18 | 0 | 681 | 0 |
| Cabin | 0 | 1 | 0 | 15 | 687 | 148 | 0 |
| Embarked | 0 | 1 | 0 | 1 | 2 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| PassengerId | 0 | 1 | 446.00 | 257.35 | 1.00 | 223.50 | 446.00 | 668.5 | 891.00 | ▇▇▇▇▇ |
| Survived | 0 | 1 | 0.38 | 0.49 | 0.00 | 0.00 | 0.00 | 1.0 | 1.00 | ▇▁▁▁▅ |
| Pclass | 0 | 1 | 2.31 | 0.84 | 1.00 | 2.00 | 3.00 | 3.0 | 3.00 | ▃▁▃▁▇ |
| Age | 0 | 1 | 29.70 | 13.00 | 0.42 | 22.00 | 29.70 | 35.0 | 80.00 | ▂▇▃▁▁ |
| SibSp | 0 | 1 | 0.52 | 1.10 | 0.00 | 0.00 | 0.00 | 1.0 | 8.00 | ▇▁▁▁▁ |
| Parch | 0 | 1 | 0.38 | 0.81 | 0.00 | 0.00 | 0.00 | 0.0 | 6.00 | ▇▁▁▁▁ |
| Fare | 0 | 1 | 32.20 | 49.69 | 0.00 | 7.91 | 14.45 | 31.0 | 512.33 | ▇▁▁▁▁ |
vis_miss(train)
vissulising the outlier
train %>%
select(Fare) %>%
ggplot(aes(x=Fare))+
geom_histogram()
train=train %>%
filter(Fare<500)
skim(train)
| Name | train |
| Number of rows | 888 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Name | 0 | 1 | 12 | 82 | 0 | 888 | 0 |
| Sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| Ticket | 0 | 1 | 3 | 18 | 0 | 680 | 0 |
| Cabin | 0 | 1 | 0 | 15 | 686 | 147 | 0 |
| Embarked | 0 | 1 | 0 | 1 | 2 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| PassengerId | 0 | 1 | 445.62 | 257.41 | 1.00 | 222.75 | 445.50 | 667.25 | 891 | ▇▇▇▇▇ |
| Survived | 0 | 1 | 0.38 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▅ |
| Pclass | 0 | 1 | 2.31 | 0.83 | 1.00 | 2.00 | 3.00 | 3.00 | 3 | ▃▁▃▁▇ |
| Age | 0 | 1 | 29.68 | 13.02 | 0.42 | 22.00 | 29.70 | 35.00 | 80 | ▂▇▃▁▁ |
| SibSp | 0 | 1 | 0.52 | 1.10 | 0.00 | 0.00 | 0.00 | 1.00 | 8 | ▇▁▁▁▁ |
| Parch | 0 | 1 | 0.38 | 0.81 | 0.00 | 0.00 | 0.00 | 0.00 | 6 | ▇▁▁▁▁ |
| Fare | 0 | 1 | 30.58 | 41.18 | 0.00 | 7.90 | 14.45 | 30.77 | 263 | ▇▁▁▁▁ |
unique(train$Embarked)
## [1] "S" "C" "Q" ""
changing missing values to NA
train=train %>%
mutate_all(na_if,"")
see……..:)
unique(train$Embarked)
## [1] "S" "C" "Q" NA
correlation plots can be very helpful to choose which variable to choose before modeling
data=train %>%
mutate(Embarked=if_else(is.na(Embarked),"unknown","Embarked")) %>%
mutate(Cabin=if_else(is.na(Cabin),"null","known")) %>%
mutate(Survived=as.factor(Survived))
library(DataExplorer)
plot_correlation(na.omit(data))
ggplot(data=data,aes(x=Survived,color=Survived,fill=Survived))+
geom_bar()+facet_wrap(~Pclass)
data$Fare[is.na(data$Fare)]=mean(data$Fare,na.rm=TRUE)
data %>%
mutate(Fare=cut(Fare,breaks=3,labels=c("low","medium","high"))) %>%
ggplot(aes(x=Survived,color=Survived,fill=Survived))+
geom_bar()+facet_wrap(~Fare)
ggplot(data=data,aes(x=Survived,color=Survived,fill=Survived))+
geom_bar()+facet_wrap(~Sex)
ggplot(data=data,aes(x=Survived,color=Survived,fill=Survived))+
geom_bar()+facet_wrap(~Embarked)
data1=train %>%
select(-c(PassengerId,SibSp,Parch,Cabin,Name)) %>%
mutate(Survived=as.factor(Survived)) %>%
mutate(Ticket=as.factor(Ticket)) %>%
mutate(Embarked=if_else(is.na(Embarked),"unknown","known"))
library(tidymodels)
data_split=initial_split(data1,prop = 0.75)
data_train=training(data_split)
data_test=testing(data_split)
recipe=data_train %>%
recipe(Survived~.)
model=logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
workflow_log=workflow() %>%
add_recipe(recipe) %>%
add_model(model)
fit=fit(workflow_log,data=data_train)
predict_class=predict(fit,new_data = data_train,type = "class")
predict_prob=predict(fit,new_data = data_train,type = "prob")
original_value=data_train %>% select(Survived)
new_assess_data=bind_cols(original_value,predict_class,predict_prob)
conf_mat(new_assess_data,truth = Survived,estimate = .pred_class) %>%
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.985
## 2 kap binary 0.968
## 3 sens binary 0.990
## 4 spec binary 0.976
## 5 ppv binary 0.985
## 6 npv binary 0.984
## 7 mcc binary 0.968
## 8 j_index binary 0.967
## 9 bal_accuracy binary 0.983
## 10 detection_prevalence binary 0.620
## 11 precision binary 0.985
## 12 recall binary 0.990
## 13 f_meas binary 0.988
roc_curve(new_assess_data,truth = Survived, .pred_0) %>%
autoplot()
roc_auc(new_assess_data,truth = Survived, .pred_0)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.983