knitr::opts_chunk$set(warning=FALSE)
knitr::opts_chunk$set(message =FALSE)

reading csv file

library(tidyverse)
library(skimr)
test=read.csv("D:\\wallpapers and photos\\test.csv")
train=read.csv("D:\\wallpapers and photos\\train.csv")
names(train)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"

QUICK EDA ,having a bird eye view over the data ,filling the missing values

[mean can be affected by outlier so eda is important before imputing the data]

library(naniar)

vis_miss(train)

train$Age[is.na(train$Age)]=mean(train$Age,na.rm = TRUE)
skim(train)
Data summary
Name train
Number of rows 891
Number of columns 12
_______________________
Column type frequency:
character 5
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 12 82 0 891 0
Sex 0 1 4 6 0 2 0
Ticket 0 1 3 18 0 681 0
Cabin 0 1 0 15 687 148 0
Embarked 0 1 0 1 2 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PassengerId 0 1 446.00 257.35 1.00 223.50 446.00 668.5 891.00 ▇▇▇▇▇
Survived 0 1 0.38 0.49 0.00 0.00 0.00 1.0 1.00 ▇▁▁▁▅
Pclass 0 1 2.31 0.84 1.00 2.00 3.00 3.0 3.00 ▃▁▃▁▇
Age 0 1 29.70 13.00 0.42 22.00 29.70 35.0 80.00 ▂▇▃▁▁
SibSp 0 1 0.52 1.10 0.00 0.00 0.00 1.0 8.00 ▇▁▁▁▁
Parch 0 1 0.38 0.81 0.00 0.00 0.00 0.0 6.00 ▇▁▁▁▁
Fare 0 1 32.20 49.69 0.00 7.91 14.45 31.0 512.33 ▇▁▁▁▁
vis_miss(train)

vissulising the outlier

train %>% 
  select(Fare) %>% 
  ggplot(aes(x=Fare))+
  geom_histogram()

train=train %>% 
  filter(Fare<500)
skim(train)
Data summary
Name train
Number of rows 888
Number of columns 12
_______________________
Column type frequency:
character 5
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 12 82 0 888 0
Sex 0 1 4 6 0 2 0
Ticket 0 1 3 18 0 680 0
Cabin 0 1 0 15 686 147 0
Embarked 0 1 0 1 2 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PassengerId 0 1 445.62 257.41 1.00 222.75 445.50 667.25 891 ▇▇▇▇▇
Survived 0 1 0.38 0.49 0.00 0.00 0.00 1.00 1 ▇▁▁▁▅
Pclass 0 1 2.31 0.83 1.00 2.00 3.00 3.00 3 ▃▁▃▁▇
Age 0 1 29.68 13.02 0.42 22.00 29.70 35.00 80 ▂▇▃▁▁
SibSp 0 1 0.52 1.10 0.00 0.00 0.00 1.00 8 ▇▁▁▁▁
Parch 0 1 0.38 0.81 0.00 0.00 0.00 0.00 6 ▇▁▁▁▁
Fare 0 1 30.58 41.18 0.00 7.90 14.45 30.77 263 ▇▁▁▁▁
unique(train$Embarked)
## [1] "S" "C" "Q" ""

changing missing values to NA

train=train %>% 
  mutate_all(na_if,"")

see……..:)

unique(train$Embarked)
## [1] "S" "C" "Q" NA

correlation plots can be very helpful to choose which variable to choose before modeling

data=train %>% 
  mutate(Embarked=if_else(is.na(Embarked),"unknown","Embarked")) %>% 
  mutate(Cabin=if_else(is.na(Cabin),"null","known")) %>% 
  mutate(Survived=as.factor(Survived)) 
library(DataExplorer)
plot_correlation(na.omit(data))

ggplot(data=data,aes(x=Survived,color=Survived,fill=Survived))+
  geom_bar()+facet_wrap(~Pclass)

data$Fare[is.na(data$Fare)]=mean(data$Fare,na.rm=TRUE)
data %>% 
  mutate(Fare=cut(Fare,breaks=3,labels=c("low","medium","high"))) %>% 
  ggplot(aes(x=Survived,color=Survived,fill=Survived))+
  geom_bar()+facet_wrap(~Fare)

ggplot(data=data,aes(x=Survived,color=Survived,fill=Survived))+
  geom_bar()+facet_wrap(~Sex)

ggplot(data=data,aes(x=Survived,color=Survived,fill=Survived))+
  geom_bar()+facet_wrap(~Embarked)

data1=train %>% 
  select(-c(PassengerId,SibSp,Parch,Cabin,Name)) %>% 
  mutate(Survived=as.factor(Survived)) %>% 
  mutate(Ticket=as.factor(Ticket)) %>% 
  mutate(Embarked=if_else(is.na(Embarked),"unknown","known"))
library(tidymodels)
data_split=initial_split(data1,prop = 0.75)
data_train=training(data_split)
data_test=testing(data_split)
recipe=data_train %>% 
  recipe(Survived~.)
model=logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")
workflow_log=workflow() %>% 
  add_recipe(recipe) %>% 
  add_model(model)
fit=fit(workflow_log,data=data_train)
predict_class=predict(fit,new_data = data_train,type = "class")
predict_prob=predict(fit,new_data = data_train,type = "prob")
original_value=data_train %>% select(Survived)
new_assess_data=bind_cols(original_value,predict_class,predict_prob)
conf_mat(new_assess_data,truth = Survived,estimate = .pred_class) %>% 
summary()
## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.985
##  2 kap                  binary         0.968
##  3 sens                 binary         0.990
##  4 spec                 binary         0.976
##  5 ppv                  binary         0.985
##  6 npv                  binary         0.984
##  7 mcc                  binary         0.968
##  8 j_index              binary         0.967
##  9 bal_accuracy         binary         0.983
## 10 detection_prevalence binary         0.620
## 11 precision            binary         0.985
## 12 recall               binary         0.990
## 13 f_meas               binary         0.988
roc_curve(new_assess_data,truth = Survived, .pred_0) %>% 
  autoplot()

roc_auc(new_assess_data,truth = Survived, .pred_0)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.983