Assignment_3_TelcoChurn

version$version.string

## [1] "R version 4.3.1 (2023-06-16 ucrt)"

#install.packages('plyr', repos = "http://cran.us.r-project.org")
#install.packages("tidyverse");
#install.packages("lmtest");
#install.packages("dplyr");
#install.packages("ggplot2");
#install.packages("entropy");
#install.packages('RTransferEntropy');
#install.packages("multcomp");
#install.packages("party");
#install.packages("partykit");
#install.packages("rsample");
#install.packages("inspectdf");
#install.packages("MASS");
#install.packages("Lahman");
#install.packages("caTools");
#install.packages("rpart");
#install.packages("investr");
#install.packages("tree");
#install.packages("readxl");
#install.packages("devtools");
#install.packages("factoextra");
#install.packages("lime");
#install.packages("shinythemes")
#install.packages("AER");
#install.packages("flexdashboard");
#install.packages("rpart.plot");
#install.packages("fancyRpartPlot");
#install.packages("rattle");
#install.packages("caret");
#install.packages("glm2");
#install.packages("xgboost");
#install.packages("gmodels");
#install.packages("eeptools");
#install.packages("tinytex");

library(DataExplorer);
library(entropy);
library(readxl);
library(dplyr);

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra);

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(tibble);
library(ggplot2);
library(tidyverse);

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(RTransferEntropy);
library(e1071);
library(caret);

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(ROCR);
library(multcomp);

## Loading required package: mvtnorm
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Attaching package: 'TH.data'
## 
## The following object is masked from 'package:MASS':
## 
##     geyser

library(partykit);

## Loading required package: grid
## Loading required package: libcoin

library(rsample);

## 
## Attaching package: 'rsample'
## 
## The following object is masked from 'package:e1071':
## 
##     permutations

library(randomForest);

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(inspectdf);
library(MASS);
library(Lahman);
library(caTools);
library(rpart);
library(investr);

## 
## Attaching package: 'investr'
## 
## The following object is masked from 'package:survival':
## 
##     bladder

library(purrr);
library(forcats);
library(tree);
library(devtools);

## Loading required package: usethis

library(factoextra);

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(lime);

## 
## Attaching package: 'lime'
## 
## The following object is masked from 'package:dplyr':
## 
##     explain

library(shiny);
library(shinythemes);
library(AER);

## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'carData'
## 
## The following object is masked from 'package:Lahman':
## 
##     Salaries
## 
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich

library(flexdashboard);
library(rpart.plot);
#library(fancyRpartPlot);
library(rattle);

## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## 
## The following object is masked from 'package:randomForest':
## 
##     importance

library(knitr);
library(glm2);

## 
## Attaching package: 'glm2'
## 
## The following object is masked from 'package:MASS':
## 
##     crabs
## 
## The following object is masked from 'package:survival':
## 
##     heart

library(xgboost);

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:rattle':
## 
##     xgboost
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(gmodels);
library(eeptools);

print(getwd())

## [1] "C:/Users/gaura/OneDrive/Documents/IIT Kanpur/Assignments"

mydata <- read.csv("WA_Fn-UseC_-Telco-Customer-Churn.csv", stringsAsFactors = TRUE)
attr(mydata, 'spec') <- NULL # just to remove a warning about specifications

# Parsed with column specification:
  cols(
    .default = col_character(),
    SeniorCitizen = col_double(),
    tenure = col_double(),
    MonthlyCharges = col_double(),
    TotalCharges = col_double()
  )

## cols(
##   .default = col_character(),
##   SeniorCitizen = col_double(),
##   tenure = col_double(),
##   MonthlyCharges = col_double(),
##   TotalCharges = col_double()
## )

  str(mydata)

## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 5376 3963 2565 5536 6512 6552 1003 4771 5605 4535 ...
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...

  plot_intro(mydata) # let's take an overview on some aspects about the data

# So most of our data is discrete, there no a lot of missing values but we need 
#  to deal with them by removing or imputing and no columns missing completely.

#Let's see the distribution of our target variable

head(as.integer(mydata$Churn == 'Yes')) # this how i can convert it into numerical

## [1] 0 0 1 0 1 1

hist(as.integer(mydata$Churn == 'Yes') , col = 'darkblue' , main = 'distributuin of the target varible')

# We can see that the target feature is unbalanced as much more values are 0 so we may need to check 
# if the model will be biased toward 0 or not

table(mydata$Churn)/nrow(mydata) # seeing the fraction of the zeros and ones in the data

## 
##        No       Yes 
## 0.7346301 0.2653699

# customerID
n_distinct(mydata$customerID)

## [1] 7043

# Factor variables
options(repr.plot.width = 15, repr.plot.height = 5)
ggplot(mydata , aes( y = Churn ,fill = Churn)) + geom_bar() + facet_wrap(. ~ gender)+
  ggtitle("For each gender what is the distribution of zeros and ones") +
  theme(plot.title = element_text(hjust = 0.5))

# We can see clearly that there is no specific gender correlated with Churned or 
# not churned both genders has the same distribution for both 0 and 1

# Senior Citizen
prop.table(table(mydata$SeniorCitizen,mydata$Churn),1) # i want to get the ratio of Churn and in all levels

##    
##            No       Yes
##   0 0.7639383 0.2360617
##   1 0.5831874 0.4168126

options(repr.plot.width = 15, repr.plot.height = 5)

ggplot(mydata , aes( y = Churn ,fill = SeniorCitizen)) + geom_bar() + facet_wrap(. ~ SeniorCitizen )+
  ggtitle("what is the distribution of Churn for SeniorCitizen and non SeniorCitizen") +
  theme(plot.title = element_text(hjust = 0.5))

# The fraction of churned out of all Senior Citizen is much bigger that the fraction of churned out of
# all non Senior Citizen so being Senior Citizen is indeed an important factor of churning

# Partner 
table(mydata$Partner)/nrow(mydata) # distribution of people who have partner and who not

## 
##        No       Yes 
## 0.5169672 0.4830328

options(repr.plot.width = 15, repr.plot.height = 5)
ggplot(mydata , aes( y = Churn ,fill = Partner)) + geom_bar() + facet_wrap(. ~ Partner )+
  ggtitle("what is the distribution of Churn for people with Partner and vice versa") +
  theme(plot.title = element_text(hjust = 0.5))

print("So half the people have Partner and they have less probability of Churning than people who don't have 
as we can see the fraction of churned in Partner group is less than no Partner group")

## [1] "So half the people have Partner and they have less probability of Churning than people who don't have \nas we can see the fraction of churned in Partner group is less than no Partner group"

# Dependents
prop.table(table(mydata$Dependents,mydata$Churn),1)# i want to get the ratio of Churn and in all levels

##      
##              No       Yes
##   No  0.6872086 0.3127914
##   Yes 0.8454976 0.1545024

options(repr.plot.width = 15, repr.plot.height = 5)
ggplot(mydata , aes( y = Churn ,fill = Dependents)) + geom_bar() + facet_wrap(. ~ Dependents )+
  ggtitle("what is the distribution of Churn for people with Partner and vice versa") +
  theme(plot.title = element_text(hjust = 0.5))

print("So the ratio of churned people who don't have Dependents is 31% wehre in the people who have dependent is 15% only which is half the ratio which means that not having dependent increse the probability of churn

")

## [1] "So the ratio of churned people who don't have Dependents is 31% wehre in the people who have dependent is 15% only which is half the ratio which means that not having dependent increse the probability of churn\n\n"

options(repr.plot.width = 20, repr.plot.height = 8)
grid.arrange(
  ggplot(mydata, aes(x=PhoneService,fill=Churn))+ 
    geom_bar(position = 'fill', show.legend = FALSE) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=MultipleLines,fill=Churn))+ 
    geom_bar(position = 'fill')+
    labs(y = NULL) + scale_fill_ordinal() + 
    theme_minimal(), 
  
  ggplot(mydata, aes(x=InternetService,fill=Churn))+ 
    geom_bar(position = 'fill', show.legend = FALSE) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=OnlineSecurity,fill=Churn))+ 
    geom_bar(position = 'fill')+labs(y = NULL) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  nrow = 2
)

print("customers who have Fiber optic in InternestService and customers who have no internet service has high 
      probability to churn compared with the others in the grid plot")

## [1] "customers who have Fiber optic in InternestService and customers who have no internet service has high \n      probability to churn compared with the others in the grid plot"

# Online Backup Device Protection Tech Support Streaming TV
options(repr.plot.width = 20, repr.plot.height = 8)
grid.arrange(
  ggplot(mydata, aes(x=OnlineBackup,fill=Churn))+ 
    geom_bar(position = 'fill', show.legend = FALSE) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=DeviceProtection,fill=Churn))+ 
    geom_bar(position = 'fill')+
    labs(y = NULL) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=TechSupport,fill=Churn))+ 
    geom_bar(position = 'fill', show.legend = FALSE) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=StreamingTV,fill=Churn))+ geom_bar(position = 'fill')+
    labs(y = NULL) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  nrow = 2
)

print("StreamingMovies
Contract The contract term of the customer (Month-to-month, One year, Two year)
PaperlessBilling
PaymentMethod The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))")

## [1] "StreamingMovies\nContract The contract term of the customer (Month-to-month, One year, Two year)\nPaperlessBilling\nPaymentMethod The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))"

options(repr.plot.width = 15, repr.plot.height = 12)

grid.arrange( 
  ggplot(mydata, aes(x=StreamingMovies,fill=Churn))+ 
    geom_bar(position = 'fill', show.legend = FALSE) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=Contract,fill=Churn))+  
    geom_bar(position = 'fill')+labs(y = NULL) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=PaperlessBilling,fill=Churn))+ 
    geom_bar(position = 'fill', show.legend = FALSE) + 
    scale_fill_ordinal() + 
    theme_minimal(),
  
  ggplot(mydata, aes(x=PaymentMethod,fill=Churn))+
    geom_bar(position = 'fill')+
    labs(y = NULL) + 
    scale_fill_ordinal() + theme_minimal()+
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), 
  
  nrow=3)

print("It's predictible that people with month to month contract are more likely to churn so we have to give them more care and offers")

## [1] "It's predictible that people with month to month contract are more likely to churn so we have to give them more care and offers"

# Continues variables
# First of all we need to handle the missing values
colnames(mydata)[colSums(is.na(mydata)) > 0] # getting the columns that have missing values

## [1] "TotalCharges"

mydata2 = mydata %>% dplyr::select(c(tenure,MonthlyCharges,TotalCharges)) %>% mutate(tenure_TotalCharges = tenure * MonthlyCharges , difference = tenure_TotalCharges -TotalCharges  )
head(mydata2)

##   tenure MonthlyCharges TotalCharges tenure_TotalCharges difference
## 1      1          29.85        29.85               29.85       0.00
## 2     34          56.95      1889.50             1936.30      46.80
## 3      2          53.85       108.15              107.70      -0.45
## 4     45          42.30      1840.75             1903.50      62.75
## 5      2          70.70       151.65              141.40     -10.25
## 6      8          99.65       820.50              797.20     -23.30

summary(mydata2$difference)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -373.2500  -28.7000    0.0000   -0.1532   28.6500  370.8500        11

sd(na.omit(mydata2$difference))

## [1] 67.25533

print("we can see that if we multipli the tenure in the monthly charge we will get a column that is very neer to the totalCharge with mean difference than TotalCharge = -0.1532, median = 0 and std of 67 so we can replace the na with the values from this calculated column")

## [1] "we can see that if we multipli the tenure in the monthly charge we will get a column that is very neer to the totalCharge with mean difference than TotalCharge = -0.1532, median = 0 and std of 67 so we can replace the na with the values from this calculated column"

mydata2[rowSums(is.na(mydata2)) > 0,]

##      tenure MonthlyCharges TotalCharges tenure_TotalCharges difference
## 489       0          52.55           NA                   0         NA
## 754       0          20.25           NA                   0         NA
## 937       0          80.85           NA                   0         NA
## 1083      0          25.75           NA                   0         NA
## 1341      0          56.05           NA                   0         NA
## 3332      0          19.85           NA                   0         NA
## 3827      0          25.35           NA                   0         NA
## 4381      0          20.00           NA                   0         NA
## 5219      0          19.70           NA                   0         NA
## 6671      0          73.35           NA                   0         NA
## 6755      0          61.90           NA                   0         NA

print("It's a little bit wired to have all the tenure column = 0 for these rows if we were in real life i would ask the data provider what that means to have tenure = 0 but still they have MonthlyCharges.
any ways i will replace the nans with the MonthlyCharges as my calculated column equal to zero")

## [1] "It's a little bit wired to have all the tenure column = 0 for these rows if we were in real life i would ask the data provider what that means to have tenure = 0 but still they have MonthlyCharges.\nany ways i will replace the nans with the MonthlyCharges as my calculated column equal to zero"

mydata[rowSums(is.na(mydata2)) > 0,'TotalCharges'] = mydata2[rowSums(is.na(mydata2)) > 0,'MonthlyCharges']
anyNA(mydata) # is there any nan in the data ?

## [1] FALSE

options(repr.plot.width = 15, repr.plot.height = 8)

ggplot(mydata , aes( x = tenure , fill = Churn)) + geom_histogram(bins = 30) + theme(plot.title = element_text(hjust = 0.5))+ facet_wrap(. ~Churn )

print("The distibution of the churned customer is clearly positively skewed which means that more customers has low tenure period and vice versa on not churned customer although there is a spike at low tenure period also which indicat another factors inetacting with the period a cutomer stays which makes sense")

## [1] "The distibution of the churned customer is clearly positively skewed which means that more customers has low tenure period and vice versa on not churned customer although there is a spike at low tenure period also which indicat another factors inetacting with the period a cutomer stays which makes sense"

ggplot(mydata , aes( x = TotalCharges , fill = Churn)) + geom_histogram(bins = 30) + theme(plot.title = element_text(hjust = 0.5)) + facet_wrap(. ~ Churn)

print("Same distribution which means this column may not be very important for the model")

## [1] "Same distribution which means this column may not be very important for the model"

ggplot(mydata , aes( x = MonthlyCharges , fill = Churn)) + geom_histogram(bins = 30) + theme(plot.title = element_text(hjust = 0.5)) + facet_wrap(. ~ Churn)

print("Base Model and Model Selection")

## [1] "Base Model and Model Selection"

print("I will remove the customerID and convert the columns to be numerical, i will start with a simple label encoding to get a quick base model")

## [1] "I will remove the customerID and convert the columns to be numerical, i will start with a simple label encoding to get a quick base model"

mydata2 =subset(mydata , select = -c(customerID)) # keep a reference to original data
binary_cols = c('Partner','Dependents','PhoneService','PaperlessBilling','Churn')
for(col in binary_cols) # for each column go and convert it to 1 if Yes and zero if No
{
  mydata2[,col] = as.integer(mydata2[,col] == 'Yes')
  
}
mydata2 = mydata2 %>% mutate_if(is.character,as.factor)  %>% mutate_if(is.factor, as.numeric)
str(mydata2) # check if the data is numerical now

## 'data.frame':    7043 obs. of  20 variables:
##  $ gender          : num  1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Dependents      : int  0 0 0 0 0 0 1 0 0 1 ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : int  0 1 1 0 1 1 1 0 1 1 ...
##  $ MultipleLines   : num  2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : num  1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : num  1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : num  3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: num  1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : num  1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : num  1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : num  1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : num  1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: int  1 0 1 0 1 1 1 0 1 0 ...
##  $ PaymentMethod   : num  3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : int  0 0 1 0 1 1 0 0 1 0 ...

# split the data into train, validation and test set
mydata2 = as.data.frame(mydata2)
set.seed(2)
train_index <- createDataPartition(mydata2$Churn, p = 0.7 , list = FALSE) # Taking 30% to make a validation and test set
train = mydata2[train_index,]
val_test = mydata2[-train_index,]
#making the train test data
set.seed(3)
val_index <- createDataPartition(val_test$Churn, p =0.5 , list = FALSE) # Taking 15% to make a validation and 15% and test set
val = val_test[val_index,]
test = val_test[-val_index,]

nrow(train)

## [1] 4931

nrow(val)

## [1] 1056

nrow(test)

## [1] 1056

print("Chossing the accuracy :
In our problem the False negative will very very costly as it means my model predicts this customer will not churn but actually he/she will churn which means loosing a customer and also the False positive means the model predicts the cutomer will churn hence efforts and money will be exerted to keep him while he/she already will saty so, i want the minimum false negative and the the minimum fals negative")

## [1] "Chossing the accuracy :\nIn our problem the False negative will very very costly as it means my model predicts this customer will not churn but actually he/she will churn which means loosing a customer and also the False positive means the model predicts the cutomer will churn hence efforts and money will be exerted to keep him while he/she already will saty so, i want the minimum false negative and the the minimum fals negative"

# Decision tree
#train and predict
set.seed(123)
Dtree = rpart(Churn ~., data = train, method = "class")  # method = class to make it binary classification
preds <- predict(Dtree,type = "class", newdata = val)

#evaluating the model

confusion_table <- table(preds, val$Churn)
confusionMatrix = confusionMatrix( as.factor(preds) , as.factor(val$Churn),positive = "1" )

confusion_table

##      
## preds   0   1
##     0 713 158
##     1  70 115

confusionMatrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 713 158
##          1  70 115
##                                          
##                Accuracy : 0.7841         
##                  95% CI : (0.758, 0.8086)
##     No Information Rate : 0.7415         
##     P-Value [Acc > NIR] : 0.0007304      
##                                          
##                   Kappa : 0.3708         
##                                          
##  Mcnemar's Test P-Value : 8.326e-09      
##                                          
##             Sensitivity : 0.4212         
##             Specificity : 0.9106         
##          Pos Pred Value : 0.6216         
##          Neg Pred Value : 0.8186         
##              Prevalence : 0.2585         
##          Detection Rate : 0.1089         
##    Detection Prevalence : 0.1752         
##       Balanced Accuracy : 0.6659         
##                                          
##        'Positive' Class : 1              
##

options(repr.plot.width = 15, repr.plot.height = 8)
rpart.plot(Dtree)

Assignment_3_TelcoChurn_DecisionTree.R

gaura

2023-08-21