library(tidyverse)
## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.1.0 <U+221A> purrr 0.2.5
## <U+221A> tibble 2.1.3 <U+221A> dplyr 0.8.3
## <U+221A> tidyr 0.8.2 <U+221A> stringr 1.3.1
## <U+221A> readr 1.2.1 <U+221A> forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(psycho)
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(DescTools)
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
##
## %like%
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
##
## Recode
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(tibble)
library(nnet)
library(InformationValue)
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
library(caTools)
library(lattice)
library(grid)
library(dplyr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(tidyselect)
all_churn_2017 <- read_excel("C:/Users/vjovanovic/Desktop/A1/all churn 2017.xlsx")
Blizoo_total_Base <- read_excel("C:/Users/vjovanovic/Desktop/A1/Blizoo_total_Base.xlsx")
CABLE_ALL_total_Base <- read_excel("C:/Users/vjovanovic/Desktop/A1/CABLE_ALL_total_Base.xlsx")
CABLE_AMDOCS <- read_excel("C:/Users/vjovanovic/Desktop/A1/CABLE_AMDOCS.xlsx")
trouble_tickets_Q1_2017 <- read_excel("C:/Users/vjovanovic/Desktop/A1/trouble_tickets_Q1_2017.xlsx")
#We will try to make an innerjoin of two data systems of Blizoo and Cable_all for checking if there is enough
#fully populated customers. This will give as a databse with higher number of variables (potential predictors).
dim(Blizoo_total_Base) #75855 customers
## [1] 75855 24
dim(CABLE_ALL_total_Base) #85777
## [1] 85777 32
inner_cable <- merge(CABLE_ALL_total_Base, Blizoo_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(inner_cable) #75894
## [1] 75894 55
#We see that we have 40 more customers than in Blizzo, but we could have 75855 or less.
#But we got about 50 warnings and those are maybe duplicates, we suppose that ALL_Cable have all customers that are in Blizoo.
missmap(inner_cable,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

#Now we are looking for an intersection of inner_cable and churns.
dim(all_churn_2017)
## [1] 7023 30
churns_inner <- merge(all_churn_2017, inner_cable, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_inner) #2598
## [1] 2598 84
#We halved the number of churns, half of churns are no in inner_cable, we will try to find better intersection (inner_join).
#Exploring the intersection between churns and Blizoo
dim(all_churn_2017)
## [1] 7023 30
churns_blizoo <- merge(all_churn_2017, Blizoo_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_blizoo) #2558
## [1] 2558 53
#No duplicates.
#But still a lot of churns that are no in the Blizoo database and that createsa lot of missing values
#in churns extended with blizoo variables (left join, x=churns).
#Now we want to see how many churns are there who are in Blizoo. We are doing inner_join or natural join.
churns_intersect_blizoo <- merge(all_churn_2017, Blizoo_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_intersect_blizoo) #2558
## [1] 2558 53
#How many churns are in all cable?
churns_intersect_cableall <- merge(all_churn_2017, CABLE_ALL_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_intersect_cableall) #3225
## [1] 3255 61
missmap(churns_intersect_cableall)

###FOR THIS PURPOSE OF PRACTICE WE WILL DO ON BLIZOO BUT MODEL WILL BE BETTER WHEN DOING ON CABLE_ALL database becease of higher number of churns!
#Now we see a very few variables that have a lot of missing values, but we halved the number of churns.
#We already have a very few churns and model would have even the weaker predictive power with
#smaller number of churns. So we chose to have higher number of churns and fewer number of variables.
#But we can try both. We will clean the churns_intersect_blizoo
#Now we will delete columns (variables) with highest percentage of missings in order to preserve
#the highest possible number of customers
churns_intersect_blizoo$`start date of churn request` <- NULL
churns_intersect_blizoo$`end date of churn request` <- NULL
churns_intersect_blizoo$`Churn date` <- NULL
churns_intersect_blizoo$`Migration Y/N` <- NULL
churns_intersect_blizoo$Reason <- NULL
churns_intersect_blizoo$`Reason description` <- NULL
churns_intersect_blizoo$`System information` <- NULL
churns_intersect_blizoo$`Blizoo-Vip tariff` <- NULL
churns_intersect_blizoo$contract_end <- NULL
churns_intersect_blizoo$`CPE name` <- NULL
dim(churns_intersect_blizoo)
## [1] 2558 43
missmap(churns_intersect_blizoo,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

#We aggregate the averages instead of missings to save as more churns as possible.
churns_intersect_blizoo[] <- lapply(churns_intersect_blizoo, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
dim(churns_intersect_blizoo)
## [1] 2558 43
#We are recoding month to contract_end
churns_intersect_blizoo$MONTHS_TO_CONTRACT_END <- as.character(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
table(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
##
## 0 1 10 11 12 13
## 105 100 31 35 18 2
## 14 15 19 2 3 4
## 1 2 4 69 16 25
## 5 6 7 8 9 NO_CONTRACT
## 31 29 32 57 41 1686
#We have decided to recode this variable to be possible to treat it as numeric and to
#make imputations with averages.
churns_intersect_blizoo$MONTHS_TO_CONTRACT_END <- as.character(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
churns_intersect_blizoo$MONTHS_TO_CONTRACT_END[churns_intersect_blizoo$MONTHS_TO_CONTRACT_END == "NO_CONTRACT"] <- "0"
churns_intersect_blizoo[] <- lapply(churns_intersect_blizoo, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
churns_intersect_blizoo$MONTHS_TO_CONTRACT_END <- as.numeric(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
table(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END) #We suceedeed.
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1791 100 69 16 25 31 29 32 57 41 31 35 18 2 1
## 15 19
## 2 4
#We are also trying to make an imputation in factor variable adding more frequent category.
churns_intersect_blizoo$`CPE old-new` <- as.character(churns_intersect_blizoo$`CPE old-new`)
churns_intersect_blizoo$`CPE old-new`[is.na(churns_intersect_blizoo$`CPE old-new`)] <- 1
churns_intersect_blizoo$`CPE old-new` <- as.factor(churns_intersect_blizoo$`CPE old-new`)
#Now we have churn_intersect_blizoo cleaned.
#We are adding churn variable to churns_intersect blizoo. This will be our TARGET VARIABLE!
churns_intersect_blizoo$churn <- 1
table(churns_intersect_blizoo$churn)
##
## 1
## 2558
#Now we have churn_blizoo dataframe without missings.
#Now we want to see whether is possible to merge all_troubles and Blizoo, we suppose
#that maybe contract number is possibly Custom id.
dim(trouble_tickets_Q1_2017) #18659
## [1] 18659 9
dim(Blizoo_total_Base)
## [1] 75855 24
blizoo_troubles <- merge (Blizoo_total_Base, trouble_tickets_Q1_2017, copy=FALSE, suffix = c(".x", ".y"), by.x = "Customer ID", by.y = "contract_number", all.x=T, all.y=F, no.dups = TRUE)
dim(blizoo_troubles)
## [1] 149280 32
#It seems that we cannot use this database and that matching is impossible becease we got the mess, as we
#do not have an key variable for matching.
#Now we will merge Blizoo dataframe and churn_intersect blizoo (left join) adding churns to a Blizoo.
blizoo_with_churns <- merge (Blizoo_total_Base, churns_intersect_blizoo, copy=FALSE, suffix = c(".x", ".y"), by.x = "client_id", by.y = "client_id", all.x=T, all.y=F, no.dups = TRUE)
dim(blizoo_with_churns)
## [1] 75857 67
missmap(blizoo_with_churns)

#According to the missing map we see that in Blizoo we added a lot of variable from churns database
#which are empty and we will have to clean them, but first we will check are blizoo_with_churn$churn variable.
table(blizoo_with_churns$churn)
##
## 1
## 2558
#We see that customers with churns NA are not in churn database
filter(churns_intersect_blizoo, client_id==27)
## [1] client_id client id 2
## [3] Customer ID.x City
## [5] telephone mobile
## [7] service_type service_description
## [9] package_name monthly fee
## [11] network_type operator (blizoo/waw)
## [13] tech_tickets_last_3_months.x tech_tickets_last_6_months
## [15] tech_tickets_last_12_months date_first_activation
## [17] date_last_renewal account_type (res/bis)
## [19] date of contract expiry System churn date
## [21] Churn type Current status
## [23] Customer ID.y telephone_new
## [25] Price_package mobile_new
## [27] bus_res first_activation
## [29] contract_start service type
## [31] Service description Blizoo_Vip_tarifa
## [33] network type Monthly fee
## [35] tech_tickets_last_3_months.y average_invoice_last_3_months
## [37] CPE old-new Subscriber status
## [39] operator ADX_TT_count
## [41] tech_tickets_last_3_months_final TENURE_MONTHS
## [43] MONTHS_TO_CONTRACT_END churn
## <0 rows> (or 0-length row.names)
filter(churns_intersect_blizoo, client_id==109)
## [1] client_id client id 2
## [3] Customer ID.x City
## [5] telephone mobile
## [7] service_type service_description
## [9] package_name monthly fee
## [11] network_type operator (blizoo/waw)
## [13] tech_tickets_last_3_months.x tech_tickets_last_6_months
## [15] tech_tickets_last_12_months date_first_activation
## [17] date_last_renewal account_type (res/bis)
## [19] date of contract expiry System churn date
## [21] Churn type Current status
## [23] Customer ID.y telephone_new
## [25] Price_package mobile_new
## [27] bus_res first_activation
## [29] contract_start service type
## [31] Service description Blizoo_Vip_tarifa
## [33] network type Monthly fee
## [35] tech_tickets_last_3_months.y average_invoice_last_3_months
## [37] CPE old-new Subscriber status
## [39] operator ADX_TT_count
## [41] tech_tickets_last_3_months_final TENURE_MONTHS
## [43] MONTHS_TO_CONTRACT_END churn
## <0 rows> (or 0-length row.names)
#While those who are with churns =1 can be found.
filter(churns_intersect_blizoo, client_id==79)
## client_id client id 2 Customer ID.x City telephone mobile
## 1 79 79 0.9260636 City I 0.5044875 0.4634362
## service_type service_description
## 1 2p AtvDtvTel
## package_name
## 1 <U+0414><U+0422><U+0412> (<U+0424><U+0435><U+0432>. 2015),<U+0424><U+0438><U+043A><U+0441><U+043D><U+0430> <U+0422><U+0435><U+043B><U+0435><U+0444><U+043E><U+043D><U+0438><U+0458><U+0430> (<U+041D><U+043E><U+0435><U+043C><U+0432><U+0440><U+0438> 2012)
## monthly fee network_type operator (blizoo/waw)
## 1 598 DOCSIS Blizoo
## tech_tickets_last_3_months.x tech_tickets_last_6_months
## 1 0 0
## tech_tickets_last_12_months date_first_activation date_last_renewal
## 1 0 2008-05-01 2015-04-04
## account_type (res/bis) date of contract expiry System churn date
## 1 residental not available 1496188800
## Churn type Current status Customer ID.y telephone_new Price_package
## 1 voluntary_churn inactive 0.9260636 0.1193234 598
## mobile_new bus_res first_activation contract_start service type
## 1 0.006346465 residental 39569 1428105600 2p
## Service description Blizoo_Vip_tarifa network type Monthly fee
## 1 AtvDtvTel Blizoo docsis2.0 598
## tech_tickets_last_3_months.y average_invoice_last_3_months CPE old-new
## 1 0 598 old
## Subscriber status operator ADX_TT_count tech_tickets_last_3_months_final
## 1 active Blizoo 0 0
## TENURE_MONTHS MONTHS_TO_CONTRACT_END churn
## 1 110 1 1
str(blizoo_with_churns)
## 'data.frame': 75857 obs. of 67 variables:
## $ client_id : num 5 15 19 25 27 29 31 37 43 53 ...
## $ Customer ID : num 0.787 0.15 0.891 0.145 0.985 ...
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : POSIXct, format: "2015-12-01" "2017-02-06" ...
## $ contract_end : chr "43070" NA NA "43070" ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ Monthly fee.x : num 1258 NA NA 1764 NA ...
## $ tech_tickets_last_3_months : num 0 0 0 0 0 0 0 1 0 0 ...
## $ average_invoice_last_3_months.x : num 1258 1708 0 1474 1699 ...
## $ CPE name : chr "Powerbox HD-1822,Technicolor TC7200" "BLIZOO CAM modul chipset pairing,Powerbox HD-1822,Technicolor TC7200" "Powerbox HD-1822,Technicolor TC7200" "ADB ADB-2840C,Powerbox HD-1822,Technicolor TC7200" ...
## $ CPE old-new.x : chr "new" "new" "new" "new" ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ MONTHS_TO_CONTRACT_END.x : chr "7" "NO_CONTRACT" "NO_CONTRACT" "7" ...
## $ client id 2 : num NA NA NA NA NA NA NA NA NA 53 ...
## $ Customer ID.x : num NA NA NA NA NA ...
## $ City : chr NA NA NA NA ...
## $ telephone : num NA NA NA NA NA ...
## $ mobile : num NA NA NA NA NA ...
## $ service_type : chr NA NA NA NA ...
## $ service_description : chr NA NA NA NA ...
## $ package_name : chr NA NA NA NA ...
## $ monthly fee : num NA NA NA NA NA ...
## $ network_type : chr NA NA NA NA ...
## $ operator (blizoo/waw) : chr NA NA NA NA ...
## $ tech_tickets_last_3_months.x : num NA NA NA NA NA NA NA NA NA 0 ...
## $ tech_tickets_last_6_months : num NA NA NA NA NA NA NA NA NA 0 ...
## $ tech_tickets_last_12_months : num NA NA NA NA NA NA NA NA NA 0 ...
## $ date_first_activation : chr NA NA NA NA ...
## $ date_last_renewal : chr NA NA NA NA ...
## $ account_type (res/bis) : chr NA NA NA NA ...
## $ date of contract expiry : chr NA NA NA NA ...
## $ System churn date : num NA NA NA NA NA ...
## $ Churn type : chr NA NA NA NA ...
## $ Current status : chr NA NA NA NA ...
## $ Customer ID.y : num NA NA NA NA NA ...
## $ telephone_new.y : num NA NA NA NA NA ...
## $ Price_package.y : num NA NA NA NA NA ...
## $ mobile_new.y : num NA NA NA NA NA ...
## $ bus_res.y : chr NA NA NA NA ...
## $ first_activation.y : chr NA NA NA NA ...
## $ contract_start.y : num NA NA NA NA NA ...
## $ service type.y : chr NA NA NA NA ...
## $ Service description.y : chr NA NA NA NA ...
## $ Blizoo_Vip_tarifa.y : chr NA NA NA NA ...
## $ network type.y : chr NA NA NA NA ...
## $ Monthly fee.y : num NA NA NA NA NA ...
## $ tech_tickets_last_3_months.y : num NA NA NA NA NA NA NA NA NA 0 ...
## $ average_invoice_last_3_months.y : num NA NA NA NA NA ...
## $ CPE old-new.y : Factor w/ 3 levels "1","new","old": NA NA NA NA NA NA NA NA NA 2 ...
## $ Subscriber status.y : chr NA NA NA NA ...
## $ operator.y : chr NA NA NA NA ...
## $ ADX_TT_count.y : num NA NA NA NA NA NA NA NA NA 0 ...
## $ tech_tickets_last_3_months_final.y: num NA NA NA NA NA NA NA NA NA 0 ...
## $ TENURE_MONTHS.y : num NA NA NA NA NA NA NA NA NA 110 ...
## $ MONTHS_TO_CONTRACT_END.y : num NA NA NA NA NA NA NA NA NA 10 ...
## $ churn : num NA NA NA NA NA NA NA NA NA 1 ...
#We will use only full variables with ".x" extension
library(dplyr)
library(dbplyr)
library(tidyselect)
df <- blizoo_with_churns
#We select only variables with no missings, with .x extension.
df <- dplyr::select(df, ends_with (".x"), churn)
#So if we assume that all NA in blizoo_with_churns are actually not churns we can do:
df$churn[is.na(df$churn)] <- 0
table(df$churn)
##
## 0 1
## 73299 2558
#We are deleting the variables with very huge number of missings in order to save higher possible number
#of churns for better predictions.
df$average_invoice_last_3_months.x <- NULL
df$`Customer ID.x` <- NULL
df$`Monthly fee.x` <- NULL
#We want to again make imputation with average.
df$MONTHS_TO_CONTRACT_END <- as.character(df$MONTHS_TO_CONTRACT_END)
df$MONTHS_TO_CONTRACT_END[df$MONTHS_TO_CONTRACT_END.x == "NO_CONTRACT"] <- "0"
df$MONTHS_TO_CONTRACT_END <- as.numeric(df$MONTHS_TO_CONTRACT_END)
#We are filling the factor varaible with the more common factor.
str(df$`CPE old-new.x`)
## chr [1:75857] "new" "new" "new" "new" "new" "new" "new" "new" "new" ...
df$`CPE old-new.x`[is.na(df$`CPE old-new.x`)] <- "1"
df$`CPE old-new.x` <- as.factor(df$`CPE old-new.x`)
df[] <- lapply(df, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
missmap(df)

df$MONTHS_TO_CONTRACT_END.x <- NULL
str(df)
## 'data.frame': 75857 obs. of 19 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : num 0 0 0 0 0 0 0 0 0 1 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
dim(df)
## [1] 75857 19
#We have our FINAL cleaned database (although cleaning is "fast and dirty").
#We will explore a little our database. We see that we have only 3,5% of churns.
#Price_package can vary a lot, it may be important predictor.
#We have a lot of character variables that could be recoded as factor variables.
summary(df)
## telephone_new.x Price_package.x mobile_new.x
## Min. :0.0000114 Min. : 0.0 Min. :0.000009
## 1st Qu.:0.2530128 1st Qu.: 0.0 1st Qu.:0.275781
## Median :0.5035861 Median : 499.0 Median :0.501933
## Mean :0.5023232 Mean : 470.7 Mean :0.501933
## 3rd Qu.:0.7531127 3rd Qu.: 750.0 3rd Qu.:0.727331
## Max. :0.9999751 Max. :18587.0 Max. :0.999984
## bus_res.x first_activation.x contract_start.x
## Length:75857 Length:75857 Min. :1.014e+09
## Class :character Class :character 1st Qu.:1.417e+09
## Mode :character Mode :character Median :1.450e+09
## Mean :1.431e+09
## 3rd Qu.:1.472e+09
## Max. :1.491e+09
## service type.x Service description.x Blizoo_Vip_tarifa.x
## Length:75857 Length:75857 Length:75857
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## network type.x CPE old-new.x Subscriber status.x operator.x
## Length:75857 Min. :1.000 Length:75857 Length:75857
## Class :character 1st Qu.:2.000 Class :character Class :character
## Mode :character Median :2.000 Mode :character Mode :character
## Mean :2.147
## 3rd Qu.:2.000
## Max. :3.000
## ADX_TT_count.x tech_tickets_last_3_months_final.x TENURE_MONTHS.x
## Min. : 0.0000 Min. : 0.0000 Min. : 1.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 22.00
## Median : 0.0000 Median : 0.0000 Median : 72.88
## Mean : 0.1214 Mean : 0.3594 Mean : 72.88
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:117.00
## Max. :11.0000 Max. :21.0000 Max. :214.00
## tech_tickets_last_3_months.x churn MONTHS_TO_CONTRACT_END
## Min. :0.0000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.1126 1st Qu.:0.00000 1st Qu.: 0.000
## Median :0.1126 Median :0.00000 Median : 0.000
## Mean :0.1126 Mean :0.03372 Mean : 1.235
## 3rd Qu.:0.1126 3rd Qu.:0.00000 3rd Qu.: 0.000
## Max. :6.0000 Max. :1.00000 Max. :22.000
str(df)
## 'data.frame': 75857 obs. of 19 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : num 0 0 0 0 0 0 0 0 0 1 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
table(df$churn)
##
## 0 1
## 73299 2558
df$churn <- as.factor(df$churn)
df$bus_res <- as.factor(df$bus_res)
df$`Subscriber status` <- as.factor(df$`Subscriber status`)
df$`service type` <- as.factor(df$`service type`)
df$`Service description` <- as.factor(df$`Service description`)
df$Blizoo_Vip_tarifa <- as.factor(df$Blizoo_Vip_tarifa)
df$`network type` <- as.factor(df$`network type`)
df$operator <- as.factor(df$operator)
#When we look at the structure of the date we see some factor variable with only one factor
#e.g. bus_res which is useless, and we see some factors with huge number of levels (cpe name, service description)
#service type is combination of four possible services (from 1 to 4). We could basically recode
#this variable into four category categorical variable according to number of services
str(df)
## 'data.frame': 75857 obs. of 26 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
## $ bus_res : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 15 levels "Atv","AtvDtv",..: 4 4 4 4 13 4 4 4 4 4 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ operator : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
table(df$`Service description`)
##
## Atv AtvDtv AtvDtvInt AtvDtvIntTel AtvDtvTel
## 718 9714 2536 52220 2290
## AtvInt AtvIntTel AtvTel Dtv DtvInt
## 3175 357 126 47 3
## DtvIntTel DtvTel Int IntTel Tel
## 22 2 4154 450 43
service <- function(atv){
atv <- as.character(atv)
if (atv=='Atv'| atv=='Dtv' | atv=='Int' | atv=='Tel'){
return('one service')
}else if(atv=='AtvDtv'| atv=='AtvInt' | atv=='AtvTel' | atv=='DtvInt' | atv=='DtvTel' | atv=='DtvTel' | atv=='IntTel'){
return('two services')
}else if(atv=='AtvDtvInt'| atv=='AtvDtvTel' | atv=='AtvIntTel' | atv=='DtvIntTel'){
return('three services')
}else{
return('four services')
}
}
df$`Service description` <- sapply(df$`Service description`,service)
table(df$`Service description`)
##
## four services one service three services two services
## 52220 4962 5205 13470
df$`Service description` <- as.factor(df$`Service description`)
#Now we will see how are predictors connected with our target churn variable
library(ggplot2)
g <- ggplot (df, aes(df$churn, df$`Subscriber status`)) + geom_jitter()
g

#we will check chi-square test and cramer's V. There is an assosiation but weak. Majority of churns are active users.
tbl <- table(df$churn, df$`Subscriber status`)
tbl
##
## active ctv
## 0 72863 436
## 1 2339 219
chisq.test(tbl)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tbl
## X-squared = 1823.3, df = 1, p-value < 2.2e-16
CramerV(df$churn, df$`Subscriber status`)
## [1] 0.1554298
#See the equipment. This connection is even weaker.
p <- ggplot (df, aes(df$churn, df$`CPE old-new`)) + geom_jitter()
p

tbl <- table(df$churn, df$`CPE old-new`)
tbl
##
## 1 2 3
## 0 4681 53190 15428
## 1 342 1434 782
chisq.test(tbl)
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 380.47, df = 2, p-value < 2.2e-16
CramerV(df$churn, df$`CPE old-new`)
## [1] 0.07082085
#Let' try to see proportion of the churns in relation to price. We see that churns have
#cheaper packages but the differens is small and not significant.
str(df)
## 'data.frame': 75857 obs. of 26 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
## $ bus_res : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ operator : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
#For example, here we see the difference, churns have longer contratcs , maybe they are
#new but unsatisfied users.
k <- ggplot(df, aes(MONTHS_TO_CONTRACT_END, fill=churn)) + geom_density(alpha=0.7, adjust=1.5) + xlim(0,20) + theme_bw()
k

o <- ggplot(df, aes(Price_package.x, fill=churn)) + geom_histogram(binwidth = 100, alpha=0.7, position = "identity") + xlim(0,2000) + theme_bw()
o

#We have a high number of not churns with price package zero. Maybe we should delete them
p <- ggplot(df, aes(Price_package.x, fill=churn)) + geom_density(alpha=0.7, adjust=1.5) + xlim(0,2000) + theme_bw()
p

df$Price_package[df$Price_package.x<200] <- mean(df$Price_package.x)
boxplot(df$Price_package.x)$out
## [1] 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999
## [12] 1999 1899 1999 1999 2599 1999 1999 1999 1999 1999 1999
## [23] 1949 1999 1999 1999 3197 2098 1999 1999 1999 3998 1999
## [34] 1999 1999 1999 1999 1999 1999 1999 1998 1999 1999 1999
## [45] 1999 1999 1999 1999 1999 1899 2158 1999 1999 1999 1999
## [56] 1999 1999 1999 1999 1999 1999 1999 2198 1999 1999 1999
## [67] 1999 2299 1899 15587 18587 18587 1999 1999 1999 1999 2000
## [78] 12320 4500 1999 1999 1999 1999 1999 1999 1999 1999 1999
## [89] 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999
outliers <- boxplot(df$Price_package.x, plot=FALSE)$out
print(outliers)
## [1] 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999
## [12] 1999 1899 1999 1999 2599 1999 1999 1999 1999 1999 1999
## [23] 1949 1999 1999 1999 3197 2098 1999 1999 1999 3998 1999
## [34] 1999 1999 1999 1999 1999 1999 1999 1998 1999 1999 1999
## [45] 1999 1999 1999 1999 1999 1899 2158 1999 1999 1999 1999
## [56] 1999 1999 1999 1999 1999 1999 1999 2198 1999 1999 1999
## [67] 1999 2299 1899 15587 18587 18587 1999 1999 1999 1999 2000
## [78] 12320 4500 1999 1999 1999 1999 1999 1999 1999 1999 1999
## [89] 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999
df <- df[-which(df$Price_package.x %in% outliers),]
boxplot(df$Price_package.x)

p <- ggplot(df, aes(Price_package.x, fill=churn)) + geom_density(alpha=0.7, adjust=1.5) + xlim(0,2000) + theme_bw()
p

#We do not see any groupings in this churns.
j <- ggplot(df, aes (df$`network type`, df$tech_tickets_last_3_months_final)) + geom_bar(aes(color=churn), stat = "identity")
j

t.test(Price_package.x ~ churn, data = df,
var.equal = TRUE, alternative = "less")
##
## Two Sample t-test
##
## data: Price_package.x by churn
## t = -12.73, df = 75756, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -93.69321
## sample estimates:
## mean in group 0 mean in group 1
## 464.2760 571.8725
res.aov <- aov(df$Price_package.x ~ df$churn, data = df)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## df$churn 1 2.860e+07 28603238 162 <2e-16 ***
## Residuals 75756 1.337e+10 176517
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
EtaSq(res.aov)
## eta.sq eta.sq.part
## df$churn 0.002134439 0.002134439
#We saw an intresting trend that docsis 2.0 network users have more tech tickets. This is important to
#us becease higher churn rates. Results of Anova are significant.
str(df)
## 'data.frame': 75758 obs. of 27 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
## $ bus_res : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ operator : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
## $ Price_package : num 1199 471 691 1199 471 ...
res.aov <- aov(df$tech_tickets_last_3_months.x ~ df$`network type`, data = df)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## df$`network type` 3 0.1 0.04092 6.576 0.000194 ***
## Residuals 75754 471.4 0.00622
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
EtaSq(res.aov)
## eta.sq eta.sq.part
## df$`network type` 0.0002603378 0.0002603378
#We want to see the proportion of variance explained by network type. Basicaly less of 1% of tickets
#is explained by network type.
#Let' see the connection between churns and network type. Very weak assossiation.
tbl1 <- table(df$churn, df$`network type`)
tbl1
##
## docsis2.0 docsis3.0 lan TV
## 0 21010 35057 7704 9430
## 1 871 778 253 655
chisq.test(tbl1)
##
## Pearson's Chi-squared test
##
## data: tbl1
## X-squared = 485.79, df = 3, p-value < 2.2e-16
CramerV(df$churn, df$`network type`)
## [1] 0.0800771
i <- ggplot(df, aes(df$`network type`, fill=churn)) + geom_bar(alpha=0.7, stat = "count", position = "dodge") + theme_bw()
i

j <- ggplot(df, aes(df$`Service description`, fill=churn)) + geom_bar(alpha=0.7, stat = "count", position = "dodge") + theme_bw()
j

CramerV(df$churn, df$`Service description`)
## [1] 0.06572216
#Now we will deploy logit model. But first we must delete unimportant variables from the database
#e.g. character variables and IDs, as well as factors with one factor. Character variables could be useful when categorised
#if possible but we do not have time for this at the moment.
str(df)
## 'data.frame': 75758 obs. of 27 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: num 1 0 0 0 0 0 0 1 0 0 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
## $ bus_res : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ operator : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
## $ Price_package : num 1199 471 691 1199 471 ...
#We should explore a little bit more numeric variable that appear to be factors. We
#will make categorical out of numeric becease it seems that smaller values are far more frequent.
table(df$ADX_TT_count)
##
## 0 1 2 3 4 5 6 7 9 10 11
## 68743 5432 1155 309 86 17 11 2 1 1 1
tt <- function(adx){
adx <- as.numeric(adx)
if (adx>2){
return('thre and more')
}else if(adx==2){
return('two_adx')
}else if(adx==1){
return('one_adx')
}else{
return('zero')
}
}
df$ADX_TT_count <- sapply(df$ADX_TT_count,tt)
table(df$ADX_TT_count)
##
## one_adx thre and more two_adx zero
## 5432 428 1155 68743
df$ADX_TT_count <- as.factor(df$ADX_TT_count)
table(df$tech_tickets_last_3_months_final.x)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 59819 8952 4803 1053 716 168 133 45 34 11 9 4
## 12 13 14 15 17 21
## 2 2 3 1 1 2
tick <- function(ticket){
ticket <- as.numeric(ticket)
if (ticket>4){
return('four and more')
}else if(ticket==3){
return('three_ticket')
}else if(ticket==2){
return('two_ticket')
}else if(ticket==1){
return('one_ticket')
}else{
return('zero')
}
}
df$tech_tickets_last_3_months_final.x <- sapply(df$tech_tickets_last_3_months_final.x,tick)
table(df$tech_tickets_last_3_months_final.x)
##
## four and more one_ticket three_ticket two_ticket zero
## 415 8952 1053 4803 60535
df$tech_tickets_last_3_months_final.x <- as.factor(df$tech_tickets_last_3_months_final.x)
# Drop the columns of the dataframe. Many of this variables could be usefull, but this is a pracise
#example and we do not have time to dig deeper. For example, date varibles could be very usefull.
str(df)
## 'data.frame': 75758 obs. of 28 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ service type.x : chr "3p" "3p" "3p" "3p" ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
## $ bus_res : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ operator : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
## $ Price_package : num 1199 471 691 1199 471 ...
## $ ADX_TT_count : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
missmap(df)

df$`service type.x` <- as.factor(df$`service type.x`)
table(df$`service type.x`)
##
## 1p 2p 3p
## 14674 8577 52507
df$`CPE old-new.x` <- as.factor(df$`CPE old-new.x`)
table(df$`CPE old-new.x`)
##
## 1 2 3
## 5022 54534 16202
df1 <- subset(df,select = -c(bus_res, bus_res.x, first_activation.x, contract_start.x, operator, `Service description.x`, Blizoo_Vip_tarifa.x, `network type.x`, `Subscriber status.x`))
str(df1)
## 'data.frame': 75758 obs. of 19 variables:
## $ telephone_new.x : num 0.6276 0.0169 0.0905 0.1394 0.3512 ...
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ service type.x : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ CPE old-new.x : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 1 0 0 0 0 0 0 0 0 0 ...
## $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ tech_tickets_last_3_months.x : num 0.113 0.113 0.113 0.113 0.113 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 7 0 0 7 0 0 0 0 0 10 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ Price_package : num 1199 471 691 1199 471 ...
## $ ADX_TT_count : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
#We should standardize the numeric variables in order to have more interpretable results in logit model.
library(psycho)
df2 <- df %>%
psycho::standardize()
summary(df2)
## telephone_new.x Price_package.x mobile_new.x
## Min. :-1.740718 Min. :-1.11251 Min. :-1.8222775
## 1st Qu.:-0.863982 1st Qu.:-1.11251 1st Qu.:-0.8209182
## Median : 0.004379 Median : 0.07393 Median :-0.0000719
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000000
## 3rd Qu.: 0.868890 3rd Qu.: 0.66834 3rd Qu.: 0.8183075
## Max. : 1.724818 Max. : 3.16723 Max. : 1.8080704
## bus_res.x first_activation.x contract_start.x service type.x
## Length:75758 Length:75758 Min. :-6.3953 1p:14674
## Class :character Class :character 1st Qu.:-0.2021 2p: 8577
## Mode :character Mode :character Median : 0.3037 3p:52507
## Mean : 0.0000
## 3rd Qu.: 0.6449
## Max. : 0.9317
## Service description.x Blizoo_Vip_tarifa.x network type.x
## Length:75758 Length:75758 Length:75758
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## CPE old-new.x Subscriber status.x operator.x ADX_TT_count.x
## 1: 5022 Length:75758 Length:75758 Min. :-0.2797
## 2:54534 Class :character Class :character 1st Qu.:-0.2797
## 3:16202 Mode :character Mode :character Median :-0.2797
## Mean : 0.0000
## 3rd Qu.:-0.2797
## Max. :25.0292
## tech_tickets_last_3_months_final.x TENURE_MONTHS.x
## four and more: 415 Min. :-1.4720090
## one_ticket : 8952 1st Qu.:-1.0419249
## three_ticket : 1053 Median : 0.0001635
## two_ticket : 4803 Mean : 0.0000000
## zero :60535 3rd Qu.: 0.9036938
## Max. : 2.8902729
## tech_tickets_last_3_months.x churn MONTHS_TO_CONTRACT_END
## Min. :-1.42705 0:73201 Min. :-0.4311
## 1st Qu.:-0.00002 1: 2557 1st Qu.:-0.4311
## Median :-0.00002 Median :-0.4311
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:-0.00002 3rd Qu.:-0.4311
## Max. :74.62190 Max. : 7.2731
## bus_res Subscriber status service type Service description
## residental:75758 active:75103 1p:14674 four services :52128
## ctv : 655 2p: 8577 one service : 4960
## 3p:52507 three services: 5201
## two services :13469
##
##
## Blizoo_Vip_tarifa network type operator Price_package
## Blizoo:38444 docsis2.0:21881 Blizoo:75758 Min. :-1.6066
## Vip :37314 docsis3.0:35835 1st Qu.:-0.6028
## lan : 7957 Median :-0.4981
## TV :10085 Mean : 0.0000
## 3rd Qu.: 0.4288
## Max. : 4.3255
## ADX_TT_count
## one_adx : 5432
## thre and more: 428
## two_adx : 1155
## zero :68743
##
##
str(df2)
## 'data.frame': 75758 obs. of 28 variables:
## $ telephone_new.x : num 0.434 -1.682 -1.427 -1.258 -0.524 ...
## $ Price_package.x : num 1.74 -1.11 0.53 1.74 -1.11 ...
## $ mobile_new.x : num 0.609 1.441 1.592 -0.864 0.188 ...
## $ bus_res.x : chr "residental" "residental" "residental" "residental" ...
## $ first_activation.x : chr "39569" "39569" "39569" "39569" ...
## $ contract_start.x : num 0.282 0.857 0.252 0.282 0.731 ...
## $ service type.x : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description.x : chr "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
## $ Blizoo_Vip_tarifa.x : chr "Blizoo" "Vip" "Vip" "Blizoo" ...
## $ network type.x : chr "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
## $ CPE old-new.x : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
## $ Subscriber status.x : chr "active" "active" "active" "active" ...
## $ operator.x : chr "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
## $ ADX_TT_count.x : num 2.02 -0.28 -0.28 -0.28 -0.28 ...
## $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
## $ TENURE_MONTHS.x : num 0.76 0.76 0.76 0.76 0.76 ...
## $ tech_tickets_last_3_months.x : num -1.88e-05 -1.88e-05 -1.88e-05 -1.88e-05 -1.88e-05 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ MONTHS_TO_CONTRACT_END : num 2.02 -0.431 -0.431 2.02 -0.431 ...
## $ bus_res : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ operator : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
## $ Price_package : num 2.097 -0.603 0.214 2.097 -0.603 ...
## $ ADX_TT_count : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
df2 %>%
dplyr::select(Price_package.x, TENURE_MONTHS.x) %>%
gather(Variable, Value) %>%
ggplot(aes(x=Value, fill=Variable)) +
geom_density(alpha=0.5) +
geom_vline(aes(xintercept=0)) +
xlim(-5, 5) +
theme_bw() +
scale_fill_brewer(palette="Spectral")

#We see that variables ara far away from normal distributions. Linear transformation
#would make a better fit. We will leave this as standrised in this moment as it is.
#We will try a logit model for nnet library to make a rough accuracy rate.We will put
#all variables as predictors.
df$`CPE old-new` <- as.factor(df$`CPE old-new`)
df1 <- subset(df,select = -c(operator, bus_res, telephone_new.x, bus_res.x, first_activation.x, `service type.x`, `Service description.x`, Blizoo_Vip_tarifa.x, `network type.x`, `CPE old-new.x`, `Subscriber status.x`, operator.x, ADX_TT_count.x, MONTHS_TO_CONTRACT_END, tech_tickets_last_3_months.x))
str(df1)
## 'data.frame': 75758 obs. of 14 variables:
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ Price_package : num 1199 471 691 1199 471 ...
## $ ADX_TT_count : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
## $ CPE old-new : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
#Explore the matrice of interrcorelation
cor <- psycho::correlation(df1,
type = "full",
method = "pearson",
adjust = "none")
summary(cor)
## Price_package.x mobile_new.x contract_start.x
## Price_package.x
## mobile_new.x 0.01
## contract_start.x -0.36*** 0
## TENURE_MONTHS.x 0.24*** 0 -0.33***
## Price_package 0.88*** 0.01 -0.14***
## TENURE_MONTHS.x
## Price_package.x
## mobile_new.x
## contract_start.x
## TENURE_MONTHS.x
## Price_package 0.19***
plot(cor)

#creating training and test subset
library(caTools)
set.seed(101)
split = sample.split(df1$churn, SplitRatio = 0.50)
final.train = subset(df1, split == TRUE)
final.test = subset(df1, split == FALSE)
summary(final.train)
## Price_package.x mobile_new.x contract_start.x
## Min. : 0.0 Min. :0.0000094 Min. :1.017e+09
## 1st Qu.: 0.0 1st Qu.:0.2738808 1st Qu.:1.417e+09
## Median : 499.0 Median :0.5019332 Median :1.450e+09
## Mean : 469.8 Mean :0.5019495 Mean :1.430e+09
## 3rd Qu.: 798.0 3rd Qu.:0.7278121 3rd Qu.:1.472e+09
## Max. :1798.0 Max. :0.9999836 Max. :1.491e+09
## tech_tickets_last_3_months_final.x TENURE_MONTHS.x churn
## four and more: 203 Min. : 1.00 0:36600
## one_ticket : 4478 1st Qu.: 23.00 1: 1278
## three_ticket : 513 Median : 73.00
## two_ticket : 2419 Mean : 73.31
## zero :30265 3rd Qu.:117.00
## Max. :214.00
## Subscriber status service type Service description Blizoo_Vip_tarifa
## active:37552 1p: 7369 four services :26060 Blizoo:19351
## ctv : 326 2p: 4258 one service : 2508 Vip :18527
## 3p:26251 three services: 2603
## two services : 6707
##
##
## network type Price_package ADX_TT_count CPE old-new
## docsis2.0:11056 Min. : 200.0 one_adx : 2747 1: 2519
## docsis3.0:17797 1st Qu.: 470.7 thre and more: 208 2:27120
## lan : 3985 Median : 499.0 two_adx : 565 3: 8239
## TV : 5040 Mean : 634.4 zero :34358
## 3rd Qu.: 798.0
## Max. :1798.0
summary(final.test)
## Price_package.x mobile_new.x contract_start.x
## Min. : 0 Min. :0.000009 Min. :1.014e+09
## 1st Qu.: 0 1st Qu.:0.277662 1st Qu.:1.417e+09
## Median : 499 Median :0.501933 Median :1.450e+09
## Mean : 466 Mean :0.501956 Mean :1.431e+09
## 3rd Qu.: 749 3rd Qu.:0.726951 3rd Qu.:1.473e+09
## Max. :1800 Max. :0.999945 Max. :1.491e+09
## tech_tickets_last_3_months_final.x TENURE_MONTHS.x churn
## four and more: 212 Min. : 1.00 0:36601
## one_ticket : 4474 1st Qu.: 22.00 1: 1279
## three_ticket : 540 Median : 72.88
## two_ticket : 2384 Mean : 72.44
## zero :30270 3rd Qu.:116.00
## Max. :214.00
## Subscriber status service type Service description Blizoo_Vip_tarifa
## active:37551 1p: 7305 four services :26068 Blizoo:19093
## ctv : 329 2p: 4319 one service : 2452 Vip :18787
## 3p:26256 three services: 2598
## two services : 6762
##
##
## network type Price_package ADX_TT_count CPE old-new
## docsis2.0:10825 Min. : 200.0 one_adx : 2685 1: 2503
## docsis3.0:18038 1st Qu.: 470.7 thre and more: 220 2:27414
## lan : 3972 Median : 499.0 two_adx : 590 3: 7963
## TV : 5045 Mean : 632.3 zero :34385
## 3rd Qu.: 749.0
## Max. :1800.0
#logit model with all predictor. We see that only "subscriber status", "Tenure months", "old CPE"
#are significant preditcors.
str(df1)
## 'data.frame': 75758 obs. of 14 variables:
## $ Price_package.x : num 1199 0 691 1199 0 ...
## $ mobile_new.x : num 0.67 0.899 0.94 0.264 0.554 ...
## $ contract_start.x : num 1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
## $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 128 110 110 110 110 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ Price_package : num 1199 471 691 1199 471 ...
## $ ADX_TT_count : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
## $ CPE old-new : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
missmap(df1)

logit <- glm(formula = churn ~ ., family = binomial(logit), data = df1)
summary(logit)
##
## Call:
## glm(formula = churn ~ ., family = binomial(logit), data = df1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4888 -0.2941 -0.2173 -0.1678 3.2595
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -9.829e-01 7.402e-01
## Price_package.x 2.293e-04 1.715e-04
## mobile_new.x -2.144e-02 7.625e-02
## contract_start.x -1.532e-09 4.102e-10
## tech_tickets_last_3_months_final.xone_ticket -2.108e-01 4.124e-01
## tech_tickets_last_3_months_final.xthree_ticket -6.786e-01 4.509e-01
## tech_tickets_last_3_months_final.xtwo_ticket -5.116e-01 4.207e-01
## tech_tickets_last_3_months_final.xzero -9.381e-02 4.084e-01
## TENURE_MONTHS.x -8.558e-03 5.381e-04
## `Subscriber status`ctv 2.799e+00 9.130e-02
## `service type`2p 1.324e-01 1.836e-01
## `service type`3p 5.063e-01 3.138e-01
## `Service description`one service 4.974e-01 3.125e-01
## `Service description`three services 3.861e-01 2.422e-01
## `Service description`two services 5.254e-01 2.709e-01
## Blizoo_Vip_tarifaVip -4.425e-01 1.102e-01
## `network type`docsis3.0 -5.783e-03 7.213e-02
## `network type`lan 1.034e-03 8.244e-02
## `network type`TV 7.640e-01 1.688e-01
## Price_package 8.463e-04 2.038e-04
## ADX_TT_countthre and more -2.848e-01 4.605e-01
## ADX_TT_counttwo_adx 2.163e-03 2.246e-01
## ADX_TT_countzero -3.275e-01 1.192e-01
## `CPE old-new`2 -5.375e-01 9.670e-02
## `CPE old-new`3 -2.860e-01 8.623e-02
## z value Pr(>|z|)
## (Intercept) -1.328 0.184251
## Price_package.x 1.337 0.181140
## mobile_new.x -0.281 0.778608
## contract_start.x -3.736 0.000187 ***
## tech_tickets_last_3_months_final.xone_ticket -0.511 0.609220
## tech_tickets_last_3_months_final.xthree_ticket -1.505 0.132312
## tech_tickets_last_3_months_final.xtwo_ticket -1.216 0.223947
## tech_tickets_last_3_months_final.xzero -0.230 0.818315
## TENURE_MONTHS.x -15.904 < 2e-16 ***
## `Subscriber status`ctv 30.663 < 2e-16 ***
## `service type`2p 0.721 0.470708
## `service type`3p 1.614 0.106627
## `Service description`one service 1.592 0.111495
## `Service description`three services 1.594 0.110988
## `Service description`two services 1.940 0.052432 .
## Blizoo_Vip_tarifaVip -4.015 5.94e-05 ***
## `network type`docsis3.0 -0.080 0.936099
## `network type`lan 0.013 0.989994
## `network type`TV 4.527 5.99e-06 ***
## Price_package 4.152 3.30e-05 ***
## ADX_TT_countthre and more -0.618 0.536278
## ADX_TT_counttwo_adx 0.010 0.992314
## ADX_TT_countzero -2.747 0.006017 **
## `CPE old-new`2 -5.559 2.71e-08 ***
## `CPE old-new`3 -3.316 0.000912 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22357 on 75757 degrees of freedom
## Residual deviance: 20634 on 75733 degrees of freedom
## AIC: 20684
##
## Number of Fisher Scoring iterations: 7
#We run a stepwise algorithm which puts in model predictors one by one
#and calculates AIC (AIC = 2k - 2ln(L), L is maximum of maximum likelihood funcion and k is the number
#of predictors. Smallest the AIC it is better - with fewer predictors we have more information and
#better prediction.
#Now we left only important predictors.
logit1 <- step(logit)
## Start: AIC=20684.06
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + `service type` +
## `Service description` + Blizoo_Vip_tarifa + `network type` +
## Price_package + ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## - `Service description` 3 20638 20682
## - mobile_new.x 1 20634 20682
## - `service type` 2 20637 20683
## - Price_package.x 1 20636 20684
## <none> 20634 20684
## - ADX_TT_count 3 20643 20687
## - tech_tickets_last_3_months_final.x 4 20653 20695
## - contract_start.x 1 20648 20696
## - Blizoo_Vip_tarifa 1 20651 20699
## - Price_package 1 20652 20700
## - `network type` 3 20656 20700
## - `CPE old-new` 2 20668 20714
## - TENURE_MONTHS.x 1 20910 20958
## - `Subscriber status` 1 21310 21358
##
## Step: AIC=20681.76
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + `service type` +
## Blizoo_Vip_tarifa + `network type` + Price_package + ADX_TT_count +
## `CPE old-new`
##
## Df Deviance AIC
## - `service type` 2 20639 20679
## - mobile_new.x 1 20638 20680
## - Price_package.x 1 20640 20682
## <none> 20638 20682
## - ADX_TT_count 3 20647 20685
## - contract_start.x 1 20650 20692
## - tech_tickets_last_3_months_final.x 4 20657 20693
## - Price_package 1 20654 20696
## - Blizoo_Vip_tarifa 1 20656 20698
## - `CPE old-new` 2 20675 20715
## - `network type` 3 20703 20741
## - TENURE_MONTHS.x 1 20917 20959
## - `Subscriber status` 1 21314 21356
##
## Step: AIC=20678.72
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa +
## `network type` + Price_package + ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## - mobile_new.x 1 20639 20677
## - Price_package.x 1 20641 20679
## <none> 20639 20679
## - ADX_TT_count 3 20647 20681
## - tech_tickets_last_3_months_final.x 4 20657 20689
## - contract_start.x 1 20653 20691
## - Price_package 1 20654 20692
## - Blizoo_Vip_tarifa 1 20659 20697
## - `CPE old-new` 2 20677 20713
## - `network type` 3 20756 20790
## - TENURE_MONTHS.x 1 20919 20957
## - `Subscriber status` 1 21317 21355
##
## Step: AIC=20676.79
## churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa +
## `network type` + Price_package + ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## - Price_package.x 1 20641 20677
## <none> 20639 20677
## - ADX_TT_count 3 20647 20679
## - tech_tickets_last_3_months_final.x 4 20657 20687
## - contract_start.x 1 20653 20689
## - Price_package 1 20654 20690
## - Blizoo_Vip_tarifa 1 20659 20695
## - `CPE old-new` 2 20677 20711
## - `network type` 3 20756 20788
## - TENURE_MONTHS.x 1 20919 20955
## - `Subscriber status` 1 21317 21353
##
## Step: AIC=20676.69
## churn ~ contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa +
## `network type` + Price_package + ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## <none> 20641 20677
## - ADX_TT_count 3 20649 20679
## - tech_tickets_last_3_months_final.x 4 20659 20687
## - contract_start.x 1 20656 20690
## - `CPE old-new` 2 20678 20710
## - Blizoo_Vip_tarifa 1 20680 20714
## - Price_package 1 20719 20753
## - `network type` 3 20757 20787
## - TENURE_MONTHS.x 1 20919 20953
## - `Subscriber status` 1 21317 21351
summary(logit1)
##
## Call:
## glm(formula = churn ~ contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa +
## `network type` + Price_package + ADX_TT_count + `CPE old-new`,
## family = binomial(logit), data = df1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5037 -0.2949 -0.2138 -0.1670 3.2440
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -4.552e-01 6.806e-01
## contract_start.x -1.516e-09 3.854e-10
## tech_tickets_last_3_months_final.xone_ticket -2.084e-01 4.120e-01
## tech_tickets_last_3_months_final.xthree_ticket -6.746e-01 4.507e-01
## tech_tickets_last_3_months_final.xtwo_ticket -5.067e-01 4.203e-01
## tech_tickets_last_3_months_final.xzero -8.984e-02 4.080e-01
## TENURE_MONTHS.x -8.532e-03 5.342e-04
## `Subscriber status`ctv 2.793e+00 9.078e-02
## Blizoo_Vip_tarifaVip -5.547e-01 8.973e-02
## `network type`docsis3.0 -8.156e-03 7.142e-02
## `network type`lan -1.692e-03 8.190e-02
## `network type`TV 7.277e-01 7.105e-02
## Price_package 1.014e-03 1.138e-04
## ADX_TT_countthre and more -2.863e-01 4.601e-01
## ADX_TT_counttwo_adx 1.868e-03 2.245e-01
## ADX_TT_countzero -3.221e-01 1.191e-01
## `CPE old-new`2 -5.307e-01 9.230e-02
## `CPE old-new`3 -2.647e-01 8.219e-02
## z value Pr(>|z|)
## (Intercept) -0.669 0.50358
## contract_start.x -3.932 8.41e-05 ***
## tech_tickets_last_3_months_final.xone_ticket -0.506 0.61301
## tech_tickets_last_3_months_final.xthree_ticket -1.497 0.13443
## tech_tickets_last_3_months_final.xtwo_ticket -1.206 0.22797
## tech_tickets_last_3_months_final.xzero -0.220 0.82570
## TENURE_MONTHS.x -15.973 < 2e-16 ***
## `Subscriber status`ctv 30.762 < 2e-16 ***
## Blizoo_Vip_tarifaVip -6.183 6.31e-10 ***
## `network type`docsis3.0 -0.114 0.90908
## `network type`lan -0.021 0.98351
## `network type`TV 10.242 < 2e-16 ***
## Price_package 8.916 < 2e-16 ***
## ADX_TT_countthre and more -0.622 0.53378
## ADX_TT_counttwo_adx 0.008 0.99336
## ADX_TT_countzero -2.705 0.00683 **
## `CPE old-new`2 -5.749 8.95e-09 ***
## `CPE old-new`3 -3.221 0.00128 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22357 on 75757 degrees of freedom
## Residual deviance: 20641 on 75740 degrees of freedom
## AIC: 20677
##
## Number of Fisher Scoring iterations: 7
capture.output(summary(logit1), file = "logit_without_smotes.xls")
#Now we will do exponentaion of every coefficiten to conclude the odd ratio for every predictor.
#This shows how odd ratio changes when predictor change for one unite while holding other
#predictors constant. If exp(b)=2 that means that when this predictor changes
#for unite there is twice bigger chance of churn. We do this on step logit model
#with the best predictors.
exp(cbind(OR = coef(logit1), confint(logit1)))
## Waiting for profiling to be done...
## OR 2.5 %
## (Intercept) 0.6343135 0.1629550
## contract_start.x 1.0000000 1.0000000
## tech_tickets_last_3_months_final.xone_ticket 0.8118924 0.3813783
## tech_tickets_last_3_months_final.xthree_ticket 0.5093796 0.2160549
## tech_tickets_last_3_months_final.xtwo_ticket 0.6024860 0.2777721
## tech_tickets_last_3_months_final.xzero 0.9140777 0.4331511
## TENURE_MONTHS.x 0.9915040 0.9904604
## `Subscriber status`ctv 16.3217736 13.6444959
## Blizoo_Vip_tarifaVip 0.5742236 0.4813648
## `network type`docsis3.0 0.9918770 0.8623676
## `network type`lan 0.9983089 0.8488594
## `network type`TV 2.0703426 1.8005992
## Price_package 1.0010147 1.0007911
## ADX_TT_countthre and more 0.7510533 0.2906648
## ADX_TT_counttwo_adx 1.0018695 0.6340033
## ADX_TT_countzero 0.7246200 0.5752190
## `CPE old-new`2 0.5882139 0.4910162
## `CPE old-new`3 0.7674379 0.6534652
## 97.5 %
## (Intercept) 2.3524299
## contract_start.x 1.0000000
## tech_tickets_last_3_months_final.xone_ticket 1.9210408
## tech_tickets_last_3_months_final.xthree_ticket 1.2737550
## tech_tickets_last_3_months_final.xtwo_ticket 1.4461157
## tech_tickets_last_3_months_final.xzero 2.1470236
## TENURE_MONTHS.x 0.9925366
## `Subscriber status`ctv 19.4791030
## Blizoo_Vip_tarifaVip 0.6842770
## `network type`docsis3.0 1.1410350
## `network type`lan 1.1703165
## `network type`TV 2.3789701
## Price_package 1.0012374
## ADX_TT_countthre and more 1.7729407
## ADX_TT_counttwo_adx 1.5329786
## ADX_TT_countzero 0.9176355
## `CPE old-new`2 0.7050968
## `CPE old-new`3 0.9019437
PseudoR2(logit1, "Nagelkerke")
## Nagelkerke
## 0.08763531
#pravljenje confusion matrice
test.predicted.income = predict(logit1, newdata = final.test, type = "response" )
table(final.test$churn, test.predicted.income > 0.5)
##
## FALSE TRUE
## 0 36572 29
## 1 1260 19
tab <- table(final.test$churn, test.predicted.income > 0.5)
tab
##
## FALSE TRUE
## 0 36572 29
## 1 1260 19
#accuracy rate
acc <- sum(diag(tab))/sum(tab)
acc
## [1] 0.9659715
#mislasification rate
mfc <- 1-acc
mfc
## [1] 0.03402851
# (true positive/true positive + false positive) - precision
15/(15+30)
## [1] 0.3333333
#(true negative/true negative + false negative) - recall
36571/(36571+1264)
## [1] 0.9665918
#Here we just wanted to check the best of cut off value.
optCutOff <- optimalCutoff(final.test$churn, test.predicted.income)
optCutOff
## [1] 0.603401
plotROC(final.test$churn, test.predicted.income)

#Now we will try to balance the data through SMOTES algorithm. Perc.over is 200 - that means that
#we oversample the churns and perc.under is 150 that means that we undersample the no churns.
#k is the number of the nearest neighbours that are taken as the basis for sampling.
split <- sample.split(df1$churn, SplitRatio = 0.50)
traindf <- subset(df1, split == TRUE)
testdf <- subset(df1, split == FALSE)
as.data.frame(table(traindf$churn))
## Var1 Freq
## 1 0 36600
## 2 1 1278
str(traindf)
## 'data.frame': 37878 obs. of 14 variables:
## $ Price_package.x : num 1199 1199 850 1199 0 ...
## $ mobile_new.x : num 0.6697 0.2639 0.0943 0.9522 0.9271 ...
## $ contract_start.x : num 1.45e+09 1.45e+09 1.41e+09 1.43e+09 1.49e+09 ...
## $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 2 5 5 5 5 5 5 ...
## $ TENURE_MONTHS.x : num 110 110 110 110 110 110 110 110 110 110 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Subscriber status : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
## $ service type : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 3 1 3 3 3 3 ...
## $ Service description : Factor w/ 4 levels "four services",..: 1 1 1 1 1 4 1 1 1 1 ...
## $ Blizoo_Vip_tarifa : Factor w/ 2 levels "Blizoo","Vip": 1 1 1 1 2 1 2 1 1 1 ...
## $ network type : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 4 2 1 2 1 ...
## $ Price_package : num 1199 1199 850 1199 471 ...
## $ ADX_TT_count : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
## $ CPE old-new : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
## Smote : Synthetic Minority Oversampling Technique To Handle Class Imbalancy In Binary Classification
balanced.data <- SMOTE(churn ~., traindf, perc.over = 200, k = 5, perc.under = 100)
as.data.frame(table(balanced.data$churn))
## Var1 Freq
## 1 0 2556
## 2 1 3834
logit_balanced <- glm(formula = churn ~ ., family = binomial(logit), data = balanced.data)
summary(logit_balanced)
##
## Call:
## glm(formula = churn ~ ., family = binomial(logit), data = balanced.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1196 -0.8588 0.1583 0.8473 2.1406
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.187e-01 1.056e+00
## Price_package.x 8.907e-04 2.155e-04
## mobile_new.x -1.522e-02 1.192e-01
## contract_start.x 1.127e-09 6.566e-10
## tech_tickets_last_3_months_final.xone_ticket 1.513e+00 5.570e-01
## tech_tickets_last_3_months_final.xthree_ticket 1.359e+00 5.775e-01
## tech_tickets_last_3_months_final.xtwo_ticket 1.276e+00 5.598e-01
## tech_tickets_last_3_months_final.xzero 8.632e-01 5.507e-01
## TENURE_MONTHS.x -1.004e-02 7.815e-04
## `Subscriber status`ctv 3.755e+00 2.375e-01
## `service type`2p -1.622e-02 1.399e-01
## `service type`3p -4.012e-01 1.132e-01
## `Service description`one service -1.039e-01 1.509e-01
## `Service description`three services 5.311e-01 1.528e-01
## `Service description`two services 4.751e-01 1.073e-01
## Blizoo_Vip_tarifaVip 1.781e-01 9.576e-02
## `network type`docsis3.0 -2.602e-01 9.206e-02
## `network type`lan -3.690e-01 1.237e-01
## `network type`TV 5.976e-01 1.208e-01
## Price_package 3.112e-04 2.917e-04
## ADX_TT_countthre and more 3.147e-01 4.150e-01
## ADX_TT_counttwo_adx -1.310e+00 2.643e-01
## ADX_TT_countzero -1.580e+00 1.024e-01
## `CPE old-new`2 -1.055e+00 1.333e-01
## `CPE old-new`3 -6.883e-01 1.330e-01
## z value Pr(>|z|)
## (Intercept) -0.302 0.762743
## Price_package.x 4.132 3.59e-05 ***
## mobile_new.x -0.128 0.898385
## contract_start.x 1.716 0.086138 .
## tech_tickets_last_3_months_final.xone_ticket 2.717 0.006585 **
## tech_tickets_last_3_months_final.xthree_ticket 2.353 0.018599 *
## tech_tickets_last_3_months_final.xtwo_ticket 2.279 0.022674 *
## tech_tickets_last_3_months_final.xzero 1.568 0.116970
## TENURE_MONTHS.x -12.851 < 2e-16 ***
## `Subscriber status`ctv 15.812 < 2e-16 ***
## `service type`2p -0.116 0.907701
## `service type`3p -3.544 0.000394 ***
## `Service description`one service -0.689 0.491079
## `Service description`three services 3.477 0.000508 ***
## `Service description`two services 4.427 9.55e-06 ***
## Blizoo_Vip_tarifaVip 1.860 0.062860 .
## `network type`docsis3.0 -2.827 0.004701 **
## `network type`lan -2.982 0.002862 **
## `network type`TV 4.947 7.55e-07 ***
## Price_package 1.067 0.285948
## ADX_TT_countthre and more 0.758 0.448243
## ADX_TT_counttwo_adx -4.957 7.15e-07 ***
## ADX_TT_countzero -15.431 < 2e-16 ***
## `CPE old-new`2 -7.918 2.41e-15 ***
## `CPE old-new`3 -5.177 2.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8601.1 on 6389 degrees of freedom
## Residual deviance: 6260.1 on 6365 degrees of freedom
## AIC: 6310.1
##
## Number of Fisher Scoring iterations: 6
logit1_balanced <- step(logit_balanced)
## Start: AIC=6310.07
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + `service type` +
## `Service description` + Blizoo_Vip_tarifa + `network type` +
## Price_package + ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## - mobile_new.x 1 6260.1 6308.1
## - Price_package 1 6261.2 6309.2
## <none> 6260.1 6310.1
## - contract_start.x 1 6263.0 6311.0
## - Blizoo_Vip_tarifa 1 6263.5 6311.5
## - `service type` 2 6276.5 6322.5
## - Price_package.x 1 6277.1 6325.1
## - `Service description` 3 6290.7 6334.7
## - `network type` 3 6321.5 6365.5
## - tech_tickets_last_3_months_final.x 4 6328.3 6370.3
## - `CPE old-new` 2 6331.0 6377.0
## - TENURE_MONTHS.x 1 6434.2 6482.2
## - ADX_TT_count 3 6562.3 6606.3
## - `Subscriber status` 1 6992.7 7040.7
##
## Step: AIC=6308.09
## churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + `service type` +
## `Service description` + Blizoo_Vip_tarifa + `network type` +
## Price_package + ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## - Price_package 1 6261.2 6307.2
## <none> 6260.1 6308.1
## - contract_start.x 1 6263.0 6309.0
## - Blizoo_Vip_tarifa 1 6263.6 6309.6
## - `service type` 2 6276.6 6320.6
## - Price_package.x 1 6277.1 6323.1
## - `Service description` 3 6290.7 6332.7
## - `network type` 3 6321.6 6363.6
## - tech_tickets_last_3_months_final.x 4 6328.4 6368.4
## - `CPE old-new` 2 6331.2 6375.2
## - TENURE_MONTHS.x 1 6434.2 6480.2
## - ADX_TT_count 3 6562.4 6604.4
## - `Subscriber status` 1 6992.8 7038.8
##
## Step: AIC=6307.24
## churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + `service type` +
## `Service description` + Blizoo_Vip_tarifa + `network type` +
## ADX_TT_count + `CPE old-new`
##
## Df Deviance AIC
## <none> 6261.2 6307.2
## - Blizoo_Vip_tarifa 1 6264.7 6308.7
## - contract_start.x 1 6265.0 6309.0
## - `service type` 2 6277.0 6319.0
## - `Service description` 3 6291.3 6331.3
## - `network type` 3 6321.7 6361.7
## - tech_tickets_last_3_months_final.x 4 6329.6 6367.6
## - `CPE old-new` 2 6331.2 6373.2
## - Price_package.x 1 6345.6 6389.6
## - TENURE_MONTHS.x 1 6438.4 6482.4
## - ADX_TT_count 3 6562.5 6602.5
## - `Subscriber status` 1 6993.2 7037.2
summary(logit1_balanced)
##
## Call:
## glm(formula = churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x +
## TENURE_MONTHS.x + `Subscriber status` + `service type` +
## `Service description` + Blizoo_Vip_tarifa + `network type` +
## ADX_TT_count + `CPE old-new`, family = binomial(logit), data = balanced.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1105 -0.8537 0.1586 0.8484 2.1489
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.993e-01 1.051e+00
## Price_package.x 1.082e-03 1.195e-04
## contract_start.x 1.256e-09 6.455e-10
## tech_tickets_last_3_months_final.xone_ticket 1.491e+00 5.556e-01
## tech_tickets_last_3_months_final.xthree_ticket 1.327e+00 5.756e-01
## tech_tickets_last_3_months_final.xtwo_ticket 1.262e+00 5.588e-01
## tech_tickets_last_3_months_final.xzero 8.410e-01 5.493e-01
## TENURE_MONTHS.x -1.011e-02 7.797e-04
## `Subscriber status`ctv 3.747e+00 2.373e-01
## `service type`2p -2.033e-02 1.398e-01
## `service type`3p -3.912e-01 1.127e-01
## `Service description`one service -1.120e-01 1.508e-01
## `Service description`three services 5.283e-01 1.527e-01
## `Service description`two services 4.639e-01 1.067e-01
## Blizoo_Vip_tarifaVip 1.771e-01 9.553e-02
## `network type`docsis3.0 -2.570e-01 9.196e-02
## `network type`lan -3.727e-01 1.236e-01
## `network type`TV 5.860e-01 1.202e-01
## ADX_TT_countthre and more 3.015e-01 4.150e-01
## ADX_TT_counttwo_adx -1.303e+00 2.640e-01
## ADX_TT_countzero -1.576e+00 1.022e-01
## `CPE old-new`2 -1.039e+00 1.322e-01
## `CPE old-new`3 -6.784e-01 1.326e-01
## z value Pr(>|z|)
## (Intercept) -0.380 0.704125
## Price_package.x 9.058 < 2e-16 ***
## contract_start.x 1.946 0.051624 .
## tech_tickets_last_3_months_final.xone_ticket 2.684 0.007281 **
## tech_tickets_last_3_months_final.xthree_ticket 2.305 0.021152 *
## tech_tickets_last_3_months_final.xtwo_ticket 2.259 0.023900 *
## tech_tickets_last_3_months_final.xzero 1.531 0.125782
## TENURE_MONTHS.x -12.962 < 2e-16 ***
## `Subscriber status`ctv 15.787 < 2e-16 ***
## `service type`2p -0.145 0.884391
## `service type`3p -3.473 0.000515 ***
## `Service description`one service -0.743 0.457688
## `Service description`three services 3.461 0.000539 ***
## `Service description`two services 4.349 1.37e-05 ***
## Blizoo_Vip_tarifaVip 1.853 0.063820 .
## `network type`docsis3.0 -2.795 0.005191 **
## `network type`lan -3.016 0.002565 **
## `network type`TV 4.878 1.07e-06 ***
## ADX_TT_countthre and more 0.727 0.467502
## ADX_TT_counttwo_adx -4.934 8.04e-07 ***
## ADX_TT_countzero -15.416 < 2e-16 ***
## `CPE old-new`2 -7.858 3.90e-15 ***
## `CPE old-new`3 -5.117 3.10e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8601.1 on 6389 degrees of freedom
## Residual deviance: 6261.2 on 6367 degrees of freedom
## AIC: 6307.2
##
## Number of Fisher Scoring iterations: 6
PseudoR2(logit1_balanced, "Nagelkerke")
## Nagelkerke
## 0.4145004
capture.output(summary(logit1_balanced), file = "logit_smotes.xls")
exp(cbind(OR = coef(logit1_balanced), confint(logit1_balanced)))
## Waiting for profiling to be done...
## OR 2.5 %
## (Intercept) 0.6707990 0.08448574
## Price_package.x 1.0010829 1.00084939
## contract_start.x 1.0000000 1.00000000
## tech_tickets_last_3_months_final.xone_ticket 4.4421086 1.51917415
## tech_tickets_last_3_months_final.xthree_ticket 3.7695394 1.23862103
## tech_tickets_last_3_months_final.xtwo_ticket 3.5329331 1.20070996
## tech_tickets_last_3_months_final.xzero 2.3187387 0.80295457
## TENURE_MONTHS.x 0.9899440 0.98842434
## `Subscriber status`ctv 42.3805708 27.39509963
## `service type`2p 0.9798764 0.74511775
## `service type`3p 0.6762185 0.54201152
## `Service description`one service 0.8940827 0.66520001
## `Service description`three services 1.6960222 1.25848468
## `Service description`two services 1.5902260 1.29048525
## Blizoo_Vip_tarifaVip 1.1936927 0.99025361
## `network type`docsis3.0 0.7733605 0.64571966
## `network type`lan 0.6888417 0.54023880
## `network type`TV 1.7968738 1.42040815
## ADX_TT_countthre and more 1.3519508 0.62693105
## ADX_TT_counttwo_adx 0.2718247 0.16201371
## ADX_TT_countzero 0.2068306 0.16884691
## `CPE old-new`2 0.3539334 0.27244218
## `CPE old-new`3 0.5074185 0.39046982
## 97.5 %
## (Intercept) 5.2301075
## Price_package.x 1.0013184
## contract_start.x 1.0000000
## tech_tickets_last_3_months_final.xone_ticket 13.6283632
## tech_tickets_last_3_months_final.xthree_ticket 12.0031288
## tech_tickets_last_3_months_final.xtwo_ticket 10.9029528
## tech_tickets_last_3_months_final.xzero 7.0318295
## TENURE_MONTHS.x 0.9914507
## `Subscriber status`ctv 69.8604823
## `service type`2p 1.2891822
## `service type`3p 0.8430646
## `Service description`one service 1.2014331
## `Service description`three services 2.2901396
## `Service description`two services 1.9607128
## Blizoo_Vip_tarifaVip 1.4401729
## `network type`docsis3.0 0.9260171
## `network type`lan 0.8771942
## `network type`TV 2.2752146
## ADX_TT_countthre and more 3.2433400
## ADX_TT_counttwo_adx 0.4571275
## ADX_TT_countzero 0.2521164
## `CPE old-new`2 0.4575094
## `CPE old-new`3 0.6567256
logit1_odds_balanced <- exp(cbind(OR = coef(logit1), confint(logit1)))
## Waiting for profiling to be done...
capture.output(summary(logit1_odds_balanced), file = "logit1_odds.xls")
#pravljenje confusion matrice
test.predicted.income = predict(logit1_balanced, newdata = testdf, type = "response" )
table(testdf$churn, test.predicted.income > 0.5)
##
## FALSE TRUE
## 0 24501 12100
## 1 596 683
tab <- table(testdf$churn, test.predicted.income > 0.5)
acc <- sum(diag(tab))/sum(tab)
acc
## [1] 0.6648363
#mislasification rate
mfc <- 1-acc
mfc
## [1] 0.3351637
plotROC(testdf$churn, test.predicted.income)

#Now we will deploy random forest model, without much tuning (default). We will use cross validation
#(deviding into 3 samples and checking wether the default random forest model has the same
#accuracy accross the samples in order to avoid overfitting)
tr <- caret::trainControl(method="cv", number=3, search = "random")
default_model <- caret::train(churn ~ ., data = traindf, method = 'rf', metric = 'Accuracy', trControl = tr)
##
## Attaching package: 'caret'
## The following objects are masked from 'package:InformationValue':
##
## confusionMatrix, precision, sensitivity, specificity
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
## The following object is masked from 'package:purrr':
##
## lift
print(default_model)
## Random Forest
##
## 37878 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 25252, 25252, 25252
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9673953 0.0711089
## 18 0.9662601 0.1634497
## 22 0.9658113 0.1600487
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
test.predicted.income = predict(default_model, newdata = testdf, type = "raw")
table(testdf$churn, test.predicted.income)
## test.predicted.income
## 0 1
## 0 36597 4
## 1 1208 71
#We see that default model with mtry=5 has a very high accuracy, but Kappa is poor
#(number of variable is randomly collected to be sampled at each split time.)
pred <- predict(default_model, newdata = testdf, type = "raw")
prob <- predict(default_model, newdata = testdf, type = "prob")
head(prob)
## 0 1
## 2 1 0
## 3 1 0
## 5 1 0
## 6 1 0
## 10 1 0
## 11 1 0
confusionMatrix(table(pred, testdf$churn))
## Confusion Matrix and Statistics
##
##
## pred 0 1
## 0 36597 1208
## 1 4 71
##
## Accuracy : 0.968
## 95% CI : (0.9662, 0.9698)
## No Information Rate : 0.9662
## P-Value [Acc > NIR] : 0.0285
##
## Kappa : 0.1015
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99989
## Specificity : 0.05551
## Pos Pred Value : 0.96805
## Neg Pred Value : 0.94667
## Prevalence : 0.96624
## Detection Rate : 0.96613
## Detection Prevalence : 0.99802
## Balanced Accuracy : 0.52770
##
## 'Positive' Class : 0
##
table(pred, testdf$churn)
##
## pred 0 1
## 0 36597 1208
## 1 4 71
prob <- predict(default_model, newdata = testdf, type = "prob")
forestpred = prediction(prob[,2], testdf$churn)
forestperf = performance(forestpred, "tpr", "fpr")
plot(forestperf, main="ROC", colorize=T)
auc_ROCR <- performance(forestpred, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
print(auc_ROCR)
## [1] 0.657174
