library(tidyverse)
## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.1.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  2.1.3     <U+221A> dplyr   0.8.3
## <U+221A> tidyr   0.8.2     <U+221A> stringr 1.3.1
## <U+221A> readr   1.2.1     <U+221A> forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(psycho)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(tibble)
library(nnet)
library(InformationValue)
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
library(caTools)
library(lattice)
library(grid)
library(dplyr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(tidyselect)

all_churn_2017 <- read_excel("C:/Users/vjovanovic/Desktop/A1/all churn 2017.xlsx")
Blizoo_total_Base <- read_excel("C:/Users/vjovanovic/Desktop/A1/Blizoo_total_Base.xlsx")
CABLE_ALL_total_Base <- read_excel("C:/Users/vjovanovic/Desktop/A1/CABLE_ALL_total_Base.xlsx")
CABLE_AMDOCS <- read_excel("C:/Users/vjovanovic/Desktop/A1/CABLE_AMDOCS.xlsx")
trouble_tickets_Q1_2017 <- read_excel("C:/Users/vjovanovic/Desktop/A1/trouble_tickets_Q1_2017.xlsx")


#We will try to make an innerjoin of two data systems of Blizoo and Cable_all for checking if there is enough
#fully populated customers. This will give as a databse with higher number of variables (potential predictors).


dim(Blizoo_total_Base) #75855 customers
## [1] 75855    24
dim(CABLE_ALL_total_Base) #85777
## [1] 85777    32
inner_cable <- merge(CABLE_ALL_total_Base, Blizoo_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)

dim(inner_cable) #75894
## [1] 75894    55
#We see that we have 40 more customers than in Blizzo, but we could have 75855 or less. 
#But we got about 50 warnings and those are maybe duplicates, we suppose that ALL_Cable have all customers that are in Blizoo.


missmap(inner_cable,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

#Now we are looking for an intersection of inner_cable and churns.

dim(all_churn_2017)
## [1] 7023   30
churns_inner <- merge(all_churn_2017, inner_cable, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_inner) #2598
## [1] 2598   84
#We halved the number of churns, half of churns are no in inner_cable, we will try to find better intersection (inner_join).
#Exploring the intersection between churns and Blizoo

dim(all_churn_2017)
## [1] 7023   30
churns_blizoo <- merge(all_churn_2017, Blizoo_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_blizoo) #2558
## [1] 2558   53
#No duplicates.



#But still a lot of churns that are no in the Blizoo database and that createsa lot of missing values
#in churns extended with blizoo variables (left join, x=churns).


#Now we want to see how many churns are there who are in Blizoo. We are doing inner_join or natural join.

churns_intersect_blizoo <- merge(all_churn_2017, Blizoo_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_intersect_blizoo) #2558
## [1] 2558   53
#How many churns are in all cable?

churns_intersect_cableall <- merge(all_churn_2017, CABLE_ALL_total_Base, copy=FALSE, suffix = c(".x", ".y"), by.x="client_id", by.y="client_id", all.x=F, all.y=F, no.dups = TRUE)
dim(churns_intersect_cableall) #3225
## [1] 3255   61
missmap(churns_intersect_cableall)

###FOR THIS PURPOSE OF PRACTICE WE WILL DO ON BLIZOO BUT MODEL WILL BE BETTER WHEN DOING ON CABLE_ALL database becease of higher number of churns!

#Now we see a very few variables that have a lot of missing values, but we halved the number of churns.
#We already have a very few churns and model would have even the weaker predictive power with 
#smaller number of churns. So we chose to have higher number of churns and fewer number of variables.
#But we can try both. We will clean the churns_intersect_blizoo

#Now we will delete columns (variables) with highest percentage of missings in order to preserve
#the highest possible number of customers


churns_intersect_blizoo$`start date of churn request` <- NULL
churns_intersect_blizoo$`end date of churn request` <- NULL
churns_intersect_blizoo$`Churn date` <- NULL
churns_intersect_blizoo$`Migration Y/N` <- NULL
churns_intersect_blizoo$Reason <- NULL
churns_intersect_blizoo$`Reason description` <- NULL
churns_intersect_blizoo$`System information` <- NULL
churns_intersect_blizoo$`Blizoo-Vip tariff` <- NULL
churns_intersect_blizoo$contract_end <- NULL
churns_intersect_blizoo$`CPE name` <- NULL


dim(churns_intersect_blizoo)
## [1] 2558   43
missmap(churns_intersect_blizoo,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

#We aggregate the averages instead of missings to save as more churns as possible.


churns_intersect_blizoo[] <- lapply(churns_intersect_blizoo, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
dim(churns_intersect_blizoo)
## [1] 2558   43
#We are recoding month to contract_end

churns_intersect_blizoo$MONTHS_TO_CONTRACT_END <- as.character(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
table(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
## 
##           0           1          10          11          12          13 
##         105         100          31          35          18           2 
##          14          15          19           2           3           4 
##           1           2           4          69          16          25 
##           5           6           7           8           9 NO_CONTRACT 
##          31          29          32          57          41        1686
#We have decided to recode this variable to be possible to treat it as numeric and to 
#make imputations with averages.
churns_intersect_blizoo$MONTHS_TO_CONTRACT_END <- as.character(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
churns_intersect_blizoo$MONTHS_TO_CONTRACT_END[churns_intersect_blizoo$MONTHS_TO_CONTRACT_END == "NO_CONTRACT"] <- "0"
churns_intersect_blizoo[] <- lapply(churns_intersect_blizoo, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))

churns_intersect_blizoo$MONTHS_TO_CONTRACT_END <- as.numeric(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END)
table(churns_intersect_blizoo$MONTHS_TO_CONTRACT_END) #We suceedeed.
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 1791  100   69   16   25   31   29   32   57   41   31   35   18    2    1 
##   15   19 
##    2    4
#We are also trying to make an imputation in factor variable adding more frequent category.

churns_intersect_blizoo$`CPE old-new` <- as.character(churns_intersect_blizoo$`CPE old-new`)
churns_intersect_blizoo$`CPE old-new`[is.na(churns_intersect_blizoo$`CPE old-new`)] <- 1
churns_intersect_blizoo$`CPE old-new` <- as.factor(churns_intersect_blizoo$`CPE old-new`)


#Now we have churn_intersect_blizoo cleaned.

#We are adding churn variable to churns_intersect blizoo. This will be our TARGET VARIABLE!

churns_intersect_blizoo$churn <- 1
table(churns_intersect_blizoo$churn)
## 
##    1 
## 2558
#Now we have churn_blizoo dataframe without missings.

#Now we want to see whether is possible to merge all_troubles and Blizoo, we suppose
#that maybe contract number is possibly Custom id.

dim(trouble_tickets_Q1_2017) #18659
## [1] 18659     9
dim(Blizoo_total_Base)
## [1] 75855    24
blizoo_troubles <- merge (Blizoo_total_Base, trouble_tickets_Q1_2017, copy=FALSE, suffix = c(".x", ".y"), by.x = "Customer ID", by.y = "contract_number", all.x=T, all.y=F, no.dups = TRUE)
dim(blizoo_troubles)
## [1] 149280     32
#It seems that we cannot use this database and that matching is impossible becease we got the mess, as we 
#do not have an key variable for matching.

#Now we will merge Blizoo dataframe and churn_intersect blizoo (left join) adding churns to a Blizoo.

blizoo_with_churns <- merge (Blizoo_total_Base, churns_intersect_blizoo, copy=FALSE, suffix = c(".x", ".y"), by.x = "client_id", by.y = "client_id", all.x=T, all.y=F, no.dups = TRUE)
dim(blizoo_with_churns)
## [1] 75857    67
missmap(blizoo_with_churns)

#According to the missing map we see that in Blizoo we added a lot of variable from churns database
#which are empty and we will have to clean them, but first we will check are blizoo_with_churn$churn variable.

table(blizoo_with_churns$churn)
## 
##    1 
## 2558
#We see that customers with churns NA are not in churn database
filter(churns_intersect_blizoo, client_id==27)
##  [1] client_id                        client id 2                     
##  [3] Customer ID.x                    City                            
##  [5] telephone                        mobile                          
##  [7] service_type                     service_description             
##  [9] package_name                     monthly fee                     
## [11] network_type                     operator (blizoo/waw)           
## [13] tech_tickets_last_3_months.x     tech_tickets_last_6_months      
## [15] tech_tickets_last_12_months      date_first_activation           
## [17] date_last_renewal                account_type (res/bis)          
## [19] date of contract expiry          System churn date               
## [21] Churn type                       Current status                  
## [23] Customer ID.y                    telephone_new                   
## [25] Price_package                    mobile_new                      
## [27] bus_res                          first_activation                
## [29] contract_start                   service type                    
## [31] Service description              Blizoo_Vip_tarifa               
## [33] network type                     Monthly fee                     
## [35] tech_tickets_last_3_months.y     average_invoice_last_3_months   
## [37] CPE old-new                      Subscriber status               
## [39] operator                         ADX_TT_count                    
## [41] tech_tickets_last_3_months_final TENURE_MONTHS                   
## [43] MONTHS_TO_CONTRACT_END           churn                           
## <0 rows> (or 0-length row.names)
filter(churns_intersect_blizoo, client_id==109)
##  [1] client_id                        client id 2                     
##  [3] Customer ID.x                    City                            
##  [5] telephone                        mobile                          
##  [7] service_type                     service_description             
##  [9] package_name                     monthly fee                     
## [11] network_type                     operator (blizoo/waw)           
## [13] tech_tickets_last_3_months.x     tech_tickets_last_6_months      
## [15] tech_tickets_last_12_months      date_first_activation           
## [17] date_last_renewal                account_type (res/bis)          
## [19] date of contract expiry          System churn date               
## [21] Churn type                       Current status                  
## [23] Customer ID.y                    telephone_new                   
## [25] Price_package                    mobile_new                      
## [27] bus_res                          first_activation                
## [29] contract_start                   service type                    
## [31] Service description              Blizoo_Vip_tarifa               
## [33] network type                     Monthly fee                     
## [35] tech_tickets_last_3_months.y     average_invoice_last_3_months   
## [37] CPE old-new                      Subscriber status               
## [39] operator                         ADX_TT_count                    
## [41] tech_tickets_last_3_months_final TENURE_MONTHS                   
## [43] MONTHS_TO_CONTRACT_END           churn                           
## <0 rows> (or 0-length row.names)
#While those who are with churns =1 can be found.
filter(churns_intersect_blizoo, client_id==79)
##   client_id client id 2 Customer ID.x   City telephone    mobile
## 1        79          79     0.9260636 City I 0.5044875 0.4634362
##   service_type service_description
## 1           2p           AtvDtvTel
##                                                                                                                                                                                                                                                  package_name
## 1 <U+0414><U+0422><U+0412> (<U+0424><U+0435><U+0432>. 2015),<U+0424><U+0438><U+043A><U+0441><U+043D><U+0430> <U+0422><U+0435><U+043B><U+0435><U+0444><U+043E><U+043D><U+0438><U+0458><U+0430> (<U+041D><U+043E><U+0435><U+043C><U+0432><U+0440><U+0438> 2012)
##   monthly fee network_type operator (blizoo/waw)
## 1         598       DOCSIS                Blizoo
##   tech_tickets_last_3_months.x tech_tickets_last_6_months
## 1                            0                          0
##   tech_tickets_last_12_months date_first_activation date_last_renewal
## 1                           0            2008-05-01        2015-04-04
##   account_type (res/bis) date of contract expiry System churn date
## 1             residental           not available        1496188800
##        Churn type Current status Customer ID.y telephone_new Price_package
## 1 voluntary_churn       inactive     0.9260636     0.1193234           598
##    mobile_new    bus_res first_activation contract_start service type
## 1 0.006346465 residental            39569     1428105600           2p
##   Service description Blizoo_Vip_tarifa network type Monthly fee
## 1           AtvDtvTel            Blizoo    docsis2.0         598
##   tech_tickets_last_3_months.y average_invoice_last_3_months CPE old-new
## 1                            0                           598         old
##   Subscriber status operator ADX_TT_count tech_tickets_last_3_months_final
## 1            active   Blizoo            0                                0
##   TENURE_MONTHS MONTHS_TO_CONTRACT_END churn
## 1           110                      1     1
str(blizoo_with_churns)
## 'data.frame':    75857 obs. of  67 variables:
##  $ client_id                         : num  5 15 19 25 27 29 31 37 43 53 ...
##  $ Customer ID                       : num  0.787 0.15 0.891 0.145 0.985 ...
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : POSIXct, format: "2015-12-01" "2017-02-06" ...
##  $ contract_end                      : chr  "43070" NA NA "43070" ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ Monthly fee.x                     : num  1258 NA NA 1764 NA ...
##  $ tech_tickets_last_3_months        : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ average_invoice_last_3_months.x   : num  1258 1708 0 1474 1699 ...
##  $ CPE name                          : chr  "Powerbox HD-1822,Technicolor TC7200" "BLIZOO CAM modul chipset pairing,Powerbox HD-1822,Technicolor TC7200" "Powerbox HD-1822,Technicolor TC7200" "ADB ADB-2840C,Powerbox HD-1822,Technicolor TC7200" ...
##  $ CPE old-new.x                     : chr  "new" "new" "new" "new" ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ MONTHS_TO_CONTRACT_END.x          : chr  "7" "NO_CONTRACT" "NO_CONTRACT" "7" ...
##  $ client id 2                       : num  NA NA NA NA NA NA NA NA NA 53 ...
##  $ Customer ID.x                     : num  NA NA NA NA NA ...
##  $ City                              : chr  NA NA NA NA ...
##  $ telephone                         : num  NA NA NA NA NA ...
##  $ mobile                            : num  NA NA NA NA NA ...
##  $ service_type                      : chr  NA NA NA NA ...
##  $ service_description               : chr  NA NA NA NA ...
##  $ package_name                      : chr  NA NA NA NA ...
##  $ monthly fee                       : num  NA NA NA NA NA ...
##  $ network_type                      : chr  NA NA NA NA ...
##  $ operator (blizoo/waw)             : chr  NA NA NA NA ...
##  $ tech_tickets_last_3_months.x      : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ tech_tickets_last_6_months        : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ tech_tickets_last_12_months       : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ date_first_activation             : chr  NA NA NA NA ...
##  $ date_last_renewal                 : chr  NA NA NA NA ...
##  $ account_type (res/bis)            : chr  NA NA NA NA ...
##  $ date of contract expiry           : chr  NA NA NA NA ...
##  $ System churn date                 : num  NA NA NA NA NA ...
##  $ Churn type                        : chr  NA NA NA NA ...
##  $ Current status                    : chr  NA NA NA NA ...
##  $ Customer ID.y                     : num  NA NA NA NA NA ...
##  $ telephone_new.y                   : num  NA NA NA NA NA ...
##  $ Price_package.y                   : num  NA NA NA NA NA ...
##  $ mobile_new.y                      : num  NA NA NA NA NA ...
##  $ bus_res.y                         : chr  NA NA NA NA ...
##  $ first_activation.y                : chr  NA NA NA NA ...
##  $ contract_start.y                  : num  NA NA NA NA NA ...
##  $ service type.y                    : chr  NA NA NA NA ...
##  $ Service description.y             : chr  NA NA NA NA ...
##  $ Blizoo_Vip_tarifa.y               : chr  NA NA NA NA ...
##  $ network type.y                    : chr  NA NA NA NA ...
##  $ Monthly fee.y                     : num  NA NA NA NA NA ...
##  $ tech_tickets_last_3_months.y      : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ average_invoice_last_3_months.y   : num  NA NA NA NA NA ...
##  $ CPE old-new.y                     : Factor w/ 3 levels "1","new","old": NA NA NA NA NA NA NA NA NA 2 ...
##  $ Subscriber status.y               : chr  NA NA NA NA ...
##  $ operator.y                        : chr  NA NA NA NA ...
##  $ ADX_TT_count.y                    : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ tech_tickets_last_3_months_final.y: num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ TENURE_MONTHS.y                   : num  NA NA NA NA NA NA NA NA NA 110 ...
##  $ MONTHS_TO_CONTRACT_END.y          : num  NA NA NA NA NA NA NA NA NA 10 ...
##  $ churn                             : num  NA NA NA NA NA NA NA NA NA 1 ...
#We will use only full variables with ".x" extension

library(dplyr)
library(dbplyr)
library(tidyselect)

df <- blizoo_with_churns


#We select only variables with no missings, with .x extension.


df <-  dplyr::select(df, ends_with (".x"), churn)

           
#So if we assume that all NA in blizoo_with_churns are actually not churns we can do:

df$churn[is.na(df$churn)] <- 0
table(df$churn)
## 
##     0     1 
## 73299  2558
#We are deleting the variables with very huge number of missings in order to save higher possible number
#of churns for better predictions.

df$average_invoice_last_3_months.x <- NULL
df$`Customer ID.x` <- NULL
df$`Monthly fee.x` <- NULL

#We want to again make imputation with average.

df$MONTHS_TO_CONTRACT_END <- as.character(df$MONTHS_TO_CONTRACT_END)
df$MONTHS_TO_CONTRACT_END[df$MONTHS_TO_CONTRACT_END.x == "NO_CONTRACT"] <- "0"
df$MONTHS_TO_CONTRACT_END <- as.numeric(df$MONTHS_TO_CONTRACT_END)

#We are filling the factor varaible with the more common factor.
str(df$`CPE old-new.x`)
##  chr [1:75857] "new" "new" "new" "new" "new" "new" "new" "new" "new" ...
df$`CPE old-new.x`[is.na(df$`CPE old-new.x`)] <- "1"
df$`CPE old-new.x` <- as.factor(df$`CPE old-new.x`)

df[] <- lapply(df, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))

missmap(df)

df$MONTHS_TO_CONTRACT_END.x <- NULL
str(df)
## 'data.frame':    75857 obs. of  19 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
dim(df)
## [1] 75857    19
#We have our FINAL cleaned database (although cleaning is "fast and dirty").

#We will explore a little our database. We see that we have only 3,5% of churns.
#Price_package can vary a lot, it may be important predictor.
#We have a lot of character variables that could be recoded as factor variables.
summary(df)
##  telephone_new.x     Price_package.x    mobile_new.x     
##  Min.   :0.0000114   Min.   :    0.0   Min.   :0.000009  
##  1st Qu.:0.2530128   1st Qu.:    0.0   1st Qu.:0.275781  
##  Median :0.5035861   Median :  499.0   Median :0.501933  
##  Mean   :0.5023232   Mean   :  470.7   Mean   :0.501933  
##  3rd Qu.:0.7531127   3rd Qu.:  750.0   3rd Qu.:0.727331  
##  Max.   :0.9999751   Max.   :18587.0   Max.   :0.999984  
##   bus_res.x         first_activation.x contract_start.x   
##  Length:75857       Length:75857       Min.   :1.014e+09  
##  Class :character   Class :character   1st Qu.:1.417e+09  
##  Mode  :character   Mode  :character   Median :1.450e+09  
##                                        Mean   :1.431e+09  
##                                        3rd Qu.:1.472e+09  
##                                        Max.   :1.491e+09  
##  service type.x     Service description.x Blizoo_Vip_tarifa.x
##  Length:75857       Length:75857          Length:75857       
##  Class :character   Class :character      Class :character   
##  Mode  :character   Mode  :character      Mode  :character   
##                                                              
##                                                              
##                                                              
##  network type.x     CPE old-new.x   Subscriber status.x  operator.x       
##  Length:75857       Min.   :1.000   Length:75857        Length:75857      
##  Class :character   1st Qu.:2.000   Class :character    Class :character  
##  Mode  :character   Median :2.000   Mode  :character    Mode  :character  
##                     Mean   :2.147                                         
##                     3rd Qu.:2.000                                         
##                     Max.   :3.000                                         
##  ADX_TT_count.x    tech_tickets_last_3_months_final.x TENURE_MONTHS.x 
##  Min.   : 0.0000   Min.   : 0.0000                    Min.   :  1.00  
##  1st Qu.: 0.0000   1st Qu.: 0.0000                    1st Qu.: 22.00  
##  Median : 0.0000   Median : 0.0000                    Median : 72.88  
##  Mean   : 0.1214   Mean   : 0.3594                    Mean   : 72.88  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000                    3rd Qu.:117.00  
##  Max.   :11.0000   Max.   :21.0000                    Max.   :214.00  
##  tech_tickets_last_3_months.x     churn         MONTHS_TO_CONTRACT_END
##  Min.   :0.0000               Min.   :0.00000   Min.   : 0.000        
##  1st Qu.:0.1126               1st Qu.:0.00000   1st Qu.: 0.000        
##  Median :0.1126               Median :0.00000   Median : 0.000        
##  Mean   :0.1126               Mean   :0.03372   Mean   : 1.235        
##  3rd Qu.:0.1126               3rd Qu.:0.00000   3rd Qu.: 0.000        
##  Max.   :6.0000               Max.   :1.00000   Max.   :22.000
str(df)
## 'data.frame':    75857 obs. of  19 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
table(df$churn)
## 
##     0     1 
## 73299  2558
df$churn <- as.factor(df$churn)
df$bus_res <- as.factor(df$bus_res)
df$`Subscriber status` <- as.factor(df$`Subscriber status`)
df$`service type` <- as.factor(df$`service type`)
df$`Service description` <- as.factor(df$`Service description`)
df$Blizoo_Vip_tarifa <- as.factor(df$Blizoo_Vip_tarifa)
df$`network type` <- as.factor(df$`network type`)
df$operator <- as.factor(df$operator)



#When we look at the structure of the date we see some factor variable with only one factor
#e.g. bus_res which is useless, and we see some factors with huge number of levels (cpe name, service description)
#service type is combination of four possible services (from 1 to 4). We could basically recode
#this variable into four category categorical variable according to number of services

str(df)
## 'data.frame':    75857 obs. of  26 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
##  $ bus_res                           : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 15 levels "Atv","AtvDtv",..: 4 4 4 4 13 4 4 4 4 4 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ operator                          : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
table(df$`Service description`)
## 
##          Atv       AtvDtv    AtvDtvInt AtvDtvIntTel    AtvDtvTel 
##          718         9714         2536        52220         2290 
##       AtvInt    AtvIntTel       AtvTel          Dtv       DtvInt 
##         3175          357          126           47            3 
##    DtvIntTel       DtvTel          Int       IntTel          Tel 
##           22            2         4154          450           43
service <- function(atv){
  atv <- as.character(atv)
  if (atv=='Atv'| atv=='Dtv' | atv=='Int' | atv=='Tel'){
    return('one service')
  }else if(atv=='AtvDtv'| atv=='AtvInt' | atv=='AtvTel' | atv=='DtvInt' | atv=='DtvTel' | atv=='DtvTel' | atv=='IntTel'){
    return('two services')
  }else if(atv=='AtvDtvInt'| atv=='AtvDtvTel' | atv=='AtvIntTel' | atv=='DtvIntTel'){
    return('three services')
  }else{
    return('four services')
  }
}

df$`Service description` <- sapply(df$`Service description`,service)
table(df$`Service description`)
## 
##  four services    one service three services   two services 
##          52220           4962           5205          13470
df$`Service description` <- as.factor(df$`Service description`)

#Now we will see how are predictors connected with our target churn variable

library(ggplot2)
g <- ggplot (df, aes(df$churn, df$`Subscriber status`)) + geom_jitter()
g

#we will check chi-square test and cramer's V. There is an assosiation but weak. Majority of churns are active users.

        
tbl <-  table(df$churn, df$`Subscriber status`) 
tbl       
##    
##     active   ctv
##   0  72863   436
##   1   2339   219
chisq.test(tbl)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl
## X-squared = 1823.3, df = 1, p-value < 2.2e-16
CramerV(df$churn, df$`Subscriber status`)
## [1] 0.1554298
#See the equipment. This connection is even weaker. 

p <- ggplot (df, aes(df$churn, df$`CPE old-new`)) + geom_jitter()
p

tbl <-  table(df$churn, df$`CPE old-new`) 
tbl 
##    
##         1     2     3
##   0  4681 53190 15428
##   1   342  1434   782
chisq.test(tbl)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 380.47, df = 2, p-value < 2.2e-16
CramerV(df$churn, df$`CPE old-new`)
## [1] 0.07082085
#Let' try to see proportion of the churns in relation to price. We see that churns have 
#cheaper packages but the differens is small and not significant.

str(df)
## 'data.frame':    75857 obs. of  26 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
##  $ bus_res                           : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ operator                          : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
#For example, here we see the difference, churns have longer contratcs , maybe they are 
#new but unsatisfied users.
k <- ggplot(df, aes(MONTHS_TO_CONTRACT_END, fill=churn)) + geom_density(alpha=0.7, adjust=1.5) + xlim(0,20) + theme_bw()
k

o <- ggplot(df, aes(Price_package.x, fill=churn)) + geom_histogram(binwidth = 100, alpha=0.7, position = "identity") + xlim(0,2000) + theme_bw()
o

#We have a high number of not churns with price package zero. Maybe we should delete them
p <- ggplot(df, aes(Price_package.x, fill=churn)) + geom_density(alpha=0.7, adjust=1.5) + xlim(0,2000) + theme_bw()
p

df$Price_package[df$Price_package.x<200] <- mean(df$Price_package.x)

boxplot(df$Price_package.x)$out
##  [1]  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999
## [12]  1999  1899  1999  1999  2599  1999  1999  1999  1999  1999  1999
## [23]  1949  1999  1999  1999  3197  2098  1999  1999  1999  3998  1999
## [34]  1999  1999  1999  1999  1999  1999  1999  1998  1999  1999  1999
## [45]  1999  1999  1999  1999  1999  1899  2158  1999  1999  1999  1999
## [56]  1999  1999  1999  1999  1999  1999  1999  2198  1999  1999  1999
## [67]  1999  2299  1899 15587 18587 18587  1999  1999  1999  1999  2000
## [78] 12320  4500  1999  1999  1999  1999  1999  1999  1999  1999  1999
## [89]  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999
outliers <- boxplot(df$Price_package.x, plot=FALSE)$out
print(outliers)
##  [1]  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999
## [12]  1999  1899  1999  1999  2599  1999  1999  1999  1999  1999  1999
## [23]  1949  1999  1999  1999  3197  2098  1999  1999  1999  3998  1999
## [34]  1999  1999  1999  1999  1999  1999  1999  1998  1999  1999  1999
## [45]  1999  1999  1999  1999  1999  1899  2158  1999  1999  1999  1999
## [56]  1999  1999  1999  1999  1999  1999  1999  2198  1999  1999  1999
## [67]  1999  2299  1899 15587 18587 18587  1999  1999  1999  1999  2000
## [78] 12320  4500  1999  1999  1999  1999  1999  1999  1999  1999  1999
## [89]  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999  1999
df <- df[-which(df$Price_package.x %in% outliers),]
boxplot(df$Price_package.x)

p <- ggplot(df, aes(Price_package.x, fill=churn)) + geom_density(alpha=0.7, adjust=1.5) + xlim(0,2000) + theme_bw()
p

#We do not see any groupings in this churns. 
j <- ggplot(df, aes (df$`network type`, df$tech_tickets_last_3_months_final)) + geom_bar(aes(color=churn), stat = "identity")
j

t.test(Price_package.x ~ churn, data = df,
       var.equal = TRUE, alternative = "less")
## 
##  Two Sample t-test
## 
## data:  Price_package.x by churn
## t = -12.73, df = 75756, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -93.69321
## sample estimates:
## mean in group 0 mean in group 1 
##        464.2760        571.8725
res.aov <- aov(df$Price_package.x ~ df$churn, data = df)
# Summary of the analysis
summary(res.aov)
##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## df$churn        1 2.860e+07 28603238     162 <2e-16 ***
## Residuals   75756 1.337e+10   176517                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
EtaSq(res.aov)
##               eta.sq eta.sq.part
## df$churn 0.002134439 0.002134439
#We saw an intresting trend that docsis 2.0 network users have more tech tickets. This is important to 
#us becease higher churn rates. Results of Anova are significant. 
str(df)
## 'data.frame':    75758 obs. of  27 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
##  $ bus_res                           : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ operator                          : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Price_package                     : num  1199 471 691 1199 471 ...
res.aov <- aov(df$tech_tickets_last_3_months.x ~ df$`network type`, data = df)
# Summary of the analysis
summary(res.aov)
##                      Df Sum Sq Mean Sq F value   Pr(>F)    
## df$`network type`     3    0.1 0.04092   6.576 0.000194 ***
## Residuals         75754  471.4 0.00622                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
EtaSq(res.aov)
##                         eta.sq  eta.sq.part
## df$`network type` 0.0002603378 0.0002603378
#We want to see the proportion of variance explained by network type. Basicaly less of 1% of tickets
#is explained by network type. 
#Let' see the connection between churns and network type. Very weak assossiation. 

tbl1 <-  table(df$churn, df$`network type`) 
tbl1
##    
##     docsis2.0 docsis3.0   lan    TV
##   0     21010     35057  7704  9430
##   1       871       778   253   655
chisq.test(tbl1)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl1
## X-squared = 485.79, df = 3, p-value < 2.2e-16
CramerV(df$churn, df$`network type`)
## [1] 0.0800771
i <- ggplot(df, aes(df$`network type`, fill=churn)) + geom_bar(alpha=0.7, stat = "count", position = "dodge") + theme_bw()
i

j <- ggplot(df, aes(df$`Service description`, fill=churn)) + geom_bar(alpha=0.7, stat = "count", position = "dodge") + theme_bw()
j

CramerV(df$churn, df$`Service description`)
## [1] 0.06572216
#Now we will deploy logit model. But first we must delete unimportant variables from the database
#e.g. character variables and IDs, as well as factors with one factor. Character variables could be useful when categorised
#if possible but we do not have time for this at the moment.

str(df)
## 'data.frame':    75758 obs. of  27 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: num  1 0 0 0 0 0 0 1 0 0 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
##  $ bus_res                           : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ operator                          : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Price_package                     : num  1199 471 691 1199 471 ...
#We should explore a little bit more numeric variable that appear to be factors. We 
#will make categorical out of numeric becease it seems that smaller values are far more frequent.
table(df$ADX_TT_count)
## 
##     0     1     2     3     4     5     6     7     9    10    11 
## 68743  5432  1155   309    86    17    11     2     1     1     1
tt <- function(adx){
  adx <- as.numeric(adx)
  if (adx>2){
    return('thre and more')
  }else if(adx==2){
    return('two_adx')
  }else if(adx==1){
    return('one_adx')
  }else{
    return('zero')
  }
}

df$ADX_TT_count <- sapply(df$ADX_TT_count,tt)
table(df$ADX_TT_count)
## 
##       one_adx thre and more       two_adx          zero 
##          5432           428          1155         68743
df$ADX_TT_count <- as.factor(df$ADX_TT_count)

table(df$tech_tickets_last_3_months_final.x)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
## 59819  8952  4803  1053   716   168   133    45    34    11     9     4 
##    12    13    14    15    17    21 
##     2     2     3     1     1     2
tick <- function(ticket){
  ticket <- as.numeric(ticket)
  if (ticket>4){
    return('four and more')
  }else if(ticket==3){
    return('three_ticket')
  }else if(ticket==2){
    return('two_ticket')
  }else if(ticket==1){
      return('one_ticket')
  }else{
    return('zero')
  }
}

df$tech_tickets_last_3_months_final.x <- sapply(df$tech_tickets_last_3_months_final.x,tick)
table(df$tech_tickets_last_3_months_final.x)
## 
## four and more    one_ticket  three_ticket    two_ticket          zero 
##           415          8952          1053          4803         60535
df$tech_tickets_last_3_months_final.x <- as.factor(df$tech_tickets_last_3_months_final.x)

# Drop the columns of the dataframe. Many of this variables could be usefull, but this is a pracise
#example and we do not have time to dig deeper. For example, date varibles could be very usefull.
str(df)
## 'data.frame':    75758 obs. of  28 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ service type.x                    : chr  "3p" "3p" "3p" "3p" ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
##  $ bus_res                           : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ operator                          : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Price_package                     : num  1199 471 691 1199 471 ...
##  $ ADX_TT_count                      : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
missmap(df)

df$`service type.x` <- as.factor(df$`service type.x`)
table(df$`service type.x`)
## 
##    1p    2p    3p 
## 14674  8577 52507
df$`CPE old-new.x` <- as.factor(df$`CPE old-new.x`)
table(df$`CPE old-new.x`)
## 
##     1     2     3 
##  5022 54534 16202
df1 <- subset(df,select = -c(bus_res, bus_res.x, first_activation.x, contract_start.x, operator, `Service description.x`, Blizoo_Vip_tarifa.x, `network type.x`, `Subscriber status.x`))
str(df1)
## 'data.frame':    75758 obs. of  19 variables:
##  $ telephone_new.x                   : num  0.6276 0.0169 0.0905 0.1394 0.3512 ...
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ service type.x                    : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ CPE old-new.x                     : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ tech_tickets_last_3_months.x      : num  0.113 0.113 0.113 0.113 0.113 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  7 0 0 7 0 0 0 0 0 10 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ Price_package                     : num  1199 471 691 1199 471 ...
##  $ ADX_TT_count                      : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
#We should standardize the numeric variables in order to have more interpretable results in logit model.

library(psycho)
df2 <- df %>% 
  psycho::standardize() 

summary(df2)
##  telephone_new.x     Price_package.x     mobile_new.x       
##  Min.   :-1.740718   Min.   :-1.11251   Min.   :-1.8222775  
##  1st Qu.:-0.863982   1st Qu.:-1.11251   1st Qu.:-0.8209182  
##  Median : 0.004379   Median : 0.07393   Median :-0.0000719  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000000  
##  3rd Qu.: 0.868890   3rd Qu.: 0.66834   3rd Qu.: 0.8183075  
##  Max.   : 1.724818   Max.   : 3.16723   Max.   : 1.8080704  
##   bus_res.x         first_activation.x contract_start.x  service type.x
##  Length:75758       Length:75758       Min.   :-6.3953   1p:14674      
##  Class :character   Class :character   1st Qu.:-0.2021   2p: 8577      
##  Mode  :character   Mode  :character   Median : 0.3037   3p:52507      
##                                        Mean   : 0.0000                 
##                                        3rd Qu.: 0.6449                 
##                                        Max.   : 0.9317                 
##  Service description.x Blizoo_Vip_tarifa.x network type.x    
##  Length:75758          Length:75758        Length:75758      
##  Class :character      Class :character    Class :character  
##  Mode  :character      Mode  :character    Mode  :character  
##                                                              
##                                                              
##                                                              
##  CPE old-new.x Subscriber status.x  operator.x        ADX_TT_count.x   
##  1: 5022       Length:75758        Length:75758       Min.   :-0.2797  
##  2:54534       Class :character    Class :character   1st Qu.:-0.2797  
##  3:16202       Mode  :character    Mode  :character   Median :-0.2797  
##                                                       Mean   : 0.0000  
##                                                       3rd Qu.:-0.2797  
##                                                       Max.   :25.0292  
##  tech_tickets_last_3_months_final.x TENURE_MONTHS.x     
##  four and more:  415                Min.   :-1.4720090  
##  one_ticket   : 8952                1st Qu.:-1.0419249  
##  three_ticket : 1053                Median : 0.0001635  
##  two_ticket   : 4803                Mean   : 0.0000000  
##  zero         :60535                3rd Qu.: 0.9036938  
##                                     Max.   : 2.8902729  
##  tech_tickets_last_3_months.x churn     MONTHS_TO_CONTRACT_END
##  Min.   :-1.42705             0:73201   Min.   :-0.4311       
##  1st Qu.:-0.00002             1: 2557   1st Qu.:-0.4311       
##  Median :-0.00002                       Median :-0.4311       
##  Mean   : 0.00000                       Mean   : 0.0000       
##  3rd Qu.:-0.00002                       3rd Qu.:-0.4311       
##  Max.   :74.62190                       Max.   : 7.2731       
##        bus_res      Subscriber status service type     Service description
##  residental:75758   active:75103      1p:14674     four services :52128   
##                     ctv   :  655      2p: 8577     one service   : 4960   
##                                       3p:52507     three services: 5201   
##                                                    two services  :13469   
##                                                                           
##                                                                           
##  Blizoo_Vip_tarifa    network type     operator     Price_package    
##  Blizoo:38444      docsis2.0:21881   Blizoo:75758   Min.   :-1.6066  
##  Vip   :37314      docsis3.0:35835                  1st Qu.:-0.6028  
##                    lan      : 7957                  Median :-0.4981  
##                    TV       :10085                  Mean   : 0.0000  
##                                                     3rd Qu.: 0.4288  
##                                                     Max.   : 4.3255  
##         ADX_TT_count  
##  one_adx      : 5432  
##  thre and more:  428  
##  two_adx      : 1155  
##  zero         :68743  
##                       
## 
str(df2)
## 'data.frame':    75758 obs. of  28 variables:
##  $ telephone_new.x                   : num  0.434 -1.682 -1.427 -1.258 -0.524 ...
##  $ Price_package.x                   : num  1.74 -1.11 0.53 1.74 -1.11 ...
##  $ mobile_new.x                      : num  0.609 1.441 1.592 -0.864 0.188 ...
##  $ bus_res.x                         : chr  "residental" "residental" "residental" "residental" ...
##  $ first_activation.x                : chr  "39569" "39569" "39569" "39569" ...
##  $ contract_start.x                  : num  0.282 0.857 0.252 0.282 0.731 ...
##  $ service type.x                    : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description.x             : chr  "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" "AtvDtvIntTel" ...
##  $ Blizoo_Vip_tarifa.x               : chr  "Blizoo" "Vip" "Vip" "Blizoo" ...
##  $ network type.x                    : chr  "docsis3.0" "docsis3.0" "docsis3.0" "docsis3.0" ...
##  $ CPE old-new.x                     : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Subscriber status.x               : chr  "active" "active" "active" "active" ...
##  $ operator.x                        : chr  "Blizoo" "Blizoo" "Blizoo" "Blizoo" ...
##  $ ADX_TT_count.x                    : num  2.02 -0.28 -0.28 -0.28 -0.28 ...
##  $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
##  $ TENURE_MONTHS.x                   : num  0.76 0.76 0.76 0.76 0.76 ...
##  $ tech_tickets_last_3_months.x      : num  -1.88e-05 -1.88e-05 -1.88e-05 -1.88e-05 -1.88e-05 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MONTHS_TO_CONTRACT_END            : num  2.02 -0.431 -0.431 2.02 -0.431 ...
##  $ bus_res                           : Factor w/ 1 level "residental": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ operator                          : Factor w/ 1 level "Blizoo": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Price_package                     : num  2.097 -0.603 0.214 2.097 -0.603 ...
##  $ ADX_TT_count                      : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
df2 %>% 
  dplyr::select(Price_package.x, TENURE_MONTHS.x) %>% 
  gather(Variable, Value) %>% 
  ggplot(aes(x=Value, fill=Variable)) +
  geom_density(alpha=0.5) +
  geom_vline(aes(xintercept=0)) +
  xlim(-5, 5) +
  theme_bw() +
  scale_fill_brewer(palette="Spectral")

#We see that variables ara far away from normal distributions. Linear transformation
#would make a better fit. We will leave this as standrised in this moment as it is.

#We will try a logit model for nnet library to make a rough accuracy rate.We will put
#all variables as predictors.

df$`CPE old-new` <- as.factor(df$`CPE old-new`)

df1 <- subset(df,select = -c(operator, bus_res, telephone_new.x, bus_res.x, first_activation.x, `service type.x`, `Service description.x`, Blizoo_Vip_tarifa.x, `network type.x`, `CPE old-new.x`, `Subscriber status.x`, operator.x, ADX_TT_count.x, MONTHS_TO_CONTRACT_END, tech_tickets_last_3_months.x))
str(df1)
## 'data.frame':    75758 obs. of  14 variables:
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ Price_package                     : num  1199 471 691 1199 471 ...
##  $ ADX_TT_count                      : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
##  $ CPE old-new                       : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
#Explore the matrice of interrcorelation


cor <- psycho::correlation(df1, 
                           type = "full",
                           method = "pearson",
                           adjust = "none")

summary(cor)
##                  Price_package.x mobile_new.x contract_start.x
## Price_package.x                                               
## mobile_new.x               0.01                               
## contract_start.x        -0.36***           0                  
## TENURE_MONTHS.x          0.24***           0          -0.33***
## Price_package            0.88***        0.01          -0.14***
##                  TENURE_MONTHS.x
## Price_package.x                 
## mobile_new.x                    
## contract_start.x                
## TENURE_MONTHS.x                 
## Price_package            0.19***
plot(cor)

#creating training and test subset


library(caTools)
set.seed(101)

split = sample.split(df1$churn, SplitRatio = 0.50)

final.train = subset(df1, split == TRUE)
final.test = subset(df1, split == FALSE)

summary(final.train)
##  Price_package.x   mobile_new.x       contract_start.x   
##  Min.   :   0.0   Min.   :0.0000094   Min.   :1.017e+09  
##  1st Qu.:   0.0   1st Qu.:0.2738808   1st Qu.:1.417e+09  
##  Median : 499.0   Median :0.5019332   Median :1.450e+09  
##  Mean   : 469.8   Mean   :0.5019495   Mean   :1.430e+09  
##  3rd Qu.: 798.0   3rd Qu.:0.7278121   3rd Qu.:1.472e+09  
##  Max.   :1798.0   Max.   :0.9999836   Max.   :1.491e+09  
##  tech_tickets_last_3_months_final.x TENURE_MONTHS.x  churn    
##  four and more:  203                Min.   :  1.00   0:36600  
##  one_ticket   : 4478                1st Qu.: 23.00   1: 1278  
##  three_ticket :  513                Median : 73.00            
##  two_ticket   : 2419                Mean   : 73.31            
##  zero         :30265                3rd Qu.:117.00            
##                                     Max.   :214.00            
##  Subscriber status service type     Service description Blizoo_Vip_tarifa
##  active:37552      1p: 7369     four services :26060    Blizoo:19351     
##  ctv   :  326      2p: 4258     one service   : 2508    Vip   :18527     
##                    3p:26251     three services: 2603                     
##                                 two services  : 6707                     
##                                                                          
##                                                                          
##     network type   Price_package           ADX_TT_count   CPE old-new
##  docsis2.0:11056   Min.   : 200.0   one_adx      : 2747   1: 2519    
##  docsis3.0:17797   1st Qu.: 470.7   thre and more:  208   2:27120    
##  lan      : 3985   Median : 499.0   two_adx      :  565   3: 8239    
##  TV       : 5040   Mean   : 634.4   zero         :34358              
##                    3rd Qu.: 798.0                                    
##                    Max.   :1798.0
summary(final.test)
##  Price_package.x  mobile_new.x      contract_start.x   
##  Min.   :   0    Min.   :0.000009   Min.   :1.014e+09  
##  1st Qu.:   0    1st Qu.:0.277662   1st Qu.:1.417e+09  
##  Median : 499    Median :0.501933   Median :1.450e+09  
##  Mean   : 466    Mean   :0.501956   Mean   :1.431e+09  
##  3rd Qu.: 749    3rd Qu.:0.726951   3rd Qu.:1.473e+09  
##  Max.   :1800    Max.   :0.999945   Max.   :1.491e+09  
##  tech_tickets_last_3_months_final.x TENURE_MONTHS.x  churn    
##  four and more:  212                Min.   :  1.00   0:36601  
##  one_ticket   : 4474                1st Qu.: 22.00   1: 1279  
##  three_ticket :  540                Median : 72.88            
##  two_ticket   : 2384                Mean   : 72.44            
##  zero         :30270                3rd Qu.:116.00            
##                                     Max.   :214.00            
##  Subscriber status service type     Service description Blizoo_Vip_tarifa
##  active:37551      1p: 7305     four services :26068    Blizoo:19093     
##  ctv   :  329      2p: 4319     one service   : 2452    Vip   :18787     
##                    3p:26256     three services: 2598                     
##                                 two services  : 6762                     
##                                                                          
##                                                                          
##     network type   Price_package           ADX_TT_count   CPE old-new
##  docsis2.0:10825   Min.   : 200.0   one_adx      : 2685   1: 2503    
##  docsis3.0:18038   1st Qu.: 470.7   thre and more:  220   2:27414    
##  lan      : 3972   Median : 499.0   two_adx      :  590   3: 7963    
##  TV       : 5045   Mean   : 632.3   zero         :34385              
##                    3rd Qu.: 749.0                                    
##                    Max.   :1800.0
#logit model with all predictor. We see that only "subscriber status", "Tenure months", "old CPE"
#are significant preditcors. 
str(df1)
## 'data.frame':    75758 obs. of  14 variables:
##  $ Price_package.x                   : num  1199 0 691 1199 0 ...
##  $ mobile_new.x                      : num  0.67 0.899 0.94 0.264 0.554 ...
##  $ contract_start.x                  : num  1.45e+09 1.49e+09 1.45e+09 1.45e+09 1.48e+09 ...
##  $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 5 5 5 5 2 5 5 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 128 110 110 110 110 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 2 2 1 2 2 1 1 2 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ Price_package                     : num  1199 471 691 1199 471 ...
##  $ ADX_TT_count                      : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
##  $ CPE old-new                       : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
missmap(df1)

logit <- glm(formula = churn ~ ., family = binomial(logit), data = df1)
summary(logit)
## 
## Call:
## glm(formula = churn ~ ., family = binomial(logit), data = df1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4888  -0.2941  -0.2173  -0.1678   3.2595  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -9.829e-01  7.402e-01
## Price_package.x                                 2.293e-04  1.715e-04
## mobile_new.x                                   -2.144e-02  7.625e-02
## contract_start.x                               -1.532e-09  4.102e-10
## tech_tickets_last_3_months_final.xone_ticket   -2.108e-01  4.124e-01
## tech_tickets_last_3_months_final.xthree_ticket -6.786e-01  4.509e-01
## tech_tickets_last_3_months_final.xtwo_ticket   -5.116e-01  4.207e-01
## tech_tickets_last_3_months_final.xzero         -9.381e-02  4.084e-01
## TENURE_MONTHS.x                                -8.558e-03  5.381e-04
## `Subscriber status`ctv                          2.799e+00  9.130e-02
## `service type`2p                                1.324e-01  1.836e-01
## `service type`3p                                5.063e-01  3.138e-01
## `Service description`one service                4.974e-01  3.125e-01
## `Service description`three services             3.861e-01  2.422e-01
## `Service description`two services               5.254e-01  2.709e-01
## Blizoo_Vip_tarifaVip                           -4.425e-01  1.102e-01
## `network type`docsis3.0                        -5.783e-03  7.213e-02
## `network type`lan                               1.034e-03  8.244e-02
## `network type`TV                                7.640e-01  1.688e-01
## Price_package                                   8.463e-04  2.038e-04
## ADX_TT_countthre and more                      -2.848e-01  4.605e-01
## ADX_TT_counttwo_adx                             2.163e-03  2.246e-01
## ADX_TT_countzero                               -3.275e-01  1.192e-01
## `CPE old-new`2                                 -5.375e-01  9.670e-02
## `CPE old-new`3                                 -2.860e-01  8.623e-02
##                                                z value Pr(>|z|)    
## (Intercept)                                     -1.328 0.184251    
## Price_package.x                                  1.337 0.181140    
## mobile_new.x                                    -0.281 0.778608    
## contract_start.x                                -3.736 0.000187 ***
## tech_tickets_last_3_months_final.xone_ticket    -0.511 0.609220    
## tech_tickets_last_3_months_final.xthree_ticket  -1.505 0.132312    
## tech_tickets_last_3_months_final.xtwo_ticket    -1.216 0.223947    
## tech_tickets_last_3_months_final.xzero          -0.230 0.818315    
## TENURE_MONTHS.x                                -15.904  < 2e-16 ***
## `Subscriber status`ctv                          30.663  < 2e-16 ***
## `service type`2p                                 0.721 0.470708    
## `service type`3p                                 1.614 0.106627    
## `Service description`one service                 1.592 0.111495    
## `Service description`three services              1.594 0.110988    
## `Service description`two services                1.940 0.052432 .  
## Blizoo_Vip_tarifaVip                            -4.015 5.94e-05 ***
## `network type`docsis3.0                         -0.080 0.936099    
## `network type`lan                                0.013 0.989994    
## `network type`TV                                 4.527 5.99e-06 ***
## Price_package                                    4.152 3.30e-05 ***
## ADX_TT_countthre and more                       -0.618 0.536278    
## ADX_TT_counttwo_adx                              0.010 0.992314    
## ADX_TT_countzero                                -2.747 0.006017 ** 
## `CPE old-new`2                                  -5.559 2.71e-08 ***
## `CPE old-new`3                                  -3.316 0.000912 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22357  on 75757  degrees of freedom
## Residual deviance: 20634  on 75733  degrees of freedom
## AIC: 20684
## 
## Number of Fisher Scoring iterations: 7
#We run a stepwise algorithm which puts in model predictors one by one 
#and calculates AIC (AIC = 2k - 2ln(L), L is maximum of maximum likelihood funcion and k is the number
#of predictors. Smallest the AIC it is better - with fewer predictors we have more information and 
#better prediction.
#Now we left only important predictors.

logit1 <- step(logit)
## Start:  AIC=20684.06
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + `service type` + 
##     `Service description` + Blizoo_Vip_tarifa + `network type` + 
##     Price_package + ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance   AIC
## - `Service description`               3    20638 20682
## - mobile_new.x                        1    20634 20682
## - `service type`                      2    20637 20683
## - Price_package.x                     1    20636 20684
## <none>                                     20634 20684
## - ADX_TT_count                        3    20643 20687
## - tech_tickets_last_3_months_final.x  4    20653 20695
## - contract_start.x                    1    20648 20696
## - Blizoo_Vip_tarifa                   1    20651 20699
## - Price_package                       1    20652 20700
## - `network type`                      3    20656 20700
## - `CPE old-new`                       2    20668 20714
## - TENURE_MONTHS.x                     1    20910 20958
## - `Subscriber status`                 1    21310 21358
## 
## Step:  AIC=20681.76
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + `service type` + 
##     Blizoo_Vip_tarifa + `network type` + Price_package + ADX_TT_count + 
##     `CPE old-new`
## 
##                                      Df Deviance   AIC
## - `service type`                      2    20639 20679
## - mobile_new.x                        1    20638 20680
## - Price_package.x                     1    20640 20682
## <none>                                     20638 20682
## - ADX_TT_count                        3    20647 20685
## - contract_start.x                    1    20650 20692
## - tech_tickets_last_3_months_final.x  4    20657 20693
## - Price_package                       1    20654 20696
## - Blizoo_Vip_tarifa                   1    20656 20698
## - `CPE old-new`                       2    20675 20715
## - `network type`                      3    20703 20741
## - TENURE_MONTHS.x                     1    20917 20959
## - `Subscriber status`                 1    21314 21356
## 
## Step:  AIC=20678.72
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa + 
##     `network type` + Price_package + ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance   AIC
## - mobile_new.x                        1    20639 20677
## - Price_package.x                     1    20641 20679
## <none>                                     20639 20679
## - ADX_TT_count                        3    20647 20681
## - tech_tickets_last_3_months_final.x  4    20657 20689
## - contract_start.x                    1    20653 20691
## - Price_package                       1    20654 20692
## - Blizoo_Vip_tarifa                   1    20659 20697
## - `CPE old-new`                       2    20677 20713
## - `network type`                      3    20756 20790
## - TENURE_MONTHS.x                     1    20919 20957
## - `Subscriber status`                 1    21317 21355
## 
## Step:  AIC=20676.79
## churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa + 
##     `network type` + Price_package + ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance   AIC
## - Price_package.x                     1    20641 20677
## <none>                                     20639 20677
## - ADX_TT_count                        3    20647 20679
## - tech_tickets_last_3_months_final.x  4    20657 20687
## - contract_start.x                    1    20653 20689
## - Price_package                       1    20654 20690
## - Blizoo_Vip_tarifa                   1    20659 20695
## - `CPE old-new`                       2    20677 20711
## - `network type`                      3    20756 20788
## - TENURE_MONTHS.x                     1    20919 20955
## - `Subscriber status`                 1    21317 21353
## 
## Step:  AIC=20676.69
## churn ~ contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa + 
##     `network type` + Price_package + ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance   AIC
## <none>                                     20641 20677
## - ADX_TT_count                        3    20649 20679
## - tech_tickets_last_3_months_final.x  4    20659 20687
## - contract_start.x                    1    20656 20690
## - `CPE old-new`                       2    20678 20710
## - Blizoo_Vip_tarifa                   1    20680 20714
## - Price_package                       1    20719 20753
## - `network type`                      3    20757 20787
## - TENURE_MONTHS.x                     1    20919 20953
## - `Subscriber status`                 1    21317 21351
summary(logit1)
## 
## Call:
## glm(formula = churn ~ contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + Blizoo_Vip_tarifa + 
##     `network type` + Price_package + ADX_TT_count + `CPE old-new`, 
##     family = binomial(logit), data = df1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5037  -0.2949  -0.2138  -0.1670   3.2440  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -4.552e-01  6.806e-01
## contract_start.x                               -1.516e-09  3.854e-10
## tech_tickets_last_3_months_final.xone_ticket   -2.084e-01  4.120e-01
## tech_tickets_last_3_months_final.xthree_ticket -6.746e-01  4.507e-01
## tech_tickets_last_3_months_final.xtwo_ticket   -5.067e-01  4.203e-01
## tech_tickets_last_3_months_final.xzero         -8.984e-02  4.080e-01
## TENURE_MONTHS.x                                -8.532e-03  5.342e-04
## `Subscriber status`ctv                          2.793e+00  9.078e-02
## Blizoo_Vip_tarifaVip                           -5.547e-01  8.973e-02
## `network type`docsis3.0                        -8.156e-03  7.142e-02
## `network type`lan                              -1.692e-03  8.190e-02
## `network type`TV                                7.277e-01  7.105e-02
## Price_package                                   1.014e-03  1.138e-04
## ADX_TT_countthre and more                      -2.863e-01  4.601e-01
## ADX_TT_counttwo_adx                             1.868e-03  2.245e-01
## ADX_TT_countzero                               -3.221e-01  1.191e-01
## `CPE old-new`2                                 -5.307e-01  9.230e-02
## `CPE old-new`3                                 -2.647e-01  8.219e-02
##                                                z value Pr(>|z|)    
## (Intercept)                                     -0.669  0.50358    
## contract_start.x                                -3.932 8.41e-05 ***
## tech_tickets_last_3_months_final.xone_ticket    -0.506  0.61301    
## tech_tickets_last_3_months_final.xthree_ticket  -1.497  0.13443    
## tech_tickets_last_3_months_final.xtwo_ticket    -1.206  0.22797    
## tech_tickets_last_3_months_final.xzero          -0.220  0.82570    
## TENURE_MONTHS.x                                -15.973  < 2e-16 ***
## `Subscriber status`ctv                          30.762  < 2e-16 ***
## Blizoo_Vip_tarifaVip                            -6.183 6.31e-10 ***
## `network type`docsis3.0                         -0.114  0.90908    
## `network type`lan                               -0.021  0.98351    
## `network type`TV                                10.242  < 2e-16 ***
## Price_package                                    8.916  < 2e-16 ***
## ADX_TT_countthre and more                       -0.622  0.53378    
## ADX_TT_counttwo_adx                              0.008  0.99336    
## ADX_TT_countzero                                -2.705  0.00683 ** 
## `CPE old-new`2                                  -5.749 8.95e-09 ***
## `CPE old-new`3                                  -3.221  0.00128 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22357  on 75757  degrees of freedom
## Residual deviance: 20641  on 75740  degrees of freedom
## AIC: 20677
## 
## Number of Fisher Scoring iterations: 7
capture.output(summary(logit1), file = "logit_without_smotes.xls")


#Now we will do exponentaion of every coefficiten to conclude the odd ratio for every predictor.
#This shows how odd ratio changes when predictor change for one unite while holding other 
#predictors constant. If exp(b)=2 that means that when this predictor changes
#for unite there is twice bigger chance of churn. We do this on step logit model
#with the best predictors.

exp(cbind(OR = coef(logit1), confint(logit1)))
## Waiting for profiling to be done...
##                                                        OR      2.5 %
## (Intercept)                                     0.6343135  0.1629550
## contract_start.x                                1.0000000  1.0000000
## tech_tickets_last_3_months_final.xone_ticket    0.8118924  0.3813783
## tech_tickets_last_3_months_final.xthree_ticket  0.5093796  0.2160549
## tech_tickets_last_3_months_final.xtwo_ticket    0.6024860  0.2777721
## tech_tickets_last_3_months_final.xzero          0.9140777  0.4331511
## TENURE_MONTHS.x                                 0.9915040  0.9904604
## `Subscriber status`ctv                         16.3217736 13.6444959
## Blizoo_Vip_tarifaVip                            0.5742236  0.4813648
## `network type`docsis3.0                         0.9918770  0.8623676
## `network type`lan                               0.9983089  0.8488594
## `network type`TV                                2.0703426  1.8005992
## Price_package                                   1.0010147  1.0007911
## ADX_TT_countthre and more                       0.7510533  0.2906648
## ADX_TT_counttwo_adx                             1.0018695  0.6340033
## ADX_TT_countzero                                0.7246200  0.5752190
## `CPE old-new`2                                  0.5882139  0.4910162
## `CPE old-new`3                                  0.7674379  0.6534652
##                                                    97.5 %
## (Intercept)                                     2.3524299
## contract_start.x                                1.0000000
## tech_tickets_last_3_months_final.xone_ticket    1.9210408
## tech_tickets_last_3_months_final.xthree_ticket  1.2737550
## tech_tickets_last_3_months_final.xtwo_ticket    1.4461157
## tech_tickets_last_3_months_final.xzero          2.1470236
## TENURE_MONTHS.x                                 0.9925366
## `Subscriber status`ctv                         19.4791030
## Blizoo_Vip_tarifaVip                            0.6842770
## `network type`docsis3.0                         1.1410350
## `network type`lan                               1.1703165
## `network type`TV                                2.3789701
## Price_package                                   1.0012374
## ADX_TT_countthre and more                       1.7729407
## ADX_TT_counttwo_adx                             1.5329786
## ADX_TT_countzero                                0.9176355
## `CPE old-new`2                                  0.7050968
## `CPE old-new`3                                  0.9019437
PseudoR2(logit1, "Nagelkerke")
## Nagelkerke 
## 0.08763531
#pravljenje confusion matrice

test.predicted.income = predict(logit1, newdata = final.test, type = "response" )
table(final.test$churn, test.predicted.income > 0.5)
##    
##     FALSE  TRUE
##   0 36572    29
##   1  1260    19
tab <- table(final.test$churn, test.predicted.income > 0.5)
tab
##    
##     FALSE  TRUE
##   0 36572    29
##   1  1260    19
#accuracy rate
acc <- sum(diag(tab))/sum(tab)
acc
## [1] 0.9659715
#mislasification rate

mfc <- 1-acc

mfc
## [1] 0.03402851
# (true positive/true positive + false positive) - precision
15/(15+30)
## [1] 0.3333333
#(true negative/true negative + false negative) - recall
36571/(36571+1264)
## [1] 0.9665918
#Here we just wanted to check the best of cut off value.



optCutOff <- optimalCutoff(final.test$churn, test.predicted.income)
optCutOff
## [1] 0.603401
plotROC(final.test$churn, test.predicted.income)

#Now we will try to balance the data through SMOTES algorithm. Perc.over is 200 - that means that
#we oversample the churns and perc.under is 150 that means that we undersample the no churns.
#k is the number of the nearest neighbours that are taken as the basis for sampling.

split <- sample.split(df1$churn, SplitRatio = 0.50)

traindf <- subset(df1, split == TRUE)
testdf <- subset(df1, split == FALSE)

as.data.frame(table(traindf$churn))
##   Var1  Freq
## 1    0 36600
## 2    1  1278
str(traindf)
## 'data.frame':    37878 obs. of  14 variables:
##  $ Price_package.x                   : num  1199 1199 850 1199 0 ...
##  $ mobile_new.x                      : num  0.6697 0.2639 0.0943 0.9522 0.9271 ...
##  $ contract_start.x                  : num  1.45e+09 1.45e+09 1.41e+09 1.43e+09 1.49e+09 ...
##  $ tech_tickets_last_3_months_final.x: Factor w/ 5 levels "four and more",..: 2 5 5 2 5 5 5 5 5 5 ...
##  $ TENURE_MONTHS.x                   : num  110 110 110 110 110 110 110 110 110 110 ...
##  $ churn                             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Subscriber status                 : Factor w/ 2 levels "active","ctv": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service type                      : Factor w/ 3 levels "1p","2p","3p": 3 3 3 3 3 1 3 3 3 3 ...
##  $ Service description               : Factor w/ 4 levels "four services",..: 1 1 1 1 1 4 1 1 1 1 ...
##  $ Blizoo_Vip_tarifa                 : Factor w/ 2 levels "Blizoo","Vip": 1 1 1 1 2 1 2 1 1 1 ...
##  $ network type                      : Factor w/ 4 levels "docsis2.0","docsis3.0",..: 2 2 2 2 2 4 2 1 2 1 ...
##  $ Price_package                     : num  1199 1199 850 1199 471 ...
##  $ ADX_TT_count                      : Factor w/ 4 levels "one_adx","thre and more",..: 1 4 4 4 4 4 4 4 4 4 ...
##  $ CPE old-new                       : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
## Smote : Synthetic Minority Oversampling Technique To Handle Class Imbalancy In Binary Classification

balanced.data <- SMOTE(churn ~., traindf, perc.over = 200, k = 5, perc.under = 100)

as.data.frame(table(balanced.data$churn))
##   Var1 Freq
## 1    0 2556
## 2    1 3834
logit_balanced <- glm(formula = churn ~ ., family = binomial(logit), data = balanced.data)
summary(logit_balanced)
## 
## Call:
## glm(formula = churn ~ ., family = binomial(logit), data = balanced.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1196  -0.8588   0.1583   0.8473   2.1406  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.187e-01  1.056e+00
## Price_package.x                                 8.907e-04  2.155e-04
## mobile_new.x                                   -1.522e-02  1.192e-01
## contract_start.x                                1.127e-09  6.566e-10
## tech_tickets_last_3_months_final.xone_ticket    1.513e+00  5.570e-01
## tech_tickets_last_3_months_final.xthree_ticket  1.359e+00  5.775e-01
## tech_tickets_last_3_months_final.xtwo_ticket    1.276e+00  5.598e-01
## tech_tickets_last_3_months_final.xzero          8.632e-01  5.507e-01
## TENURE_MONTHS.x                                -1.004e-02  7.815e-04
## `Subscriber status`ctv                          3.755e+00  2.375e-01
## `service type`2p                               -1.622e-02  1.399e-01
## `service type`3p                               -4.012e-01  1.132e-01
## `Service description`one service               -1.039e-01  1.509e-01
## `Service description`three services             5.311e-01  1.528e-01
## `Service description`two services               4.751e-01  1.073e-01
## Blizoo_Vip_tarifaVip                            1.781e-01  9.576e-02
## `network type`docsis3.0                        -2.602e-01  9.206e-02
## `network type`lan                              -3.690e-01  1.237e-01
## `network type`TV                                5.976e-01  1.208e-01
## Price_package                                   3.112e-04  2.917e-04
## ADX_TT_countthre and more                       3.147e-01  4.150e-01
## ADX_TT_counttwo_adx                            -1.310e+00  2.643e-01
## ADX_TT_countzero                               -1.580e+00  1.024e-01
## `CPE old-new`2                                 -1.055e+00  1.333e-01
## `CPE old-new`3                                 -6.883e-01  1.330e-01
##                                                z value Pr(>|z|)    
## (Intercept)                                     -0.302 0.762743    
## Price_package.x                                  4.132 3.59e-05 ***
## mobile_new.x                                    -0.128 0.898385    
## contract_start.x                                 1.716 0.086138 .  
## tech_tickets_last_3_months_final.xone_ticket     2.717 0.006585 ** 
## tech_tickets_last_3_months_final.xthree_ticket   2.353 0.018599 *  
## tech_tickets_last_3_months_final.xtwo_ticket     2.279 0.022674 *  
## tech_tickets_last_3_months_final.xzero           1.568 0.116970    
## TENURE_MONTHS.x                                -12.851  < 2e-16 ***
## `Subscriber status`ctv                          15.812  < 2e-16 ***
## `service type`2p                                -0.116 0.907701    
## `service type`3p                                -3.544 0.000394 ***
## `Service description`one service                -0.689 0.491079    
## `Service description`three services              3.477 0.000508 ***
## `Service description`two services                4.427 9.55e-06 ***
## Blizoo_Vip_tarifaVip                             1.860 0.062860 .  
## `network type`docsis3.0                         -2.827 0.004701 ** 
## `network type`lan                               -2.982 0.002862 ** 
## `network type`TV                                 4.947 7.55e-07 ***
## Price_package                                    1.067 0.285948    
## ADX_TT_countthre and more                        0.758 0.448243    
## ADX_TT_counttwo_adx                             -4.957 7.15e-07 ***
## ADX_TT_countzero                               -15.431  < 2e-16 ***
## `CPE old-new`2                                  -7.918 2.41e-15 ***
## `CPE old-new`3                                  -5.177 2.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8601.1  on 6389  degrees of freedom
## Residual deviance: 6260.1  on 6365  degrees of freedom
## AIC: 6310.1
## 
## Number of Fisher Scoring iterations: 6
logit1_balanced <- step(logit_balanced)
## Start:  AIC=6310.07
## churn ~ Price_package.x + mobile_new.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + `service type` + 
##     `Service description` + Blizoo_Vip_tarifa + `network type` + 
##     Price_package + ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance    AIC
## - mobile_new.x                        1   6260.1 6308.1
## - Price_package                       1   6261.2 6309.2
## <none>                                    6260.1 6310.1
## - contract_start.x                    1   6263.0 6311.0
## - Blizoo_Vip_tarifa                   1   6263.5 6311.5
## - `service type`                      2   6276.5 6322.5
## - Price_package.x                     1   6277.1 6325.1
## - `Service description`               3   6290.7 6334.7
## - `network type`                      3   6321.5 6365.5
## - tech_tickets_last_3_months_final.x  4   6328.3 6370.3
## - `CPE old-new`                       2   6331.0 6377.0
## - TENURE_MONTHS.x                     1   6434.2 6482.2
## - ADX_TT_count                        3   6562.3 6606.3
## - `Subscriber status`                 1   6992.7 7040.7
## 
## Step:  AIC=6308.09
## churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + `service type` + 
##     `Service description` + Blizoo_Vip_tarifa + `network type` + 
##     Price_package + ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance    AIC
## - Price_package                       1   6261.2 6307.2
## <none>                                    6260.1 6308.1
## - contract_start.x                    1   6263.0 6309.0
## - Blizoo_Vip_tarifa                   1   6263.6 6309.6
## - `service type`                      2   6276.6 6320.6
## - Price_package.x                     1   6277.1 6323.1
## - `Service description`               3   6290.7 6332.7
## - `network type`                      3   6321.6 6363.6
## - tech_tickets_last_3_months_final.x  4   6328.4 6368.4
## - `CPE old-new`                       2   6331.2 6375.2
## - TENURE_MONTHS.x                     1   6434.2 6480.2
## - ADX_TT_count                        3   6562.4 6604.4
## - `Subscriber status`                 1   6992.8 7038.8
## 
## Step:  AIC=6307.24
## churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + `service type` + 
##     `Service description` + Blizoo_Vip_tarifa + `network type` + 
##     ADX_TT_count + `CPE old-new`
## 
##                                      Df Deviance    AIC
## <none>                                    6261.2 6307.2
## - Blizoo_Vip_tarifa                   1   6264.7 6308.7
## - contract_start.x                    1   6265.0 6309.0
## - `service type`                      2   6277.0 6319.0
## - `Service description`               3   6291.3 6331.3
## - `network type`                      3   6321.7 6361.7
## - tech_tickets_last_3_months_final.x  4   6329.6 6367.6
## - `CPE old-new`                       2   6331.2 6373.2
## - Price_package.x                     1   6345.6 6389.6
## - TENURE_MONTHS.x                     1   6438.4 6482.4
## - ADX_TT_count                        3   6562.5 6602.5
## - `Subscriber status`                 1   6993.2 7037.2
summary(logit1_balanced)
## 
## Call:
## glm(formula = churn ~ Price_package.x + contract_start.x + tech_tickets_last_3_months_final.x + 
##     TENURE_MONTHS.x + `Subscriber status` + `service type` + 
##     `Service description` + Blizoo_Vip_tarifa + `network type` + 
##     ADX_TT_count + `CPE old-new`, family = binomial(logit), data = balanced.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1105  -0.8537   0.1586   0.8484   2.1489  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.993e-01  1.051e+00
## Price_package.x                                 1.082e-03  1.195e-04
## contract_start.x                                1.256e-09  6.455e-10
## tech_tickets_last_3_months_final.xone_ticket    1.491e+00  5.556e-01
## tech_tickets_last_3_months_final.xthree_ticket  1.327e+00  5.756e-01
## tech_tickets_last_3_months_final.xtwo_ticket    1.262e+00  5.588e-01
## tech_tickets_last_3_months_final.xzero          8.410e-01  5.493e-01
## TENURE_MONTHS.x                                -1.011e-02  7.797e-04
## `Subscriber status`ctv                          3.747e+00  2.373e-01
## `service type`2p                               -2.033e-02  1.398e-01
## `service type`3p                               -3.912e-01  1.127e-01
## `Service description`one service               -1.120e-01  1.508e-01
## `Service description`three services             5.283e-01  1.527e-01
## `Service description`two services               4.639e-01  1.067e-01
## Blizoo_Vip_tarifaVip                            1.771e-01  9.553e-02
## `network type`docsis3.0                        -2.570e-01  9.196e-02
## `network type`lan                              -3.727e-01  1.236e-01
## `network type`TV                                5.860e-01  1.202e-01
## ADX_TT_countthre and more                       3.015e-01  4.150e-01
## ADX_TT_counttwo_adx                            -1.303e+00  2.640e-01
## ADX_TT_countzero                               -1.576e+00  1.022e-01
## `CPE old-new`2                                 -1.039e+00  1.322e-01
## `CPE old-new`3                                 -6.784e-01  1.326e-01
##                                                z value Pr(>|z|)    
## (Intercept)                                     -0.380 0.704125    
## Price_package.x                                  9.058  < 2e-16 ***
## contract_start.x                                 1.946 0.051624 .  
## tech_tickets_last_3_months_final.xone_ticket     2.684 0.007281 ** 
## tech_tickets_last_3_months_final.xthree_ticket   2.305 0.021152 *  
## tech_tickets_last_3_months_final.xtwo_ticket     2.259 0.023900 *  
## tech_tickets_last_3_months_final.xzero           1.531 0.125782    
## TENURE_MONTHS.x                                -12.962  < 2e-16 ***
## `Subscriber status`ctv                          15.787  < 2e-16 ***
## `service type`2p                                -0.145 0.884391    
## `service type`3p                                -3.473 0.000515 ***
## `Service description`one service                -0.743 0.457688    
## `Service description`three services              3.461 0.000539 ***
## `Service description`two services                4.349 1.37e-05 ***
## Blizoo_Vip_tarifaVip                             1.853 0.063820 .  
## `network type`docsis3.0                         -2.795 0.005191 ** 
## `network type`lan                               -3.016 0.002565 ** 
## `network type`TV                                 4.878 1.07e-06 ***
## ADX_TT_countthre and more                        0.727 0.467502    
## ADX_TT_counttwo_adx                             -4.934 8.04e-07 ***
## ADX_TT_countzero                               -15.416  < 2e-16 ***
## `CPE old-new`2                                  -7.858 3.90e-15 ***
## `CPE old-new`3                                  -5.117 3.10e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8601.1  on 6389  degrees of freedom
## Residual deviance: 6261.2  on 6367  degrees of freedom
## AIC: 6307.2
## 
## Number of Fisher Scoring iterations: 6
PseudoR2(logit1_balanced, "Nagelkerke")
## Nagelkerke 
##  0.4145004
capture.output(summary(logit1_balanced), file = "logit_smotes.xls")
exp(cbind(OR = coef(logit1_balanced), confint(logit1_balanced)))
## Waiting for profiling to be done...
##                                                        OR       2.5 %
## (Intercept)                                     0.6707990  0.08448574
## Price_package.x                                 1.0010829  1.00084939
## contract_start.x                                1.0000000  1.00000000
## tech_tickets_last_3_months_final.xone_ticket    4.4421086  1.51917415
## tech_tickets_last_3_months_final.xthree_ticket  3.7695394  1.23862103
## tech_tickets_last_3_months_final.xtwo_ticket    3.5329331  1.20070996
## tech_tickets_last_3_months_final.xzero          2.3187387  0.80295457
## TENURE_MONTHS.x                                 0.9899440  0.98842434
## `Subscriber status`ctv                         42.3805708 27.39509963
## `service type`2p                                0.9798764  0.74511775
## `service type`3p                                0.6762185  0.54201152
## `Service description`one service                0.8940827  0.66520001
## `Service description`three services             1.6960222  1.25848468
## `Service description`two services               1.5902260  1.29048525
## Blizoo_Vip_tarifaVip                            1.1936927  0.99025361
## `network type`docsis3.0                         0.7733605  0.64571966
## `network type`lan                               0.6888417  0.54023880
## `network type`TV                                1.7968738  1.42040815
## ADX_TT_countthre and more                       1.3519508  0.62693105
## ADX_TT_counttwo_adx                             0.2718247  0.16201371
## ADX_TT_countzero                                0.2068306  0.16884691
## `CPE old-new`2                                  0.3539334  0.27244218
## `CPE old-new`3                                  0.5074185  0.39046982
##                                                    97.5 %
## (Intercept)                                     5.2301075
## Price_package.x                                 1.0013184
## contract_start.x                                1.0000000
## tech_tickets_last_3_months_final.xone_ticket   13.6283632
## tech_tickets_last_3_months_final.xthree_ticket 12.0031288
## tech_tickets_last_3_months_final.xtwo_ticket   10.9029528
## tech_tickets_last_3_months_final.xzero          7.0318295
## TENURE_MONTHS.x                                 0.9914507
## `Subscriber status`ctv                         69.8604823
## `service type`2p                                1.2891822
## `service type`3p                                0.8430646
## `Service description`one service                1.2014331
## `Service description`three services             2.2901396
## `Service description`two services               1.9607128
## Blizoo_Vip_tarifaVip                            1.4401729
## `network type`docsis3.0                         0.9260171
## `network type`lan                               0.8771942
## `network type`TV                                2.2752146
## ADX_TT_countthre and more                       3.2433400
## ADX_TT_counttwo_adx                             0.4571275
## ADX_TT_countzero                                0.2521164
## `CPE old-new`2                                  0.4575094
## `CPE old-new`3                                  0.6567256
logit1_odds_balanced <- exp(cbind(OR = coef(logit1), confint(logit1)))
## Waiting for profiling to be done...
capture.output(summary(logit1_odds_balanced), file = "logit1_odds.xls")



#pravljenje confusion matrice

test.predicted.income = predict(logit1_balanced, newdata = testdf, type = "response" )
table(testdf$churn, test.predicted.income > 0.5)
##    
##     FALSE  TRUE
##   0 24501 12100
##   1   596   683
tab <- table(testdf$churn, test.predicted.income > 0.5)
acc <- sum(diag(tab))/sum(tab)
acc
## [1] 0.6648363
#mislasification rate
mfc <- 1-acc
mfc
## [1] 0.3351637
plotROC(testdf$churn, test.predicted.income)

#Now we will deploy random forest model, without much tuning (default). We will use cross validation
#(deviding into 3 samples and checking wether the default random forest model has the same
#accuracy accross the samples in order to avoid overfitting)

tr <- caret::trainControl(method="cv", number=3, search = "random")
default_model <- caret::train(churn ~ ., data = traindf, method = 'rf', metric = 'Accuracy', trControl = tr)
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:InformationValue':
## 
##     confusionMatrix, precision, sensitivity, specificity
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
print(default_model)
## Random Forest 
## 
## 37878 samples
##    13 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 25252, 25252, 25252 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9673953  0.0711089
##   18    0.9662601  0.1634497
##   22    0.9658113  0.1600487
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
test.predicted.income = predict(default_model, newdata = testdf, type = "raw")
table(testdf$churn, test.predicted.income)
##    test.predicted.income
##         0     1
##   0 36597     4
##   1  1208    71
#We see that default model with mtry=5 has a very high accuracy, but Kappa is poor 
#(number of variable is randomly collected to be sampled at each split time.)

pred <- predict(default_model, newdata = testdf, type = "raw")
prob <- predict(default_model, newdata = testdf, type = "prob")
head(prob)
##    0 1
## 2  1 0
## 3  1 0
## 5  1 0
## 6  1 0
## 10 1 0
## 11 1 0
confusionMatrix(table(pred, testdf$churn))
## Confusion Matrix and Statistics
## 
##     
## pred     0     1
##    0 36597  1208
##    1     4    71
##                                           
##                Accuracy : 0.968           
##                  95% CI : (0.9662, 0.9698)
##     No Information Rate : 0.9662          
##     P-Value [Acc > NIR] : 0.0285          
##                                           
##                   Kappa : 0.1015          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99989         
##             Specificity : 0.05551         
##          Pos Pred Value : 0.96805         
##          Neg Pred Value : 0.94667         
##              Prevalence : 0.96624         
##          Detection Rate : 0.96613         
##    Detection Prevalence : 0.99802         
##       Balanced Accuracy : 0.52770         
##                                           
##        'Positive' Class : 0               
## 
table(pred, testdf$churn)
##     
## pred     0     1
##    0 36597  1208
##    1     4    71
prob <- predict(default_model, newdata = testdf, type = "prob")
forestpred = prediction(prob[,2], testdf$churn)
forestperf = performance(forestpred, "tpr", "fpr")
plot(forestperf, main="ROC", colorize=T)

auc_ROCR <- performance(forestpred, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
print(auc_ROCR)
## [1] 0.657174