library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gtools)
## Warning: package 'gtools' was built under R version 3.6.2
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot)
## corrplot 0.84 loaded
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
train_data <- read.csv("churnTrain.csv")
test_data <- read.csv("churnTest.csv")
head(train_data)
## State Account_Length Area_Code Phone_No International_Plan Voice_Mail_Plan
## 1 KS 128 415 3824657 no yes
## 2 OH 107 415 3717191 no yes
## 3 NJ 137 415 3581921 no no
## 4 OH 84 408 3759999 yes no
## 5 OK 75 415 3306626 yes no
## 6 AL 118 510 3918027 yes no
## No_Vmail_Messages Total_Day_minutes Total_Day_Calls Total_Day_charge
## 1 25 265.1 110 45.07
## 2 26 161.6 123 27.47
## 3 0 243.4 114 41.38
## 4 0 299.4 71 50.90
## 5 0 166.7 113 28.34
## 6 0 223.4 98 37.98
## Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge Total_Night_Minutes
## 1 197.4 99 16.78 244.7
## 2 195.5 103 16.62 254.4
## 3 121.2 110 10.30 162.6
## 4 61.9 88 5.26 196.9
## 5 148.3 122 12.61 186.9
## 6 220.6 101 18.75 203.9
## Total_Night_Calls Total_Night_Charge Total_Intl_Minutes Total_Intl_Calls
## 1 91 11.01 10.0 3
## 2 103 11.45 13.7 3
## 3 104 7.32 12.2 5
## 4 89 8.86 6.6 7
## 5 121 8.41 10.1 3
## 6 118 9.18 6.3 6
## Total_Intl_Charge No_CS_Calls Churn
## 1 2.70 1 FALSE
## 2 3.70 1 FALSE
## 3 3.29 0 FALSE
## 4 1.78 2 FALSE
## 5 2.73 3 FALSE
## 6 1.70 0 FALSE
head(test_data)
## State Account_Length Area_Code Phone_No International_Plan Voice_Mail_Plan
## 1 HI 101 510 3548815 no no
## 2 MT 137 510 3817211 no no
## 3 OH 103 408 4119481 no yes
## 4 NM 99 415 4189100 no no
## 5 SC 108 415 4133643 no no
## 6 IA 117 415 3756180 no no
## No_Vmail_Messages Total_Day_minutes Total_Day_Calls Total_Day_charge
## 1 0 70.9 123 12.05
## 2 0 223.6 86 38.01
## 3 29 294.7 95 50.10
## 4 0 216.8 123 36.86
## 5 0 197.4 78 33.56
## 6 0 226.5 85 38.51
## Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge Total_Night_Minutes
## 1 211.9 73 18.01 236.0
## 2 244.8 139 20.81 94.2
## 3 237.3 105 20.17 300.3
## 4 126.4 88 10.74 220.6
## 5 124.0 101 10.54 204.5
## 6 141.6 68 12.04 223.0
## Total_Night_Calls Total_Night_Charge Total_Intl_Minutes Total_Intl_Calls
## 1 73 10.62 10.6 3
## 2 81 4.24 9.5 7
## 3 127 13.51 13.7 6
## 4 82 9.93 15.7 2
## 5 107 9.20 7.7 4
## 6 90 10.04 6.9 5
## Total_Intl_Charge No_CS_Calls
## 1 2.86 3
## 2 2.57 0
## 3 3.70 1
## 4 4.24 1
## 5 2.08 2
## 6 1.86 1
df <- smartbind(train_data, test_data)
# view the first 6 rows of the train_data
head(df)
## State Account_Length Area_Code Phone_No International_Plan Voice_Mail_Plan
## 1:1 KS 128 415 3824657 no yes
## 1:2 OH 107 415 3717191 no yes
## 1:3 NJ 137 415 3581921 no no
## 1:4 OH 84 408 3759999 yes no
## 1:5 OK 75 415 3306626 yes no
## 1:6 AL 118 510 3918027 yes no
## No_Vmail_Messages Total_Day_minutes Total_Day_Calls Total_Day_charge
## 1:1 25 265.1 110 45.07
## 1:2 26 161.6 123 27.47
## 1:3 0 243.4 114 41.38
## 1:4 0 299.4 71 50.90
## 1:5 0 166.7 113 28.34
## 1:6 0 223.4 98 37.98
## Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge Total_Night_Minutes
## 1:1 197.4 99 16.78 244.7
## 1:2 195.5 103 16.62 254.4
## 1:3 121.2 110 10.30 162.6
## 1:4 61.9 88 5.26 196.9
## 1:5 148.3 122 12.61 186.9
## 1:6 220.6 101 18.75 203.9
## Total_Night_Calls Total_Night_Charge Total_Intl_Minutes Total_Intl_Calls
## 1:1 91 11.01 10.0 3
## 1:2 103 11.45 13.7 3
## 1:3 104 7.32 12.2 5
## 1:4 89 8.86 6.6 7
## 1:5 121 8.41 10.1 3
## 1:6 118 9.18 6.3 6
## Total_Intl_Charge No_CS_Calls Churn
## 1:1 2.70 1 FALSE
## 1:2 3.70 1 FALSE
## 1:3 3.29 0 FALSE
## 1:4 1.78 2 FALSE
## 1:5 2.73 3 FALSE
## 1:6 1.70 0 FALSE
# view the dimension of the df
dim(df)
## [1] 3758 21
# view the structure of the df
str(df)
## 'data.frame': 3758 obs. of 21 variables:
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ Account_Length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ Area_Code : int 415 415 415 408 415 510 510 415 408 415 ...
## $ Phone_No : int 3824657 3717191 3581921 3759999 3306626 3918027 3559993 3299001 3354719 3308173 ...
## $ International_Plan : Factor w/ 3 levels ""," no"," yes": 2 2 2 3 3 3 2 3 2 3 ...
## $ Voice_Mail_Plan : Factor w/ 3 levels ""," no"," yes": 3 3 2 2 2 2 3 2 2 3 ...
## $ No_Vmail_Messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ Total_Day_minutes : num 265 162 243 299 167 ...
## $ Total_Day_Calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ Total_Day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Total_Eve_Minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Total_Eve_Calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ Total_Eve_Charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Total_Night_Minutes: num 245 254 163 197 187 ...
## $ Total_Night_Calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ Total_Night_Charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Total_Intl_Minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ Total_Intl_Calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ Total_Intl_Charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ No_CS_Calls : int 1 1 0 2 3 0 3 0 1 0 ...
## $ Churn : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
# view the summary of the df
summary(df)
## State Account_Length Area_Code Phone_No
## WV : 122 Min. : 1 Min. :408.0 Min. :3271058
## MN : 94 1st Qu.: 74 1st Qu.:415.0 1st Qu.:3511101
## AL : 93 Median :100 Median :415.0 Median :3752748
## VA : 91 Mean :101 Mean :437.3 Mean :3748047
## NY : 89 3rd Qu.:127 3rd Qu.:510.0 3rd Qu.:3987837
## OH : 88 Max. :243 Max. :510.0 Max. :4229964
## (Other):3181 NA's :1 NA's :1
## International_Plan Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes
## : 5 : 4 Min. : 0.000 Min. : 0.0
## no :3390 no :2726 1st Qu.: 0.000 1st Qu.:143.7
## yes: 363 yes:1028 Median : 0.000 Median :179.4
## Mean : 8.026 Mean :180.0
## 3rd Qu.:19.000 3rd Qu.:216.7
## Max. :51.000 Max. :350.8
## NA's :7
## Total_Day_Calls Total_Day_charge Total_Eve_Minutes Total_Eve_Calls
## Min. : 0.0 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 87.0 1st Qu.:24.42 1st Qu.:166.4 1st Qu.: 87.0
## Median :101.0 Median :30.50 Median :201.0 Median :100.0
## Mean :100.4 Mean :30.60 Mean :200.8 Mean :100.2
## 3rd Qu.:114.0 3rd Qu.:36.80 3rd Qu.:235.1 3rd Qu.:114.0
## Max. :165.0 Max. :59.64 Max. :363.7 Max. :170.0
## NA's :4 NA's :11 NA's :11 NA's :4
## Total_Eve_Charge Total_Night_Minutes Total_Night_Calls Total_Night_Charge
## Min. : 0.00 Min. : 23.2 Min. : 33.0 Min. : 1.040
## 1st Qu.:14.17 1st Qu.:167.1 1st Qu.: 87.0 1st Qu.: 7.520
## Median :17.10 Median :201.1 Median :100.0 Median : 9.050
## Mean :17.08 Mean :200.8 Mean :100.1 Mean : 9.041
## 3rd Qu.:19.98 3rd Qu.:235.3 3rd Qu.:113.0 3rd Qu.:10.590
## Max. :30.91 Max. :395.0 Max. :175.0 Max. :17.770
## NA's :5 NA's :11 NA's :6 NA's :4
## Total_Intl_Minutes Total_Intl_Calls Total_Intl_Charge No_CS_Calls
## Min. : 0.00 Min. : 0.000 Min. :0.000 Min. :0.000
## 1st Qu.: 8.50 1st Qu.: 3.000 1st Qu.:2.300 1st Qu.:1.000
## Median :10.30 Median : 4.000 Median :2.780 Median :1.000
## Mean :10.25 Mean : 4.471 Mean :2.769 Mean :1.563
## 3rd Qu.:12.10 3rd Qu.: 6.000 3rd Qu.:3.270 3rd Qu.:2.000
## Max. :20.00 Max. :20.000 Max. :5.400 Max. :9.000
## NA's :3 NA's :5
## Churn
## Mode :logical
## FALSE:2850
## TRUE :483
## NA's :425
##
##
##
cat_data <- names(df[, c("State", "Area_Code", "Phone_No", "International_Plan",
"Voice_Mail_Plan", "Churn")])
num_data <- setdiff(names(df), c(cat_data))
cat_data
## [1] "State" "Area_Code" "Phone_No"
## [4] "International_Plan" "Voice_Mail_Plan" "Churn"
num_data
## [1] "Account_Length" "No_Vmail_Messages" "Total_Day_minutes"
## [4] "Total_Day_Calls" "Total_Day_charge" "Total_Eve_Minutes"
## [7] "Total_Eve_Calls" "Total_Eve_Charge" "Total_Night_Minutes"
## [10] "Total_Night_Calls" "Total_Night_Charge" "Total_Intl_Minutes"
## [13] "Total_Intl_Calls" "Total_Intl_Charge" "No_CS_Calls"
df <- df %>%
mutate_at(num_data, as.double) %>%
mutate_at(cat_data, as.factor)
# view structure of the data
str(df)
## 'data.frame': 3758 obs. of 21 variables:
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ Account_Length : num 128 107 137 84 75 118 121 147 117 141 ...
## $ Area_Code : Factor w/ 3 levels "408","415","510": 2 2 2 1 2 3 3 2 1 2 ...
## $ Phone_No : Factor w/ 3758 levels "3271058","3271319",..: 2157 1762 1249 1910 125 2543 1172 93 324 133 ...
## $ International_Plan : Factor w/ 3 levels ""," no"," yes": 2 2 2 3 3 3 2 3 2 3 ...
## $ Voice_Mail_Plan : Factor w/ 3 levels ""," no"," yes": 3 3 2 2 2 2 3 2 2 3 ...
## $ No_Vmail_Messages : num 25 26 0 0 0 0 24 0 0 37 ...
## $ Total_Day_minutes : num 265 162 243 299 167 ...
## $ Total_Day_Calls : num 110 123 114 71 113 98 88 79 97 84 ...
## $ Total_Day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Total_Eve_Minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Total_Eve_Calls : num 99 103 110 88 122 101 108 94 80 111 ...
## $ Total_Eve_Charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Total_Night_Minutes: num 245 254 163 197 187 ...
## $ Total_Night_Calls : num 91 103 104 89 121 118 118 96 90 97 ...
## $ Total_Night_Charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Total_Intl_Minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ Total_Intl_Calls : num 3 3 5 7 3 6 7 6 4 5 ...
## $ Total_Intl_Charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ No_CS_Calls : num 1 1 0 2 3 0 3 0 1 0 ...
## $ Churn : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
# view summary of the data
summary(df)
## State Account_Length Area_Code Phone_No International_Plan
## WV : 122 Min. : 1 408 : 934 3271058: 1 : 5
## MN : 94 1st Qu.: 74 415 :1871 3271319: 1 no :3390
## AL : 93 Median :100 510 : 952 3272040: 1 yes: 363
## VA : 91 Mean :101 NA's: 1 3272475: 1
## NY : 89 3rd Qu.:127 3273053: 1
## OH : 88 Max. :243 3273587: 1
## (Other):3181 NA's :1 (Other):3752
## Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes Total_Day_Calls
## : 4 Min. : 0.000 Min. : 0.0 Min. : 0.0
## no :2726 1st Qu.: 0.000 1st Qu.:143.7 1st Qu.: 87.0
## yes:1028 Median : 0.000 Median :179.4 Median :101.0
## Mean : 8.026 Mean :180.0 Mean :100.4
## 3rd Qu.:19.000 3rd Qu.:216.7 3rd Qu.:114.0
## Max. :51.000 Max. :350.8 Max. :165.0
## NA's :7 NA's :4
## Total_Day_charge Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.:24.42 1st Qu.:166.4 1st Qu.: 87.0 1st Qu.:14.17
## Median :30.50 Median :201.0 Median :100.0 Median :17.10
## Mean :30.60 Mean :200.8 Mean :100.2 Mean :17.08
## 3rd Qu.:36.80 3rd Qu.:235.1 3rd Qu.:114.0 3rd Qu.:19.98
## Max. :59.64 Max. :363.7 Max. :170.0 Max. :30.91
## NA's :11 NA's :11 NA's :4 NA's :5
## Total_Night_Minutes Total_Night_Calls Total_Night_Charge Total_Intl_Minutes
## Min. : 23.2 Min. : 33.0 Min. : 1.040 Min. : 0.00
## 1st Qu.:167.1 1st Qu.: 87.0 1st Qu.: 7.520 1st Qu.: 8.50
## Median :201.1 Median :100.0 Median : 9.050 Median :10.30
## Mean :200.8 Mean :100.1 Mean : 9.041 Mean :10.25
## 3rd Qu.:235.3 3rd Qu.:113.0 3rd Qu.:10.590 3rd Qu.:12.10
## Max. :395.0 Max. :175.0 Max. :17.770 Max. :20.00
## NA's :11 NA's :6 NA's :4 NA's :3
## Total_Intl_Calls Total_Intl_Charge No_CS_Calls Churn
## Min. : 0.000 Min. :0.000 Min. :0.000 FALSE:2850
## 1st Qu.: 3.000 1st Qu.:2.300 1st Qu.:1.000 TRUE : 483
## Median : 4.000 Median :2.780 Median :1.000 NA's : 425
## Mean : 4.471 Mean :2.769 Mean :1.563
## 3rd Qu.: 6.000 3rd Qu.:3.270 3rd Qu.:2.000
## Max. :20.000 Max. :5.400 Max. :9.000
## NA's :5
colSums(is.na(df))
## State Account_Length Area_Code Phone_No
## 0 1 1 0
## International_Plan Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes
## 0 0 0 7
## Total_Day_Calls Total_Day_charge Total_Eve_Minutes Total_Eve_Calls
## 4 11 11 4
## Total_Eve_Charge Total_Night_Minutes Total_Night_Calls Total_Night_Charge
## 5 11 6 4
## Total_Intl_Minutes Total_Intl_Calls Total_Intl_Charge No_CS_Calls
## 3 5 0 0
## Churn
## 425
rm_acc_area_df <- df %>%
filter(is.na(Area_Code) == FALSE)
# Double check on if the data has been removed
colSums(is.na(rm_acc_area_df))
## State Account_Length Area_Code Phone_No
## 0 1 0 0
## International_Plan Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes
## 0 0 0 7
## Total_Day_Calls Total_Day_charge Total_Eve_Minutes Total_Eve_Calls
## 4 11 11 4
## Total_Eve_Charge Total_Night_Minutes Total_Night_Calls Total_Night_Charge
## 5 11 6 4
## Total_Intl_Minutes Total_Intl_Calls Total_Intl_Charge No_CS_Calls
## 3 5 0 0
## Churn
## 424
# 1. Create a list of numerical variables with missing values
list_na <- rm_acc_area_df %>%
select(-c("Churn"))
list_na2 <- colnames(list_na)[ apply(list_na, 2, anyNA) ]
# Create mean
average_missing <- apply(rm_acc_area_df[,colnames(rm_acc_area_df) %in% list_na2],
2,
mean,
na.rm = TRUE)
average_missing
## Account_Length Total_Day_minutes Total_Day_Calls Total_Day_charge
## 101.005857 179.997573 100.389289 30.595846
## Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge Total_Night_Minutes
## 200.790897 100.178257 17.075810 200.851415
## Total_Night_Calls Total_Night_Charge Total_Intl_Minutes Total_Intl_Calls
## 100.060784 9.041071 10.253090 4.471215
# Create a new variable with the mean and median
df_replace_missing_values <- rm_acc_area_df %>%
mutate(Total_Day_minutes = ifelse(is.na(Total_Day_minutes), average_missing['Total_Day_minutes'], Total_Day_minutes),
Total_Day_Calls = ifelse(is.na(Total_Day_Calls), average_missing['Total_Day_Calls'], Total_Day_Calls),
Total_Day_charge = ifelse(is.na(Total_Day_charge), average_missing['Total_Day_charge'], Total_Day_charge),
Total_Eve_Minutes = ifelse(is.na(Total_Eve_Minutes), average_missing['Total_Eve_Minutes'], Total_Eve_Minutes),
Total_Eve_Calls = ifelse(is.na(Total_Eve_Calls), average_missing['Total_Eve_Calls'], Total_Eve_Calls),
Total_Eve_Charge = ifelse(is.na(Total_Eve_Charge), average_missing['Total_Eve_Charge'], Total_Eve_Charge),
Total_Night_Minutes = ifelse(is.na(Total_Night_Minutes), average_missing['Total_Night_Minutes'], Total_Night_Minutes),
Total_Night_Calls = ifelse(is.na(Total_Night_Calls), average_missing['Total_Night_Calls'], Total_Night_Calls),
Total_Night_Charge = ifelse(is.na(Total_Night_Charge), average_missing['Total_Night_Charge'], Total_Night_Charge),
Total_Intl_Minutes = ifelse(is.na(Total_Intl_Minutes), average_missing['Total_Intl_Minutes'], Total_Intl_Minutes),
Total_Intl_Calls = ifelse(is.na(Total_Intl_Calls), average_missing['Total_Intl_Calls'], Total_Intl_Calls),
Account_Length = ifelse(is.na(Account_Length), average_missing['Account_Length'], Account_Length)
)
# double check if still have missing values
colSums(is.na(df_replace_missing_values))
## State Account_Length Area_Code Phone_No
## 0 0 0 0
## International_Plan Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes
## 0 0 0 0
## Total_Day_Calls Total_Day_charge Total_Eve_Minutes Total_Eve_Calls
## 0 0 0 0
## Total_Eve_Charge Total_Night_Minutes Total_Night_Calls Total_Night_Charge
## 0 0 0 0
## Total_Intl_Minutes Total_Intl_Calls Total_Intl_Charge No_CS_Calls
## 0 0 0 0
## Churn
## 424
boxplot(df_replace_missing_values[,c(num_data)])
rm_outliers <- df_replace_missing_values %>%
filter(No_Vmail_Messages < quantile(df_replace_missing_values$No_Vmail_Messages, 0.99, na.rm = TRUE),
Total_Day_minutes < quantile(df_replace_missing_values$Total_Day_minutes, 0.99, na.rm = TRUE),
Total_Day_Calls < quantile(df_replace_missing_values$Total_Day_Calls, 0.99, na.rm = TRUE),
Total_Day_charge < quantile(df_replace_missing_values$Total_Day_charge, 0.99, na.rm = TRUE),
Total_Eve_Minutes < quantile(df_replace_missing_values$Total_Eve_Minutes, 0.99, na.rm = TRUE),
Total_Eve_Calls < quantile(df_replace_missing_values$Total_Eve_Calls, 0.99, na.rm = TRUE),
Total_Eve_Charge < quantile(df_replace_missing_values$Total_Eve_Charge, 0.99, na.rm = TRUE),
Total_Night_Minutes < quantile(df_replace_missing_values$Total_Night_Minutes ,0.99, na.rm = TRUE),
Total_Night_Calls < quantile(df_replace_missing_values$Total_Night_Calls ,0.99, na.rm = TRUE),
Total_Night_Charge < quantile(df_replace_missing_values$Total_Night_Charge ,0.99, na.rm = TRUE),
Total_Intl_Minutes < quantile(df_replace_missing_values$Total_Intl_Minutes ,0.99, na.rm = TRUE),
Total_Intl_Calls < quantile(df_replace_missing_values$Total_Intl_Calls ,0.99, na.rm = TRUE),
Total_Intl_Charge < quantile(df_replace_missing_values$Total_Intl_Charge ,0.99, na.rm = TRUE),
No_CS_Calls < quantile(df_replace_missing_values$No_CS_Calls ,0.99, na.rm = TRUE))
#Check the dimensions of before outliers removal vs after outliers removal
dim(rm_outliers)
## [1] 3303 21
dim(df_replace_missing_values)
## [1] 3757 21
# take only the Churn Data with no missing values
temp_df <- rm_outliers %>% filter(!is.na(Churn))
unique(temp_df$Churn)
## [1] FALSE TRUE
## Levels: FALSE TRUE
# Find the mean of Churn and No Churn
temp_df2 <- temp_df %>%
group_by(Churn) %>%
summarise(Mean_No_Of_CS_Calls = mean(No_CS_Calls))
## `summarise()` ungrouping output (override with `.groups` argument)
# Visualize using Bar chart
ggplot(temp_df2, aes(x = Churn, y = Mean_No_Of_CS_Calls)) +
geom_bar(stat = "identity")
Findings:
Customers with more Service Calls made are more likely to Churn. Hence, we are also interested to predict the No. Of Customer Service Calls to make a hypothesis on how likely a customer will churn.
# Z-Score Normalization
pre_proc <- preProcess(rm_outliers %>% select(c(num_data)), method = c('center','scale', 'YeoJohnson'))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(num_data)` instead of `num_data` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Transform the data to Z-Score form
transformed_num_df <- predict(pre_proc, rm_outliers)
# view data
head(transformed_num_df)
## State Account_Length Area_Code Phone_No International_Plan Voice_Mail_Plan
## 1 KS 0.6859880 415 3824657 no yes
## 2 OH 0.1705578 415 3717191 no yes
## 3 NJ 0.9040278 415 3581921 no no
## 4 OH -0.4067150 408 3759999 yes no
## 5 OK -0.6370261 415 3306626 yes no
## 6 AL 0.4417770 510 3918027 yes no
## No_Vmail_Messages Total_Day_minutes Total_Day_Calls Total_Day_charge
## 1 1.6665326 1.6650638 0.4988108 1.6679404
## 2 1.6699846 -0.3440224 1.2141869 -0.3454267
## 3 -0.5978186 1.2366500 0.7159743 1.2378804
## 4 -0.5978186 2.3487505 -1.4650229 2.3546493
## 5 -0.5978186 -0.2472975 0.6614328 -0.2484118
## 6 -0.5978186 0.8448907 -0.1361516 0.8450450
## Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge Total_Night_Minutes
## 1 -0.06790512 -0.06234612 -0.07061632 0.92773910
## 2 -0.10678094 0.14914717 -0.10910569 1.13602654
## 3 -1.57236556 0.52496149 -1.57220332 -0.76873477
## 4 -2.64058065 -0.63108841 -2.63578362 -0.07551299
## 5 -1.05121654 1.18528072 -1.05138699 -0.28010693
## 6 0.41160424 0.04309829 0.40836828 0.06885042
## Total_Night_Calls Total_Night_Charge Total_Intl_Minutes Total_Intl_Calls
## 1 -0.4585665 0.92578064 -0.1233752 -0.5326307
## 2 0.1743618 1.13603960 1.3746706 -0.5326307
## 3 0.2276297 -0.76732531 0.7452051 0.4236901
## 4 -0.5628760 -0.07696591 -1.3217243 1.1692059
## 5 1.1445676 -0.28126987 -0.0853833 -0.5326307
## 6 0.9812507 0.06954524 -1.4180320 0.8158933
## Total_Intl_Charge No_CS_Calls Churn
## 1 -0.12894237 -0.2557483 FALSE
## 2 1.37894093 -0.2557483 FALSE
## 3 0.73490717 -1.5050552 FALSE
## 4 -1.31737248 0.6124466 FALSE
## 5 -0.08686774 1.2995078 FALSE
## 6 -1.41097785 -1.5050552 FALSE
# one hot encode the categorical variable
cat_data
## [1] "State" "Area_Code" "Phone_No"
## [4] "International_Plan" "Voice_Mail_Plan" "Churn"
# select features to be one hot encoded (categorical variables)
# Redo this step to ensure all the to be one-hot-encoded data is factor variables
to_onehot <- transformed_num_df %>%
select(c(cat_data)) %>%
mutate_all(as.factor)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cat_data)` instead of `cat_data` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
head(to_onehot)
## State Area_Code Phone_No International_Plan Voice_Mail_Plan Churn
## 1 KS 415 3824657 no yes FALSE
## 2 OH 415 3717191 no yes FALSE
## 3 NJ 415 3581921 no no FALSE
## 4 OH 408 3759999 yes no FALSE
## 5 OK 415 3306626 yes no FALSE
## 6 AL 510 3918027 yes no FALSE
str(to_onehot)
## 'data.frame': 3303 obs. of 6 variables:
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 25 16 13 27 ...
## $ Area_Code : Factor w/ 3 levels "408","415","510": 2 2 2 1 2 3 2 2 1 3 ...
## $ Phone_No : Factor w/ 3758 levels "3271058","3271319",..: 2157 1762 1249 1910 125 2543 93 83 1435 2665 ...
## $ International_Plan: Factor w/ 3 levels ""," no"," yes": 2 2 2 3 3 3 3 2 2 2 ...
## $ Voice_Mail_Plan : Factor w/ 3 levels ""," no"," yes": 3 3 2 2 2 2 2 2 2 2 ...
## $ Churn : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 2 1 1 ...
dmy <- dummyVars(Phone_No ~ ., data = to_onehot)
onehot_df <- data.frame(predict(dmy, newdata = to_onehot)) %>%
cbind(to_onehot %>% select(Phone_No))
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'Phone_No' is not a factor
# take a look at the data
glimpse(onehot_df)
## Rows: 3,303
## Columns: 63
## $ State.AK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.AL <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.AR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.AZ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.CA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.CT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.DC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.DE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.FL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.GA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.HI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.IA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ State.ID <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ State.IL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.IN <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ State.KS <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.KY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.LA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.ME <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MO <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ State.NC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.ND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NJ <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NV <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.OH <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.OK <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.OR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.PA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.RI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.SC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.SD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.TN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.TX <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ State.UT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.VA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ State.VT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ State.WA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.WI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.WV <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.WY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Area_Code.408 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Area_Code.415 <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Area_Code.510 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, …
## $ International_Plan. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ International_Plan..no <dbl> 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ International_Plan..yes <dbl> 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Voice_Mail_Plan. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Voice_Mail_Plan..no <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, …
## $ Voice_Mail_Plan..yes <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ Churn.FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, …
## $ Churn.TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Phone_No <fct> 3824657, 3717191, 3581921, 3759999, 3306626, …
# join numerical and onehot encoded features
# Data set for classification
final_df_class <- transformed_num_df
glimpse(final_df_class)
## Rows: 3,303
## Columns: 21
## $ State <fct> KS, OH, NJ, OH, OK, AL, MO, IN, IA, MT, IA, ID, V…
## $ Account_Length <dbl> 0.6859880, 0.1705578, 0.9040278, -0.4067150, -0.6…
## $ Area_Code <fct> 415, 415, 415, 408, 415, 510, 415, 415, 408, 510,…
## $ Phone_No <fct> 3824657, 3717191, 3581921, 3759999, 3306626, 3918…
## $ International_Plan <fct> no, no, no, yes, yes, yes, yes, no, no, …
## $ Voice_Mail_Plan <fct> yes, yes, no, no, no, no, no, no, no, n…
## $ No_Vmail_Messages <dbl> 1.6665326, 1.6699846, -0.5978186, -0.5978186, -0.…
## $ Total_Day_minutes <dbl> 1.6650638, -0.3440224, 1.2366500, 2.3487505, -0.2…
## $ Total_Day_Calls <dbl> 0.4988108, 1.2141869, 0.7159743, -1.4650229, 0.66…
## $ Total_Day_charge <dbl> 1.6679404, -0.3454267, 1.2378804, 2.3546493, -0.2…
## $ Total_Eve_Minutes <dbl> -0.06790512, -0.10678094, -1.57236556, -2.6405806…
## $ Total_Eve_Calls <dbl> -0.062346121, 0.149147168, 0.524961493, -0.631088…
## $ Total_Eve_Charge <dbl> -0.07061632, -0.10910569, -1.57220332, -2.6357836…
## $ Total_Night_Minutes <dbl> 0.92773910, 1.13602654, -0.76873477, -0.07551299,…
## $ Total_Night_Calls <dbl> -0.45856646, 0.17436180, 0.22762974, -0.56287596,…
## $ Total_Night_Charge <dbl> 0.92578064, 1.13603960, -0.76732531, -0.07696591,…
## $ Total_Intl_Minutes <dbl> -0.123375195, 1.374670631, 0.745205132, -1.321724…
## $ Total_Intl_Calls <dbl> -0.53263066, -0.53263066, 0.42369010, 1.16920585,…
## $ Total_Intl_Charge <dbl> -0.12894237, 1.37894093, 0.73490717, -1.31737248,…
## $ No_CS_Calls <dbl> -0.2557483, -0.2557483, -1.5050552, 0.6124466, 1.…
## $ Churn <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# Data set for Regression
final_df_regres <- rm_outliers %>%
left_join(onehot_df, by ='Phone_No')
glimpse(final_df_regres)
## Rows: 3,303
## Columns: 83
## $ State <fct> KS, OH, NJ, OH, OK, AL, MO, IN, IA, MT, IA, I…
## $ Account_Length <dbl> 128, 107, 137, 84, 75, 118, 147, 65, 168, 95,…
## $ Area_Code <fct> 415, 415, 415, 408, 415, 510, 415, 415, 408, …
## $ Phone_No <fct> 3824657, 3717191, 3581921, 3759999, 3306626, …
## $ International_Plan <fct> no, no, no, yes, yes, yes, yes, no, …
## $ Voice_Mail_Plan <fct> yes, yes, no, no, no, no, no, no, no…
## $ No_Vmail_Messages <dbl> 25, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 33,…
## $ Total_Day_minutes <dbl> 265.1, 161.6, 243.4, 299.4, 166.7, 223.4, 157…
## $ Total_Day_Calls <dbl> 110, 123, 114, 71, 113, 98, 79, 137, 96, 88, …
## $ Total_Day_charge <dbl> 45.07, 27.47, 41.38, 50.90, 28.34, 37.98, 26.…
## $ Total_Eve_Minutes <dbl> 197.4, 195.5, 121.2, 61.9, 148.3, 220.6, 103.…
## $ Total_Eve_Calls <dbl> 99, 103, 110, 88, 122, 101, 94, 83, 71, 75, 7…
## $ Total_Eve_Charge <dbl> 16.78, 16.62, 10.30, 5.26, 12.61, 18.75, 8.76…
## $ Total_Night_Minutes <dbl> 244.7, 254.4, 162.6, 196.9, 186.9, 203.9, 211…
## $ Total_Night_Calls <dbl> 91, 103, 104, 89, 121, 118, 96, 111, 128, 115…
## $ Total_Night_Charge <dbl> 11.01, 11.45, 7.32, 8.86, 8.41, 9.18, 9.53, 9…
## $ Total_Intl_Minutes <dbl> 10.0, 13.7, 12.2, 6.6, 10.1, 6.3, 7.1, 12.7, …
## $ Total_Intl_Calls <dbl> 3, 3, 5, 7, 3, 6, 6, 6, 2, 5, 6, 4, 3, 5, 2, …
## $ Total_Intl_Charge <dbl> 2.70, 3.70, 3.29, 1.78, 2.73, 1.70, 1.92, 3.4…
## $ No_CS_Calls <dbl> 1, 1, 0, 2, 3, 0, 0, 4, 1, 3, 4, 1, 3, 1, 1, …
## $ Churn <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ State.AK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.AL <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.AR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.AZ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.CA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.CT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.DC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.DE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.FL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.GA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.HI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.IA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ State.ID <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ State.IL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.IN <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ State.KS <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.KY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.LA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.ME <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MO <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.MT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ State.NC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.ND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NJ <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NV <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.NY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.OH <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.OK <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.OR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.PA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.RI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.SC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.SD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.TN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.TX <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ State.UT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.VA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ State.VT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ State.WA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.WI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.WV <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ State.WY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Area_Code.408 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Area_Code.415 <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Area_Code.510 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, …
## $ International_Plan. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ International_Plan..no <dbl> 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ International_Plan..yes <dbl> 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Voice_Mail_Plan. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Voice_Mail_Plan..no <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, …
## $ Voice_Mail_Plan..yes <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ Churn.FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, …
## $ Churn.TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
dim(final_df_class)
## [1] 3303 21
dim(final_df_regres)
## [1] 3303 83
final_df_class1 <- data.frame(final_df_class)
final_df_regres1 <- data.frame(final_df_regres)
#Find any na values
colSums(is.na(final_df_class1))
## State Account_Length Area_Code Phone_No
## 0 0 0 0
## International_Plan Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes
## 0 0 0 0
## Total_Day_Calls Total_Day_charge Total_Eve_Minutes Total_Eve_Calls
## 0 0 0 0
## Total_Eve_Charge Total_Night_Minutes Total_Night_Calls Total_Night_Charge
## 0 0 0 0
## Total_Intl_Minutes Total_Intl_Calls Total_Intl_Charge No_CS_Calls
## 0 0 0 0
## Churn
## 371
head(final_df_class1)
## State Account_Length Area_Code Phone_No International_Plan Voice_Mail_Plan
## 1 KS 0.6859880 415 3824657 no yes
## 2 OH 0.1705578 415 3717191 no yes
## 3 NJ 0.9040278 415 3581921 no no
## 4 OH -0.4067150 408 3759999 yes no
## 5 OK -0.6370261 415 3306626 yes no
## 6 AL 0.4417770 510 3918027 yes no
## No_Vmail_Messages Total_Day_minutes Total_Day_Calls Total_Day_charge
## 1 1.6665326 1.6650638 0.4988108 1.6679404
## 2 1.6699846 -0.3440224 1.2141869 -0.3454267
## 3 -0.5978186 1.2366500 0.7159743 1.2378804
## 4 -0.5978186 2.3487505 -1.4650229 2.3546493
## 5 -0.5978186 -0.2472975 0.6614328 -0.2484118
## 6 -0.5978186 0.8448907 -0.1361516 0.8450450
## Total_Eve_Minutes Total_Eve_Calls Total_Eve_Charge Total_Night_Minutes
## 1 -0.06790512 -0.06234612 -0.07061632 0.92773910
## 2 -0.10678094 0.14914717 -0.10910569 1.13602654
## 3 -1.57236556 0.52496149 -1.57220332 -0.76873477
## 4 -2.64058065 -0.63108841 -2.63578362 -0.07551299
## 5 -1.05121654 1.18528072 -1.05138699 -0.28010693
## 6 0.41160424 0.04309829 0.40836828 0.06885042
## Total_Night_Calls Total_Night_Charge Total_Intl_Minutes Total_Intl_Calls
## 1 -0.4585665 0.92578064 -0.1233752 -0.5326307
## 2 0.1743618 1.13603960 1.3746706 -0.5326307
## 3 0.2276297 -0.76732531 0.7452051 0.4236901
## 4 -0.5628760 -0.07696591 -1.3217243 1.1692059
## 5 1.1445676 -0.28126987 -0.0853833 -0.5326307
## 6 0.9812507 0.06954524 -1.4180320 0.8158933
## Total_Intl_Charge No_CS_Calls Churn
## 1 -0.12894237 -0.2557483 FALSE
## 2 1.37894093 -0.2557483 FALSE
## 3 0.73490717 -1.5050552 FALSE
## 4 -1.31737248 0.6124466 FALSE
## 5 -0.08686774 1.2995078 FALSE
## 6 -1.41097785 -1.5050552 FALSE
#Remove na values
final_df_class1 <- final_df_class1[complete.cases(final_df_class1), ]
#Check for na values again
colSums(is.na(final_df_class1))
## State Account_Length Area_Code Phone_No
## 0 0 0 0
## International_Plan Voice_Mail_Plan No_Vmail_Messages Total_Day_minutes
## 0 0 0 0
## Total_Day_Calls Total_Day_charge Total_Eve_Minutes Total_Eve_Calls
## 0 0 0 0
## Total_Eve_Charge Total_Night_Minutes Total_Night_Calls Total_Night_Charge
## 0 0 0 0
## Total_Intl_Minutes Total_Intl_Calls Total_Intl_Charge No_CS_Calls
## 0 0 0 0
## Churn
## 0
#Check data types for each varable
str(final_df_class1)
## 'data.frame': 2932 obs. of 21 variables:
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 25 16 13 27 ...
## $ Account_Length : num 0.686 0.171 0.904 -0.407 -0.637 ...
## $ Area_Code : Factor w/ 3 levels "408","415","510": 2 2 2 1 2 3 2 2 1 3 ...
## $ Phone_No : Factor w/ 3758 levels "3271058","3271319",..: 2157 1762 1249 1910 125 2543 93 83 1435 2665 ...
## $ International_Plan : Factor w/ 3 levels ""," no"," yes": 2 2 2 3 3 3 3 2 2 2 ...
## $ Voice_Mail_Plan : Factor w/ 3 levels ""," no"," yes": 3 3 2 2 2 2 2 2 2 2 ...
## $ No_Vmail_Messages : num 1.667 1.67 -0.598 -0.598 -0.598 ...
## $ Total_Day_minutes : num 1.665 -0.344 1.237 2.349 -0.247 ...
## $ Total_Day_Calls : num 0.499 1.214 0.716 -1.465 0.661 ...
## $ Total_Day_charge : num 1.668 -0.345 1.238 2.355 -0.248 ...
## $ Total_Eve_Minutes : num -0.0679 -0.1068 -1.5724 -2.6406 -1.0512 ...
## $ Total_Eve_Calls : num -0.0623 0.1491 0.525 -0.6311 1.1853 ...
## $ Total_Eve_Charge : num -0.0706 -0.1091 -1.5722 -2.6358 -1.0514 ...
## $ Total_Night_Minutes: num 0.9277 1.136 -0.7687 -0.0755 -0.2801 ...
## $ Total_Night_Calls : num -0.459 0.174 0.228 -0.563 1.145 ...
## $ Total_Night_Charge : num 0.926 1.136 -0.767 -0.077 -0.281 ...
## $ Total_Intl_Minutes : num -0.1234 1.3747 0.7452 -1.3217 -0.0854 ...
## $ Total_Intl_Calls : num -0.533 -0.533 0.424 1.169 -0.533 ...
## $ Total_Intl_Charge : num -0.1289 1.3789 0.7349 -1.3174 -0.0869 ...
## $ No_CS_Calls : num -0.256 -0.256 -1.505 0.612 1.3 ...
## $ Churn : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 2 1 1 ...
validation_index <- createDataPartition(final_df_class1$Churn, p=0.80, list=FALSE)
final_df_class_test <- final_df_class1[-validation_index,]
final_df_class_train <- final_df_class1[validation_index,]
dim(final_df_class_train)
## [1] 2347 21
dim(final_df_class_test)
## [1] 585 21
# Run algorithms using 3-fold cross validation
method1 <- trainControl(method="cv", number=3)
# Random Forest
set.seed(7)
fit.rf <- train(Churn~., data=final_df_class_train, method="ranger", metric='Accuracy', trControl=method1)
## Growing trees.. Progress: 29%. Estimated remaining time: 1 minute, 17 seconds.
## Growing trees.. Progress: 59%. Estimated remaining time: 43 seconds.
## Growing trees.. Progress: 92%. Estimated remaining time: 7 seconds.
## Growing trees.. Progress: 27%. Estimated remaining time: 1 minute, 24 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 51 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 18 seconds.
## Growing trees.. Progress: 35%. Estimated remaining time: 57 seconds.
## Growing trees.. Progress: 70%. Estimated remaining time: 26 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 1 minute, 20 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 49 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 19 seconds.
## Growing trees.. Progress: 31%. Estimated remaining time: 1 minute, 8 seconds.
## Growing trees.. Progress: 64%. Estimated remaining time: 35 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 3 seconds.
## Growing trees.. Progress: 26%. Estimated remaining time: 1 minute, 26 seconds.
## Growing trees.. Progress: 54%. Estimated remaining time: 52 seconds.
## Growing trees.. Progress: 80%. Estimated remaining time: 22 seconds.
## Growing trees.. Progress: 14%. Estimated remaining time: 3 minutes, 16 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 2 minutes, 40 seconds.
## Growing trees.. Progress: 42%. Estimated remaining time: 2 minutes, 13 seconds.
## Growing trees.. Progress: 56%. Estimated remaining time: 1 minute, 39 seconds.
## Growing trees.. Progress: 69%. Estimated remaining time: 1 minute, 12 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 39 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 5 seconds.
fit.rf
## Random Forest
##
## 2347 samples
## 20 predictor
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 1564, 1564, 1566
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 2 gini 0.8789950 0.0000000
## 2 extratrees 0.8789950 0.0000000
## 87 gini 0.8956174 0.2170439
## 87 extratrees 0.8789950 0.0000000
## 3827 gini 0.9454684 0.7039203
## 3827 extratrees 0.9467444 0.7019253
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3827, splitrule =
## extratrees and min.node.size = 1.
plot(fit.rf)
#Test the RF model on the test set
predictions <- predict(fit.rf, final_df_class_test)
#Show the prediction stats on the test set
confusionMatrix(predictions, final_df_class_test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 508 18
## TRUE 7 52
##
## Accuracy : 0.9573
## 95% CI : (0.9376, 0.9722)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 7.659e-11
##
## Kappa : 0.7824
##
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.9864
## Specificity : 0.7429
## Pos Pred Value : 0.9658
## Neg Pred Value : 0.8814
## Prevalence : 0.8803
## Detection Rate : 0.8684
## Detection Prevalence : 0.8991
## Balanced Accuracy : 0.8646
##
## 'Positive' Class : FALSE
##
validation_index <- createDataPartition(final_df_regres1$No_CS_Calls, p=0.80, list=FALSE)
final_df_regres_test <- final_df_regres1[-validation_index,]
final_df_regres_train <- final_df_regres1[validation_index,]
dim(final_df_regres_train)
## [1] 2643 80
dim(final_df_regres_test)
## [1] 660 80
# Run algorithms using 3-fold cross validation
method1 <- trainControl(method="cv", number=3)
# Regression with Neural Network model(Multilayer Perceptron model)
set.seed(7)
fit.mlp <- train(No_CS_Calls~., data=final_df_regres_train, method="mlp", metric='MAE', trControl=method1)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
## Warning in snnsObject$setUnitName(num, iNames[[i]]): SNNS error message in
## setUnitName : SNNS-Kernel Error: Symbol pattern invalid (must match [A-Za-z]
## [^|, ]*)
fit.mlp
## Multi-Layer Perceptron
##
## 2643 samples
## 79 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 1762, 1761, 1763
## Resampling results across tuning parameters:
##
## size RMSE Rsquared MAE
## 1 1.661514 0.0001705958 1.343567
## 3 1.470485 0.0014212059 1.146664
## 5 1.374573 NaN 1.137586
##
## MAE was used to select the optimal model using the smallest value.
## The final value used for the model was size = 5.
#Test the MLP model on the test set
predictions <- predict(fit.mlp, final_df_regres_test)
#Create function to calculate RMSE for test set
calc_rmse = function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
#Show the RMSE on the test set
calc_rmse(actual = final_df_regres_test$No_CS_Calls,
predicted = predict(fit.mlp, final_df_regres_test))
## [1] 1.162633