HW1

library(readr)
churn <- read_csv("/tmp/Churn_Modelling.csv")
## Parsed with column specification:
## cols(
##   RowNumber = col_integer(),
##   CustomerId = col_integer(),
##   Surname = col_character(),
##   CreditScore = col_integer(),
##   Geography = col_character(),
##   Gender = col_character(),
##   Age = col_integer(),
##   Tenure = col_integer(),
##   Balance = col_double(),
##   NumOfProducts = col_integer(),
##   HasCrCard = col_integer(),
##   IsActiveMember = col_integer(),
##   EstimatedSalary = col_double(),
##   Exited = col_integer()
## )
class(churn)
## [1] "tbl_df"     "tbl"        "data.frame"
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    10000 obs. of  14 variables:
##  $ RowNumber      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CustomerId     : int  15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
##  $ Surname        : chr  "Hargrave" "Hill" "Onio" "Boni" ...
##  $ CreditScore    : int  619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : chr  "France" "Spain" "France" "France" ...
##  $ Gender         : chr  "Female" "Female" "Female" "Female" ...
##  $ Age            : int  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num  0 83808 159661 0 125511 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : int  1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : int  1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : int  1 0 1 0 0 1 0 1 0 0 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 14
##   .. ..$ RowNumber      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ CustomerId     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Surname        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CreditScore    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Geography      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Tenure         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Balance        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NumOfProducts  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ HasCrCard      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ IsActiveMember : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ EstimatedSalary: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Exited         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
# Answer 1
table(churn$HasCrCard)
## 
##    0    1 
## 2945 7055
# Answer 1 (dplyr)
# SELECT HasCrCard, COUNT(*) FROM churn GROUP BY HasCrCard; 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
churn %>% 
  select(HasCrCard) %>%
  group_by(HasCrCard) %>%
  summarize(cnt = n())
## # A tibble: 2 x 2
##   HasCrCard   cnt
##       <int> <int>
## 1         0  2945
## 2         1  7055
# Answer 1
tb <- table(churn$HasCrCard)
tb
## 
##    0    1 
## 2945 7055
l1 <- paste((tb / sum(tb)) * 100, '%' )
l2 <- c('Without Credit Card', 'With Credit Card')
paste(l2, l1)
## [1] "Without Credit Card 29.45 %" "With Credit Card 70.55 %"
pie(tb, init.angle = 90, labels = paste(l2, l1) ,main = 'Credit Card Holder Propotion', col = c('#7998cB', '#89C7B6')  )

barplot(tb,names.arg = c('Without Credit Card', 'With Credit Card'), main = 'Credit Card Holder Propotion', col = c('#7998cB', '#89C7B6') )

# Answer 2
max(churn$EstimatedSalary)
## [1] 199992.5
min(churn$EstimatedSalary)
## [1] 11.58
mean(churn$EstimatedSalary)
## [1] 100090.2
summary(churn$EstimatedSalary)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     11.58  51000.00 100200.00 100100.00 149400.00 200000.00
# Answer 3
#sort(churn$EstimatedSalary)
churn[order(churn$EstimatedSalary, decreasing = TRUE)[0:3], 'CustomerId' ]
## # A tibble: 3 x 1
##   CustomerId
##        <int>
## 1   15662021
## 2   15634359
## 3   15697270
# Answer 3
# SELECT CustomerId FROM churn ORDER BY EstimatedSalary DESC LIMIT 3
churn %>% 
  arrange(desc(EstimatedSalary)) %>%
  select(CustomerId) %>%
  head(3)
## # A tibble: 3 x 1
##   CustomerId
##        <int>
## 1   15662021
## 2   15634359
## 3   15697270
# Answer 4 
table(churn[churn$EstimatedSalary >= 100000, 'Exited'])
## 
##    0    1 
## 3966 1044
# Answer 4 (dplyr)
churn %>%
  filter(EstimatedSalary >= 100000) %>%
  select(Exited) %>% 
  group_by(Exited) %>%
  summarize(cnt = n())
## # A tibble: 2 x 2
##   Exited   cnt
##    <int> <int>
## 1      0  3966
## 2      1  1044
# Answer 5
churn %>% 
  group_by(Geography, Gender) %>%
  summarise(mean_salary = mean(EstimatedSalary))
## # A tibble: 6 x 3
## # Groups:   Geography [?]
##   Geography Gender mean_salary
##   <chr>     <chr>        <dbl>
## 1 France    Female      99564.
## 2 France    Male       100174.
## 3 Germany   Female     102446.
## 4 Germany   Male        99905.
## 5 Spain     Female     100734.
## 6 Spain     Male        98426.
# Answer 6
churn$RowNumber  <- NULL
churn$CustomerId <- NULL
churn$Surname    <- NULL

head(churn)
## # A tibble: 6 x 11
##   CreditScore Geography Gender   Age Tenure Balance NumOfProducts
##         <int> <chr>     <chr>  <int>  <int>   <dbl>         <int>
## 1         619 France    Female    42      2      0.             1
## 2         608 Spain     Female    41      1  83808.             1
## 3         502 France    Female    42      8 159661.             3
## 4         699 France    Female    39      1      0.             2
## 5         850 Spain     Female    43      2 125511.             1
## 6         645 Spain     Male      44      8 113756.             2
## # ... with 4 more variables: HasCrCard <int>, IsActiveMember <int>,
## #   EstimatedSalary <dbl>, Exited <int>
# Answer 7
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    10000 obs. of  11 variables:
##  $ CreditScore    : int  619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : chr  "France" "Spain" "France" "France" ...
##  $ Gender         : chr  "Female" "Female" "Female" "Female" ...
##  $ Age            : int  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num  0 83808 159661 0 125511 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : int  1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : int  1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : int  1 0 1 0 0 1 0 1 0 0 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 14
##   .. ..$ RowNumber      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ CustomerId     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Surname        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CreditScore    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Geography      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Tenure         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Balance        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NumOfProducts  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ HasCrCard      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ IsActiveMember : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ EstimatedSalary: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Exited         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
#Geography, Gender,HasCrCard,IsActiveMember,Exited 
churn$Geography <- as.factor(churn$Geography)
churn$Gender            <- as.factor(churn$Gender)
churn$HasCrCard         <- as.factor(churn$HasCrCard)
churn$IsActiveMember    <- as.factor(churn$IsActiveMember)
churn$Exited            <- as.factor(churn$Exited)

str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    10000 obs. of  11 variables:
##  $ CreditScore    : int  619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 1 2 1 1 ...
##  $ Gender         : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ Age            : int  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num  0 83808 159661 0 125511 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 2 1 2 ...
##  $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 2 2 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 1 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 14
##   .. ..$ RowNumber      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ CustomerId     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Surname        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CreditScore    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Geography      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Gender         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Tenure         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Balance        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NumOfProducts  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ HasCrCard      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ IsActiveMember : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ EstimatedSalary: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Exited         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
# Answer 8
library(rpart)
fit <- rpart(Exited ~ ., data = churn)
plot(fit, margin = 0.1)
text(fit)

# Answer 9
predicted <- predict(fit, churn, type= 'class')
sum(predicted == churn$Exited) / length(churn$Exited)
## [1] 0.8593
table(churn$Exited, predicted)
##    predicted
##        0    1
##   0 7781  182
##   1 1225  812
# Answer 10 
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(fit, churn, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(churn$Exited)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

Clustering

x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)

sum(abs(x - y)) 
## [1] 2
sum((x - y) ^ 2)
## [1] 2
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x, y), method = 'euclidean')
##          x
## y 1.414214
dist(rbind(x, y), method = 'minkowski', p = 2)
##          x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x, y), method = 'manhattan')
##   x
## y 2
hc <- hclust(dist(iris[  ,  -5], method = 'euclidean'), method = 'ward.D2')
plot(hc)

fit <- cutree(hc, k= 3)
fit
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
plot(hc)
rect.hclust(hc, k = 3, border = 'red')

par(mfrow = c(1, 2) )
plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species, main = 'Original')
plot(iris$Petal.Length, iris$Petal.Width, col = fit, main = 'Clustered')

#.Random.seed
set.seed(42)
sample.int(42, 6)
## [1] 39 42 12 33 25 20
sample.int(42, 6)
## [1] 31  6 27 28 18 40
sample.int(42, 6)
## [1] 40 11 19 37 38  5
set.seed(22)
fit <- kmeans(iris[,-5], 3)
fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
fit$cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")


par(mfrow = c(1, 2) )

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species, main = 'Original')
plot(iris$Petal.Length, iris$Petal.Width, col = fit$cluster, main = 'Clustered')

library(cluster)

set.seed(22)
km  <- kmeans(iris[,-5], 3)

kms <- silhouette(km$cluster, dist(iris[,-5]))
plot(kms)



set.seed(22)
wss <- c()
for (k in 2:10){
  km  <- kmeans(iris[,-5], k)
  wss <- c(wss, km$tot.withinss)
}
plot(2:10, wss, type = 'l')

set.seed(22)
km  <- kmeans(iris[,-5], 3)

kms <- silhouette(km$cluster, dist(iris[,-5]))
plot(kms)


sil <- c()
for (k in 2:10){
  set.seed(22)
  km  <- kmeans(iris[,-5], k)
  kms <- silhouette(km$cluster, dist(iris[,-5]))
  m <- as.matrix(kms)
  sil <- c(sil, mean(m[,3]))
}
plot(2:10, sil, type = 'l')

single_c   <-  hclust(dist(iris[,-5]), method="single")
hc_single  <-  cutree(single_c, k = 3)
kms_single <- silhouette(hc_single, dist(iris[,-5]))
m_single <- as.matrix(kms_single)
#0.512 
mean(m_single[,3])
## [1] 0.5121108
complete_c <-  hclust(dist(iris[,-5]), method="complete")
hc_complete <-  cutree(complete_c, k = 3)
kms_complete <- silhouette(hc_complete, dist(iris[,-5]))
m_complete <- as.matrix(kms_complete)
# 0.513
mean(m_complete[,3])
## [1] 0.5135953
set.seed(42)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
m <- as.matrix(kms)
# 0.552
mean(m[,3])
## [1] 0.552819
par(mfrow=c(1,1))

Customer Segmentation

customer <- read.csv('/tmp/customers.csv')
head(customer)
##   CustomerID  Genre Age Annual_Income Spending_Score
## 1          1   Male  19            15             39
## 2          2   Male  21            15             81
## 3          3 Female  20            16              6
## 4          4 Female  23            16             77
## 5          5 Female  31            17             40
## 6          6 Female  22            17             76
customer <- customer[,c('Annual_Income', 'Spending_Score')]

hc <- hclust(dist(customer), method = 'ward.D2')
plot(hc)

library(cluster)
set.seed(123)
sil <- c()
for (k in 2:10){
  km <- kmeans(customer, center = k)
  kms <- silhouette(km$cluster, dist(customer))
  m <- as.matrix(kms)
  sil <- c(sil, mean(m[,3]))
}
plot(2:10, sil, type= 'l')

set.seed(22)
km <- kmeans(customer, center = 5)
str(customer)
## 'data.frame':    200 obs. of  2 variables:
##  $ Annual_Income : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending_Score: int  39 81 6 77 40 76 6 94 3 72 ...
plot(customer$Annual_Income, customer$Spending_Score, col = km$cluster)

#plot(customer)

Community Detection

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
m <- as.matrix(dist(customer))
#m <- (m < 50)
m2 <- ifelse(m < 30, 1, 0)

G <- graph_from_adjacency_matrix(m2)

wc <- cluster_walktrap(G)

modularity(wc)
## [1] 0.3163856
table(membership(wc))
## 
##  1  2  3  4  5 
## 39 32 22 94 13
group <- membership(wc)

plot(customer$Annual_Income, customer$Spending_Score, col = group)

Cross Validation

dataset <- read.csv('/tmp/Churn_Modelling.csv') 
dataset <- dataset[4:14]
# 資料預處理
dataset$Geography <- as.numeric(factor(dataset$Geography,
    levels = c('France', 'Spain', 'Germany'),
    labels = c(1, 2, 3))) 

dataset$Gender <- as.numeric(factor(dataset$Gender,
           levels = c('Female', 'Male'), 
           labels = c(1, 2)))

dataset$Exited <- as.factor(dataset$Exited)


set.seed(123)
idx <- sample.int(2, nrow(dataset), replace=TRUE,prob=c(0.7,0.3))
training_set <- dataset[idx==1,]
test_set <- dataset[idx==2,]

library(rpart)
fit <- rpart(Exited ~ ., data = training_set)

predicted <- predict(fit, test_set, type = 'class') 
cm <- table(test_set$Exited,predicted)
cm
##    predicted
##        0    1
##   0 2313   54
##   1  332  253
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
control <- trainControl(method="repeatedcv", number=10, repeats=3)

model <- train(Exited~., data=training_set, method="rpart", trControl=control) 
#predict(model, test_set)

Early Termination

library(rpart)
fit <- rpart(Exited ~ ., data = training_set)
fit$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.05268595      0 1.0000000 1.0000000 0.02338420
## 2 0.03753444      2 0.8946281 0.8863636 0.02233771
## 3 0.03443526      4 0.8195592 0.8181818 0.02164499
## 4 0.03305785      5 0.7851240 0.8023416 0.02147647
## 5 0.02823691      6 0.7520661 0.7720386 0.02114563
## 6 0.01033058      7 0.7238292 0.7327824 0.02069984
## 7 0.01000000      9 0.7031680 0.7265840 0.02062760
plot(fit, margin= 0.1)
text(fit)

min_split <- which.min(fit$cptable[,"xerror"])
stop_criteria <- fit$cptable[min_split ,"CP"]
prune.fit <- prune(fit, cp= stop_criteria)

par(mfrow=c(1,2))
plot(fit, margin =0.1)
text(fit)
plot(prune.fit, margin =0.1)
text(prune.fit)

library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:igraph':
## 
##     clusters
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
fit <- ctree(Exited ~ ., data = training_set)
plot(fit)

fit2 <- ctree(Species ~., data = iris)
plot(fit2)

Regularization

# Logistic Regression
fit <- glm(Exited ~ ., data = training_set, family= 'binomial')

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
x <- model.matrix(Exited~.,training_set)
#x
?cv.glmnet
cv.out <- cv.glmnet(x,training_set$Exited,alpha=0,family="binomial") 
plot(cv.out)

Ensemble Method

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
forest <- randomForest(Exited ~., data = training_set, ntree=100,importance=T, proximity=T)

forest.predicted <- predict(forest, test_set, type = "class") 
cm <- table(forest.predicted, test_set$Exited)
cm
##                 
## forest.predicted    0    1
##                0 2278  298
##                1   89  287
#forest.predicted    0    1
#               0 2275  293
#               1   92  292


tree <- rpart(Exited ~., data = training_set)

tree.predicted <- predict(tree, test_set, type = "class") 
cm <- table(tree.predicted, test_set$Exited)
cm
##               
## tree.predicted    0    1
##              0 2313  332
##              1   54  253