HW1
library(readr)
churn <- read_csv("/tmp/Churn_Modelling.csv")
## Parsed with column specification:
## cols(
## RowNumber = col_integer(),
## CustomerId = col_integer(),
## Surname = col_character(),
## CreditScore = col_integer(),
## Geography = col_character(),
## Gender = col_character(),
## Age = col_integer(),
## Tenure = col_integer(),
## Balance = col_double(),
## NumOfProducts = col_integer(),
## HasCrCard = col_integer(),
## IsActiveMember = col_integer(),
## EstimatedSalary = col_double(),
## Exited = col_integer()
## )
class(churn)
## [1] "tbl_df" "tbl" "data.frame"
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 14 variables:
## $ RowNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
## $ Surname : chr "Hargrave" "Hill" "Onio" "Boni" ...
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : chr "France" "Spain" "France" "France" ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ Age : int 42 41 42 39 43 44 50 29 44 27 ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : int 1 0 1 0 1 1 1 1 0 1 ...
## $ IsActiveMember : int 1 1 0 0 1 0 1 0 1 1 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 14
## .. ..$ RowNumber : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ CustomerId : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Surname : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ CreditScore : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Geography : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Tenure : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Balance : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ NumOfProducts : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ HasCrCard : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ IsActiveMember : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ EstimatedSalary: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Exited : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Answer 1
table(churn$HasCrCard)
##
## 0 1
## 2945 7055
# Answer 1 (dplyr)
# SELECT HasCrCard, COUNT(*) FROM churn GROUP BY HasCrCard;
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
churn %>%
select(HasCrCard) %>%
group_by(HasCrCard) %>%
summarize(cnt = n())
## # A tibble: 2 x 2
## HasCrCard cnt
## <int> <int>
## 1 0 2945
## 2 1 7055
# Answer 1
tb <- table(churn$HasCrCard)
tb
##
## 0 1
## 2945 7055
l1 <- paste((tb / sum(tb)) * 100, '%' )
l2 <- c('Without Credit Card', 'With Credit Card')
paste(l2, l1)
## [1] "Without Credit Card 29.45 %" "With Credit Card 70.55 %"
pie(tb, init.angle = 90, labels = paste(l2, l1) ,main = 'Credit Card Holder Propotion', col = c('#7998cB', '#89C7B6') )

barplot(tb,names.arg = c('Without Credit Card', 'With Credit Card'), main = 'Credit Card Holder Propotion', col = c('#7998cB', '#89C7B6') )

# Answer 2
max(churn$EstimatedSalary)
## [1] 199992.5
min(churn$EstimatedSalary)
## [1] 11.58
mean(churn$EstimatedSalary)
## [1] 100090.2
summary(churn$EstimatedSalary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.58 51000.00 100200.00 100100.00 149400.00 200000.00
# Answer 3
#sort(churn$EstimatedSalary)
churn[order(churn$EstimatedSalary, decreasing = TRUE)[0:3], 'CustomerId' ]
## # A tibble: 3 x 1
## CustomerId
## <int>
## 1 15662021
## 2 15634359
## 3 15697270
# Answer 3
# SELECT CustomerId FROM churn ORDER BY EstimatedSalary DESC LIMIT 3
churn %>%
arrange(desc(EstimatedSalary)) %>%
select(CustomerId) %>%
head(3)
## # A tibble: 3 x 1
## CustomerId
## <int>
## 1 15662021
## 2 15634359
## 3 15697270
# Answer 4
table(churn[churn$EstimatedSalary >= 100000, 'Exited'])
##
## 0 1
## 3966 1044
# Answer 4 (dplyr)
churn %>%
filter(EstimatedSalary >= 100000) %>%
select(Exited) %>%
group_by(Exited) %>%
summarize(cnt = n())
## # A tibble: 2 x 2
## Exited cnt
## <int> <int>
## 1 0 3966
## 2 1 1044
# Answer 5
churn %>%
group_by(Geography, Gender) %>%
summarise(mean_salary = mean(EstimatedSalary))
## # A tibble: 6 x 3
## # Groups: Geography [?]
## Geography Gender mean_salary
## <chr> <chr> <dbl>
## 1 France Female 99564.
## 2 France Male 100174.
## 3 Germany Female 102446.
## 4 Germany Male 99905.
## 5 Spain Female 100734.
## 6 Spain Male 98426.
# Answer 6
churn$RowNumber <- NULL
churn$CustomerId <- NULL
churn$Surname <- NULL
head(churn)
## # A tibble: 6 x 11
## CreditScore Geography Gender Age Tenure Balance NumOfProducts
## <int> <chr> <chr> <int> <int> <dbl> <int>
## 1 619 France Female 42 2 0. 1
## 2 608 Spain Female 41 1 83808. 1
## 3 502 France Female 42 8 159661. 3
## 4 699 France Female 39 1 0. 2
## 5 850 Spain Female 43 2 125511. 1
## 6 645 Spain Male 44 8 113756. 2
## # ... with 4 more variables: HasCrCard <int>, IsActiveMember <int>,
## # EstimatedSalary <dbl>, Exited <int>
# Answer 7
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 11 variables:
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : chr "France" "Spain" "France" "France" ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ Age : int 42 41 42 39 43 44 50 29 44 27 ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : int 1 0 1 0 1 1 1 1 0 1 ...
## $ IsActiveMember : int 1 1 0 0 1 0 1 0 1 1 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 14
## .. ..$ RowNumber : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ CustomerId : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Surname : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ CreditScore : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Geography : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Tenure : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Balance : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ NumOfProducts : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ HasCrCard : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ IsActiveMember : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ EstimatedSalary: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Exited : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
#Geography, Gender,HasCrCard,IsActiveMember,Exited
churn$Geography <- as.factor(churn$Geography)
churn$Gender <- as.factor(churn$Gender)
churn$HasCrCard <- as.factor(churn$HasCrCard)
churn$IsActiveMember <- as.factor(churn$IsActiveMember)
churn$Exited <- as.factor(churn$Exited)
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 11 variables:
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 1 2 1 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
## $ Age : int 42 41 42 39 43 44 50 29 44 27 ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 2 1 2 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 2 2 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 1 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 14
## .. ..$ RowNumber : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ CustomerId : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Surname : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ CreditScore : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Geography : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Tenure : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Balance : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ NumOfProducts : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ HasCrCard : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ IsActiveMember : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ EstimatedSalary: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Exited : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Answer 8
library(rpart)
fit <- rpart(Exited ~ ., data = churn)
plot(fit, margin = 0.1)
text(fit)

# Answer 9
predicted <- predict(fit, churn, type= 'class')
sum(predicted == churn$Exited) / length(churn$Exited)
## [1] 0.8593
table(churn$Exited, predicted)
## predicted
## 0 1
## 0 7781 182
## 1 1225 812
# Answer 10
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(fit, churn, type="prob")
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(churn$Exited))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

Clustering
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
sum(abs(x - y))
## [1] 2
sum((x - y) ^ 2)
## [1] 2
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x, y), method = 'euclidean')
## x
## y 1.414214
dist(rbind(x, y), method = 'minkowski', p = 2)
## x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x, y), method = 'manhattan')
## x
## y 2
hc <- hclust(dist(iris[ , -5], method = 'euclidean'), method = 'ward.D2')
plot(hc)
fit <- cutree(hc, k= 3)
fit
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
plot(hc)
rect.hclust(hc, k = 3, border = 'red')

par(mfrow = c(1, 2) )
plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species, main = 'Original')
plot(iris$Petal.Length, iris$Petal.Width, col = fit, main = 'Clustered')

#.Random.seed
set.seed(42)
sample.int(42, 6)
## [1] 39 42 12 33 25 20
sample.int(42, 6)
## [1] 31 6 27 28 18 40
sample.int(42, 6)
## [1] 40 11 19 37 38 5
set.seed(22)
fit <- kmeans(iris[,-5], 3)
fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fit$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")
par(mfrow = c(1, 2) )

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species, main = 'Original')
plot(iris$Petal.Length, iris$Petal.Width, col = fit$cluster, main = 'Clustered')

library(cluster)
set.seed(22)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
plot(kms)
set.seed(22)
wss <- c()
for (k in 2:10){
km <- kmeans(iris[,-5], k)
wss <- c(wss, km$tot.withinss)
}
plot(2:10, wss, type = 'l')

set.seed(22)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
plot(kms)
sil <- c()
for (k in 2:10){
set.seed(22)
km <- kmeans(iris[,-5], k)
kms <- silhouette(km$cluster, dist(iris[,-5]))
m <- as.matrix(kms)
sil <- c(sil, mean(m[,3]))
}
plot(2:10, sil, type = 'l')

single_c <- hclust(dist(iris[,-5]), method="single")
hc_single <- cutree(single_c, k = 3)
kms_single <- silhouette(hc_single, dist(iris[,-5]))
m_single <- as.matrix(kms_single)
#0.512
mean(m_single[,3])
## [1] 0.5121108
complete_c <- hclust(dist(iris[,-5]), method="complete")
hc_complete <- cutree(complete_c, k = 3)
kms_complete <- silhouette(hc_complete, dist(iris[,-5]))
m_complete <- as.matrix(kms_complete)
# 0.513
mean(m_complete[,3])
## [1] 0.5135953
set.seed(42)
km <- kmeans(iris[,-5], 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
m <- as.matrix(kms)
# 0.552
mean(m[,3])
## [1] 0.552819
par(mfrow=c(1,1))
Customer Segmentation
customer <- read.csv('/tmp/customers.csv')
head(customer)
## CustomerID Genre Age Annual_Income Spending_Score
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
customer <- customer[,c('Annual_Income', 'Spending_Score')]
hc <- hclust(dist(customer), method = 'ward.D2')
plot(hc)

library(cluster)
set.seed(123)
sil <- c()
for (k in 2:10){
km <- kmeans(customer, center = k)
kms <- silhouette(km$cluster, dist(customer))
m <- as.matrix(kms)
sil <- c(sil, mean(m[,3]))
}
plot(2:10, sil, type= 'l')

set.seed(22)
km <- kmeans(customer, center = 5)
str(customer)
## 'data.frame': 200 obs. of 2 variables:
## $ Annual_Income : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending_Score: int 39 81 6 77 40 76 6 94 3 72 ...
plot(customer$Annual_Income, customer$Spending_Score, col = km$cluster)

#plot(customer)
Cross Validation
dataset <- read.csv('/tmp/Churn_Modelling.csv')
dataset <- dataset[4:14]
# 資料預處理
dataset$Geography <- as.numeric(factor(dataset$Geography,
levels = c('France', 'Spain', 'Germany'),
labels = c(1, 2, 3)))
dataset$Gender <- as.numeric(factor(dataset$Gender,
levels = c('Female', 'Male'),
labels = c(1, 2)))
dataset$Exited <- as.factor(dataset$Exited)
set.seed(123)
idx <- sample.int(2, nrow(dataset), replace=TRUE,prob=c(0.7,0.3))
training_set <- dataset[idx==1,]
test_set <- dataset[idx==2,]
library(rpart)
fit <- rpart(Exited ~ ., data = training_set)
predicted <- predict(fit, test_set, type = 'class')
cm <- table(test_set$Exited,predicted)
cm
## predicted
## 0 1
## 0 2313 54
## 1 332 253
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Exited~., data=training_set, method="rpart", trControl=control)
#predict(model, test_set)
Early Termination
library(rpart)
fit <- rpart(Exited ~ ., data = training_set)
fit$cptable
## CP nsplit rel error xerror xstd
## 1 0.05268595 0 1.0000000 1.0000000 0.02338420
## 2 0.03753444 2 0.8946281 0.8863636 0.02233771
## 3 0.03443526 4 0.8195592 0.8181818 0.02164499
## 4 0.03305785 5 0.7851240 0.8023416 0.02147647
## 5 0.02823691 6 0.7520661 0.7720386 0.02114563
## 6 0.01033058 7 0.7238292 0.7327824 0.02069984
## 7 0.01000000 9 0.7031680 0.7265840 0.02062760
plot(fit, margin= 0.1)
text(fit)
min_split <- which.min(fit$cptable[,"xerror"])
stop_criteria <- fit$cptable[min_split ,"CP"]
prune.fit <- prune(fit, cp= stop_criteria)
par(mfrow=c(1,2))
plot(fit, margin =0.1)
text(fit)
plot(prune.fit, margin =0.1)
text(prune.fit)

library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:igraph':
##
## clusters
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
fit <- ctree(Exited ~ ., data = training_set)
plot(fit)

fit2 <- ctree(Species ~., data = iris)
plot(fit2)

Regularization
# Logistic Regression
fit <- glm(Exited ~ ., data = training_set, family= 'binomial')
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
x <- model.matrix(Exited~.,training_set)
#x
?cv.glmnet
cv.out <- cv.glmnet(x,training_set$Exited,alpha=0,family="binomial")
plot(cv.out)

Ensemble Method
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
forest <- randomForest(Exited ~., data = training_set, ntree=100,importance=T, proximity=T)
forest.predicted <- predict(forest, test_set, type = "class")
cm <- table(forest.predicted, test_set$Exited)
cm
##
## forest.predicted 0 1
## 0 2278 298
## 1 89 287
#forest.predicted 0 1
# 0 2275 293
# 1 92 292
tree <- rpart(Exited ~., data = training_set)
tree.predicted <- predict(tree, test_set, type = "class")
cm <- table(tree.predicted, test_set$Exited)
cm
##
## tree.predicted 0 1
## 0 2313 332
## 1 54 253
Grid Search
library(caret)
tune.gridcart <- expand.grid(maxdepth = 2:10)
control <- trainControl(method="repeatedcv", number=10, repeats=1)
rpartFit2 <- train(Exited~., data=training_set, method = "rpart2",
tuneGrid =tune.gridcart, trControl = control)
#?train