Data Preprocessing
dataset <- read.csv('/tmp/Churn_Modelling.csv')
head(dataset)
## RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure
## 1 1 15634602 Hargrave 619 France Female 42 2
## 2 2 15647311 Hill 608 Spain Female 41 1
## 3 3 15619304 Onio 502 France Female 42 8
## 4 4 15701354 Boni 699 France Female 39 1
## 5 5 15737888 Mitchell 850 Spain Female 43 2
## 6 6 15574012 Chu 645 Spain Male 44 8
## Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1 0.00 1 1 1 101348.88 1
## 2 83807.86 1 0 1 112542.58 0
## 3 159660.80 3 1 0 113931.57 1
## 4 0.00 2 0 0 93826.63 0
## 5 125510.82 1 1 1 79084.10 0
## 6 113755.78 2 1 0 149756.71 1
dataset <- dataset[,4:14]
dataset$Geography <- as.numeric(factor(dataset$Geography,
levels = c('France', 'Spain', 'Germany'),
labels = c(1, 2, 3)))
dataset$Gender <- as.numeric(factor(dataset$Gender,
levels = c('Female', 'Male'),
labels = c(1, 2)))
dataset$Exited <- as.factor(dataset$Exited)
str(dataset)
## 'data.frame': 10000 obs. of 11 variables:
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : num 1 2 1 1 2 2 1 3 1 1 ...
## $ Gender : num 1 1 1 1 1 2 2 1 2 2 ...
## $ Age : int 42 41 42 39 43 44 50 29 44 27 ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : int 1 0 1 0 1 1 1 1 0 1 ...
## $ IsActiveMember : int 1 1 0 0 1 0 1 0 1 1 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 1 1 ...
set.seed(123)
idx <- sample.int(2, nrow(dataset), replace=TRUE, prob=c(0.7,0.3))
training_set <- dataset[idx==1,]
test_set <- dataset[idx==2,]
Feature Ranking
library(FSelector)
weights <- random.forest.importance(Exited~., training_set, importance.type = 1)
weights
## attr_importance
## CreditScore 2.2411688
## Geography 39.5350434
## Gender 13.0318623
## Age 163.0292160
## Tenure 0.4476493
## Balance 68.2936181
## NumOfProducts 173.1493877
## HasCrCard 2.2607808
## IsActiveMember 75.1710978
## EstimatedSalary 1.8798426
subset <- cutoff.k(weights, 5)
subset
## [1] "NumOfProducts" "Age" "IsActiveMember" "Balance"
## [5] "Geography"
f <- as.simple.formula(subset, "Exited")
print(f)
## Exited ~ NumOfProducts + Age + IsActiveMember + Balance + Geography
## <environment: 0x306a188>
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Exited~., data=training_set, method="rpart"
, trControl=control)
importance <- varImp(model, scale=FALSE)
importance
## rpart variable importance
##
## Overall
## NumOfProducts 361.254
## Age 347.343
## IsActiveMember 144.105
## Geography 138.329
## Balance 79.412
## EstimatedSalary 4.889
## CreditScore 4.121
## Tenure 0.000
## HasCrCard 0.000
## Gender 0.000
plot(importance)

#library(rminer)
#? random.forest.importance
Feature Selection
library(rpart)
evaluator <- function(subset) {
k <- 5
set.seed(123)
ind <- sample(5, nrow(dataset), replace = TRUE)
results = sapply(1:k, function(i) {
training_set <- dataset[ind ==i,]
test_set <- dataset[ind !=i,]
fit <- rpart(as.simple.formula(subset, "Exited"), training_set)
error.rate = sum(test_set$Exited != predict(fit, test_set, type="class")) / nrow(test_set)
return(1 - error.rate)
})
return(mean(results))
}
attr.subset <- hill.climbing.search(names(dataset)[!names(dataset) %in% "Exited"], evaluator)
f <- as.simple.formula(attr.subset, "Exited")
print(f)
## Exited ~ CreditScore + Geography + Gender + Age + Balance + NumOfProducts +
## HasCrCard + IsActiveMember + EstimatedSalary
## <environment: 0xe0e2eb0>
names(dataset)[!names(dataset) %in% "Exited"]
## [1] "CreditScore" "Geography" "Gender"
## [4] "Age" "Tenure" "Balance"
## [7] "NumOfProducts" "HasCrCard" "IsActiveMember"
## [10] "EstimatedSalary"
StepAIC
fit <- glm(Exited ~., data= training_set, family='binomial')
summary(fit)
##
## Call:
## glm(formula = Exited ~ ., family = "binomial", data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8847 -0.6715 -0.4741 -0.2803 2.9750
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.984e+00 3.010e-01 -9.914 < 2e-16 ***
## CreditScore -7.994e-04 3.302e-04 -2.421 0.0155 *
## Geography 3.425e-01 3.967e-02 8.632 < 2e-16 ***
## Gender -5.204e-01 6.394e-02 -8.139 3.97e-16 ***
## Age 6.820e-02 2.986e-03 22.840 < 2e-16 ***
## Tenure -1.729e-02 1.104e-02 -1.566 0.1172
## Balance 3.271e-06 5.853e-07 5.588 2.29e-08 ***
## NumOfProducts -1.252e-01 5.523e-02 -2.267 0.0234 *
## HasCrCard -6.631e-02 6.931e-02 -0.957 0.3387
## IsActiveMember -1.003e+00 6.730e-02 -14.910 < 2e-16 ***
## EstimatedSalary 3.948e-07 5.587e-07 0.707 0.4798
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7169.6 on 7047 degrees of freedom
## Residual deviance: 6192.4 on 7037 degrees of freedom
## AIC: 6214.4
##
## Number of Fisher Scoring iterations: 5
step(fit)
## Start: AIC=6214.42
## Exited ~ CreditScore + Geography + Gender + Age + Tenure + Balance +
## NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary
##
## Df Deviance AIC
## - EstimatedSalary 1 6192.9 6212.9
## - HasCrCard 1 6193.3 6213.3
## <none> 6192.4 6214.4
## - Tenure 1 6194.9 6214.9
## - NumOfProducts 1 6197.6 6217.6
## - CreditScore 1 6198.3 6218.3
## - Balance 1 6223.7 6243.7
## - Gender 1 6259.2 6279.2
## - Geography 1 6267.8 6287.8
## - IsActiveMember 1 6428.7 6448.7
## - Age 1 6751.1 6771.1
##
## Step: AIC=6212.92
## Exited ~ CreditScore + Geography + Gender + Age + Tenure + Balance +
## NumOfProducts + HasCrCard + IsActiveMember
##
## Df Deviance AIC
## - HasCrCard 1 6193.8 6211.8
## <none> 6192.9 6212.9
## - Tenure 1 6195.3 6213.3
## - NumOfProducts 1 6198.1 6216.1
## - CreditScore 1 6198.8 6216.8
## - Balance 1 6224.4 6242.4
## - Gender 1 6259.6 6277.6
## - Geography 1 6268.2 6286.2
## - IsActiveMember 1 6429.5 6447.5
## - Age 1 6751.4 6769.4
##
## Step: AIC=6211.82
## Exited ~ CreditScore + Geography + Gender + Age + Tenure + Balance +
## NumOfProducts + IsActiveMember
##
## Df Deviance AIC
## <none> 6193.8 6211.8
## - Tenure 1 6196.3 6212.3
## - NumOfProducts 1 6199.0 6215.0
## - CreditScore 1 6199.7 6215.7
## - Balance 1 6225.6 6241.6
## - Gender 1 6260.5 6276.5
## - Geography 1 6268.9 6284.9
## - IsActiveMember 1 6429.9 6445.9
## - Age 1 6752.7 6768.7
##
## Call: glm(formula = Exited ~ CreditScore + Geography + Gender + Age +
## Tenure + Balance + NumOfProducts + IsActiveMember, family = "binomial",
## data = training_set)
##
## Coefficients:
## (Intercept) CreditScore Geography Gender
## -2.991e+00 -8.014e-04 3.419e-01 -5.201e-01
## Age Tenure Balance NumOfProducts
## 6.819e-02 -1.744e-02 3.292e-06 -1.245e-01
## IsActiveMember
## -1.003e+00
##
## Degrees of Freedom: 7047 Total (i.e. Null); 7039 Residual
## Null Deviance: 7170
## Residual Deviance: 6194 AIC: 6212
Good R Coding Style
# Vectorized programming
# good style
c(1,2,3) + 3
## [1] 4 5 6
# bad style
for(i in seq_along(c(1,2,3))){
print(i + 3)
}
## [1] 4
## [1] 5
## [1] 6
# Use built in function
# good style
sum(1:100)
## [1] 5050
# bad style
s <- 0
for (i in 1:100){
s <- s+ i
}
s
## [1] 5050
# preallocating memory
# good style
a <- c('David', 'Marry', 'John')
b <- c('', '' ,'')
for(i in seq_along(a) ){
b[i] <- paste('Hello',a[i])
}
b
## [1] "Hello David" "Hello Marry" "Hello John"
# bad style
a <- c('David', 'Marry', 'John')
b <- c()
for(i in a){
b <- c(b, paste('Hello',i) )
}
b
## [1] "Hello David" "Hello Marry" "Hello John"
# use apply function instead of for loop
## best method
paste('Hello', a)
## [1] "Hello David" "Hello Marry" "Hello John"
## second best method
sapply(a, function(i) paste('Hello',i) )
## David Marry John
## "Hello David" "Hello Marry" "Hello John"
## worst style
a <- c('David', 'Marry', 'John')
b <- c()
for(i in a){
b <- c(b, paste('Hello',i) )
}
b
## [1] "Hello David" "Hello Marry" "Hello John"
Iterative Programming v.s. Functional Programming
a <- c(1,3,4,6,2,5,8)
b <- c()
for(i in a){
#print(i)
if (i %% 2 == 0){
b <- c(b, i)
}
}
b
## [1] 4 6 2 8
s <- sum(b)
s
## [1] 20
a <- c(1,3,4,6,2,5,8)
s <- sum(a[a %% 2 == 0])
s
## [1] 20
sparkR
if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
Sys.setenv(SPARK_HOME = "/usr/local/spark") }
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
?glm
sparkR.session(master = "local")
lvr_prices <- read.csv('/tmp/lvr_prices.csv')
head(lvr_prices)
str(lvr_prices)
class(lvr_prices)
hist(lvr_prices$total_price)
hist(log(lvr_prices$total_price))
#?log
lvr_data <- as.DataFrame(lvr_prices)
lvr_data
head(lvr_data)
#showDF(lvr_data)
class(lvr_data)
printSchema(lvr_data)
head(select(lvr_data, lvr_data$area, lvr_data$total_price))
head(filter(lvr_data, lvr_data$area == '大安區'))
a <- select(lvr_data, lvr_data$area, lvr_data$total_price)
b <- filter(a, a$area == '大安區')
class(b)
head(b)
library(dplyr)
lvr_prices %>%
dplyr::select(address, area, total_price) %>%
dplyr::filter(area == '大安區') %>%
head()
lvr_data %>%
SparkR::select(lvr_data$address, lvr_data$area, lvr_data$total_price) %>%
SparkR::filter(lvr_data$area == '大安區') %>%
head()
lvr_data %>%
SparkR::groupBy(lvr_data$area) %>%
SparkR::summarize(price_sum = sum(lvr_data$total_price)) %>%
head()
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
lvr_data$house_age <- (datediff(date_format(lvr_data$trading_ymd, "yyyy-MM-dd"), date_format(lvr_data$finish_ymd, "yyyy-MM-dd")))/ 365
printSchema(lvr_data)
head(lvr_data)
lvr_data$trading_ym <- date_format(lvr_data$trading_ymd, "yyyy-MM-01")
a <- lvr_data %>%
SparkR::groupBy(lvr_data$area, lvr_data$trading_ym) %>%
SparkR::summarize(price_avg = mean(lvr_data$total_price))
mean_df <- head(a)
mean_df
SparkSQL
printSchema(lvr_data)
createOrReplaceTempView(lvr_data, "lvr_data")
lvr_sql <- SparkR::sql("SELECT area, avg(total_price) FROM lvr_data WHERE house_age < 30 group by area")
head(lvr_sql)
Regression
if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
Sys.setenv(SPARK_HOME = "/usr/local/spark") }
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
sparkR.session(master = "local")
lvr_prices <- read.csv('/tmp/lvr_prices.csv')
lvr_data <- as.DataFrame(lvr_prices)
fit <- SparkR::glm(total_price ~ building_sqmeter, data = lvr_data, family = 'gaussian')
pred <- summary(fit)
pred
data(iris)
iris.data <- iris[1:100,]
iris.data$Species <- factor(iris.data$Species, labels = c(0,1))
fit <- stats::glm(Species ~., data = iris.data, family=binomial(logit))
predict(fit, iris.data)
iris.data.sp <- as.DataFrame(iris.data)
fit3 <- SparkR::glm(Species ~., data = iris.data.sp, family=binomial(logit))
write.ml(fit3, '/tmp/model2')
model <- read.ml('/tmp/model2')