Data Preprocessing

dataset <- read.csv('/tmp/Churn_Modelling.csv')
head(dataset)
##   RowNumber CustomerId  Surname CreditScore Geography Gender Age Tenure
## 1         1   15634602 Hargrave         619    France Female  42      2
## 2         2   15647311     Hill         608     Spain Female  41      1
## 3         3   15619304     Onio         502    France Female  42      8
## 4         4   15701354     Boni         699    France Female  39      1
## 5         5   15737888 Mitchell         850     Spain Female  43      2
## 6         6   15574012      Chu         645     Spain   Male  44      8
##     Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1      0.00             1         1              1       101348.88      1
## 2  83807.86             1         0              1       112542.58      0
## 3 159660.80             3         1              0       113931.57      1
## 4      0.00             2         0              0        93826.63      0
## 5 125510.82             1         1              1        79084.10      0
## 6 113755.78             2         1              0       149756.71      1
dataset <- dataset[,4:14]

dataset$Geography <- as.numeric(factor(dataset$Geography,
levels = c('France', 'Spain', 'Germany'),
labels = c(1, 2, 3)))

dataset$Gender <- as.numeric(factor(dataset$Gender,
levels = c('Female', 'Male'),
labels = c(1, 2)))

dataset$Exited <- as.factor(dataset$Exited)

str(dataset)
## 'data.frame':    10000 obs. of  11 variables:
##  $ CreditScore    : int  619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : num  1 2 1 1 2 2 1 3 1 1 ...
##  $ Gender         : num  1 1 1 1 1 2 2 1 2 2 ...
##  $ Age            : int  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num  0 83808 159661 0 125511 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : int  1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : int  1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 1 1 ...
set.seed(123)
idx <- sample.int(2, nrow(dataset), replace=TRUE, prob=c(0.7,0.3))

training_set <- dataset[idx==1,]
test_set     <- dataset[idx==2,]

Feature Ranking

library(FSelector)
weights <- random.forest.importance(Exited~., training_set, importance.type = 1)
weights
##                 attr_importance
## CreditScore           2.2411688
## Geography            39.5350434
## Gender               13.0318623
## Age                 163.0292160
## Tenure                0.4476493
## Balance              68.2936181
## NumOfProducts       173.1493877
## HasCrCard             2.2607808
## IsActiveMember       75.1710978
## EstimatedSalary       1.8798426
subset <- cutoff.k(weights, 5)
subset
## [1] "NumOfProducts"  "Age"            "IsActiveMember" "Balance"       
## [5] "Geography"
f <- as.simple.formula(subset, "Exited")
print(f)
## Exited ~ NumOfProducts + Age + IsActiveMember + Balance + Geography
## <environment: 0x306a188>
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Exited~., data=training_set, method="rpart"
, trControl=control)
importance <- varImp(model, scale=FALSE)
importance
## rpart variable importance
## 
##                 Overall
## NumOfProducts   361.254
## Age             347.343
## IsActiveMember  144.105
## Geography       138.329
## Balance          79.412
## EstimatedSalary   4.889
## CreditScore       4.121
## Tenure            0.000
## HasCrCard         0.000
## Gender            0.000
plot(importance)

#library(rminer)
#? random.forest.importance

Feature Selection

library(rpart)
evaluator <- function(subset) {
  k <- 5
  set.seed(123)
  ind <- sample(5, nrow(dataset), replace = TRUE)
  results = sapply(1:k, function(i) {
    training_set <- dataset[ind ==i,]
    test_set <- dataset[ind !=i,]
    fit <- rpart(as.simple.formula(subset, "Exited"), training_set)
    
  error.rate = sum(test_set$Exited != predict(fit, test_set, type="class")) / nrow(test_set)
return(1 - error.rate)
})
return(mean(results))
}

attr.subset <- hill.climbing.search(names(dataset)[!names(dataset) %in% "Exited"], evaluator)
f <- as.simple.formula(attr.subset, "Exited")
print(f)
## Exited ~ CreditScore + Geography + Gender + Age + Balance + NumOfProducts + 
##     HasCrCard + IsActiveMember + EstimatedSalary
## <environment: 0xe0e2eb0>
names(dataset)[!names(dataset) %in% "Exited"]
##  [1] "CreditScore"     "Geography"       "Gender"         
##  [4] "Age"             "Tenure"          "Balance"        
##  [7] "NumOfProducts"   "HasCrCard"       "IsActiveMember" 
## [10] "EstimatedSalary"

StepAIC

fit <- glm(Exited ~., data= training_set, family='binomial')

summary(fit)
## 
## Call:
## glm(formula = Exited ~ ., family = "binomial", data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8847  -0.6715  -0.4741  -0.2803   2.9750  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.984e+00  3.010e-01  -9.914  < 2e-16 ***
## CreditScore     -7.994e-04  3.302e-04  -2.421   0.0155 *  
## Geography        3.425e-01  3.967e-02   8.632  < 2e-16 ***
## Gender          -5.204e-01  6.394e-02  -8.139 3.97e-16 ***
## Age              6.820e-02  2.986e-03  22.840  < 2e-16 ***
## Tenure          -1.729e-02  1.104e-02  -1.566   0.1172    
## Balance          3.271e-06  5.853e-07   5.588 2.29e-08 ***
## NumOfProducts   -1.252e-01  5.523e-02  -2.267   0.0234 *  
## HasCrCard       -6.631e-02  6.931e-02  -0.957   0.3387    
## IsActiveMember  -1.003e+00  6.730e-02 -14.910  < 2e-16 ***
## EstimatedSalary  3.948e-07  5.587e-07   0.707   0.4798    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7169.6  on 7047  degrees of freedom
## Residual deviance: 6192.4  on 7037  degrees of freedom
## AIC: 6214.4
## 
## Number of Fisher Scoring iterations: 5
step(fit)
## Start:  AIC=6214.42
## Exited ~ CreditScore + Geography + Gender + Age + Tenure + Balance + 
##     NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary
## 
##                   Df Deviance    AIC
## - EstimatedSalary  1   6192.9 6212.9
## - HasCrCard        1   6193.3 6213.3
## <none>                 6192.4 6214.4
## - Tenure           1   6194.9 6214.9
## - NumOfProducts    1   6197.6 6217.6
## - CreditScore      1   6198.3 6218.3
## - Balance          1   6223.7 6243.7
## - Gender           1   6259.2 6279.2
## - Geography        1   6267.8 6287.8
## - IsActiveMember   1   6428.7 6448.7
## - Age              1   6751.1 6771.1
## 
## Step:  AIC=6212.92
## Exited ~ CreditScore + Geography + Gender + Age + Tenure + Balance + 
##     NumOfProducts + HasCrCard + IsActiveMember
## 
##                  Df Deviance    AIC
## - HasCrCard       1   6193.8 6211.8
## <none>                6192.9 6212.9
## - Tenure          1   6195.3 6213.3
## - NumOfProducts   1   6198.1 6216.1
## - CreditScore     1   6198.8 6216.8
## - Balance         1   6224.4 6242.4
## - Gender          1   6259.6 6277.6
## - Geography       1   6268.2 6286.2
## - IsActiveMember  1   6429.5 6447.5
## - Age             1   6751.4 6769.4
## 
## Step:  AIC=6211.82
## Exited ~ CreditScore + Geography + Gender + Age + Tenure + Balance + 
##     NumOfProducts + IsActiveMember
## 
##                  Df Deviance    AIC
## <none>                6193.8 6211.8
## - Tenure          1   6196.3 6212.3
## - NumOfProducts   1   6199.0 6215.0
## - CreditScore     1   6199.7 6215.7
## - Balance         1   6225.6 6241.6
## - Gender          1   6260.5 6276.5
## - Geography       1   6268.9 6284.9
## - IsActiveMember  1   6429.9 6445.9
## - Age             1   6752.7 6768.7
## 
## Call:  glm(formula = Exited ~ CreditScore + Geography + Gender + Age + 
##     Tenure + Balance + NumOfProducts + IsActiveMember, family = "binomial", 
##     data = training_set)
## 
## Coefficients:
##    (Intercept)     CreditScore       Geography          Gender  
##     -2.991e+00      -8.014e-04       3.419e-01      -5.201e-01  
##            Age          Tenure         Balance   NumOfProducts  
##      6.819e-02      -1.744e-02       3.292e-06      -1.245e-01  
## IsActiveMember  
##     -1.003e+00  
## 
## Degrees of Freedom: 7047 Total (i.e. Null);  7039 Residual
## Null Deviance:       7170 
## Residual Deviance: 6194  AIC: 6212

Good R Coding Style

# Vectorized programming
# good style
c(1,2,3) + 3
## [1] 4 5 6
# bad style
for(i in seq_along(c(1,2,3))){
  print(i + 3)
}
## [1] 4
## [1] 5
## [1] 6
# Use built in function
# good style
sum(1:100)
## [1] 5050
# bad style
s <- 0
for (i in 1:100){
  s <- s+ i
}
s
## [1] 5050
# preallocating memory
# good style
a <- c('David', 'Marry', 'John')
b <- c('', '' ,'')
for(i in seq_along(a) ){
  b[i] <- paste('Hello',a[i])
}
b
## [1] "Hello David" "Hello Marry" "Hello John"
# bad style
a <- c('David', 'Marry', 'John')
b <- c()
for(i in a){
  b <- c(b, paste('Hello',i) )
}
b
## [1] "Hello David" "Hello Marry" "Hello John"
# use apply function instead of for loop

## best method
paste('Hello', a)
## [1] "Hello David" "Hello Marry" "Hello John"
## second best method
sapply(a, function(i) paste('Hello',i) )
##         David         Marry          John 
## "Hello David" "Hello Marry"  "Hello John"
## worst style
a <- c('David', 'Marry', 'John')
b <- c()
for(i in a){
  b <- c(b, paste('Hello',i) )
}
b
## [1] "Hello David" "Hello Marry" "Hello John"

Iterative Programming v.s. Functional Programming

a <- c(1,3,4,6,2,5,8)

b <- c()
for(i in a){
  #print(i)
  if (i %% 2 == 0){
    b <- c(b, i)
  }
}
b
## [1] 4 6 2 8
s <- sum(b)
s
## [1] 20
a <- c(1,3,4,6,2,5,8)
s <- sum(a[a %% 2 == 0]) 
s
## [1] 20

sparkR

if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
Sys.setenv(SPARK_HOME = "/usr/local/spark") }
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
?glm

sparkR.session(master = "local")
lvr_prices <- read.csv('/tmp/lvr_prices.csv')

head(lvr_prices)

str(lvr_prices)

class(lvr_prices)
hist(lvr_prices$total_price)
hist(log(lvr_prices$total_price))
#?log

lvr_data <- as.DataFrame(lvr_prices)
lvr_data

head(lvr_data) 

#showDF(lvr_data)
class(lvr_data)
printSchema(lvr_data)

head(select(lvr_data, lvr_data$area, lvr_data$total_price))

head(filter(lvr_data, lvr_data$area == '大安區'))

a <- select(lvr_data, lvr_data$area, lvr_data$total_price)

b <- filter(a, a$area == '大安區')

class(b)

head(b)

library(dplyr)
lvr_prices %>%
  dplyr::select(address, area, total_price) %>%
  dplyr::filter(area == '大安區') %>%
  head()

lvr_data %>%
  SparkR::select(lvr_data$address, lvr_data$area, lvr_data$total_price) %>%
  SparkR::filter(lvr_data$area == '大安區') %>%
  head()
  
lvr_data %>%
  SparkR::groupBy(lvr_data$area) %>%
  SparkR::summarize(price_sum = sum(lvr_data$total_price)) %>%
  head()

library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))

lvr_data$house_age <- (datediff(date_format(lvr_data$trading_ymd, "yyyy-MM-dd"), date_format(lvr_data$finish_ymd, "yyyy-MM-dd")))/ 365

printSchema(lvr_data)

head(lvr_data)

lvr_data$trading_ym <- date_format(lvr_data$trading_ymd, "yyyy-MM-01")

a <- lvr_data %>%
  SparkR::groupBy(lvr_data$area, lvr_data$trading_ym) %>%
  SparkR::summarize(price_avg = mean(lvr_data$total_price)) 

mean_df <- head(a)
mean_df

SparkSQL


printSchema(lvr_data)
createOrReplaceTempView(lvr_data, "lvr_data")

lvr_sql <- SparkR::sql("SELECT area, avg(total_price) FROM lvr_data WHERE  house_age < 30 group by area")
head(lvr_sql)

Regression

if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
Sys.setenv(SPARK_HOME = "/usr/local/spark") }
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))

sparkR.session(master = "local")
lvr_prices <- read.csv('/tmp/lvr_prices.csv')
lvr_data <- as.DataFrame(lvr_prices)
fit <- SparkR::glm(total_price ~ building_sqmeter, data = lvr_data, family = 'gaussian')
pred <- summary(fit) 
pred

data(iris)
iris.data <- iris[1:100,]
iris.data$Species <- factor(iris.data$Species, labels = c(0,1))

fit <- stats::glm(Species ~., data = iris.data, family=binomial(logit)) 
predict(fit, iris.data)


iris.data.sp <- as.DataFrame(iris.data)

fit3 <- SparkR::glm(Species ~., data = iris.data.sp, family=binomial(logit)) 

write.ml(fit3, '/tmp/model2')
model <- read.ml('/tmp/model2')