library(arules)
library(dplyr)
library(ggplot2)
library(magrittr)
library(caret)
library(corrplot)
library(arulesViz)

load the data into the global environment

data("AdultUCI")
str(AdultUCI)

CRISP-DM Data Understanding

get the factor and integer variable names to pass

factor_vars <- sapply(AdultUCI, function(x) is.factor(x)) %>% which() %>% names()  
int_vars <- AdultUCI[, -which(names(AdultUCI) %in% factor_vars)] %>% names()  

plot histogram for factor variables

factor_histogram <- function(variables) {
  for(var in variables) {
    myVector <- AdultUCI[[var]] 
    g <- ggplot(AdultUCI, aes(x=myVector)) +
            geom_bar(color="black",fill="white") +
            geom_text(stat="count", aes(x=as.numeric(myVector), label=..count..),vjust=0) +
            theme(axis.text.x=element_text(angle=90,hjust=1,vjust=.5)) +
            labs(title=var, x=var)
    print(g)
  }
}
factor_histogram(factor_vars[factor_vars %in% c("workclass","occupation","income")])
## Warning: Removed 2799 rows containing non-finite values (stat_count).

## Warning: Removed 2809 rows containing non-finite values (stat_count).

## Warning: Removed 16281 rows containing non-finite values (stat_count).

find out which variables have NA values

AdultUCI %>% sapply(function(x) sum(is.na(x)))
##            age      workclass         fnlwgt      education  education-num 
##              0           2799              0              0              0 
## marital-status     occupation   relationship           race            sex 
##              0           2809              0              0              0 
##   capital-gain   capital-loss hours-per-week native-country         income 
##              0              0              0            857          16281
AdultUCI %>% sapply(function(x) sum(is.na(x)) > 0) %>% which() %>% names()
## [1] "workclass"      "occupation"     "native-country" "income"

Summary Analysis of EDA for factor variables

  • The following attributes have NA values:“workclass”,“occupation”,“native-country”,“income”
  • There are 18680 rows with NA values
  • 0.9% of the observations have native-country = ‘United States’
  • The race:White makes up 0.86% of the observations
  • Males and Female ratio of the observations are: M-0.67% F-0.33%
  • Workclass:Gov’t workers are 0.13% of the observations, Workclass:Private are 0.69%

plot histogram for integer variables

get_binwidth <- function(v) {  
  my_min <- min(v)
  my_max <- max(v)
  l <- length(unique(v))
  binwidth = ((my_max - my_min)/sqrt(l)) %>% round()
  return(binwidth)
}

integer_histogram <- function(variables) {
  for(var in variables) {
    myVector = AdultUCI[[var]]
    g <- ggplot(AdultUCI, aes(x=myVector)) +
      geom_histogram(binwidth=get_binwidth(myVector), color='black', fill='white') +
      annotate("text",label=round(mean(myVector)), x=round(mean(myVector)),y=0,vjust=0) +
      geom_vline(aes(xintercept=round(mean(myVector))), color='red') +
      labs(title=var, x=var)
    print(g)
  }
}
integer_histogram(int_vars[int_vars %in% c("capital-loss","capital-gain","fnlwgt")])

Summary Analysis of EDA for integer variables

  • variables capital-loss/capital-gain are very right skewed
  • fnlwgt has a wide range of values and is very right skewed

CRISP-DM Data Preparation

find highly correlated income ~ variables

  • create binary income variable first
AdultUCI$income_binary <- ifelse(AdultUCI$income == 'small',0,1)
cor_matrix <- cor(AdultUCI[complete.cases(AdultUCI), sapply(AdultUCI, is.numeric)], method='pearson')
corrplot(cor_matrix,type=c('full'))

+ age, education-num, capital-gain/loss, and hours-per-week have a positive correlation with income

apply the knn algorithm to label the missing values

  • dropped fnlwgt before imputing because it is not correlated to income and has high variance; this should reduce the runtime of the euclidean calculation
AdultUCI2 <- AdultUCI[,!colnames(AdultUCI) %in% c('fnlwgt')]  
preprocess <- preProcess(AdultUCI2, method=c('knnImpute','center','scale'))
predictions <- predict(preprocess, AdultUCI2)

missing values for income_binary were replaced by the values of their closest knn

  • add the predicted income column back to original dataset
  • convert the income_binary column back to a factor with levels c(‘small’,‘large’)
AdultUCI2$income_knn <- predictions$income_binary
AdultUCI2$income_knn_factor <- discretize(AdultUCI2$income_knn, method='fixed', breaks=c(-0.56319,-0.001504,Inf), labels=c('small','large'),ordered_result = T) # breaks c(min, mean) 

compare the original dataset with the imputed one to ensure accuracy of the income transformation

  • find index of NA values and verify that all other (non-NA) values from original dataset and imputed dataset are the same
na_index <- which(AdultUCI$income %>% is.na) # NA values are at index 32562:nrow(AdultUCI)
all(AdultUCI$income[1:32561]==AdultUCI2$income[1:32561])  # returns TRUE, which confirms item for item match
## [1] TRUE

discretize variables

# cut function creates breaks from -infinity, 0, median(capital-gain) > 0, infinity with labels: none (-inf --> 0), low(0 --> median(captial-gain) > 0), high(`low` --> infinity)
AdultUCI2$`capital-gain` %<>% cut(breaks=c(-Inf,0,median(AdultUCI2[['capital-gain']][AdultUCI2[['capital-gain']]>0]),Inf),labels=c('None','Low','High'),ordered_result = T)  
# median function uses AND operation to get only the capital-gain values that are greater than 0

AdultUCI2$`capital-loss` %<>% cut(breaks=c(-Inf,0,median(AdultUCI2[['capital-loss']][AdultUCI2[['capital-loss']]>0]),Inf),labels=c('None','Low','High'),ordered_result = T)  

# create intervals for age: "(16,28]" "(28,37]" "(37,48]" "(48,90]"
AdultUCI2$age %<>% cut(breaks=c(16,28,37,48,90))

# summary(AdultUCI2_df$`hours-per-week`)
AdultUCI2$`hours-per-week` %<>% cut(breaks=c(0,39,45,99),labels=c('part-time','full-time','over-time'),ordered_result=T)

remove pre-impute and irrelevant variables

  • original income variable is no longer needed because of imputed income_knn_factor variable
  • education-num was removed because it is redundant to education
AdultUCI2 %>% sapply(function(x) sum(is.na(x)))
##               age         workclass         education     education-num 
##                 0              2799                 0                 0 
##    marital-status        occupation      relationship              race 
##                 0              2809                 0                 0 
##               sex      capital-gain      capital-loss    hours-per-week 
##                 0                 0                 0                 0 
##    native-country            income     income_binary        income_knn 
##               857             16281             16281                 0 
## income_knn_factor 
##                 0
AdultUCI2 <- AdultUCI2[,!colnames(AdultUCI2) %in% c('income','income_binary','income_knn','education-num')]

remove remaining rows with NA values (NA values are all from categorial variables)

AdultUCI2 <- AdultUCI2[-which(!complete.cases(AdultUCI2)),]

CRISP-DM Modeling

trans <- as(AdultUCI2, "transactions")
ruleset <- apriori(trans, parameter = list(support=.09, confidence=.5, minlen=2),
                   appearance = list(default="lhs", rhs=c("income_knn_factor=large")),
                   control = list(verbose = F))
inspect(head(ruleset,5))
##     lhs                               rhs                          support confidence     lift count
## [1] {sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09639114  0.5086348 1.851971  4359
## [2] {race=White,                                                                                    
##      sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09013312  0.5139327 1.871261  4076
## [3] {sex=Male,                                                                                      
##      hours-per-week=over-time,                                                                      
##      native-country=United-States} => {income_knn_factor=large} 0.09077440  0.5130609 1.868087  4105

item frequency plot

itemFrequencyPlot(trans, support=.1, cex.names=.7)

plot(ruleset)

filter ruleset to include lift > 2 and rhs: income_knn_factor=large

ruleset_filter <- subset(ruleset, lift > 1) %>% sort(decreasing = T, by='lift')
plot(ruleset_filter)

inspect(ruleset_filter[1:3])
##     lhs                               rhs                          support confidence     lift count
## [1] {race=White,                                                                                    
##      sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09013312  0.5139327 1.871261  4076
## [2] {sex=Male,                                                                                      
##      hours-per-week=over-time,                                                                      
##      native-country=United-States} => {income_knn_factor=large} 0.09077440  0.5130609 1.868087  4105
## [3] {sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09639114  0.5086348 1.851971  4359
plot(ruleset_filter[1:3],method='graph')

Find associations for income=small

ruleset <- apriori(trans, parameter = list(support=.11, confidence=.7, minlen=2),
                   appearance = list(default="lhs", rhs=c("income_knn_factor=small")),
                   control = list(verbose = F))
ruleset_filter <- subset(ruleset, lift > 1) %>% sort(decreasing = T, by='lift')
inspect(ruleset_filter[1:5])
##     lhs                               rhs                         support confidence     lift count
## [1] {age=(16,28],                                                                                  
##      relationship=Own-child}       => {income_knn_factor=small} 0.1114944  0.9933018 1.369401  5042
## [2] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      capital-gain=None,                                                                            
##      capital-loss=None}            => {income_knn_factor=small} 0.1640573  0.9914473 1.366844  7419
## [3] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      capital-gain=None,                                                                            
##      capital-loss=None,                                                                            
##      native-country=United-States} => {income_knn_factor=small} 0.1493521  0.9909038 1.366095  6754
## [4] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      race=White,                                                                                   
##      capital-gain=None,                                                                            
##      capital-loss=None}            => {income_knn_factor=small} 0.1394675  0.9908877 1.366073  6307
## [5] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      race=White,                                                                                   
##      capital-gain=None,                                                                            
##      capital-loss=None,                                                                            
##      native-country=United-States} => {income_knn_factor=small} 0.1296493  0.9903716 1.365361  5863
plot(ruleset_filter[1:10], method='graph')

Analysis of top rules

Income=large

  • Summary: In general, the rules with the highest support (larger circles) and lift (dark red) are for nodes that have the following attributes: sex=male, race=white, income-large, native.country=United States, hours.per.week=over-time. In layman’s terms: white males living in the United States that work over-time, have the largest income in the dataset.
  • The ruleset outlined above has support of 9% in the dataset. This means that the attributes(white, male, over-time, united states) are observed in 9% of the dataset. The confidence level for this association is 51% and can be interpreted as attributes(white, male, over-time, united states) lead to the observation of income-large 51% of the time given the attributes are present.
  • The confidence metric is biased towards the prevelance of the attributes(white, male, united states)–which dominate the dataset. Therefore the lift metric is a better indicator to assess how likely the occurence of large-income is–given the occurence of the attributes(white, male, over-time, united states), while also controlling for how frequent income-large is observed in the dataset. The confidence score for the association sex=male, race=white, native.country=United States, hours.per.week=over-time -> income-large is 1.87 and indicates a strong reciprical relationship.

Income=small

  • Summary: In general, the rules with the highest support and lift are for nodes that have the following attributes: age=(16,28], capital gain/loss=None, native.country=United States, marital.status=Never-married, race=White, worclass=Private. In layman’s terms: White Americans age 16-28 that work in the private sector with low net-worth; and, that have a child have the lowest income in the dataset.
  • The ruleset outlined above has support of 11% in the dataset. The confidence level for this association is 99% which suggests a strong correlation between the antecedent attributes and income-low
  • The lift for this association is 1.36 and supports an recipriocal relationship