IST 707 HW 1 - Association Rule Mining

library(arules)
library(dplyr)
library(ggplot2)
library(magrittr)
library(caret)
library(corrplot)
library(arulesViz)

load the data into the global environment

data("AdultUCI")

str(AdultUCI)

CRISP-DM Data Understanding

Exploratory Data Anlaysis and Summary Statistics
Create histograms to see the distribution of the variables

get the factor and integer variable names to pass

factor_vars <- sapply(AdultUCI, function(x) is.factor(x)) %>% which() %>% names()  
int_vars <- AdultUCI[, -which(names(AdultUCI) %in% factor_vars)] %>% names()

plot histogram for factor variables

factor_histogram <- function(variables) {
  for(var in variables) {
    myVector <- AdultUCI[[var]] 
    g <- ggplot(AdultUCI, aes(x=myVector)) +
            geom_bar(color="black",fill="white") +
            geom_text(stat="count", aes(x=as.numeric(myVector), label=..count..),vjust=0) +
            theme(axis.text.x=element_text(angle=90,hjust=1,vjust=.5)) +
            labs(title=var, x=var)
    print(g)
  }
}

factor_histogram(factor_vars[factor_vars %in% c("workclass","occupation","income")])

## Warning: Removed 2799 rows containing non-finite values (stat_count).

## Warning: Removed 2809 rows containing non-finite values (stat_count).

## Warning: Removed 16281 rows containing non-finite values (stat_count).

find out which variables have NA values

AdultUCI %>% sapply(function(x) sum(is.na(x)))

##            age      workclass         fnlwgt      education  education-num 
##              0           2799              0              0              0 
## marital-status     occupation   relationship           race            sex 
##              0           2809              0              0              0 
##   capital-gain   capital-loss hours-per-week native-country         income 
##              0              0              0            857          16281

AdultUCI %>% sapply(function(x) sum(is.na(x)) > 0) %>% which() %>% names()

## [1] "workclass"      "occupation"     "native-country" "income"

Summary Analysis of EDA for factor variables

The following attributes have NA values:“workclass”,“occupation”,“native-country”,“income”
There are 18680 rows with NA values
0.9% of the observations have native-country = ‘United States’
The race:White makes up 0.86% of the observations
Males and Female ratio of the observations are: M-0.67% F-0.33%
Workclass:Gov’t workers are 0.13% of the observations, Workclass:Private are 0.69%

plot histogram for integer variables

get_binwidth <- function(v) {  
  my_min <- min(v)
  my_max <- max(v)
  l <- length(unique(v))
  binwidth = ((my_max - my_min)/sqrt(l)) %>% round()
  return(binwidth)
}

integer_histogram <- function(variables) {
  for(var in variables) {
    myVector = AdultUCI[[var]]
    g <- ggplot(AdultUCI, aes(x=myVector)) +
      geom_histogram(binwidth=get_binwidth(myVector), color='black', fill='white') +
      annotate("text",label=round(mean(myVector)), x=round(mean(myVector)),y=0,vjust=0) +
      geom_vline(aes(xintercept=round(mean(myVector))), color='red') +
      labs(title=var, x=var)
    print(g)
  }
}

integer_histogram(int_vars[int_vars %in% c("capital-loss","capital-gain","fnlwgt")])

Summary Analysis of EDA for integer variables

variables capital-loss/capital-gain are very right skewed
fnlwgt has a wide range of values and is very right skewed

CRISP-DM Data Preparation

clean data: remove/replace NA values
clean data: remove/replace outliers
clean data: discretize data

find highly correlated income ~ variables

create binary income variable first

AdultUCI$income_binary <- ifelse(AdultUCI$income == 'small',0,1)
cor_matrix <- cor(AdultUCI[complete.cases(AdultUCI), sapply(AdultUCI, is.numeric)], method='pearson')
corrplot(cor_matrix,type=c('full'))

+ age, education-num, capital-gain/loss, and hours-per-week have a positive correlation with income

apply the knn algorithm to label the missing values

dropped fnlwgt before imputing because it is not correlated to income and has high variance; this should reduce the runtime of the euclidean calculation

AdultUCI2 <- AdultUCI[,!colnames(AdultUCI) %in% c('fnlwgt')]  
preprocess <- preProcess(AdultUCI2, method=c('knnImpute','center','scale'))
predictions <- predict(preprocess, AdultUCI2)

missing values for income_binary were replaced by the values of their closest knn

add the predicted income column back to original dataset
convert the income_binary column back to a factor with levels c(‘small’,‘large’)

AdultUCI2$income_knn <- predictions$income_binary
AdultUCI2$income_knn_factor <- discretize(AdultUCI2$income_knn, method='fixed', breaks=c(-0.56319,-0.001504,Inf), labels=c('small','large'),ordered_result = T) # breaks c(min, mean)

compare the original dataset with the imputed one to ensure accuracy of the income transformation

find index of NA values and verify that all other (non-NA) values from original dataset and imputed dataset are the same

na_index <- which(AdultUCI$income %>% is.na) # NA values are at index 32562:nrow(AdultUCI)
all(AdultUCI$income[1:32561]==AdultUCI2$income[1:32561])  # returns TRUE, which confirms item for item match

## [1] TRUE

discretize variables

# cut function creates breaks from -infinity, 0, median(capital-gain) > 0, infinity with labels: none (-inf --> 0), low(0 --> median(captial-gain) > 0), high(`low` --> infinity)
AdultUCI2$`capital-gain` %<>% cut(breaks=c(-Inf,0,median(AdultUCI2[['capital-gain']][AdultUCI2[['capital-gain']]>0]),Inf),labels=c('None','Low','High'),ordered_result = T)  
# median function uses AND operation to get only the capital-gain values that are greater than 0

AdultUCI2$`capital-loss` %<>% cut(breaks=c(-Inf,0,median(AdultUCI2[['capital-loss']][AdultUCI2[['capital-loss']]>0]),Inf),labels=c('None','Low','High'),ordered_result = T)  

# create intervals for age: "(16,28]" "(28,37]" "(37,48]" "(48,90]"
AdultUCI2$age %<>% cut(breaks=c(16,28,37,48,90))

# summary(AdultUCI2_df$`hours-per-week`)
AdultUCI2$`hours-per-week` %<>% cut(breaks=c(0,39,45,99),labels=c('part-time','full-time','over-time'),ordered_result=T)

remove pre-impute and irrelevant variables

original income variable is no longer needed because of imputed income_knn_factor variable
education-num was removed because it is redundant to education

AdultUCI2 %>% sapply(function(x) sum(is.na(x)))

##               age         workclass         education     education-num 
##                 0              2799                 0                 0 
##    marital-status        occupation      relationship              race 
##                 0              2809                 0                 0 
##               sex      capital-gain      capital-loss    hours-per-week 
##                 0                 0                 0                 0 
##    native-country            income     income_binary        income_knn 
##               857             16281             16281                 0 
## income_knn_factor 
##                 0

AdultUCI2 <- AdultUCI2[,!colnames(AdultUCI2) %in% c('income','income_binary','income_knn','education-num')]

remove remaining rows with NA values (NA values are all from categorial variables)

AdultUCI2 <- AdultUCI2[-which(!complete.cases(AdultUCI2)),]

CRISP-DM Modeling

convert record data to transaction dtype
create apriori model

trans <- as(AdultUCI2, "transactions")
ruleset <- apriori(trans, parameter = list(support=.09, confidence=.5, minlen=2),
                   appearance = list(default="lhs", rhs=c("income_knn_factor=large")),
                   control = list(verbose = F))
inspect(head(ruleset,5))

##     lhs                               rhs                          support confidence     lift count
## [1] {sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09639114  0.5086348 1.851971  4359
## [2] {race=White,                                                                                    
##      sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09013312  0.5139327 1.871261  4076
## [3] {sex=Male,                                                                                      
##      hours-per-week=over-time,                                                                      
##      native-country=United-States} => {income_knn_factor=large} 0.09077440  0.5130609 1.868087  4105

item frequency plot

itemFrequencyPlot(trans, support=.1, cex.names=.7)

plot(ruleset)

filter ruleset to include lift > 2 and rhs: income_knn_factor=large

ruleset_filter <- subset(ruleset, lift > 1) %>% sort(decreasing = T, by='lift')
plot(ruleset_filter)

inspect(ruleset_filter[1:3])

##     lhs                               rhs                          support confidence     lift count
## [1] {race=White,                                                                                    
##      sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09013312  0.5139327 1.871261  4076
## [2] {sex=Male,                                                                                      
##      hours-per-week=over-time,                                                                      
##      native-country=United-States} => {income_knn_factor=large} 0.09077440  0.5130609 1.868087  4105
## [3] {sex=Male,                                                                                      
##      hours-per-week=over-time}     => {income_knn_factor=large} 0.09639114  0.5086348 1.851971  4359

plot(ruleset_filter[1:3],method='graph')

Find associations for income=small

ruleset <- apriori(trans, parameter = list(support=.11, confidence=.7, minlen=2),
                   appearance = list(default="lhs", rhs=c("income_knn_factor=small")),
                   control = list(verbose = F))
ruleset_filter <- subset(ruleset, lift > 1) %>% sort(decreasing = T, by='lift')
inspect(ruleset_filter[1:5])

##     lhs                               rhs                         support confidence     lift count
## [1] {age=(16,28],                                                                                  
##      relationship=Own-child}       => {income_knn_factor=small} 0.1114944  0.9933018 1.369401  5042
## [2] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      capital-gain=None,                                                                            
##      capital-loss=None}            => {income_knn_factor=small} 0.1640573  0.9914473 1.366844  7419
## [3] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      capital-gain=None,                                                                            
##      capital-loss=None,                                                                            
##      native-country=United-States} => {income_knn_factor=small} 0.1493521  0.9909038 1.366095  6754
## [4] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      race=White,                                                                                   
##      capital-gain=None,                                                                            
##      capital-loss=None}            => {income_knn_factor=small} 0.1394675  0.9908877 1.366073  6307
## [5] {age=(16,28],                                                                                  
##      workclass=Private,                                                                            
##      marital-status=Never-married,                                                                 
##      race=White,                                                                                   
##      capital-gain=None,                                                                            
##      capital-loss=None,                                                                            
##      native-country=United-States} => {income_knn_factor=small} 0.1296493  0.9903716 1.365361  5863

plot(ruleset_filter[1:10], method='graph')

Analysis of top rules

Income=large

Summary: In general, the rules with the highest support (larger circles) and lift (dark red) are for nodes that have the following attributes: sex=male, race=white, income-large, native.country=United States, hours.per.week=over-time. In layman’s terms: white males living in the United States that work over-time, have the largest income in the dataset.
The ruleset outlined above has support of 9% in the dataset. This means that the attributes(white, male, over-time, united states) are observed in 9% of the dataset. The confidence level for this association is 51% and can be interpreted as attributes(white, male, over-time, united states) lead to the observation of income-large 51% of the time given the attributes are present.
The confidence metric is biased towards the prevelance of the attributes(white, male, united states)–which dominate the dataset. Therefore the lift metric is a better indicator to assess how likely the occurence of large-income is–given the occurence of the attributes(white, male, over-time, united states), while also controlling for how frequent income-large is observed in the dataset. The confidence score for the association sex=male, race=white, native.country=United States, hours.per.week=over-time -> income-large is 1.87 and indicates a strong reciprical relationship.

Income=small

Summary: In general, the rules with the highest support and lift are for nodes that have the following attributes: age=(16,28], capital gain/loss=None, native.country=United States, marital.status=Never-married, race=White, worclass=Private. In layman’s terms: White Americans age 16-28 that work in the private sector with low net-worth; and, that have a child have the lowest income in the dataset.
The ruleset outlined above has support of 11% in the dataset. The confidence level for this association is 99% which suggests a strong correlation between the antecedent attributes and income-low
The lift for this association is 1.36 and supports an recipriocal relationship

IST 707 HW 1 - Association Rule Mining

Alex Bailey

2/2/2019

load the data into the global environment

CRISP-DM Data Understanding

get the factor and integer variable names to pass

plot histogram for factor variables

find out which variables have NA values

Summary Analysis of EDA for factor variables

plot histogram for integer variables

Summary Analysis of EDA for integer variables

CRISP-DM Data Preparation

find highly correlated income ~ variables

apply the knn algorithm to label the missing values

missing values for income_binary were replaced by the values of their closest knn

compare the original dataset with the imputed one to ensure accuracy of the income transformation

discretize variables

remove pre-impute and irrelevant variables

remove remaining rows with NA values (NA values are all from categorial variables)

CRISP-DM Modeling

item frequency plot

filter ruleset to include lift > 2 and rhs: income_knn_factor=large

Find associations for income=small

Analysis of top rules

Income=large

Income=small