CRISP-DM Data Understanding
- Exploratory Data Anlaysis and Summary Statistics
- Create histograms to see the distribution of the variables
get the factor and integer variable names to pass
factor_vars <- sapply(AdultUCI, function(x) is.factor(x)) %>% which() %>% names()
int_vars <- AdultUCI[, -which(names(AdultUCI) %in% factor_vars)] %>% names()
plot histogram for factor variables
factor_histogram <- function(variables) {
for(var in variables) {
myVector <- AdultUCI[[var]]
g <- ggplot(AdultUCI, aes(x=myVector)) +
geom_bar(color="black",fill="white") +
geom_text(stat="count", aes(x=as.numeric(myVector), label=..count..),vjust=0) +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=.5)) +
labs(title=var, x=var)
print(g)
}
}
factor_histogram(factor_vars[factor_vars %in% c("workclass","occupation","income")])
## Warning: Removed 2799 rows containing non-finite values (stat_count).

## Warning: Removed 2809 rows containing non-finite values (stat_count).

## Warning: Removed 16281 rows containing non-finite values (stat_count).

find out which variables have NA values
AdultUCI %>% sapply(function(x) sum(is.na(x)))
## age workclass fnlwgt education education-num
## 0 2799 0 0 0
## marital-status occupation relationship race sex
## 0 2809 0 0 0
## capital-gain capital-loss hours-per-week native-country income
## 0 0 0 857 16281
AdultUCI %>% sapply(function(x) sum(is.na(x)) > 0) %>% which() %>% names()
## [1] "workclass" "occupation" "native-country" "income"
Summary Analysis of EDA for factor variables
- The following attributes have NA values:“workclass”,“occupation”,“native-country”,“income”
- There are 18680 rows with NA values
- 0.9% of the observations have native-country = ‘United States’
- The race:White makes up 0.86% of the observations
- Males and Female ratio of the observations are: M-0.67% F-0.33%
- Workclass:Gov’t workers are 0.13% of the observations, Workclass:Private are 0.69%
plot histogram for integer variables
get_binwidth <- function(v) {
my_min <- min(v)
my_max <- max(v)
l <- length(unique(v))
binwidth = ((my_max - my_min)/sqrt(l)) %>% round()
return(binwidth)
}
integer_histogram <- function(variables) {
for(var in variables) {
myVector = AdultUCI[[var]]
g <- ggplot(AdultUCI, aes(x=myVector)) +
geom_histogram(binwidth=get_binwidth(myVector), color='black', fill='white') +
annotate("text",label=round(mean(myVector)), x=round(mean(myVector)),y=0,vjust=0) +
geom_vline(aes(xintercept=round(mean(myVector))), color='red') +
labs(title=var, x=var)
print(g)
}
}
integer_histogram(int_vars[int_vars %in% c("capital-loss","capital-gain","fnlwgt")])



Summary Analysis of EDA for integer variables
- variables capital-loss/capital-gain are very right skewed
- fnlwgt has a wide range of values and is very right skewed
CRISP-DM Data Preparation
- clean data: remove/replace NA values
- clean data: remove/replace outliers
- clean data: discretize data
apply the knn algorithm to label the missing values
- dropped fnlwgt before imputing because it is not correlated to income and has high variance; this should reduce the runtime of the euclidean calculation
AdultUCI2 <- AdultUCI[,!colnames(AdultUCI) %in% c('fnlwgt')]
preprocess <- preProcess(AdultUCI2, method=c('knnImpute','center','scale'))
predictions <- predict(preprocess, AdultUCI2)
missing values for income_binary were replaced by the values of their closest knn
- add the predicted income column back to original dataset
- convert the income_binary column back to a factor with levels c(‘small’,‘large’)
AdultUCI2$income_knn <- predictions$income_binary
AdultUCI2$income_knn_factor <- discretize(AdultUCI2$income_knn, method='fixed', breaks=c(-0.56319,-0.001504,Inf), labels=c('small','large'),ordered_result = T) # breaks c(min, mean)
discretize variables
# cut function creates breaks from -infinity, 0, median(capital-gain) > 0, infinity with labels: none (-inf --> 0), low(0 --> median(captial-gain) > 0), high(`low` --> infinity)
AdultUCI2$`capital-gain` %<>% cut(breaks=c(-Inf,0,median(AdultUCI2[['capital-gain']][AdultUCI2[['capital-gain']]>0]),Inf),labels=c('None','Low','High'),ordered_result = T)
# median function uses AND operation to get only the capital-gain values that are greater than 0
AdultUCI2$`capital-loss` %<>% cut(breaks=c(-Inf,0,median(AdultUCI2[['capital-loss']][AdultUCI2[['capital-loss']]>0]),Inf),labels=c('None','Low','High'),ordered_result = T)
# create intervals for age: "(16,28]" "(28,37]" "(37,48]" "(48,90]"
AdultUCI2$age %<>% cut(breaks=c(16,28,37,48,90))
# summary(AdultUCI2_df$`hours-per-week`)
AdultUCI2$`hours-per-week` %<>% cut(breaks=c(0,39,45,99),labels=c('part-time','full-time','over-time'),ordered_result=T)
remove pre-impute and irrelevant variables
- original income variable is no longer needed because of imputed income_knn_factor variable
- education-num was removed because it is redundant to education
AdultUCI2 %>% sapply(function(x) sum(is.na(x)))
## age workclass education education-num
## 0 2799 0 0
## marital-status occupation relationship race
## 0 2809 0 0
## sex capital-gain capital-loss hours-per-week
## 0 0 0 0
## native-country income income_binary income_knn
## 857 16281 16281 0
## income_knn_factor
## 0
AdultUCI2 <- AdultUCI2[,!colnames(AdultUCI2) %in% c('income','income_binary','income_knn','education-num')]
remove remaining rows with NA values (NA values are all from categorial variables)
AdultUCI2 <- AdultUCI2[-which(!complete.cases(AdultUCI2)),]
CRISP-DM Modeling
- convert record data to transaction dtype
- create apriori model
trans <- as(AdultUCI2, "transactions")
ruleset <- apriori(trans, parameter = list(support=.09, confidence=.5, minlen=2),
appearance = list(default="lhs", rhs=c("income_knn_factor=large")),
control = list(verbose = F))
inspect(head(ruleset,5))
## lhs rhs support confidence lift count
## [1] {sex=Male,
## hours-per-week=over-time} => {income_knn_factor=large} 0.09639114 0.5086348 1.851971 4359
## [2] {race=White,
## sex=Male,
## hours-per-week=over-time} => {income_knn_factor=large} 0.09013312 0.5139327 1.871261 4076
## [3] {sex=Male,
## hours-per-week=over-time,
## native-country=United-States} => {income_knn_factor=large} 0.09077440 0.5130609 1.868087 4105
item frequency plot
itemFrequencyPlot(trans, support=.1, cex.names=.7)

plot(ruleset)

filter ruleset to include lift > 2 and rhs: income_knn_factor=large
ruleset_filter <- subset(ruleset, lift > 1) %>% sort(decreasing = T, by='lift')
plot(ruleset_filter)

inspect(ruleset_filter[1:3])
## lhs rhs support confidence lift count
## [1] {race=White,
## sex=Male,
## hours-per-week=over-time} => {income_knn_factor=large} 0.09013312 0.5139327 1.871261 4076
## [2] {sex=Male,
## hours-per-week=over-time,
## native-country=United-States} => {income_knn_factor=large} 0.09077440 0.5130609 1.868087 4105
## [3] {sex=Male,
## hours-per-week=over-time} => {income_knn_factor=large} 0.09639114 0.5086348 1.851971 4359
plot(ruleset_filter[1:3],method='graph')

Find associations for income=small
ruleset <- apriori(trans, parameter = list(support=.11, confidence=.7, minlen=2),
appearance = list(default="lhs", rhs=c("income_knn_factor=small")),
control = list(verbose = F))
ruleset_filter <- subset(ruleset, lift > 1) %>% sort(decreasing = T, by='lift')
inspect(ruleset_filter[1:5])
## lhs rhs support confidence lift count
## [1] {age=(16,28],
## relationship=Own-child} => {income_knn_factor=small} 0.1114944 0.9933018 1.369401 5042
## [2] {age=(16,28],
## workclass=Private,
## marital-status=Never-married,
## capital-gain=None,
## capital-loss=None} => {income_knn_factor=small} 0.1640573 0.9914473 1.366844 7419
## [3] {age=(16,28],
## workclass=Private,
## marital-status=Never-married,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} => {income_knn_factor=small} 0.1493521 0.9909038 1.366095 6754
## [4] {age=(16,28],
## workclass=Private,
## marital-status=Never-married,
## race=White,
## capital-gain=None,
## capital-loss=None} => {income_knn_factor=small} 0.1394675 0.9908877 1.366073 6307
## [5] {age=(16,28],
## workclass=Private,
## marital-status=Never-married,
## race=White,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} => {income_knn_factor=small} 0.1296493 0.9903716 1.365361 5863
plot(ruleset_filter[1:10], method='graph')
