## Dependency modeling
## page 112 (Data Mining with R)
rm(list=ls())
##update to newer version R
# install.packages("installr")
# library(installr)
# updateR(GUI=FALSE)
#install.packages("arules", dependencies = TRUE)
library(arules)
library(dplyr)
#library("MASS")
library(tidyverse)
##
## Boston data from MASS package
##
## Boston data: Housing Values in Suburbs of Boston
# ---- Variable description
#?Boston
# crimtab:per capita crime rate by town.
# zn:proportion of residential land zoned for lots over 25,000 sq.ft.
# indus:proportion of non-retail business acres per town.
# chas:Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# nox:
# nitrogen oxides concentration (parts per 10 million).
# rm:
# average number of rooms per dwelling.
# age:
# proportion of owner-occupied units built prior to 1940.
# dis:
# weighted mean of distances to five Boston employment centres.
# rad:
# index of accessibility to radial highways.
# tax:
# full-value property-tax rate per $10,000.
# ptratio:
# pupil-teacher ratio by town.
# black:
# 1000(Bk−0.63)^2, where Bk is the proportion of blacks by town.
# lstat
# lower status of the population (percent).
# medv
# median value of owner-occupied homes in $1000s.
data(Boston,package="MASS")
dim(Boston)
## [1] 506 14
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
b <- Boston
table(b$chas)
##
## 0 1
## 471 35
b$chas <- factor(b$chas,labels=c("river","noriver"))
b$rad <- factor(b$rad)
table(b$rad)
##
## 1 2 3 4 5 6 7 8 24
## 20 24 38 110 115 26 17 24 132
summary(b$black)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.32 375.38 391.44 356.67 396.23 396.90
hist(b$black)

b$black <- cut(b$black,breaks=4,labels=c(">31.5%","18.5-31.5%","8-18.5%","<8%"))
table(b$black)
##
## >31.5% 18.5-31.5% 8-18.5% <8%
## 31 8 15 452
## Create category variable for continuous
discr <- function(x) cut(x,breaks=4, labels=c("low","medLow","medHigh","high"))
#
b <- select(b, -one_of( c("chas","rad","black") )) %>%
mutate_each( funs(discr) ) %>%
bind_cols(select(b,one_of(c("chas","rad","black"))))
head(b)
## crim zn indus nox rm age dis tax ptratio lstat medv chas
## 1 low low low medLow medHigh medHigh medLow low medLow low medLow river
## 2 low low low low medHigh high medLow low medHigh low medLow river
## 3 low low low low medHigh medHigh medLow low medHigh low medHigh river
## 4 low low low low medHigh medLow medLow low medHigh low medHigh river
## 5 low low low low medHigh medHigh medLow low medHigh low medHigh river
## 6 low low low low medHigh medHigh medLow low medHigh low medHigh river
## rad black
## 1 1 <8%
## 2 2 <8%
## 3 2 <8%
## 4 3 <8%
## 5 3 <8%
## 6 3 <8%
summary(b)
## crim zn indus nox rm
## low :491 low :429 low :202 low :200 low : 8
## medLow : 10 medLow : 32 medLow :112 medLow :182 medLow :234
## medHigh: 2 medHigh: 16 medHigh:165 medHigh:100 medHigh:236
## high : 3 high : 29 high : 27 high : 24 high : 28
##
##
##
## age dis tax ptratio lstat
## low : 51 low :305 low :240 low : 58 low :243
## medLow : 97 medLow :144 medLow :128 medLow : 68 medLow :187
## medHigh: 96 medHigh: 52 medHigh: 1 medHigh:171 medHigh: 57
## high :262 high : 5 high :137 high :209 high : 19
##
##
##
## medv chas rad black
## low :116 river :471 24 :132 >31.5% : 31
## medLow :284 noriver: 35 5 :115 18.5-31.5%: 8
## medHigh: 74 4 :110 8-18.5% : 15
## high : 32 3 : 38 <8% :452
## 6 : 26
## 2 : 24
## (Other): 61
class(b)
## [1] "data.frame"
glimpse(b)
## Rows: 506
## Columns: 14
## $ crim <fct> low, low, low, low, low, low, low, low, low, low, low, low, lo…
## $ zn <fct> low, low, low, low, low, low, low, low, low, low, low, low, lo…
## $ indus <fct> low, low, low, low, low, low, medLow, medLow, medLow, medLow, …
## $ nox <fct> medLow, low, low, low, low, low, medLow, medLow, medLow, medLo…
## $ rm <fct> medHigh, medHigh, medHigh, medHigh, medHigh, medHigh, medLow, …
## $ age <fct> medHigh, high, medHigh, medLow, medHigh, medHigh, medHigh, hig…
## $ dis <fct> medLow, medLow, medLow, medLow, medLow, medLow, medLow, medLow…
## $ tax <fct> low, low, low, low, low, low, low, low, low, low, low, low, lo…
## $ ptratio <fct> medLow, medHigh, medHigh, medHigh, medHigh, medHigh, medLow, m…
## $ lstat <fct> low, low, low, low, low, low, medLow, medLow, high, medLow, me…
## $ medv <fct> medLow, medLow, medHigh, medHigh, medHigh, medHigh, medLow, me…
## $ chas <fct> river, river, river, river, river, river, river, river, river,…
## $ rad <fct> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ black <fct> <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8…
## Make transaction object
b <- as(b,"transactions")
class(b)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
summary(b)
## transactions as itemMatrix in sparse format with
## 506 rows (elements/itemsets/transactions) and
## 59 columns (items) and a density of 0.2372881
##
## most frequent items:
## crim=low chas=river black=<8% zn=low dis=low (Other)
## 491 471 452 429 305 4936
##
## element (itemset/transaction) length distribution:
## sizes
## 14
## 506
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14 14 14 14 14 14
##
## includes extended item information - examples:
## labels variables levels
## 1 crim=low crim low
## 2 crim=medLow crim medLow
## 3 crim=medHigh crim medHigh
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
itemFrequencyPlot(b, support=0.3,cex.names=0.8)

itemFrequencyPlot(b, support=0.3,cex.names=0.8)
ars <- apriori(b, parameter=list(support=0.025, confidence=0.75))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.025 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 12
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[59 item(s), 506 transaction(s)] done [0.00s].
## sorting and recoding items ... [52 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## done [0.02s].
## writing ... [408638 rule(s)] done [0.02s].
## creating S4 object ... done [0.06s].
ars
## set of 408638 rules
table(discr(Boston$medv))
##
## low medLow medHigh high
## 116 284 74 32
inspect(head(subset(ars, subset=rhs %in% "medv=high"),5,by="confidence"))
## lhs rhs support confidence coverage lift count
## [1] {rm=high,
## ptratio=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
## [2] {rm=high,
## ptratio=low,
## lstat=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
## [3] {rm=high,
## ptratio=low,
## black=<8%} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
## [4] {crim=low,
## rm=high,
## ptratio=low} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
## [5] {rm=high,
## ptratio=low,
## lstat=low,
## black=<8%} => {medv=high} 0.02964427 1 0.02964427 15.8125 15
inspect(head(subset(ars, subset=rhs %in% "medv=low"),5,by="confidence"))
## lhs rhs support confidence coverage lift count
## [1] {nox=medHigh,
## lstat=medHigh} => {medv=low} 0.05928854 1 0.05928854 4.362069 30
## [2] {nox=medHigh,
## lstat=medHigh,
## rad=24} => {medv=low} 0.05928854 1 0.05928854 4.362069 30
## [3] {nox=medHigh,
## tax=high,
## lstat=medHigh} => {medv=low} 0.05928854 1 0.05928854 4.362069 30
## [4] {indus=medHigh,
## nox=medHigh,
## lstat=medHigh} => {medv=low} 0.05928854 1 0.05928854 4.362069 30
## [5] {nox=medHigh,
## ptratio=high,
## lstat=medHigh} => {medv=low} 0.05928854 1 0.05928854 4.362069 30
inspect(head(subset(ars, subset=lhs %in% "nox=high" | rhs %in% "nox=high"),
5,by="confidence"))
## lhs rhs support confidence coverage lift
## [1] {nox=high} => {indus=medHigh} 0.04743083 1 0.04743083 3.066667
## [2] {nox=high} => {age=high} 0.04743083 1 0.04743083 1.931298
## [3] {nox=high} => {dis=low} 0.04743083 1 0.04743083 1.659016
## [4] {nox=high} => {zn=low} 0.04743083 1 0.04743083 1.179487
## [5] {nox=high} => {crim=low} 0.04743083 1 0.04743083 1.030550
## count
## [1] 24
## [2] 24
## [3] 24
## [4] 24
## [5] 24