## Dependency modeling
## page 112 (Data Mining with R)
rm(list=ls())
##update to newer version R
# install.packages("installr")
# library(installr)
# updateR(GUI=FALSE)

#install.packages("arules", dependencies = TRUE)
library(arules)
library(dplyr)
#library("MASS")
library(tidyverse)

##
## Boston data from MASS package
##
## Boston data: Housing Values in Suburbs of Boston
# ---- Variable description
#?Boston

# crimtab:per capita crime rate by town.
# zn:proportion of residential land zoned for lots over 25,000 sq.ft.
# indus:proportion of non-retail business acres per town.
# chas:Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# nox:  
#  nitrogen oxides concentration (parts per 10 million).
# rm:   
#  average number of rooms per dwelling.
# age:  
#  proportion of owner-occupied units built prior to 1940.
# dis:  
#  weighted mean of distances to five Boston employment centres.
# rad:  
#  index of accessibility to radial highways.
# tax:  
#  full-value property-tax rate per $10,000.
# ptratio:  
#  pupil-teacher ratio by town.
# black:
#  1000(Bk−0.63)^2, where Bk is the proportion of blacks by town.
# lstat
# lower status of the population (percent).
# medv
# median value of owner-occupied homes in $1000s.

data(Boston,package="MASS")
dim(Boston)
## [1] 506  14
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
b <- Boston
table(b$chas)
## 
##   0   1 
## 471  35
b$chas <- factor(b$chas,labels=c("river","noriver"))
b$rad <- factor(b$rad)
table(b$rad)
## 
##   1   2   3   4   5   6   7   8  24 
##  20  24  38 110 115  26  17  24 132
summary(b$black)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.32  375.38  391.44  356.67  396.23  396.90
hist(b$black)

b$black <- cut(b$black,breaks=4,labels=c(">31.5%","18.5-31.5%","8-18.5%","<8%"))
table(b$black)
## 
##     >31.5% 18.5-31.5%    8-18.5%        <8% 
##         31          8         15        452
## Create category variable for continuous
discr <- function(x) cut(x,breaks=4, labels=c("low","medLow","medHigh","high"))
#
b <- select(b,  -one_of( c("chas","rad","black") )) %>% 
    mutate_each( funs(discr) ) %>% 
 bind_cols(select(b,one_of(c("chas","rad","black"))))
head(b)
##   crim  zn indus    nox      rm     age    dis tax ptratio lstat    medv  chas
## 1  low low   low medLow medHigh medHigh medLow low  medLow   low  medLow river
## 2  low low   low    low medHigh    high medLow low medHigh   low  medLow river
## 3  low low   low    low medHigh medHigh medLow low medHigh   low medHigh river
## 4  low low   low    low medHigh  medLow medLow low medHigh   low medHigh river
## 5  low low   low    low medHigh medHigh medLow low medHigh   low medHigh river
## 6  low low   low    low medHigh medHigh medLow low medHigh   low medHigh river
##   rad black
## 1   1   <8%
## 2   2   <8%
## 3   2   <8%
## 4   3   <8%
## 5   3   <8%
## 6   3   <8%
summary(b)
##       crim           zn          indus          nox            rm     
##  low    :491   low    :429   low    :202   low    :200   low    :  8  
##  medLow : 10   medLow : 32   medLow :112   medLow :182   medLow :234  
##  medHigh:  2   medHigh: 16   medHigh:165   medHigh:100   medHigh:236  
##  high   :  3   high   : 29   high   : 27   high   : 24   high   : 28  
##                                                                       
##                                                                       
##                                                                       
##       age           dis           tax         ptratio        lstat    
##  low    : 51   low    :305   low    :240   low    : 58   low    :243  
##  medLow : 97   medLow :144   medLow :128   medLow : 68   medLow :187  
##  medHigh: 96   medHigh: 52   medHigh:  1   medHigh:171   medHigh: 57  
##  high   :262   high   :  5   high   :137   high   :209   high   : 19  
##                                                                       
##                                                                       
##                                                                       
##       medv          chas          rad             black    
##  low    :116   river  :471   24     :132   >31.5%    : 31  
##  medLow :284   noriver: 35   5      :115   18.5-31.5%:  8  
##  medHigh: 74                 4      :110   8-18.5%   : 15  
##  high   : 32                 3      : 38   <8%       :452  
##                              6      : 26                   
##                              2      : 24                   
##                              (Other): 61
class(b)
## [1] "data.frame"
glimpse(b)
## Rows: 506
## Columns: 14
## $ crim    <fct> low, low, low, low, low, low, low, low, low, low, low, low, lo…
## $ zn      <fct> low, low, low, low, low, low, low, low, low, low, low, low, lo…
## $ indus   <fct> low, low, low, low, low, low, medLow, medLow, medLow, medLow, …
## $ nox     <fct> medLow, low, low, low, low, low, medLow, medLow, medLow, medLo…
## $ rm      <fct> medHigh, medHigh, medHigh, medHigh, medHigh, medHigh, medLow, …
## $ age     <fct> medHigh, high, medHigh, medLow, medHigh, medHigh, medHigh, hig…
## $ dis     <fct> medLow, medLow, medLow, medLow, medLow, medLow, medLow, medLow…
## $ tax     <fct> low, low, low, low, low, low, low, low, low, low, low, low, lo…
## $ ptratio <fct> medLow, medHigh, medHigh, medHigh, medHigh, medHigh, medLow, m…
## $ lstat   <fct> low, low, low, low, low, low, medLow, medLow, high, medLow, me…
## $ medv    <fct> medLow, medLow, medHigh, medHigh, medHigh, medHigh, medLow, me…
## $ chas    <fct> river, river, river, river, river, river, river, river, river,…
## $ rad     <fct> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ black   <fct> <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8%, <8…
## Make transaction object
b <- as(b,"transactions")
class(b)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
summary(b)
## transactions as itemMatrix in sparse format with
##  506 rows (elements/itemsets/transactions) and
##  59 columns (items) and a density of 0.2372881 
## 
## most frequent items:
##   crim=low chas=river  black=<8%     zn=low    dis=low    (Other) 
##        491        471        452        429        305       4936 
## 
## element (itemset/transaction) length distribution:
## sizes
##  14 
## 506 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      14      14      14      14      14      14 
## 
## includes extended item information - examples:
##         labels variables  levels
## 1     crim=low      crim     low
## 2  crim=medLow      crim  medLow
## 3 crim=medHigh      crim medHigh
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3
itemFrequencyPlot(b, support=0.3,cex.names=0.8)

itemFrequencyPlot(b, support=0.3,cex.names=0.8)
ars <- apriori(b, parameter=list(support=0.025, confidence=0.75))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.75    0.1    1 none FALSE            TRUE       5   0.025      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 12 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[59 item(s), 506 transaction(s)] done [0.00s].
## sorting and recoding items ... [52 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
##  done [0.02s].
## writing ... [408638 rule(s)] done [0.02s].
## creating S4 object  ... done [0.06s].
ars
## set of 408638 rules
table(discr(Boston$medv))
## 
##     low  medLow medHigh    high 
##     116     284      74      32
inspect(head(subset(ars, subset=rhs %in% "medv=high"),5,by="confidence"))
##     lhs               rhs            support confidence   coverage    lift count
## [1] {rm=high,                                                                   
##      ptratio=low}  => {medv=high} 0.02964427          1 0.02964427 15.8125    15
## [2] {rm=high,                                                                   
##      ptratio=low,                                                               
##      lstat=low}    => {medv=high} 0.02964427          1 0.02964427 15.8125    15
## [3] {rm=high,                                                                   
##      ptratio=low,                                                               
##      black=<8%}    => {medv=high} 0.02964427          1 0.02964427 15.8125    15
## [4] {crim=low,                                                                  
##      rm=high,                                                                   
##      ptratio=low}  => {medv=high} 0.02964427          1 0.02964427 15.8125    15
## [5] {rm=high,                                                                   
##      ptratio=low,                                                               
##      lstat=low,                                                                 
##      black=<8%}    => {medv=high} 0.02964427          1 0.02964427 15.8125    15
inspect(head(subset(ars, subset=rhs %in% "medv=low"),5,by="confidence"))
##     lhs                 rhs           support confidence   coverage     lift count
## [1] {nox=medHigh,                                                                 
##      lstat=medHigh}  => {medv=low} 0.05928854          1 0.05928854 4.362069    30
## [2] {nox=medHigh,                                                                 
##      lstat=medHigh,                                                               
##      rad=24}         => {medv=low} 0.05928854          1 0.05928854 4.362069    30
## [3] {nox=medHigh,                                                                 
##      tax=high,                                                                    
##      lstat=medHigh}  => {medv=low} 0.05928854          1 0.05928854 4.362069    30
## [4] {indus=medHigh,                                                               
##      nox=medHigh,                                                                 
##      lstat=medHigh}  => {medv=low} 0.05928854          1 0.05928854 4.362069    30
## [5] {nox=medHigh,                                                                 
##      ptratio=high,                                                                
##      lstat=medHigh}  => {medv=low} 0.05928854          1 0.05928854 4.362069    30
inspect(head(subset(ars, subset=lhs %in% "nox=high" | rhs %in% "nox=high"),
             5,by="confidence"))
##     lhs           rhs             support    confidence coverage   lift    
## [1] {nox=high} => {indus=medHigh} 0.04743083 1          0.04743083 3.066667
## [2] {nox=high} => {age=high}      0.04743083 1          0.04743083 1.931298
## [3] {nox=high} => {dis=low}       0.04743083 1          0.04743083 1.659016
## [4] {nox=high} => {zn=low}        0.04743083 1          0.04743083 1.179487
## [5] {nox=high} => {crim=low}      0.04743083 1          0.04743083 1.030550
##     count
## [1] 24   
## [2] 24   
## [3] 24   
## [4] 24   
## [5] 24