similarities in adult behavior
library("arules")
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
data("Adult")
dim(Adult)
## [1] 48842 115
summary(Adult)
## transactions as itemMatrix in sparse format with
## 48842 rows (elements/itemsets/transactions) and
## 115 columns (items) and a density of 0.1089939
##
## most frequent items:
## capital-loss=None capital-gain=None
## 46560 44807
## native-country=United-States race=White
## 43832 41762
## workclass=Private (Other)
## 33906 401333
##
## element (itemset/transaction) length distribution:
## sizes
## 9 10 11 12 13
## 19 971 2067 15623 30162
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 12.00 13.00 12.53 13.00 13.00
##
## includes extended item information - examples:
## labels variables levels
## 1 age=Young age Young
## 2 age=Middle-aged age Middle-aged
## 3 age=Senior age Senior
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
data("Adult")
itemsets <- eclat(Adult)
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.1 1 10 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 4884
##
## create itemset ...
## set transactions ...[115 item(s), 48842 transaction(s)] done [0.03s].
## sorting and recoding items ... [31 item(s)] done [0.02s].
## creating bit matrix ... [31 row(s), 48842 column(s)] done [0.00s].
## writing ... [2616 set(s)] done [0.02s].
## Creating S4 object ... done [0.00s].
itemsets.sorted <- sort(itemsets)
inspect(itemsets.sorted[1:10])
## items support
## [1] {capital-loss=None} 0.9532779
## [2] {capital-gain=None} 0.9173867
## [3] {native-country=United-States} 0.8974243
## [4] {capital-gain=None,capital-loss=None} 0.8706646
## [5] {race=White} 0.8550428
## [6] {capital-loss=None,native-country=United-States} 0.8548380
## [7] {capital-gain=None,native-country=United-States} 0.8219565
## [8] {race=White,capital-loss=None} 0.8136849
## [9] {race=White,native-country=United-States} 0.7881127
## [10] {race=White,capital-gain=None} 0.7817862
itemsets <- eclat(Adult, parameter=list(minlen=8))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.1 8 10 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 4884
##
## create itemset ...
## set transactions ...[115 item(s), 48842 transaction(s)] done [0.03s].
## sorting and recoding items ... [31 item(s)] done [0.02s].
## creating bit matrix ... [31 row(s), 48842 column(s)] done [0.00s].
## writing ... [17 set(s)] done [0.00s].
## Creating S4 object ... done [0.00s].
inspect(itemsets)
## items support
## [1] {marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## hours-per-week=Over-time,
## native-country=United-States} 0.1026575
## [2] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1056673
## [3] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-loss=None,
## native-country=United-States} 0.1199992
## [4] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## native-country=United-States} 0.1140207
## [5] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None} 0.1163138
## [6] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1136931
## [7] {age=Middle-aged,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1510995
## [8] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1056877
## [9] {marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States,
## income=small} 0.1101716
## [10] {workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-loss=None,
## hours-per-week=Full-time,
## native-country=United-States} 0.1085951
## [11] {workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## hours-per-week=Full-time,
## native-country=United-States} 0.1028213
## [12] {workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## hours-per-week=Full-time} 0.1086155
## [13] {workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## hours-per-week=Full-time,
## native-country=United-States} 0.1063020
## [14] {marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## hours-per-week=Full-time,
## native-country=United-States} 0.1429712
## [15] {workclass=Private,
## marital-status=Married-civ-spouse,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1769993
## [16] {age=Middle-aged,
## workclass=Private,
## relationship=Husband,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1058106
## [17] {age=Middle-aged,
## workclass=Private,
## marital-status=Married-civ-spouse,
## race=White,
## sex=Male,
## capital-gain=None,
## capital-loss=None,
## native-country=United-States} 0.1065067
library(arulesNBMiner)
## Loading required package: rJava
data(Agrawal)
summary(Agrawal.db)
## transactions as itemMatrix in sparse format with
## 20000 rows (elements/itemsets/transactions) and
## 1000 columns (items) and a density of 0.00997795
##
## most frequent items:
## item540 item155 item803 item741 item399 (Other)
## 1848 1477 1332 1295 1264 192343
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 15 88 204 413 737 1233 1802 2217 2452 2444 2304 1858 1492 1072 706
## 16 17 18 19 20 21 22 23 24 25
## 431 233 138 83 46 19 10 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 8.000 10.000 9.978 12.000 25.000
##
## includes extended item information - examples:
## labels
## 1 item1
## 2 item2
## 3 item3
summary(Agrawal.pat)
## set of 2000 itemsets
##
## most frequent items:
## item399 item475 item756 item594 item293 (Other)
## 29 29 29 28 26 3960
##
## element (itemset/transaction) length distribution:sizes
## 1 2 3 4 5 6
## 702 733 385 134 34 12
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 2.00 2.05 3.00 6.00
##
## summary of quality measures:
## pWeights pCorrupts
## Min. :2.100e-08 Min. :0.0000
## 1st Qu.:1.426e-04 1st Qu.:0.2885
## Median :3.431e-04 Median :0.5129
## Mean :5.000e-04 Mean :0.5061
## 3rd Qu.:6.861e-04 3rd Qu.:0.7232
## Max. :3.898e-03 Max. :1.0000
##
## includes transaction ID lists: FALSE
mynbparameters <- NBMinerParameters(Agrawal.db)
mynbminer <- NBMiner(Agrawal.db, parameter = mynbparameters)
summary(mynbminer)
## set of 3332 itemsets
##
## most frequent items:
## item540 item615 item258 item594 item293 (Other)
## 69 57 55 50 46 6813
##
## element (itemset/transaction) length distribution:sizes
## 1 2 3 4 5
## 1000 1287 725 259 61
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.128 3.000 5.000
##
## summary of quality measures:
## precision
## Min. :0.9901
## 1st Qu.:1.0000
## Median :1.0000
## Mean :0.9997
## 3rd Qu.:1.0000
## Max. :1.0000
##
## includes transaction ID lists: FALSE
library(arules)
tr <- read.transactions("https://raw.githubusercontent.com/dirkweissenborn/mahout-rbmClassifier/master/core/src/test/resources/retail.dat", format="basket")
summary(tr)
## transactions as itemMatrix in sparse format with
## 88162 rows (elements/itemsets/transactions) and
## 16470 columns (items) and a density of 0.0006257289
##
## most frequent items:
## 39 48 38 32 41 (Other)
## 50675 42135 15596 15167 14945 770058
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3016 5516 6919 7210 6814 6163 5746 5143 4660 4086 3751 3285 2866 2620 2310
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 2115 1874 1645 1469 1290 1205 981 887 819 684 586 582 472 480 355
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 310 303 272 234 194 136 153 123 115 112 76 66 71 60 50
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 44 37 37 33 22 24 21 21 10 11 10 9 11 4 9
## 61 62 63 64 65 66 67 68 71 73 74 76
## 7 4 5 2 2 5 3 3 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 8.00 10.31 14.00 76.00
##
## includes extended item information - examples:
## labels
## 1 0
## 2 1
## 3 10
itemFrequencyPlot(tr, support=0.1)
rules <- apriori(tr, parameter=list(supp=0.5,conf=0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.5 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 44081
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[16470 item(s), 88162 transaction(s)] done [0.19s].
## sorting and recoding items ... [1 item(s)] done [0.00s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
summary(rules)
## set of 1 rules
##
## rule length distribution (lhs + rhs):sizes
## 1
## 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
##
## summary of quality measures:
## support confidence lift
## Min. :0.5748 Min. :0.5748 Min. :1
## 1st Qu.:0.5748 1st Qu.:0.5748 1st Qu.:1
## Median :0.5748 Median :0.5748 Median :1
## Mean :0.5748 Mean :0.5748 Mean :1
## 3rd Qu.:0.5748 3rd Qu.:0.5748 3rd Qu.:1
## Max. :0.5748 Max. :0.5748 Max. :1
##
## mining info:
## data ntransactions support confidence
## tr 88162 0.5 0.5
inspect(rules)
## lhs rhs support confidence lift
## [1] {} => {39} 0.5747941 0.5747941 1
library ("TraMineR")
##
## TraMineR stable version 2.0-6 (Built: 2017-08-16)
## Website: http://traminer.unige.ch
## Please type 'citation("TraMineR")' for citation information.
data(mvad)
summary(mvad)
## id weight male catholic Belfast N.Eastern
## Min. : 1.0 Min. :0.1300 no :342 no :368 no :624 no :503
## 1st Qu.:178.8 1st Qu.:0.4500 yes:370 yes:344 yes: 88 yes:209
## Median :356.5 Median :0.6900
## Mean :356.5 Mean :0.9994
## 3rd Qu.:534.2 3rd Qu.:1.0700
## Max. :712.0 Max. :4.4600
## Southern S.Eastern Western Grammar funemp gcse5eq fmpr
## no :497 no :629 no :595 no :583 no :595 no :452 no :537
## yes:215 yes: 83 yes:117 yes:129 yes:117 yes:260 yes:175
##
##
##
##
## livboth Jul.93 Aug.93 Sep.93
## no :261 school :135 school :135 school :179
## yes:451 FE : 97 FE : 98 FE :275
## employment :173 employment :178 employment : 83
## training :122 training :127 training :158
## joblessness:185 joblessness:174 joblessness: 17
## HE : 0 HE : 0 HE : 0
## Oct.93 Nov.93 Dec.93 Jan.94
## school :175 school :174 school :172 school :171
## FE :276 FE :272 FE :271 FE :263
## employment : 88 employment : 95 employment : 98 employment :100
## training :158 training :157 training :156 training :158
## joblessness: 15 joblessness: 14 joblessness: 15 joblessness: 20
## HE : 0 HE : 0 HE : 0 HE : 0
## Feb.94 Mar.94 Apr.94 May.94
## school :172 school :171 school :171 school :170
## FE :259 FE :257 FE :251 FE :247
## employment :100 employment :106 employment :112 employment :117
## training :154 training :154 training :153 training :150
## joblessness: 27 joblessness: 24 joblessness: 25 joblessness: 28
## HE : 0 HE : 0 HE : 0 HE : 0
## Jun.94 Jul.94 Aug.94 Sep.94
## school :165 school :140 school :139 school :143
## FE :232 FE :196 FE :196 FE :221
## employment :130 employment :178 employment :184 employment :167
## training :151 training :142 training :144 training :146
## joblessness: 34 joblessness: 56 joblessness: 49 joblessness: 35
## HE : 0 HE : 0 HE : 0 HE : 0
## Oct.94 Nov.94 Dec.94 Jan.95
## school :144 school :144 school :143 school :144
## FE :222 FE :220 FE :219 FE :218
## employment :172 employment :176 employment :181 employment :182
## training :137 training :137 training :133 training :128
## joblessness: 37 joblessness: 35 joblessness: 36 joblessness: 40
## HE : 0 HE : 0 HE : 0 HE : 0
## Feb.95 Mar.95 Apr.95 May.95
## school :143 school :143 school :142 school :142
## FE :211 FE :210 FE :203 FE :200
## employment :185 employment :190 employment :199 employment :205
## training :127 training :124 training :120 training :118
## joblessness: 46 joblessness: 45 joblessness: 48 joblessness: 47
## HE : 0 HE : 0 HE : 0 HE : 0
## Jun.95 Jul.95 Aug.95 Sep.95
## school :139 school :149 school :149 school : 58
## FE :189 FE :140 FE :138 FE :152
## employment :215 employment :269 employment :273 employment :305
## training :112 training : 93 training : 88 training : 84
## joblessness: 57 joblessness: 58 joblessness: 61 joblessness: 61
## HE : 0 HE : 3 HE : 3 HE : 52
## Oct.95 Nov.95 Dec.95 Jan.96
## school : 30 school : 29 school : 29 school : 27
## FE :137 FE :136 FE :135 FE :132
## employment :294 employment :296 employment :296 employment :301
## training : 81 training : 79 training : 80 training : 81
## joblessness: 57 joblessness: 56 joblessness: 56 joblessness: 57
## HE :113 HE :116 HE :116 HE :114
## Feb.96 Mar.96 Apr.96 May.96
## school : 27 school : 27 school : 27 school : 27
## FE :132 FE :125 FE :125 FE :124
## employment :300 employment :308 employment :313 employment :315
## training : 80 training : 78 training : 78 training : 78
## joblessness: 60 joblessness: 61 joblessness: 56 joblessness: 55
## HE :113 HE :113 HE :113 HE :113
## Jun.96 Jul.96 Aug.96 Sep.96
## school : 27 school : 18 school : 17 school : 8
## FE :122 FE : 83 FE : 83 FE : 82
## employment :324 employment :388 employment :392 employment :387
## training : 74 training : 58 training : 55 training : 51
## joblessness: 53 joblessness: 58 joblessness: 59 joblessness: 59
## HE :112 HE :107 HE :106 HE :125
## Oct.96 Nov.96 Dec.96 Jan.97
## school : 0 school : 0 school : 0 school : 0
## FE : 79 FE : 80 FE : 80 FE : 79
## employment :379 employment :378 employment :380 employment :382
## training : 51 training : 50 training : 49 training : 46
## joblessness: 56 joblessness: 56 joblessness: 56 joblessness: 59
## HE :147 HE :148 HE :147 HE :146
## Feb.97 Mar.97 Apr.97 May.97
## school : 0 school : 0 school : 0 school : 0
## FE : 79 FE : 76 FE : 75 FE : 74
## employment :385 employment :386 employment :392 employment :394
## training : 43 training : 42 training : 40 training : 38
## joblessness: 59 joblessness: 61 joblessness: 60 joblessness: 61
## HE :146 HE :147 HE :145 HE :145
## Jun.97 Jul.97 Aug.97 Sep.97
## school : 0 school : 0 school : 0 school : 0
## FE : 72 FE : 44 FE : 44 FE : 37
## employment :400 employment :429 employment :431 employment :435
## training : 37 training : 26 training : 22 training : 24
## joblessness: 60 joblessness: 78 joblessness: 80 joblessness: 75
## HE :143 HE :135 HE :135 HE :141
## Oct.97 Nov.97 Dec.97 Jan.98
## school : 0 school : 0 school : 0 school : 0
## FE : 29 FE : 29 FE : 29 FE : 27
## employment :434 employment :441 employment :443 employment :443
## training : 23 training : 22 training : 22 training : 21
## joblessness: 73 joblessness: 67 joblessness: 66 joblessness: 70
## HE :153 HE :153 HE :152 HE :151
## Feb.98 Mar.98 Apr.98 May.98
## school : 0 school : 0 school : 0 school : 0
## FE : 26 FE : 26 FE : 26 FE : 25
## employment :444 employment :447 employment :449 employment :450
## training : 17 training : 17 training : 17 training : 16
## joblessness: 74 joblessness: 72 joblessness: 71 joblessness: 72
## HE :151 HE :150 HE :149 HE :149
## Jun.98 Jul.98 Aug.98 Sep.98
## school : 0 school : 0 school : 0 school : 0
## FE : 25 FE : 14 FE : 14 FE : 14
## employment :454 employment :477 employment :482 employment :479
## training : 15 training : 11 training : 11 training : 13
## joblessness: 71 joblessness: 81 joblessness: 80 joblessness: 85
## HE :147 HE :129 HE :125 HE :121
## Oct.98 Nov.98 Dec.98 Jan.99
## school : 0 school : 0 school : 0 school : 0
## FE : 9 FE : 8 FE : 8 FE : 9
## employment :482 employment :484 employment :481 employment :484
## training : 13 training : 12 training : 13 training : 13
## joblessness: 82 joblessness: 83 joblessness: 85 joblessness: 82
## HE :126 HE :125 HE :125 HE :124
## Feb.99 Mar.99 Apr.99 May.99
## school : 0 school : 0 school : 0 school : 0
## FE : 9 FE : 9 FE : 9 FE : 9
## employment :485 employment :483 employment :483 employment :482
## training : 10 training : 9 training : 9 training : 8
## joblessness: 85 joblessness: 88 joblessness: 89 joblessness: 93
## HE :123 HE :123 HE :122 HE :120
## Jun.99
## school : 0
## FE : 9
## employment :484
## training : 8
## joblessness: 93
## HE :118
myseq <- seqdef(mvad, 40:80)
## [>] 6 distinct states appear in the data:
## 1 = employment
## 2 = FE
## 3 = HE
## 4 = joblessness
## 5 = school
## 6 = training
## [>] state coding:
## [alphabet] [label] [long label]
## 1 employment employment employment
## 2 FE FE FE
## 3 HE HE HE
## 4 joblessness joblessness joblessness
## 5 school school school
## 6 training training training
## [>] 712 sequences in the data set
## [>] min/max sequence length: 41/41
seqiplot(myseq)
seqfplot(myseq)
seqdplot(myseq)
seqHtplot(myseq)
myturbulence <- seqST(myseq)
## [>] extracting symbols and durations ...
## [>] computing turbulence for 712 sequence(s) ...
hist(myturbulence)
data(famform)
seq <- seqdef(famform)
## [>] found missing values ('NA') in sequence data
## [>] preparing 5 sequences
## [>] coding void elements with '%' and missing values with '*'
## [>] 5 distinct states appear in the data:
## 1 = M
## 2 = MC
## 3 = S
## 4 = SC
## 5 = U
## [>] state coding:
## [alphabet] [label] [long label]
## 1 M M M
## 2 MC MC MC
## 3 S S S
## 4 SC SC SC
## 5 U U U
## [>] 5 sequences in the data set
## [>] min/max sequence length: 2/5
seq
## Sequence
## [1] S-U
## [2] S-U-M
## [3] S-U-M-MC
## [4] S-U-M-MC-SC
## [5] U-M-MC
seqLLCP(seq[3,],seq[4,])
## [1] 4
seqLLCS(seq[1,],seq[2,])
## [1] 2
cost <- seqsubm(seq, method="CONSTANT", cval=2)
## [>] creating 5x5 substitution-cost matrix using 2 as constant value
cost
## M-> MC-> S-> SC-> U->
## M-> 0 2 2 2 2
## MC-> 2 0 2 2 2
## S-> 2 2 0 2 2
## SC-> 2 2 2 0 2
## U-> 2 2 2 2 0