R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

#### Part 2: Rule Learners -------------------
## Example: Identifying Poisonous Mushrooms ----
## Step 2: Exploring and preparing the data ---- 
mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE)
# examine the structure of the data frame
str(mushrooms)

'data.frame':   8124 obs. of  23 variables:
 $ type                    : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
 $ cap_shape               : Factor w/ 6 levels "bell","conical",..: 3 3 1 3 3 3 1 1 3 1 ...
 $ cap_surface             : Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 3 4 3 4 3 3 4 ...
 $ cap_color               : Factor w/ 10 levels "brown","buff",..: 1 10 9 9 4 10 9 9 9 10 ...
 $ bruises                 : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
 $ odor                    : Factor w/ 9 levels "almond","anise",..: 8 1 2 8 7 1 1 2 8 1 ...
 $ gill_attachment         : Factor w/ 2 levels "attached","free": 2 2 2 2 2 2 2 2 2 2 ...
 $ gill_spacing            : Factor w/ 2 levels "close","crowded": 1 1 1 1 2 1 1 1 1 1 ...
 $ gill_size               : Factor w/ 2 levels "broad","narrow": 2 1 1 2 1 1 1 1 2 1 ...
 $ gill_color              : Factor w/ 12 levels "black","brown",..: 1 1 2 2 1 2 5 2 8 5 ...
 $ stalk_shape             : Factor w/ 2 levels "enlarging","tapering": 1 1 1 1 2 1 1 1 1 1 ...
 $ stalk_root              : Factor w/ 5 levels "bulbous","club",..: 3 2 2 3 3 2 2 2 3 2 ...
 $ stalk_surface_above_ring: Factor w/ 4 levels "fibrous","scaly",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ stalk_surface_below_ring: Factor w/ 4 levels "fibrous","scaly",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ stalk_color_above_ring  : Factor w/ 9 levels "brown","buff",..: 8 8 8 8 8 8 8 8 8 8 ...
 $ stalk_color_below_ring  : Factor w/ 9 levels "brown","buff",..: 8 8 8 8 8 8 8 8 8 8 ...
 $ veil_type               : Factor w/ 1 level "partial": 1 1 1 1 1 1 1 1 1 1 ...
 $ veil_color              : Factor w/ 4 levels "brown","orange",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ ring_number             : Factor w/ 3 levels "none","one","two": 2 2 2 2 2 2 2 2 2 2 ...
 $ ring_type               : Factor w/ 5 levels "evanescent","flaring",..: 5 5 5 5 1 5 5 5 5 5 ...
 $ spore_print_color       : Factor w/ 9 levels "black","brown",..: 1 2 2 1 2 1 1 2 1 1 ...
 $ population              : Factor w/ 6 levels "abundant","clustered",..: 4 3 3 4 1 3 3 4 5 4 ...
 $ habitat                 : Factor w/ 7 levels "grasses","leaves",..: 5 1 3 5 1 1 3 3 1 3 ...

# drop the veil_type feature
mushrooms$veil_type <- NULL
# examine the class distribution
table(mushrooms$type)


   edible poisonous 
     4208      3916

set.seed(123)
train_sample <- sample(8124, 7000)
str(train_sample)

 int [1:7000] 2337 6404 3322 7171 7637 370 4288 7244 4476 3706 ...

# split the data frames
mushrooms_train <- mushrooms[train_sample, ]
mushrooms_test  <- mushrooms[-train_sample, ]
## Step 3: Training a model on the data ----
library(RWeka)
# train OneR() on the data
mushroom_1R <- OneR(type ~ ., data = mushrooms_train)
## Step 4: Evaluating model performance ----
mushroom_1R

odor:
    almond  -> edible
    anise   -> edible
    creosote    -> poisonous
    fishy   -> poisonous
    foul    -> poisonous
    musty   -> poisonous
    none    -> edible
    pungent -> poisonous
    spicy   -> poisonous
(6895/7000 instances correct)

summary(mushroom_1R)


=== Summary ===

Correctly Classified Instances        6895               98.5    %
Incorrectly Classified Instances       105                1.5    %
Kappa statistic                          0.9699
Mean absolute error                      0.015 
Root mean squared error                  0.1225
Relative absolute error                  3.0039 %
Root relative squared error             24.5108 %
Total Number of Instances             7000     

=== Confusion Matrix ===

    a    b   <-- classified as
 3626    0 |    a = edible
  105 3269 |    b = poisonous

mushroom_pred <- predict(mushroom_1R, mushrooms_test)
# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(mushrooms_test$type, mushroom_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  1124 

 
               | predicted default 
actual default |    edible | poisonous | Row Total | 
---------------|-----------|-----------|-----------|
        edible |       582 |         0 |       582 | 
               |     0.518 |     0.000 |           | 
---------------|-----------|-----------|-----------|
     poisonous |        15 |       527 |       542 | 
               |     0.013 |     0.469 |           | 
---------------|-----------|-----------|-----------|
  Column Total |       597 |       527 |      1124 | 
---------------|-----------|-----------|-----------|

## Step 5: Improving model performance ----
mushroom_JRip <- JRip(type ~ ., data = mushrooms_train)
mushroom_JRip

JRIP rules:
===========

(odor = foul) => type=poisonous (1860.0/0.0)
(gill_size = narrow) and (gill_color = buff) => type=poisonous (986.0/0.0)
(gill_size = narrow) and (odor = pungent) => type=poisonous (222.0/0.0)
(odor = creosote) => type=poisonous (171.0/0.0)
(spore_print_color = green) => type=poisonous (65.0/0.0)
(stalk_surface_below_ring = scaly) and (stalk_surface_above_ring = silky) => type=poisonous (58.0/0.0)
(habitat = leaves) and (cap_surface = scaly) and (population = clustered) => type=poisonous (10.0/0.0)
(cap_surface = grooves) => type=poisonous (2.0/0.0)
 => type=edible (3626.0/0.0)

Number of Rules : 9

summary(mushroom_JRip)


=== Summary ===

Correctly Classified Instances        7000              100      %
Incorrectly Classified Instances         0                0      %
Kappa statistic                          1     
Mean absolute error                      0     
Root mean squared error                  0     
Relative absolute error                  0      %
Root relative squared error              0      %
Total Number of Instances             7000     

=== Confusion Matrix ===

    a    b   <-- classified as
 3626    0 |    a = edible
    0 3374 |    b = poisonous

mushroom_pred <- predict(mushroom_JRip, mushrooms_test)
# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(mushrooms_test$type, mushroom_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  1124 

 
               | predicted default 
actual default |    edible | poisonous | Row Total | 
---------------|-----------|-----------|-----------|
        edible |       582 |         0 |       582 | 
               |     0.518 |     0.000 |           | 
---------------|-----------|-----------|-----------|
     poisonous |         0 |       542 |       542 | 
               |     0.000 |     0.482 |           | 
---------------|-----------|-----------|-----------|
  Column Total |       582 |       542 |      1124 | 
---------------|-----------|-----------|-----------|

# Rule Learner Using C5.0 Decision Trees (not in text)
library(C50)
mushroom_c5rules <- C5.0(type ~ odor + gill_size, data = mushrooms_train, rules = TRUE)
mushroom_c5rules


Call:
C5.0.formula(formula = type ~ odor + gill_size, data = mushrooms_train, rules = TRUE)

Rule-Based Model
Number of samples: 7000 
Number of predictors: 2 

Number of Rules: 2 

Non-standard options: attempt to group attributes

summary(mushroom_c5rules)


Call:
C5.0.formula(formula = type ~ odor + gill_size, data = mushrooms_train, rules = TRUE)


C5.0 [Release 2.07 GPL Edition]     Wed May 03 17:27:26 2017
-------------------------------

Class specified by attribute `outcome'

Read 7000 cases (3 attributes) from undefined.data

Rules:

Rule 1: (3731/105, lift 1.9)
    odor in {almond, anise, none}
    ->  class edible  [0.972]

Rule 2: (3269, lift 2.1)
    odor in {creosote, fishy, foul, musty, pungent, spicy}
    ->  class poisonous  [1.000]

Default class: edible


Evaluation on training data (7000 cases):

            Rules     
      ----------------
        No      Errors

         2  105( 1.5%)   <<


       (a)   (b)    <-classified as
      ----  ----
      3626          (a): class edible
       105  3269    (b): class poisonous


    Attribute usage:

    100.00% odor


Time: 0.0 secs

mushroom_pred <- predict(mushroom_c5rules, mushrooms_test)
# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(mushrooms_test$type, mushroom_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  1124 

 
               | predicted default 
actual default |    edible | poisonous | Row Total | 
---------------|-----------|-----------|-----------|
        edible |       582 |         0 |       582 | 
               |     0.518 |     0.000 |           | 
---------------|-----------|-----------|-----------|
     poisonous |        15 |       527 |       542 | 
               |     0.013 |     0.469 |           | 
---------------|-----------|-----------|-----------|
  Column Total |       597 |       527 |      1124 | 
---------------|-----------|-----------|-----------|

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpUaGlzIGlzIGFuIFtSIE1hcmtkb3duXShodHRwOi8vcm1hcmtkb3duLnJzdHVkaW8uY29tKSBOb3RlYm9vay4gV2hlbiB5b3UgZXhlY3V0ZSBjb2RlIHdpdGhpbiB0aGUgbm90ZWJvb2ssIHRoZSByZXN1bHRzIGFwcGVhciBiZW5lYXRoIHRoZSBjb2RlLiANCg0KVHJ5IGV4ZWN1dGluZyB0aGlzIGNodW5rIGJ5IGNsaWNraW5nIHRoZSAqUnVuKiBidXR0b24gd2l0aGluIHRoZSBjaHVuayBvciBieSBwbGFjaW5nIHlvdXIgY3Vyc29yIGluc2lkZSBpdCBhbmQgcHJlc3NpbmcgKkN0cmwrU2hpZnQrRW50ZXIqLiANCg0KYGBge3J9DQojIyMjIFBhcnQgMjogUnVsZSBMZWFybmVycyAtLS0tLS0tLS0tLS0tLS0tLS0tDQoNCiMjIEV4YW1wbGU6IElkZW50aWZ5aW5nIFBvaXNvbm91cyBNdXNocm9vbXMgLS0tLQ0KIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YSAtLS0tIA0KbXVzaHJvb21zIDwtIHJlYWQuY3N2KCJtdXNocm9vbXMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IFRSVUUpDQoNCiMgZXhhbWluZSB0aGUgc3RydWN0dXJlIG9mIHRoZSBkYXRhIGZyYW1lDQpzdHIobXVzaHJvb21zKQ0KDQojIGRyb3AgdGhlIHZlaWxfdHlwZSBmZWF0dXJlDQptdXNocm9vbXMkdmVpbF90eXBlIDwtIE5VTEwNCg0KIyBleGFtaW5lIHRoZSBjbGFzcyBkaXN0cmlidXRpb24NCnRhYmxlKG11c2hyb29tcyR0eXBlKQ0KDQoNCnNldC5zZWVkKDEyMykNCnRyYWluX3NhbXBsZSA8LSBzYW1wbGUoODEyNCwgNzAwMCkNCg0Kc3RyKHRyYWluX3NhbXBsZSkNCg0KIyBzcGxpdCB0aGUgZGF0YSBmcmFtZXMNCm11c2hyb29tc190cmFpbiA8LSBtdXNocm9vbXNbdHJhaW5fc2FtcGxlLCBdDQptdXNocm9vbXNfdGVzdCAgPC0gbXVzaHJvb21zWy10cmFpbl9zYW1wbGUsIF0NCg0KDQoNCiMjIFN0ZXAgMzogVHJhaW5pbmcgYSBtb2RlbCBvbiB0aGUgZGF0YSAtLS0tDQpsaWJyYXJ5KFJXZWthKQ0KDQojIHRyYWluIE9uZVIoKSBvbiB0aGUgZGF0YQ0KbXVzaHJvb21fMVIgPC0gT25lUih0eXBlIH4gLiwgZGF0YSA9IG11c2hyb29tc190cmFpbikNCg0KIyMgU3RlcCA0OiBFdmFsdWF0aW5nIG1vZGVsIHBlcmZvcm1hbmNlIC0tLS0NCm11c2hyb29tXzFSDQpzdW1tYXJ5KG11c2hyb29tXzFSKQ0KDQptdXNocm9vbV9wcmVkIDwtIHByZWRpY3QobXVzaHJvb21fMVIsIG11c2hyb29tc190ZXN0KQ0KDQojIGNyb3NzIHRhYnVsYXRpb24gb2YgcHJlZGljdGVkIHZlcnN1cyBhY3R1YWwgY2xhc3Nlcw0KbGlicmFyeShnbW9kZWxzKQ0KQ3Jvc3NUYWJsZShtdXNocm9vbXNfdGVzdCR0eXBlLCBtdXNocm9vbV9wcmVkLA0KICAgICAgICAgICBwcm9wLmNoaXNxID0gRkFMU0UsIHByb3AuYyA9IEZBTFNFLCBwcm9wLnIgPSBGQUxTRSwNCiAgICAgICAgICAgZG5uID0gYygnYWN0dWFsIGRlZmF1bHQnLCAncHJlZGljdGVkIGRlZmF1bHQnKSkNCg0KIyMgU3RlcCA1OiBJbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgLS0tLQ0KbXVzaHJvb21fSlJpcCA8LSBKUmlwKHR5cGUgfiAuLCBkYXRhID0gbXVzaHJvb21zX3RyYWluKQ0KbXVzaHJvb21fSlJpcA0Kc3VtbWFyeShtdXNocm9vbV9KUmlwKQ0KDQptdXNocm9vbV9wcmVkIDwtIHByZWRpY3QobXVzaHJvb21fSlJpcCwgbXVzaHJvb21zX3Rlc3QpDQoNCiMgY3Jvc3MgdGFidWxhdGlvbiBvZiBwcmVkaWN0ZWQgdmVyc3VzIGFjdHVhbCBjbGFzc2VzDQpsaWJyYXJ5KGdtb2RlbHMpDQpDcm9zc1RhYmxlKG11c2hyb29tc190ZXN0JHR5cGUsIG11c2hyb29tX3ByZWQsDQogICAgICAgICAgIHByb3AuY2hpc3EgPSBGQUxTRSwgcHJvcC5jID0gRkFMU0UsIHByb3AuciA9IEZBTFNFLA0KICAgICAgICAgICBkbm4gPSBjKCdhY3R1YWwgZGVmYXVsdCcsICdwcmVkaWN0ZWQgZGVmYXVsdCcpKQ0KDQojIFJ1bGUgTGVhcm5lciBVc2luZyBDNS4wIERlY2lzaW9uIFRyZWVzIChub3QgaW4gdGV4dCkNCmxpYnJhcnkoQzUwKQ0KbXVzaHJvb21fYzVydWxlcyA8LSBDNS4wKHR5cGUgfiBvZG9yICsgZ2lsbF9zaXplLCBkYXRhID0gbXVzaHJvb21zX3RyYWluLCBydWxlcyA9IFRSVUUpDQptdXNocm9vbV9jNXJ1bGVzDQpzdW1tYXJ5KG11c2hyb29tX2M1cnVsZXMpDQoNCm11c2hyb29tX3ByZWQgPC0gcHJlZGljdChtdXNocm9vbV9jNXJ1bGVzLCBtdXNocm9vbXNfdGVzdCkNCg0KIyBjcm9zcyB0YWJ1bGF0aW9uIG9mIHByZWRpY3RlZCB2ZXJzdXMgYWN0dWFsIGNsYXNzZXMNCmxpYnJhcnkoZ21vZGVscykNCkNyb3NzVGFibGUobXVzaHJvb21zX3Rlc3QkdHlwZSwgbXVzaHJvb21fcHJlZCwNCiAgICAgICAgICAgcHJvcC5jaGlzcSA9IEZBTFNFLCBwcm9wLmMgPSBGQUxTRSwgcHJvcC5yID0gRkFMU0UsDQogICAgICAgICAgIGRubiA9IGMoJ2FjdHVhbCBkZWZhdWx0JywgJ3ByZWRpY3RlZCBkZWZhdWx0JykpDQoNCmBgYA0KDQpBZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ3RybCtBbHQrSSouDQoNCldoZW4geW91IHNhdmUgdGhlIG5vdGVib29rLCBhbiBIVE1MIGZpbGUgY29udGFpbmluZyB0aGUgY29kZSBhbmQgb3V0cHV0IHdpbGwgYmUgc2F2ZWQgYWxvbmdzaWRlIGl0IChjbGljayB0aGUgKlByZXZpZXcqIGJ1dHRvbiBvciBwcmVzcyAqQ3RybCtTaGlmdCtLKiB0byBwcmV2aWV3IHRoZSBIVE1MIGZpbGUpLg0K