library(openxlsx)
library(tm)
library(car)
library(foreign) 
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)

This project explores a basic application of “rule learner” classification. The data used are for practice and were drawn from mushroom data and text: “Machine Learning with R”.

1R classifier RIPPER algorithm

# call data
mushroom<-read.xlsx("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\ML\\mushroom.xlsx")
# check data
str(mushroom)
## 'data.frame':    8124 obs. of  23 variables:
##  $ type                    : chr  "p" "e" "e" "p" ...
##  $ cap_shape               : chr  "x" "x" "b" "x" ...
##  $ cap_surface             : chr  "s" "s" "s" "y" ...
##  $ cap_color               : chr  "n" "y" "w" "w" ...
##  $ bruises                 : chr  "t" "t" "t" "t" ...
##  $ odor                    : chr  "p" "a" "l" "p" ...
##  $ gill_attachment         : chr  "f" "f" "f" "f" ...
##  $ gill_spacing            : chr  "c" "c" "c" "c" ...
##  $ gill_size               : chr  "n" "b" "b" "n" ...
##  $ gill_color              : chr  "k" "k" "n" "n" ...
##  $ stalk_shape             : chr  "e" "e" "e" "e" ...
##  $ stalk_root              : chr  "e" "c" "c" "e" ...
##  $ stalk_surface_above_ring: chr  "s" "s" "s" "s" ...
##  $ stalk_surface_below_ring: chr  "s" "s" "s" "s" ...
##  $ stalk_color_above_ring  : chr  "w" "w" "w" "w" ...
##  $ stalk_color_below_ring  : chr  "w" "w" "w" "w" ...
##  $ veil_type               : chr  "p" "p" "p" "p" ...
##  $ veil_color              : chr  "w" "w" "w" "w" ...
##  $ ring_number             : chr  "o" "o" "o" "o" ...
##  $ ring_type               : chr  "p" "p" "p" "p" ...
##  $ spore_print_color       : chr  "k" "n" "n" "k" ...
##  $ population              : chr  "s" "n" "n" "s" ...
##  $ habitat                 : chr  "u" "g" "m" "u" ...
# examine some features and remove veil_type (only has 1 level)
table(mushroom$type)
## 
##    e    p 
## 4208 3916
table(mushroom$viel_type)
## < table of extent 0 >
summary(mushroom$type)
##    Length     Class      Mode 
##      8124 character character
summary(mushroom$viel_type)
## Length  Class   Mode 
##      0   NULL   NULL
mushroom$veil_type<-NULL
# recheck data frame and convert characters to strings
mushroom <- lapply(mushroom, as.factor)
str(mushroom)
## List of 22
##  $ type                    : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
##  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ gill_spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ gill_color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_color_above_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
# target vector (edible / poisonous)
table(mushroom$type)
## 
##    e    p 
## 4208 3916
# train 1 rule learner on all features
mushroom_1R <- OneR(type ~ ., data = mushroom)
# assess 1R rules learned (odor), and check model performance
mushroom_1R
## odor:
##  a   -> e
##  c   -> p
##  f   -> p
##  l   -> e
##  m   -> p
##  n   -> e
##  p   -> p
##  s   -> p
##  y   -> p
## (8004/8124 instances correct)
table(mushroom$odor)
## 
##    a    c    f    l    m    n    p    s    y 
##  400  192 2160  400   36 3528  256  576  576
summary(mushroom_1R)
## 
## === Summary ===
## 
## Correctly Classified Instances        8004               98.5229 %
## Incorrectly Classified Instances       120                1.4771 %
## Kappa statistic                          0.9704
## Mean absolute error                      0.0148
## Root mean squared error                  0.1215
## Relative absolute error                  2.958  %
## Root relative squared error             24.323  %
## Total Number of Instances             8124     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  4208    0 |    a = e
##   120 3796 |    b = p

The 1 rule learner identified a rule based on specimen odor and had a 98.5% accuracy rate.

# train RIPPER algorithm to improve performance
mushroom_JRip <- JRip(type ~ ., data = mushroom)
# assess RIPPER algorithm rules learned and check model performance
mushroom_JRip
## JRIP rules:
## ===========
## 
## (odor = f) => type=p (2160.0/0.0)
## (gill_size = n) and (gill_color = b) => type=p (1152.0/0.0)
## (gill_size = n) and (odor = p) => type=p (256.0/0.0)
## (odor = c) => type=p (192.0/0.0)
## (spore_print_color = r) => type=p (72.0/0.0)
## (stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=p (68.0/0.0)
## (habitat = l) and (cap_color = w) => type=p (8.0/0.0)
## (stalk_color_above_ring = y) => type=p (8.0/0.0)
##  => type=e (4208.0/0.0)
## 
## Number of Rules : 9
summary(mushroom_JRip)
## 
## === Summary ===
## 
## Correctly Classified Instances        8124              100      %
## Incorrectly Classified Instances         0                0      %
## Kappa statistic                          1     
## Mean absolute error                      0     
## Root mean squared error                  0     
## Relative absolute error                  0      %
## Root relative squared error              0      %
## Total Number of Instances             8124     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  4208    0 |    a = e
##     0 3916 |    b = p

The RIPPER rule learner identified 9 rules for classification using specimen characteristics and had a 100% accuracy rate.