library(openxlsx)
library(tm)
library(car)
library(foreign)
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)
This project explores a basic application of “rule learner” classification. The data used are for practice and were drawn from mushroom data and text: “Machine Learning with R”.
1R classifier RIPPER algorithm
# call data
mushroom<-read.xlsx("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\ML\\mushroom.xlsx")
# check data
str(mushroom)
## 'data.frame': 8124 obs. of 23 variables:
## $ type : chr "p" "e" "e" "p" ...
## $ cap_shape : chr "x" "x" "b" "x" ...
## $ cap_surface : chr "s" "s" "s" "y" ...
## $ cap_color : chr "n" "y" "w" "w" ...
## $ bruises : chr "t" "t" "t" "t" ...
## $ odor : chr "p" "a" "l" "p" ...
## $ gill_attachment : chr "f" "f" "f" "f" ...
## $ gill_spacing : chr "c" "c" "c" "c" ...
## $ gill_size : chr "n" "b" "b" "n" ...
## $ gill_color : chr "k" "k" "n" "n" ...
## $ stalk_shape : chr "e" "e" "e" "e" ...
## $ stalk_root : chr "e" "c" "c" "e" ...
## $ stalk_surface_above_ring: chr "s" "s" "s" "s" ...
## $ stalk_surface_below_ring: chr "s" "s" "s" "s" ...
## $ stalk_color_above_ring : chr "w" "w" "w" "w" ...
## $ stalk_color_below_ring : chr "w" "w" "w" "w" ...
## $ veil_type : chr "p" "p" "p" "p" ...
## $ veil_color : chr "w" "w" "w" "w" ...
## $ ring_number : chr "o" "o" "o" "o" ...
## $ ring_type : chr "p" "p" "p" "p" ...
## $ spore_print_color : chr "k" "n" "n" "k" ...
## $ population : chr "s" "n" "n" "s" ...
## $ habitat : chr "u" "g" "m" "u" ...
# examine some features and remove veil_type (only has 1 level)
table(mushroom$type)
##
## e p
## 4208 3916
table(mushroom$viel_type)
## < table of extent 0 >
summary(mushroom$type)
## Length Class Mode
## 8124 character character
summary(mushroom$viel_type)
## Length Class Mode
## 0 NULL NULL
mushroom$veil_type<-NULL
# recheck data frame and convert characters to strings
mushroom <- lapply(mushroom, as.factor)
str(mushroom)
## List of 22
## $ type : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_shape : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ cap_surface : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
## $ cap_color : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ bruises : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
## $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ gill_attachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ gill_spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ gill_size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
## $ gill_color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ stalk_shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ stalk_root : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
## $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_color_above_ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ stalk_color_below_ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ veil_color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ ring_number : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ ring_type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ spore_print_color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
# target vector (edible / poisonous)
table(mushroom$type)
##
## e p
## 4208 3916
# train 1 rule learner on all features
mushroom_1R <- OneR(type ~ ., data = mushroom)
# assess 1R rules learned (odor), and check model performance
mushroom_1R
## odor:
## a -> e
## c -> p
## f -> p
## l -> e
## m -> p
## n -> e
## p -> p
## s -> p
## y -> p
## (8004/8124 instances correct)
table(mushroom$odor)
##
## a c f l m n p s y
## 400 192 2160 400 36 3528 256 576 576
summary(mushroom_1R)
##
## === Summary ===
##
## Correctly Classified Instances 8004 98.5229 %
## Incorrectly Classified Instances 120 1.4771 %
## Kappa statistic 0.9704
## Mean absolute error 0.0148
## Root mean squared error 0.1215
## Relative absolute error 2.958 %
## Root relative squared error 24.323 %
## Total Number of Instances 8124
##
## === Confusion Matrix ===
##
## a b <-- classified as
## 4208 0 | a = e
## 120 3796 | b = p
The 1 rule learner identified a rule based on specimen odor and had a 98.5% accuracy rate.
# train RIPPER algorithm to improve performance
mushroom_JRip <- JRip(type ~ ., data = mushroom)
# assess RIPPER algorithm rules learned and check model performance
mushroom_JRip
## JRIP rules:
## ===========
##
## (odor = f) => type=p (2160.0/0.0)
## (gill_size = n) and (gill_color = b) => type=p (1152.0/0.0)
## (gill_size = n) and (odor = p) => type=p (256.0/0.0)
## (odor = c) => type=p (192.0/0.0)
## (spore_print_color = r) => type=p (72.0/0.0)
## (stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=p (68.0/0.0)
## (habitat = l) and (cap_color = w) => type=p (8.0/0.0)
## (stalk_color_above_ring = y) => type=p (8.0/0.0)
## => type=e (4208.0/0.0)
##
## Number of Rules : 9
summary(mushroom_JRip)
##
## === Summary ===
##
## Correctly Classified Instances 8124 100 %
## Incorrectly Classified Instances 0 0 %
## Kappa statistic 1
## Mean absolute error 0
## Root mean squared error 0
## Relative absolute error 0 %
## Root relative squared error 0 %
## Total Number of Instances 8124
##
## === Confusion Matrix ===
##
## a b <-- classified as
## 4208 0 | a = e
## 0 3916 | b = p
The RIPPER rule learner identified 9 rules for classification using specimen characteristics and had a 100% accuracy rate.