load the data

setwd("C:/Users/Maxwell/Desktop/Learn R")
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
credit=read.csv("credit.csv")

str(credit)
## 'data.frame':    1000 obs. of  21 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","delayed",..: 1 5 1 5 2 5 5 5 5 1 ...
##  $ purpose             : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_length   : Factor w/ 5 levels "> 7 yrs","0 - 1 yrs",..: 1 3 4 4 3 3 1 3 4 5 ...
##  $ installment_rate    : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ personal_status     : Factor w/ 4 levels "divorced male",..: 4 2 4 4 4 4 4 4 1 3 ...
##  $ other_debtors       : Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ residence_history   : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ property            : Factor w/ 4 levels "building society savings",..: 3 3 3 1 4 4 1 2 3 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ installment_plan    : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "for free","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_credits    : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ default             : int  1 2 1 1 2 1 1 1 1 2 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ telephone           : Factor w/ 2 levels "none","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ foreign_worker      : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ job                 : Factor w/ 4 levels "mangement self-employed",..: 2 2 4 2 2 4 2 1 4 1 ...
table(credit$checking_balance)
## 
##     < 0 DM   > 200 DM 1 - 200 DM    unknown 
##        274         63        269        394
table(credit$savings_balance)
## 
##      < 100 DM     > 1000 DM  101 - 500 DM 501 - 1000 DM       unknown 
##           603            48           103            63           183

change the default variable into factor (required by c5.0 decision tree)

credit$default= ifelse(credit$default==1, "no", "yes")%>%factor
summary(credit)
##    checking_balance months_loan_duration                credit_history
##  < 0 DM    :274     Min.   : 4.0         critical              :293   
##  > 200 DM  : 63     1st Qu.:12.0         delayed               : 88   
##  1 - 200 DM:269     Median :18.0         fully repaid          : 40   
##  unknown   :394     Mean   :20.9         fully repaid this bank: 49   
##                     3rd Qu.:24.0         repaid                :530   
##                     Max.   :72.0                                      
##                                                                       
##        purpose        amount           savings_balance  employment_length
##  radio/tv  :280   Min.   :  250   < 100 DM     :603    > 7 yrs   :253    
##  car (new) :234   1st Qu.: 1366   > 1000 DM    : 48    0 - 1 yrs :172    
##  furniture :181   Median : 2320   101 - 500 DM :103    1 - 4 yrs :339    
##  car (used):103   Mean   : 3271   501 - 1000 DM: 63    4 - 7 yrs :174    
##  business  : 97   3rd Qu.: 3972   unknown      :183    unemployed: 62    
##  education : 50   Max.   :18424                                          
##  (Other)   : 55                                                          
##  installment_rate      personal_status      other_debtors
##  Min.   :1.000    divorced male: 50    co-applicant: 41  
##  1st Qu.:2.000    female       :310    guarantor   : 52  
##  Median :3.000    married male : 92    none        :907  
##  Mean   :2.973    single male  :548                      
##  3rd Qu.:4.000                                           
##  Max.   :4.000                                           
##                                                          
##  residence_history                     property        age       
##  Min.   :1.000     building society savings:232   Min.   :19.00  
##  1st Qu.:2.000     other                   :332   1st Qu.:27.00  
##  Median :3.000     real estate             :282   Median :33.00  
##  Mean   :2.845     unknown/none            :154   Mean   :35.55  
##  3rd Qu.:4.000                                    3rd Qu.:42.00  
##  Max.   :4.000                                    Max.   :75.00  
##                                                                  
##  installment_plan     housing    existing_credits default  
##  bank  :139       for free:108   Min.   :1.000    no :700  
##  none  :814       own     :713   1st Qu.:1.000    yes:300  
##  stores: 47       rent    :179   Median :1.000             
##                                  Mean   :1.407             
##                                  3rd Qu.:2.000             
##                                  Max.   :4.000             
##                                                            
##    dependents    telephone  foreign_worker                      job     
##  Min.   :1.000   none:596   no : 37        mangement self-employed:148  
##  1st Qu.:1.000   yes :404   yes:963        skilled employee       :630  
##  Median :1.000                             unemployed non-resident: 22  
##  Mean   :1.155                             unskilled resident     :200  
##  3rd Qu.:1.000                                                          
##  Max.   :2.000                                                          
## 

use caret package for splitting the data

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
index=createDataPartition(credit$checking_balance,p=0.9,list=FALSE)

train_credit=credit[index,]

test_credit=credit[-index,]

load the C50 decision tree package

library(C50)
library(tidyverse)

credit_dtree= C5.0(default~.,data=train_credit)
credit_dtree
## 
## Call:
## C5.0.formula(formula = default ~ ., data = train_credit)
## 
## Classification Tree
## Number of samples: 902 
## Number of predictors: 20 
## 
## Tree size: 48 
## 
## Non-standard options: attempt to group attributes
summary(credit_dtree)
## 
## Call:
## C5.0.formula(formula = default ~ ., data = train_credit)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Feb 17 20:58:16 2017
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 902 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_balance in {> 200 DM,unknown}: no (412/57)
## checking_balance in {< 0 DM,1 - 200 DM}:
## :...credit_history in {fully repaid,fully repaid this bank}:
##     :...dependents > 1: yes (11)
##     :   dependents <= 1:
##     :   :...savings_balance = > 1000 DM: yes (0)
##     :       savings_balance in {501 - 1000 DM,unknown}: no (7)
##     :       savings_balance in {< 100 DM,101 - 500 DM}:
##     :       :...other_debtors = co-applicant: no (1)
##     :           other_debtors = none: yes (40/9)
##     :           other_debtors = guarantor:
##     :           :...employment_length in {> 7 yrs,0 - 1 yrs,1 - 4 yrs,
##     :               :                     unemployed}: no (2)
##     :               employment_length = 4 - 7 yrs: yes (2)
##     credit_history in {critical,delayed,repaid}:
##     :...other_debtors = co-applicant:
##         :...foreign_worker = no: no (2)
##         :   foreign_worker = yes: yes (17/5)
##         other_debtors = guarantor:
##         :...purpose in {business,car (used),domestic appliances,education,
##         :   :           furniture,others,repairs,retraining}: no (10)
##         :   purpose = car (new):
##         :   :...installment_plan in {bank,stores}: yes (4)
##         :   :   installment_plan = none: no (1)
##         :   purpose = radio/tv:
##         :   :...months_loan_duration <= 33: no (20)
##         :       months_loan_duration > 33: yes (3)
##         other_debtors = none:
##         :...savings_balance = > 1000 DM: no (16/1)
##             savings_balance = 501 - 1000 DM:
##             :...months_loan_duration <= 21: no (10/1)
##             :   months_loan_duration > 21: yes (2)
##             savings_balance = 101 - 500 DM:
##             :...personal_status = divorced male:
##             :   :...credit_history in {critical,delayed}: no (2)
##             :   :   credit_history = repaid: yes (2)
##             :   personal_status = female:
##             :   :...credit_history = delayed: no (1)
##             :   :   credit_history in {critical,repaid}: yes (6/1)
##             :   personal_status = married male:
##             :   :...credit_history = critical: no (1)
##             :   :   credit_history in {delayed,repaid}: yes (3)
##             :   personal_status = single male:
##             :   :...existing_credits <= 1: no (15/1)
##             :       existing_credits > 1:
##             :       :...residence_history <= 3: no (3)
##             :           residence_history > 3: yes (2)
##             savings_balance = unknown:
##             :...existing_credits > 1: no (12)
##             :   existing_credits <= 1:
##             :   :...credit_history in {critical,delayed}: no (5)
##             :       credit_history = repaid:
##             :       :...checking_balance = < 0 DM:
##             :           :...telephone = none: yes (9/1)
##             :           :   telephone = yes:
##             :           :   :...months_loan_duration <= 30: no (7/1)
##             :           :       months_loan_duration > 30: yes (2)
##             :           checking_balance = 1 - 200 DM:
##             :           :...property = building society savings: yes (4/1)
##             :               property in {other,real estate}: no (14/2)
##             :               property = unknown/none:
##             :               :...amount <= 5848: no (3)
##             :                   amount > 5848: yes (2)
##             savings_balance = < 100 DM:
##             :...months_loan_duration > 27:
##                 :...residence_history > 1: yes (49/11)
##                 :   residence_history <= 1:
##                 :   :...checking_balance = 1 - 200 DM: no (3)
##                 :       checking_balance = < 0 DM:
##                 :       :...months_loan_duration <= 40: yes (3)
##                 :           months_loan_duration > 40: no (2)
##                 months_loan_duration <= 27:
##                 :...credit_history = critical: no (56/12)
##                     credit_history in {delayed,repaid}:
##                     :...installment_rate > 2:
##                         :...months_loan_duration <= 11: no (23/7)
##                         :   months_loan_duration > 11: yes (64/21)
##                         installment_rate <= 2:
##                         :...foreign_worker = no: yes (2)
##                             foreign_worker = yes:
##                             :...amount <= 1372:
##                                 :...telephone = none: yes (7)
##                                 :   telephone = yes: no (2)
##                                 amount > 1372:
##                                 :...amount <= 9629: no (36/5)
##                                     amount > 9629: yes (2)
## 
## 
## Evaluation on training data (902 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      47  136(15.1%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     579    49    (a): class no
##      87   187    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% checking_balance
##   54.32% credit_history
##   52.33% other_debtors
##   46.78% savings_balance
##   32.48% months_loan_duration
##   15.08% installment_rate
##    8.65% existing_credits
##    7.54% foreign_worker
##    6.98% dependents
##    6.87% residence_history
##    5.76% amount
##    4.21% purpose
##    3.88% personal_status
##    2.99% telephone
##    2.55% property
##    0.55% installment_plan
##    0.44% employment_length
## 
## 
## Time: 0.0 secs

predict using the model

credit_dtree_pred=predict(credit_dtree,test_credit)

library(gmodels)
CrossTable(credit_dtree_pred,test_credit$default)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  98 
## 
##  
##                   | test_credit$default 
## credit_dtree_pred |        no |       yes | Row Total | 
## ------------------|-----------|-----------|-----------|
##                no |        59 |        13 |        72 | 
##                   |     0.704 |     1.949 |           | 
##                   |     0.819 |     0.181 |     0.735 | 
##                   |     0.819 |     0.500 |           | 
##                   |     0.602 |     0.133 |           | 
## ------------------|-----------|-----------|-----------|
##               yes |        13 |        13 |        26 | 
##                   |     1.949 |     5.398 |           | 
##                   |     0.500 |     0.500 |     0.265 | 
##                   |     0.181 |     0.500 |           | 
##                   |     0.133 |     0.133 |           | 
## ------------------|-----------|-----------|-----------|
##      Column Total |        72 |        26 |        98 | 
##                   |     0.735 |     0.265 |           | 
## ------------------|-----------|-----------|-----------|
## 
## 

improved decision tree

credit_bdtree=C5.0(default~.,data=train_credit,trials=10)
credit_bdtree
## 
## Call:
## C5.0.formula(formula = default ~ ., data = train_credit, trials = 10)
## 
## Classification Tree
## Number of samples: 902 
## Number of predictors: 20 
## 
## Number of boosting iterations: 10 
## Average tree size: 41.2 
## 
## Non-standard options: attempt to group attributes
credit_bdtree_pred=predict(credit_bdtree,test_credit)

library(gmodels)

CrossTable(credit_bdtree_pred,test_credit$default)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  98 
## 
##  
##                    | test_credit$default 
## credit_bdtree_pred |        no |       yes | Row Total | 
## -------------------|-----------|-----------|-----------|
##                 no |        65 |        12 |        77 | 
##                    |     1.256 |     3.478 |           | 
##                    |     0.844 |     0.156 |     0.786 | 
##                    |     0.903 |     0.462 |           | 
##                    |     0.663 |     0.122 |           | 
## -------------------|-----------|-----------|-----------|
##                yes |         7 |        14 |        21 | 
##                    |     4.604 |    12.751 |           | 
##                    |     0.333 |     0.667 |     0.214 | 
##                    |     0.097 |     0.538 |           | 
##                    |     0.071 |     0.143 |           | 
## -------------------|-----------|-----------|-----------|
##       Column Total |        72 |        26 |        98 | 
##                    |     0.735 |     0.265 |           | 
## -------------------|-----------|-----------|-----------|
## 
## 

add a cost matrix(c(0,1,4,0),nrow=2,ncol=2)

costM=matrix(c(0,1,25,0), nrow=2)
dimnames(costM)=list(c("no","yes"),c("no","yes"))

costM
##     no yes
## no   0  25
## yes  1   0
credit_cbdtree=C5.0(formula=default~.,data=train_credit,trials=10,costs=costM)

credit_cbdtree_pred=predict(credit_cbdtree,test_credit)

library(gmodels)
CrossTable(credit_cbdtree_pred,test_credit$default)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  98 
## 
##  
##                     | test_credit$default 
## credit_cbdtree_pred |        no |       yes | Row Total | 
## --------------------|-----------|-----------|-----------|
##                  no |        64 |        13 |        77 | 
##                     |     0.975 |     2.701 |           | 
##                     |     0.831 |     0.169 |     0.786 | 
##                     |     0.889 |     0.500 |           | 
##                     |     0.653 |     0.133 |           | 
## --------------------|-----------|-----------|-----------|
##                 yes |         8 |        13 |        21 | 
##                     |     3.577 |     9.905 |           | 
##                     |     0.381 |     0.619 |     0.214 | 
##                     |     0.111 |     0.500 |           | 
##                     |     0.082 |     0.133 |           | 
## --------------------|-----------|-----------|-----------|
##        Column Total |        72 |        26 |        98 | 
##                     |     0.735 |     0.265 |           | 
## --------------------|-----------|-----------|-----------|
## 
## 

continue for the mushroom data

getwd()
## [1] "C:/Users/Maxwell/Desktop/Learn R"
mushroom=read.csv("mushrooms.csv")

str(mushroom)
## 'data.frame':    8124 obs. of  23 variables:
##  $ type                    : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
##  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ gill_spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ gill_color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_color_above_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
summary(mushroom)
##  type     cap_shape cap_surface   cap_color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill_attachment gill_spacing gill_size   gill_color   stalk_shape
##  a: 210          c:6812       b:5612    b      :1728   e:3516     
##  f:7914          w:1312       n:2512    p      :1492   t:4608     
##                                         w      :1202              
##                                         n      :1048              
##                                         g      : 752              
##                                         h      : 732              
##                                         (Other):1170              
##  stalk_root stalk_surface_above_ring stalk_surface_below_ring
##  ?:2480     f: 552                   f: 600                  
##  b:3776     k:2372                   k:2304                  
##  c: 556     s:5176                   s:4936                  
##  e:1120     y:  24                   y: 284                  
##  r: 192                                                      
##                                                              
##                                                              
##  stalk_color_above_ring stalk_color_below_ring veil_type veil_color
##  w      :4464           w      :4384           p:8124    n:  96    
##  p      :1872           p      :1872                     o:  96    
##  g      : 576           g      : 576                     w:7924    
##  n      : 448           n      : 512                     y:   8    
##  b      : 432           b      : 432                               
##  o      : 192           o      : 192                               
##  (Other): 140           (Other): 156                               
##  ring_number ring_type spore_print_color population habitat 
##  n:  36      e:2776    w      :2388      a: 384     d:3148  
##  o:7488      f:  48    n      :1968      c: 340     g:2148  
##  t: 600      l:1296    k      :1872      n: 400     l: 832  
##              n:  36    h      :1632      s:1248     m: 292  
##              p:3968    r      :  72      v:4040     p:1144  
##                        b      :  48      y:1712     u: 368  
##                        (Other): 144                 w: 192

notice the veil column, it has only one level. Need to remove it.

mushroom$veil_type=NULL
summary(mushroom)
##  type     cap_shape cap_surface   cap_color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill_attachment gill_spacing gill_size   gill_color   stalk_shape
##  a: 210          c:6812       b:5612    b      :1728   e:3516     
##  f:7914          w:1312       n:2512    p      :1492   t:4608     
##                                         w      :1202              
##                                         n      :1048              
##                                         g      : 752              
##                                         h      : 732              
##                                         (Other):1170              
##  stalk_root stalk_surface_above_ring stalk_surface_below_ring
##  ?:2480     f: 552                   f: 600                  
##  b:3776     k:2372                   k:2304                  
##  c: 556     s:5176                   s:4936                  
##  e:1120     y:  24                   y: 284                  
##  r: 192                                                      
##                                                              
##                                                              
##  stalk_color_above_ring stalk_color_below_ring veil_color ring_number
##  w      :4464           w      :4384           n:  96     n:  36     
##  p      :1872           p      :1872           o:  96     o:7488     
##  g      : 576           g      : 576           w:7924     t: 600     
##  n      : 448           n      : 512           y:   8                
##  b      : 432           b      : 432                                 
##  o      : 192           o      : 192                                 
##  (Other): 140           (Other): 156                                 
##  ring_type spore_print_color population habitat 
##  e:2776    w      :2388      a: 384     d:3148  
##  f:  48    n      :1968      c: 340     g:2148  
##  l:1296    k      :1872      n: 400     l: 832  
##  n:  36    h      :1632      s:1248     m: 292  
##  p:3968    r      :  72      v:4040     p:1144  
##            b      :  48      y:1712     u: 368  
##            (Other): 144                 w: 192

use RWeka package for rule learning.

library(RWeka)

mush_rule=OneR(type~.,data=mushroom)



mush_rule
## odor:
##  a   -> e
##  c   -> p
##  f   -> p
##  l   -> e
##  m   -> p
##  n   -> e
##  p   -> p
##  s   -> p
##  y   -> p
## (8004/8124 instances correct)
summary(mush_rule)
## 
## === Summary ===
## 
## Correctly Classified Instances        8004               98.5229 %
## Incorrectly Classified Instances       120                1.4771 %
## Kappa statistic                          0.9704
## Mean absolute error                      0.0148
## Root mean squared error                  0.1215
## Relative absolute error                  2.958  %
## Root relative squared error             24.323  %
## Total Number of Instances             8124     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  4208    0 |    a = e
##   120 3796 |    b = p

use another alogrithm in RWeka

library(RWeka)

mush_rip=JRip(type~.,data=mushroom)

mush_rip
## JRIP rules:
## ===========
## 
## (odor = f) => type=p (2160.0/0.0)
## (gill_size = n) and (gill_color = b) => type=p (1152.0/0.0)
## (gill_size = n) and (odor = p) => type=p (256.0/0.0)
## (odor = c) => type=p (192.0/0.0)
## (spore_print_color = r) => type=p (72.0/0.0)
## (stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=p (68.0/0.0)
## (habitat = l) and (cap_color = w) => type=p (8.0/0.0)
## (stalk_color_above_ring = y) => type=p (8.0/0.0)
##  => type=e (4208.0/0.0)
## 
## Number of Rules : 9
summary(mush_rip)
## 
## === Summary ===
## 
## Correctly Classified Instances        8124              100      %
## Incorrectly Classified Instances         0                0      %
## Kappa statistic                          1     
## Mean absolute error                      0     
## Root mean squared error                  0     
## Relative absolute error                  0      %
## Root relative squared error              0      %
## Total Number of Instances             8124     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  4208    0 |    a = e
##     0 3916 |    b = p

how about rule learner’s prediction accuracy with only 80% of the data? as what we did in

split training and testing data

library(caret)

mushroom%>%colnames()
##  [1] "type"                     "cap_shape"               
##  [3] "cap_surface"              "cap_color"               
##  [5] "bruises"                  "odor"                    
##  [7] "gill_attachment"          "gill_spacing"            
##  [9] "gill_size"                "gill_color"              
## [11] "stalk_shape"              "stalk_root"              
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring"   "stalk_color_below_ring"  
## [17] "veil_color"               "ring_number"             
## [19] "ring_type"                "spore_print_color"       
## [21] "population"               "habitat"
index=createDataPartition(mushroom$type, p=0.8 ,list=FALSE)

train_mush=mushroom[index,]

test_mush=mushroom[-index,]

train the JRip on training data

mush_ript=JRip(type~.,data=train_mush)

mush_ript_pred=predict(mush_ript,test_mush)

library(gmodels)

CrossTable(mush_ript_pred,test_mush$type)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1624 
## 
##  
##                | test_mush$type 
## mush_ript_pred |         e |         p | Row Total | 
## ---------------|-----------|-----------|-----------|
##              e |       841 |         0 |       841 | 
##                |   377.518 |   405.482 |           | 
##                |     1.000 |     0.000 |     0.518 | 
##                |     1.000 |     0.000 |           | 
##                |     0.518 |     0.000 |           | 
## ---------------|-----------|-----------|-----------|
##              p |         0 |       783 |       783 | 
##                |   405.482 |   435.518 |           | 
##                |     0.000 |     1.000 |     0.482 | 
##                |     0.000 |     1.000 |           | 
##                |     0.000 |     0.482 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |       841 |       783 |      1624 | 
##                |     0.518 |     0.482 |           | 
## ---------------|-----------|-----------|-----------|
## 
## 

The JRip method is pretty powerful according to this untuned model.