load the data

setwd("C:/Users/Maxwell/Desktop/Learn R")
library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

credit=read.csv("credit.csv")

str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","delayed",..: 1 5 1 5 2 5 5 5 5 1 ...
##  $ purpose             : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_length   : Factor w/ 5 levels "> 7 yrs","0 - 1 yrs",..: 1 3 4 4 3 3 1 3 4 5 ...
##  $ installment_rate    : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ personal_status     : Factor w/ 4 levels "divorced male",..: 4 2 4 4 4 4 4 4 1 3 ...
##  $ other_debtors       : Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ residence_history   : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ property            : Factor w/ 4 levels "building society savings",..: 3 3 3 1 4 4 1 2 3 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ installment_plan    : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "for free","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_credits    : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ default             : int  1 2 1 1 2 1 1 1 1 2 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ telephone           : Factor w/ 2 levels "none","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ foreign_worker      : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ job                 : Factor w/ 4 levels "mangement self-employed",..: 2 2 4 2 2 4 2 1 4 1 ...

table(credit$checking_balance)

## 
##     < 0 DM   > 200 DM 1 - 200 DM    unknown 
##        274         63        269        394

table(credit$savings_balance)

## 
##      < 100 DM     > 1000 DM  101 - 500 DM 501 - 1000 DM       unknown 
##           603            48           103            63           183

change the default variable into factor (required by c5.0 decision tree)

credit$default= ifelse(credit$default==1, "no", "yes")%>%factor
summary(credit)

##    checking_balance months_loan_duration                credit_history
##  < 0 DM    :274     Min.   : 4.0         critical              :293   
##  > 200 DM  : 63     1st Qu.:12.0         delayed               : 88   
##  1 - 200 DM:269     Median :18.0         fully repaid          : 40   
##  unknown   :394     Mean   :20.9         fully repaid this bank: 49   
##                     3rd Qu.:24.0         repaid                :530   
##                     Max.   :72.0                                      
##                                                                       
##        purpose        amount           savings_balance  employment_length
##  radio/tv  :280   Min.   :  250   < 100 DM     :603    > 7 yrs   :253    
##  car (new) :234   1st Qu.: 1366   > 1000 DM    : 48    0 - 1 yrs :172    
##  furniture :181   Median : 2320   101 - 500 DM :103    1 - 4 yrs :339    
##  car (used):103   Mean   : 3271   501 - 1000 DM: 63    4 - 7 yrs :174    
##  business  : 97   3rd Qu.: 3972   unknown      :183    unemployed: 62    
##  education : 50   Max.   :18424                                          
##  (Other)   : 55                                                          
##  installment_rate      personal_status      other_debtors
##  Min.   :1.000    divorced male: 50    co-applicant: 41  
##  1st Qu.:2.000    female       :310    guarantor   : 52  
##  Median :3.000    married male : 92    none        :907  
##  Mean   :2.973    single male  :548                      
##  3rd Qu.:4.000                                           
##  Max.   :4.000                                           
##                                                          
##  residence_history                     property        age       
##  Min.   :1.000     building society savings:232   Min.   :19.00  
##  1st Qu.:2.000     other                   :332   1st Qu.:27.00  
##  Median :3.000     real estate             :282   Median :33.00  
##  Mean   :2.845     unknown/none            :154   Mean   :35.55  
##  3rd Qu.:4.000                                    3rd Qu.:42.00  
##  Max.   :4.000                                    Max.   :75.00  
##                                                                  
##  installment_plan     housing    existing_credits default  
##  bank  :139       for free:108   Min.   :1.000    no :700  
##  none  :814       own     :713   1st Qu.:1.000    yes:300  
##  stores: 47       rent    :179   Median :1.000             
##                                  Mean   :1.407             
##                                  3rd Qu.:2.000             
##                                  Max.   :4.000             
##                                                            
##    dependents    telephone  foreign_worker                      job     
##  Min.   :1.000   none:596   no : 37        mangement self-employed:148  
##  1st Qu.:1.000   yes :404   yes:963        skilled employee       :630  
##  Median :1.000                             unemployed non-resident: 22  
##  Mean   :1.155                             unskilled resident     :200  
##  3rd Qu.:1.000                                                          
##  Max.   :2.000                                                          
##

use caret package for splitting the data

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

index=createDataPartition(credit$checking_balance,p=0.9,list=FALSE)

train_credit=credit[index,]

test_credit=credit[-index,]

load the C50 decision tree package

library(C50)
library(tidyverse)

credit_dtree= C5.0(default~.,data=train_credit)

credit_dtree

## 
## Call:
## C5.0.formula(formula = default ~ ., data = train_credit)
## 
## Classification Tree
## Number of samples: 902 
## Number of predictors: 20 
## 
## Tree size: 48 
## 
## Non-standard options: attempt to group attributes

summary(credit_dtree)

## 
## Call:
## C5.0.formula(formula = default ~ ., data = train_credit)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Feb 17 20:58:16 2017
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 902 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_balance in {> 200 DM,unknown}: no (412/57)
## checking_balance in {< 0 DM,1 - 200 DM}:
## :...credit_history in {fully repaid,fully repaid this bank}:
##     :...dependents > 1: yes (11)
##     :   dependents <= 1:
##     :   :...savings_balance = > 1000 DM: yes (0)
##     :       savings_balance in {501 - 1000 DM,unknown}: no (7)
##     :       savings_balance in {< 100 DM,101 - 500 DM}:
##     :       :...other_debtors = co-applicant: no (1)
##     :           other_debtors = none: yes (40/9)
##     :           other_debtors = guarantor:
##     :           :...employment_length in {> 7 yrs,0 - 1 yrs,1 - 4 yrs,
##     :               :                     unemployed}: no (2)
##     :               employment_length = 4 - 7 yrs: yes (2)
##     credit_history in {critical,delayed,repaid}:
##     :...other_debtors = co-applicant:
##         :...foreign_worker = no: no (2)
##         :   foreign_worker = yes: yes (17/5)
##         other_debtors = guarantor:
##         :...purpose in {business,car (used),domestic appliances,education,
##         :   :           furniture,others,repairs,retraining}: no (10)
##         :   purpose = car (new):
##         :   :...installment_plan in {bank,stores}: yes (4)
##         :   :   installment_plan = none: no (1)
##         :   purpose = radio/tv:
##         :   :...months_loan_duration <= 33: no (20)
##         :       months_loan_duration > 33: yes (3)
##         other_debtors = none:
##         :...savings_balance = > 1000 DM: no (16/1)
##             savings_balance = 501 - 1000 DM:
##             :...months_loan_duration <= 21: no (10/1)
##             :   months_loan_duration > 21: yes (2)
##             savings_balance = 101 - 500 DM:
##             :...personal_status = divorced male:
##             :   :...credit_history in {critical,delayed}: no (2)
##             :   :   credit_history = repaid: yes (2)
##             :   personal_status = female:
##             :   :...credit_history = delayed: no (1)
##             :   :   credit_history in {critical,repaid}: yes (6/1)
##             :   personal_status = married male:
##             :   :...credit_history = critical: no (1)
##             :   :   credit_history in {delayed,repaid}: yes (3)
##             :   personal_status = single male:
##             :   :...existing_credits <= 1: no (15/1)
##             :       existing_credits > 1:
##             :       :...residence_history <= 3: no (3)
##             :           residence_history > 3: yes (2)
##             savings_balance = unknown:
##             :...existing_credits > 1: no (12)
##             :   existing_credits <= 1:
##             :   :...credit_history in {critical,delayed}: no (5)
##             :       credit_history = repaid:
##             :       :...checking_balance = < 0 DM:
##             :           :...telephone = none: yes (9/1)
##             :           :   telephone = yes:
##             :           :   :...months_loan_duration <= 30: no (7/1)
##             :           :       months_loan_duration > 30: yes (2)
##             :           checking_balance = 1 - 200 DM:
##             :           :...property = building society savings: yes (4/1)
##             :               property in {other,real estate}: no (14/2)
##             :               property = unknown/none:
##             :               :...amount <= 5848: no (3)
##             :                   amount > 5848: yes (2)
##             savings_balance = < 100 DM:
##             :...months_loan_duration > 27:
##                 :...residence_history > 1: yes (49/11)
##                 :   residence_history <= 1:
##                 :   :...checking_balance = 1 - 200 DM: no (3)
##                 :       checking_balance = < 0 DM:
##                 :       :...months_loan_duration <= 40: yes (3)
##                 :           months_loan_duration > 40: no (2)
##                 months_loan_duration <= 27:
##                 :...credit_history = critical: no (56/12)
##                     credit_history in {delayed,repaid}:
##                     :...installment_rate > 2:
##                         :...months_loan_duration <= 11: no (23/7)
##                         :   months_loan_duration > 11: yes (64/21)
##                         installment_rate <= 2:
##                         :...foreign_worker = no: yes (2)
##                             foreign_worker = yes:
##                             :...amount <= 1372:
##                                 :...telephone = none: yes (7)
##                                 :   telephone = yes: no (2)
##                                 amount > 1372:
##                                 :...amount <= 9629: no (36/5)
##                                     amount > 9629: yes (2)
## 
## 
## Evaluation on training data (902 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      47  136(15.1%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     579    49    (a): class no
##      87   187    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% checking_balance
##   54.32% credit_history
##   52.33% other_debtors
##   46.78% savings_balance
##   32.48% months_loan_duration
##   15.08% installment_rate
##    8.65% existing_credits
##    7.54% foreign_worker
##    6.98% dependents
##    6.87% residence_history
##    5.76% amount
##    4.21% purpose
##    3.88% personal_status
##    2.99% telephone
##    2.55% property
##    0.55% installment_plan
##    0.44% employment_length
## 
## 
## Time: 0.0 secs

predict using the model

credit_dtree_pred=predict(credit_dtree,test_credit)

library(gmodels)
CrossTable(credit_dtree_pred,test_credit$default)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  98 
## 
##  
##                   | test_credit$default 
## credit_dtree_pred |        no |       yes | Row Total | 
## ------------------|-----------|-----------|-----------|
##                no |        59 |        13 |        72 | 
##                   |     0.704 |     1.949 |           | 
##                   |     0.819 |     0.181 |     0.735 | 
##                   |     0.819 |     0.500 |           | 
##                   |     0.602 |     0.133 |           | 
## ------------------|-----------|-----------|-----------|
##               yes |        13 |        13 |        26 | 
##                   |     1.949 |     5.398 |           | 
##                   |     0.500 |     0.500 |     0.265 | 
##                   |     0.181 |     0.500 |           | 
##                   |     0.133 |     0.133 |           | 
## ------------------|-----------|-----------|-----------|
##      Column Total |        72 |        26 |        98 | 
##                   |     0.735 |     0.265 |           | 
## ------------------|-----------|-----------|-----------|
## 
##

improved decision tree

credit_bdtree=C5.0(default~.,data=train_credit,trials=10)
credit_bdtree

## 
## Call:
## C5.0.formula(formula = default ~ ., data = train_credit, trials = 10)
## 
## Classification Tree
## Number of samples: 902 
## Number of predictors: 20 
## 
## Number of boosting iterations: 10 
## Average tree size: 41.2 
## 
## Non-standard options: attempt to group attributes

credit_bdtree_pred=predict(credit_bdtree,test_credit)

library(gmodels)

CrossTable(credit_bdtree_pred,test_credit$default)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  98 
## 
##  
##                    | test_credit$default 
## credit_bdtree_pred |        no |       yes | Row Total | 
## -------------------|-----------|-----------|-----------|
##                 no |        65 |        12 |        77 | 
##                    |     1.256 |     3.478 |           | 
##                    |     0.844 |     0.156 |     0.786 | 
##                    |     0.903 |     0.462 |           | 
##                    |     0.663 |     0.122 |           | 
## -------------------|-----------|-----------|-----------|
##                yes |         7 |        14 |        21 | 
##                    |     4.604 |    12.751 |           | 
##                    |     0.333 |     0.667 |     0.214 | 
##                    |     0.097 |     0.538 |           | 
##                    |     0.071 |     0.143 |           | 
## -------------------|-----------|-----------|-----------|
##       Column Total |        72 |        26 |        98 | 
##                    |     0.735 |     0.265 |           | 
## -------------------|-----------|-----------|-----------|
## 
##

add a cost matrix(c(0,1,4,0),nrow=2,ncol=2)

costM=matrix(c(0,1,25,0), nrow=2)
dimnames(costM)=list(c("no","yes"),c("no","yes"))

costM

##     no yes
## no   0  25
## yes  1   0

credit_cbdtree=C5.0(formula=default~.,data=train_credit,trials=10,costs=costM)

credit_cbdtree_pred=predict(credit_cbdtree,test_credit)

library(gmodels)
CrossTable(credit_cbdtree_pred,test_credit$default)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  98 
## 
##  
##                     | test_credit$default 
## credit_cbdtree_pred |        no |       yes | Row Total | 
## --------------------|-----------|-----------|-----------|
##                  no |        64 |        13 |        77 | 
##                     |     0.975 |     2.701 |           | 
##                     |     0.831 |     0.169 |     0.786 | 
##                     |     0.889 |     0.500 |           | 
##                     |     0.653 |     0.133 |           | 
## --------------------|-----------|-----------|-----------|
##                 yes |         8 |        13 |        21 | 
##                     |     3.577 |     9.905 |           | 
##                     |     0.381 |     0.619 |     0.214 | 
##                     |     0.111 |     0.500 |           | 
##                     |     0.082 |     0.133 |           | 
## --------------------|-----------|-----------|-----------|
##        Column Total |        72 |        26 |        98 | 
##                     |     0.735 |     0.265 |           | 
## --------------------|-----------|-----------|-----------|
## 
##

continue for the mushroom data

getwd()

## [1] "C:/Users/Maxwell/Desktop/Learn R"

mushroom=read.csv("mushrooms.csv")

str(mushroom)

## 'data.frame':    8124 obs. of  23 variables:
##  $ type                    : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
##  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ gill_spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ gill_color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_color_above_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...

summary(mushroom)

##  type     cap_shape cap_surface   cap_color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill_attachment gill_spacing gill_size   gill_color   stalk_shape
##  a: 210          c:6812       b:5612    b      :1728   e:3516     
##  f:7914          w:1312       n:2512    p      :1492   t:4608     
##                                         w      :1202              
##                                         n      :1048              
##                                         g      : 752              
##                                         h      : 732              
##                                         (Other):1170              
##  stalk_root stalk_surface_above_ring stalk_surface_below_ring
##  ?:2480     f: 552                   f: 600                  
##  b:3776     k:2372                   k:2304                  
##  c: 556     s:5176                   s:4936                  
##  e:1120     y:  24                   y: 284                  
##  r: 192                                                      
##                                                              
##                                                              
##  stalk_color_above_ring stalk_color_below_ring veil_type veil_color
##  w      :4464           w      :4384           p:8124    n:  96    
##  p      :1872           p      :1872                     o:  96    
##  g      : 576           g      : 576                     w:7924    
##  n      : 448           n      : 512                     y:   8    
##  b      : 432           b      : 432                               
##  o      : 192           o      : 192                               
##  (Other): 140           (Other): 156                               
##  ring_number ring_type spore_print_color population habitat 
##  n:  36      e:2776    w      :2388      a: 384     d:3148  
##  o:7488      f:  48    n      :1968      c: 340     g:2148  
##  t: 600      l:1296    k      :1872      n: 400     l: 832  
##              n:  36    h      :1632      s:1248     m: 292  
##              p:3968    r      :  72      v:4040     p:1144  
##                        b      :  48      y:1712     u: 368  
##                        (Other): 144                 w: 192

notice the veil column, it has only one level. Need to remove it.

mushroom$veil_type=NULL
summary(mushroom)

##  type     cap_shape cap_surface   cap_color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill_attachment gill_spacing gill_size   gill_color   stalk_shape
##  a: 210          c:6812       b:5612    b      :1728   e:3516     
##  f:7914          w:1312       n:2512    p      :1492   t:4608     
##                                         w      :1202              
##                                         n      :1048              
##                                         g      : 752              
##                                         h      : 732              
##                                         (Other):1170              
##  stalk_root stalk_surface_above_ring stalk_surface_below_ring
##  ?:2480     f: 552                   f: 600                  
##  b:3776     k:2372                   k:2304                  
##  c: 556     s:5176                   s:4936                  
##  e:1120     y:  24                   y: 284                  
##  r: 192                                                      
##                                                              
##                                                              
##  stalk_color_above_ring stalk_color_below_ring veil_color ring_number
##  w      :4464           w      :4384           n:  96     n:  36     
##  p      :1872           p      :1872           o:  96     o:7488     
##  g      : 576           g      : 576           w:7924     t: 600     
##  n      : 448           n      : 512           y:   8                
##  b      : 432           b      : 432                                 
##  o      : 192           o      : 192                                 
##  (Other): 140           (Other): 156                                 
##  ring_type spore_print_color population habitat 
##  e:2776    w      :2388      a: 384     d:3148  
##  f:  48    n      :1968      c: 340     g:2148  
##  l:1296    k      :1872      n: 400     l: 832  
##  n:  36    h      :1632      s:1248     m: 292  
##  p:3968    r      :  72      v:4040     p:1144  
##            b      :  48      y:1712     u: 368  
##            (Other): 144                 w: 192

use RWeka package for rule learning.

library(RWeka)

mush_rule=OneR(type~.,data=mushroom)



mush_rule

## odor:
##  a   -> e
##  c   -> p
##  f   -> p
##  l   -> e
##  m   -> p
##  n   -> e
##  p   -> p
##  s   -> p
##  y   -> p
## (8004/8124 instances correct)

summary(mush_rule)

## 
## === Summary ===
## 
## Correctly Classified Instances        8004               98.5229 %
## Incorrectly Classified Instances       120                1.4771 %
## Kappa statistic                          0.9704
## Mean absolute error                      0.0148
## Root mean squared error                  0.1215
## Relative absolute error                  2.958  %
## Root relative squared error             24.323  %
## Total Number of Instances             8124     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  4208    0 |    a = e
##   120 3796 |    b = p

use another alogrithm in RWeka

library(RWeka)

mush_rip=JRip(type~.,data=mushroom)

mush_rip

## JRIP rules:
## ===========
## 
## (odor = f) => type=p (2160.0/0.0)
## (gill_size = n) and (gill_color = b) => type=p (1152.0/0.0)
## (gill_size = n) and (odor = p) => type=p (256.0/0.0)
## (odor = c) => type=p (192.0/0.0)
## (spore_print_color = r) => type=p (72.0/0.0)
## (stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=p (68.0/0.0)
## (habitat = l) and (cap_color = w) => type=p (8.0/0.0)
## (stalk_color_above_ring = y) => type=p (8.0/0.0)
##  => type=e (4208.0/0.0)
## 
## Number of Rules : 9

summary(mush_rip)

## 
## === Summary ===
## 
## Correctly Classified Instances        8124              100      %
## Incorrectly Classified Instances         0                0      %
## Kappa statistic                          1     
## Mean absolute error                      0     
## Root mean squared error                  0     
## Relative absolute error                  0      %
## Root relative squared error              0      %
## Total Number of Instances             8124     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  4208    0 |    a = e
##     0 3916 |    b = p

how about rule learner’s prediction accuracy with only 80% of the data? as what we did in

split training and testing data

library(caret)

mushroom%>%colnames()

##  [1] "type"                     "cap_shape"               
##  [3] "cap_surface"              "cap_color"               
##  [5] "bruises"                  "odor"                    
##  [7] "gill_attachment"          "gill_spacing"            
##  [9] "gill_size"                "gill_color"              
## [11] "stalk_shape"              "stalk_root"              
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring"   "stalk_color_below_ring"  
## [17] "veil_color"               "ring_number"             
## [19] "ring_type"                "spore_print_color"       
## [21] "population"               "habitat"

index=createDataPartition(mushroom$type, p=0.8 ,list=FALSE)

train_mush=mushroom[index,]

test_mush=mushroom[-index,]

train the JRip on training data

mush_ript=JRip(type~.,data=train_mush)

mush_ript_pred=predict(mush_ript,test_mush)

library(gmodels)

CrossTable(mush_ript_pred,test_mush$type)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1624 
## 
##  
##                | test_mush$type 
## mush_ript_pred |         e |         p | Row Total | 
## ---------------|-----------|-----------|-----------|
##              e |       841 |         0 |       841 | 
##                |   377.518 |   405.482 |           | 
##                |     1.000 |     0.000 |     0.518 | 
##                |     1.000 |     0.000 |           | 
##                |     0.518 |     0.000 |           | 
## ---------------|-----------|-----------|-----------|
##              p |         0 |       783 |       783 | 
##                |   405.482 |   435.518 |           | 
##                |     0.000 |     1.000 |     0.482 | 
##                |     0.000 |     1.000 |           | 
##                |     0.000 |     0.482 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |       841 |       783 |      1624 | 
##                |     0.518 |     0.482 |           | 
## ---------------|-----------|-----------|-----------|
## 
##

decision tree

load the data

change the default variable into factor (required by c5.0 decision tree)

use caret package for splitting the data

load the C50 decision tree package

predict using the model

improved decision tree

add a cost matrix(c(0,1,4,0),nrow=2,ncol=2)

continue for the mushroom data

notice the veil column, it has only one level. Need to remove it.

use RWeka package for rule learning.

use another alogrithm in RWeka

how about rule learner’s prediction accuracy with only 80% of the data? as what we did in

split training and testing data

train the JRip on training data

The JRip method is pretty powerful according to this untuned model.