Classification on Pistachio Species

library(readxl)
Pistachio <- read_excel("C:/BIBIB/MTI Data Mining/Pistachio.xlsx")
head(Pistachio)
## # A tibble: 6 x 17
##    AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
##   <dbl>     <dbl>      <dbl>      <dbl>        <dbl>   <dbl>    <dbl>
## 1 63391     1568.       390.       237.        0.795    284.    0.866
## 2 68358     1942.       411.       235.        0.821    295.    0.876
## 3 73589     1247.       452.       221.        0.873    306.    0.917
## 4 71106     1445.       430.       216.        0.864    301.    0.959
## 5 80087     1252.       469.       221.        0.882    319.    0.966
## 6 52268     1154.       384.       198.        0.858    258.    0.856
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## #   ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## #   SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## #   SHAPEFACTOR_4 <dbl>, Class <chr>

Mengubah class menjadi bentuk faktor agar klasifikasi pistachio terbaca

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Pistachio_Fix<-Pistachio%>%mutate_if(is.character, as.factor)
head(Pistachio_Fix)
## # A tibble: 6 x 17
##    AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
##   <dbl>     <dbl>      <dbl>      <dbl>        <dbl>   <dbl>    <dbl>
## 1 63391     1568.       390.       237.        0.795    284.    0.866
## 2 68358     1942.       411.       235.        0.821    295.    0.876
## 3 73589     1247.       452.       221.        0.873    306.    0.917
## 4 71106     1445.       430.       216.        0.864    301.    0.959
## 5 80087     1252.       469.       221.        0.882    319.    0.966
## 6 52268     1154.       384.       198.        0.858    258.    0.856
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## #   ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## #   SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## #   SHAPEFACTOR_4 <dbl>, Class <fct>
summary(Pistachio_Fix)
##       AREA          PERIMETER        MAJOR_AXIS      MINOR_AXIS   
##  Min.   : 29808   Min.   : 858.4   Min.   :320.3   Min.   :133.5  
##  1st Qu.: 71937   1st Qu.:1171.0   1st Qu.:426.5   1st Qu.:217.9  
##  Median : 79906   Median :1262.8   Median :448.6   Median :236.4  
##  Mean   : 79951   Mean   :1426.0   Mean   :446.2   Mean   :238.3  
##  3rd Qu.: 89031   3rd Qu.:1607.9   3rd Qu.:468.5   3rd Qu.:257.8  
##  Max.   :124008   Max.   :2755.0   Max.   :542.0   Max.   :383.0  
##   ECCENTRICITY       EQDIASQ         SOLIDITY       CONVEX_AREA    
##  Min.   :0.5049   Min.   :194.8   Min.   :0.5880   Min.   : 37935  
##  1st Qu.:0.8175   1st Qu.:302.6   1st Qu.:0.9198   1st Qu.: 76467  
##  Median :0.8497   Median :319.0   Median :0.9542   Median : 85076  
##  Mean   :0.8402   Mean   :317.9   Mean   :0.9401   Mean   : 85016  
##  3rd Qu.:0.8752   3rd Qu.:336.7   3rd Qu.:0.9769   3rd Qu.: 93894  
##  Max.   :0.9460   Max.   :397.4   Max.   :0.9951   Max.   :132478  
##      EXTENT        ASPECT_RATIO     ROUNDNESS       COMPACTNESS    
##  Min.   :0.4272   Min.   :1.159   Min.   :0.0628   Min.   :0.4760  
##  1st Qu.:0.6870   1st Qu.:1.736   1st Qu.:0.3713   1st Qu.:0.6815  
##  Median :0.7265   Median :1.896   Median :0.6434   Median :0.7107  
##  Mean   :0.7161   Mean   :1.898   Mean   :0.5692   Mean   :0.7131  
##  3rd Qu.:0.7536   3rd Qu.:2.067   3rd Qu.:0.7441   3rd Qu.:0.7417  
##  Max.   :0.8204   Max.   :3.086   Max.   :0.9336   Max.   :0.8779  
##  SHAPEFACTOR_1      SHAPEFACTOR_2      SHAPEFACTOR_3    SHAPEFACTOR_4   
##  Min.   :0.004000   Min.   :0.002400   Min.   :0.2266   Min.   :0.6204  
##  1st Qu.:0.005200   1st Qu.:0.002800   1st Qu.:0.4645   1st Qu.:0.9440  
##  Median :0.005600   Median :0.003000   Median :0.5051   Median :0.9731  
##  Mean   :0.005701   Mean   :0.003017   Mean   :0.5105   Mean   :0.9552  
##  3rd Qu.:0.006100   3rd Qu.:0.003200   3rd Qu.:0.5501   3rd Qu.:0.9873  
##  Max.   :0.013100   Max.   :0.005300   Max.   :0.7706   Max.   :0.9990  
##      Class     
##  Kirmizi:1232  
##  Siit   : 916  
##                
##                
##                
## 

Membagi ke dalam data training dan data testing

Cross validation

Kita akan melakukan cross validation (membagi data menjadi bagian-bagian tertentu). Pada kesempatan kali ini, kita membagi dtaa menjadi 2, yakni data train dan test dengan proporsi data train adalah 80% dan data test 20%.

set.seed(100)
         #Set the seed of R‘s random number generator, which is useful for creating simulations or random objects that can be reproduced 
RNGkind(sample.kind = "Rejection") #sample.kind can be "Rounding" or "Rejection", or partial matches to these. The former was the default in versions prior to 3.6.0: it made sample noticeably non-uniform on large populations, and should only be used for reproduction of old results. 
idx_Pistachio<-sample(nrow(Pistachio_Fix),nrow(Pistachio_Fix)*0.8) #membagi 80% dan 20%
train_Pistachio<-Pistachio_Fix[idx_Pistachio,]
test_Pistachio<-Pistachio_Fix[idx_Pistachio,]
head(train_Pistachio)
## # A tibble: 6 x 17
##     AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
##    <dbl>     <dbl>      <dbl>      <dbl>        <dbl>   <dbl>    <dbl>
## 1  77146     1344.       439.       230.        0.851    313.    0.933
## 2 103409     2053.       488.       281.        0.818    363.    0.937
## 3  72189     1184.       463.       209.        0.893    303.    0.928
## 4  92362     1216.       464.       255.        0.835    343.    0.983
## 5  73212     1277.       411.       250.        0.793    305.    0.896
## 6  79982     1218.       469.       220.        0.882    319.    0.969
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## #   ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## #   SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## #   SHAPEFACTOR_4 <dbl>, Class <fct>
head(test_Pistachio)
## # A tibble: 6 x 17
##     AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
##    <dbl>     <dbl>      <dbl>      <dbl>        <dbl>   <dbl>    <dbl>
## 1  77146     1344.       439.       230.        0.851    313.    0.933
## 2 103409     2053.       488.       281.        0.818    363.    0.937
## 3  72189     1184.       463.       209.        0.893    303.    0.928
## 4  92362     1216.       464.       255.        0.835    343.    0.983
## 5  73212     1277.       411.       250.        0.793    305.    0.896
## 6  79982     1218.       469.       220.        0.882    319.    0.969
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## #   ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## #   SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## #   SHAPEFACTOR_4 <dbl>, Class <fct>
library(RWeka)
# fit model
model_Pistachio1<-J48(Class~.,data=train_Pistachio)
# summarize the fit
summary(model_Pistachio1)
## 
## === Summary ===
## 
## Correctly Classified Instances        1566               91.1525 %
## Incorrectly Classified Instances       152                8.8475 %
## Kappa statistic                          0.8185
## Mean absolute error                      0.1411
## Root mean squared error                  0.2656
## Relative absolute error                 28.8708 %
## Root relative squared error             53.7323 %
## Total Number of Instances             1718     
## 
## === Confusion Matrix ===
## 
##    a   b   <-- classified as
##  919  70 |   a = Kirmizi
##   82 647 |   b = Siit

Menggunakan packages partykit untuk menggambarkan hasil klasifikasi

install packages partykit di console

## visualization
## use partykit package
if(require("partykit", quietly = TRUE)) 
plot(model_Pistachio1)

### Prediksi model

Selanjutnya model yang telah dibuat digunakan untuk memprediksi data baru

pred_Pistachio1 <- predict(model_Pistachio1, test_Pistachio)
summary(pred_Pistachio1)
## Kirmizi    Siit 
##    1001     717
library(dplyr)
pred_result1 <- cbind(test_Pistachio, pred_Pistachio1) 
head(pred_result1)
##     AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY  EQDIASQ SOLIDITY
## 1  77146  1344.253   438.5641   230.0513       0.8514 313.4092   0.9329
## 2 103409  2052.611   488.2375   280.9924       0.8178 362.8559   0.9369
## 3  72189  1184.424   462.6273   208.5969       0.8926 303.1730   0.9282
## 4  92362  1215.607   463.6023   255.0006       0.8351 342.9270   0.9831
## 5  73212  1276.671   410.6384   250.3580       0.7926 305.3136   0.8961
## 6  79982  1218.160   468.5190   220.3987       0.8824 319.1179   0.9689
##   CONVEX_AREA EXTENT ASPECT_RATIO ROUNDNESS COMPACTNESS SHAPEFACTOR_1
## 1       82697 0.6922       1.9064    0.5365      0.7146        0.0057
## 2      110373 0.6557       1.7375    0.3084      0.7432        0.0047
## 3       77775 0.7361       2.2178    0.6466      0.6553        0.0064
## 4       93945 0.7419       1.8180    0.7854      0.7397        0.0050
## 5       81705 0.6990       1.6402    0.5645      0.7435        0.0056
## 6       82552 0.6927       2.1258    0.6773      0.6811        0.0059
##   SHAPEFACTOR_2 SHAPEFACTOR_3 SHAPEFACTOR_4   Class pred_Pistachio1
## 1        0.0030        0.5107        0.9736 Kirmizi         Kirmizi
## 2        0.0027        0.5523        0.9597    Siit            Siit
## 3        0.0029        0.4295        0.9524 Kirmizi         Kirmizi
## 4        0.0028        0.5472        0.9948    Siit            Siit
## 5        0.0034        0.5528        0.9067    Siit         Kirmizi
## 6        0.0028        0.4639        0.9862 Kirmizi         Kirmizi

Mengevaluasi model lebih lengkap

library(caret) # mengevaluasi model dengan confusion matrix
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(pred_Pistachio1, test_Pistachio$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Kirmizi Siit
##    Kirmizi     919   82
##    Siit         70  647
##                                           
##                Accuracy : 0.9115          
##                  95% CI : (0.8971, 0.9245)
##     No Information Rate : 0.5757          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8185          
##                                           
##  Mcnemar's Test P-Value : 0.3723          
##                                           
##             Sensitivity : 0.9292          
##             Specificity : 0.8875          
##          Pos Pred Value : 0.9181          
##          Neg Pred Value : 0.9024          
##              Prevalence : 0.5757          
##          Detection Rate : 0.5349          
##    Detection Prevalence : 0.5827          
##       Balanced Accuracy : 0.9084          
##                                           
##        'Positive' Class : Kirmizi         
## 

Membuat Decision Tree dengan library ctree: Conditional Inference Trees di library partykit

library(caret) # mengevaluasi model dengan confusion matrix
library(partykit)  # pemodelan decision tree 
#PEMBUATAN MODEL
model_Pistachio2<- ctree(formula = Class~.,data=  train_Pistachio)
plot(model_Pistachio2, type = "simple")                   

Evaluasi Model2

pred_Pistachio2 <- predict(model_Pistachio2, test_Pistachio)
confusionMatrix(pred_Pistachio2, test_Pistachio$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Kirmizi Siit
##    Kirmizi     924  160
##    Siit         65  569
##                                           
##                Accuracy : 0.869           
##                  95% CI : (0.8522, 0.8846)
##     No Information Rate : 0.5757          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7273          
##                                           
##  Mcnemar's Test P-Value : 3.689e-10       
##                                           
##             Sensitivity : 0.9343          
##             Specificity : 0.7805          
##          Pos Pred Value : 0.8524          
##          Neg Pred Value : 0.8975          
##              Prevalence : 0.5757          
##          Detection Rate : 0.5378          
##    Detection Prevalence : 0.6310          
##       Balanced Accuracy : 0.8574          
##                                           
##        'Positive' Class : Kirmizi         
##