library(readxl)
Pistachio <- read_excel("C:/BIBIB/MTI Data Mining/Pistachio.xlsx")
head(Pistachio)
## # A tibble: 6 x 17
## AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63391 1568. 390. 237. 0.795 284. 0.866
## 2 68358 1942. 411. 235. 0.821 295. 0.876
## 3 73589 1247. 452. 221. 0.873 306. 0.917
## 4 71106 1445. 430. 216. 0.864 301. 0.959
## 5 80087 1252. 469. 221. 0.882 319. 0.966
## 6 52268 1154. 384. 198. 0.858 258. 0.856
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## # ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## # SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## # SHAPEFACTOR_4 <dbl>, Class <chr>
Mengubah class menjadi bentuk faktor agar klasifikasi pistachio terbaca
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Pistachio_Fix<-Pistachio%>%mutate_if(is.character, as.factor)
head(Pistachio_Fix)
## # A tibble: 6 x 17
## AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63391 1568. 390. 237. 0.795 284. 0.866
## 2 68358 1942. 411. 235. 0.821 295. 0.876
## 3 73589 1247. 452. 221. 0.873 306. 0.917
## 4 71106 1445. 430. 216. 0.864 301. 0.959
## 5 80087 1252. 469. 221. 0.882 319. 0.966
## 6 52268 1154. 384. 198. 0.858 258. 0.856
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## # ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## # SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## # SHAPEFACTOR_4 <dbl>, Class <fct>
summary(Pistachio_Fix)
## AREA PERIMETER MAJOR_AXIS MINOR_AXIS
## Min. : 29808 Min. : 858.4 Min. :320.3 Min. :133.5
## 1st Qu.: 71937 1st Qu.:1171.0 1st Qu.:426.5 1st Qu.:217.9
## Median : 79906 Median :1262.8 Median :448.6 Median :236.4
## Mean : 79951 Mean :1426.0 Mean :446.2 Mean :238.3
## 3rd Qu.: 89031 3rd Qu.:1607.9 3rd Qu.:468.5 3rd Qu.:257.8
## Max. :124008 Max. :2755.0 Max. :542.0 Max. :383.0
## ECCENTRICITY EQDIASQ SOLIDITY CONVEX_AREA
## Min. :0.5049 Min. :194.8 Min. :0.5880 Min. : 37935
## 1st Qu.:0.8175 1st Qu.:302.6 1st Qu.:0.9198 1st Qu.: 76467
## Median :0.8497 Median :319.0 Median :0.9542 Median : 85076
## Mean :0.8402 Mean :317.9 Mean :0.9401 Mean : 85016
## 3rd Qu.:0.8752 3rd Qu.:336.7 3rd Qu.:0.9769 3rd Qu.: 93894
## Max. :0.9460 Max. :397.4 Max. :0.9951 Max. :132478
## EXTENT ASPECT_RATIO ROUNDNESS COMPACTNESS
## Min. :0.4272 Min. :1.159 Min. :0.0628 Min. :0.4760
## 1st Qu.:0.6870 1st Qu.:1.736 1st Qu.:0.3713 1st Qu.:0.6815
## Median :0.7265 Median :1.896 Median :0.6434 Median :0.7107
## Mean :0.7161 Mean :1.898 Mean :0.5692 Mean :0.7131
## 3rd Qu.:0.7536 3rd Qu.:2.067 3rd Qu.:0.7441 3rd Qu.:0.7417
## Max. :0.8204 Max. :3.086 Max. :0.9336 Max. :0.8779
## SHAPEFACTOR_1 SHAPEFACTOR_2 SHAPEFACTOR_3 SHAPEFACTOR_4
## Min. :0.004000 Min. :0.002400 Min. :0.2266 Min. :0.6204
## 1st Qu.:0.005200 1st Qu.:0.002800 1st Qu.:0.4645 1st Qu.:0.9440
## Median :0.005600 Median :0.003000 Median :0.5051 Median :0.9731
## Mean :0.005701 Mean :0.003017 Mean :0.5105 Mean :0.9552
## 3rd Qu.:0.006100 3rd Qu.:0.003200 3rd Qu.:0.5501 3rd Qu.:0.9873
## Max. :0.013100 Max. :0.005300 Max. :0.7706 Max. :0.9990
## Class
## Kirmizi:1232
## Siit : 916
##
##
##
##
Kita akan melakukan cross validation (membagi data menjadi bagian-bagian tertentu). Pada kesempatan kali ini, kita membagi dtaa menjadi 2, yakni data train dan test dengan proporsi data train adalah 80% dan data test 20%.
set.seed(100)
#Set the seed of R‘s random number generator, which is useful for creating simulations or random objects that can be reproduced
RNGkind(sample.kind = "Rejection") #sample.kind can be "Rounding" or "Rejection", or partial matches to these. The former was the default in versions prior to 3.6.0: it made sample noticeably non-uniform on large populations, and should only be used for reproduction of old results.
idx_Pistachio<-sample(nrow(Pistachio_Fix),nrow(Pistachio_Fix)*0.8) #membagi 80% dan 20%
train_Pistachio<-Pistachio_Fix[idx_Pistachio,]
test_Pistachio<-Pistachio_Fix[idx_Pistachio,]
head(train_Pistachio)
## # A tibble: 6 x 17
## AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 77146 1344. 439. 230. 0.851 313. 0.933
## 2 103409 2053. 488. 281. 0.818 363. 0.937
## 3 72189 1184. 463. 209. 0.893 303. 0.928
## 4 92362 1216. 464. 255. 0.835 343. 0.983
## 5 73212 1277. 411. 250. 0.793 305. 0.896
## 6 79982 1218. 469. 220. 0.882 319. 0.969
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## # ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## # SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## # SHAPEFACTOR_4 <dbl>, Class <fct>
head(test_Pistachio)
## # A tibble: 6 x 17
## AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 77146 1344. 439. 230. 0.851 313. 0.933
## 2 103409 2053. 488. 281. 0.818 363. 0.937
## 3 72189 1184. 463. 209. 0.893 303. 0.928
## 4 92362 1216. 464. 255. 0.835 343. 0.983
## 5 73212 1277. 411. 250. 0.793 305. 0.896
## 6 79982 1218. 469. 220. 0.882 319. 0.969
## # ... with 10 more variables: CONVEX_AREA <dbl>, EXTENT <dbl>,
## # ASPECT_RATIO <dbl>, ROUNDNESS <dbl>, COMPACTNESS <dbl>,
## # SHAPEFACTOR_1 <dbl>, SHAPEFACTOR_2 <dbl>, SHAPEFACTOR_3 <dbl>,
## # SHAPEFACTOR_4 <dbl>, Class <fct>
library(RWeka)
# fit model
model_Pistachio1<-J48(Class~.,data=train_Pistachio)
# summarize the fit
summary(model_Pistachio1)
##
## === Summary ===
##
## Correctly Classified Instances 1566 91.1525 %
## Incorrectly Classified Instances 152 8.8475 %
## Kappa statistic 0.8185
## Mean absolute error 0.1411
## Root mean squared error 0.2656
## Relative absolute error 28.8708 %
## Root relative squared error 53.7323 %
## Total Number of Instances 1718
##
## === Confusion Matrix ===
##
## a b <-- classified as
## 919 70 | a = Kirmizi
## 82 647 | b = Siit
install packages partykit di console
## visualization
## use partykit package
if(require("partykit", quietly = TRUE))
plot(model_Pistachio1)
### Prediksi model
Selanjutnya model yang telah dibuat digunakan untuk memprediksi data baru
pred_Pistachio1 <- predict(model_Pistachio1, test_Pistachio)
summary(pred_Pistachio1)
## Kirmizi Siit
## 1001 717
library(dplyr)
pred_result1 <- cbind(test_Pistachio, pred_Pistachio1)
head(pred_result1)
## AREA PERIMETER MAJOR_AXIS MINOR_AXIS ECCENTRICITY EQDIASQ SOLIDITY
## 1 77146 1344.253 438.5641 230.0513 0.8514 313.4092 0.9329
## 2 103409 2052.611 488.2375 280.9924 0.8178 362.8559 0.9369
## 3 72189 1184.424 462.6273 208.5969 0.8926 303.1730 0.9282
## 4 92362 1215.607 463.6023 255.0006 0.8351 342.9270 0.9831
## 5 73212 1276.671 410.6384 250.3580 0.7926 305.3136 0.8961
## 6 79982 1218.160 468.5190 220.3987 0.8824 319.1179 0.9689
## CONVEX_AREA EXTENT ASPECT_RATIO ROUNDNESS COMPACTNESS SHAPEFACTOR_1
## 1 82697 0.6922 1.9064 0.5365 0.7146 0.0057
## 2 110373 0.6557 1.7375 0.3084 0.7432 0.0047
## 3 77775 0.7361 2.2178 0.6466 0.6553 0.0064
## 4 93945 0.7419 1.8180 0.7854 0.7397 0.0050
## 5 81705 0.6990 1.6402 0.5645 0.7435 0.0056
## 6 82552 0.6927 2.1258 0.6773 0.6811 0.0059
## SHAPEFACTOR_2 SHAPEFACTOR_3 SHAPEFACTOR_4 Class pred_Pistachio1
## 1 0.0030 0.5107 0.9736 Kirmizi Kirmizi
## 2 0.0027 0.5523 0.9597 Siit Siit
## 3 0.0029 0.4295 0.9524 Kirmizi Kirmizi
## 4 0.0028 0.5472 0.9948 Siit Siit
## 5 0.0034 0.5528 0.9067 Siit Kirmizi
## 6 0.0028 0.4639 0.9862 Kirmizi Kirmizi
library(caret) # mengevaluasi model dengan confusion matrix
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(pred_Pistachio1, test_Pistachio$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Kirmizi Siit
## Kirmizi 919 82
## Siit 70 647
##
## Accuracy : 0.9115
## 95% CI : (0.8971, 0.9245)
## No Information Rate : 0.5757
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8185
##
## Mcnemar's Test P-Value : 0.3723
##
## Sensitivity : 0.9292
## Specificity : 0.8875
## Pos Pred Value : 0.9181
## Neg Pred Value : 0.9024
## Prevalence : 0.5757
## Detection Rate : 0.5349
## Detection Prevalence : 0.5827
## Balanced Accuracy : 0.9084
##
## 'Positive' Class : Kirmizi
##
library(caret) # mengevaluasi model dengan confusion matrix
library(partykit) # pemodelan decision tree
#PEMBUATAN MODEL
model_Pistachio2<- ctree(formula = Class~.,data= train_Pistachio)
plot(model_Pistachio2, type = "simple")
Evaluasi Model2
pred_Pistachio2 <- predict(model_Pistachio2, test_Pistachio)
confusionMatrix(pred_Pistachio2, test_Pistachio$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Kirmizi Siit
## Kirmizi 924 160
## Siit 65 569
##
## Accuracy : 0.869
## 95% CI : (0.8522, 0.8846)
## No Information Rate : 0.5757
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7273
##
## Mcnemar's Test P-Value : 3.689e-10
##
## Sensitivity : 0.9343
## Specificity : 0.7805
## Pos Pred Value : 0.8524
## Neg Pred Value : 0.8975
## Prevalence : 0.5757
## Detection Rate : 0.5378
## Detection Prevalence : 0.6310
## Balanced Accuracy : 0.8574
##
## 'Positive' Class : Kirmizi
##