Tahapan Penerapan Algoritma: 1. Input Data 2. Eksplorasi Data 3. Preprocessing Data 4. Membagi data 5. Terapkan Algoritma 6. Lakukan Evaluasi
cs <- ('D:/Bahan Ajar Semester Genap 2022-2023/Modern Predic dan Machine Learning/dataR2.csv')
data <- read.csv(cs,sep=',')
head(data)
## Age BMI Glucose Insulin HOMA Leptin Adiponectin Resistin MCP.1
## 1 48 23.50000 70 2.707 0.4674087 8.8071 9.702400 7.99585 417.114
## 2 83 20.69049 92 3.115 0.7068973 8.8438 5.429285 4.06405 468.786
## 3 82 23.12467 91 4.498 1.0096511 17.9393 22.432040 9.27715 554.697
## 4 68 21.36752 77 3.226 0.6127249 9.8827 7.169560 12.76600 928.220
## 5 86 21.11111 92 3.549 0.8053864 6.6994 4.819240 10.57635 773.920
## 6 49 22.85446 92 3.226 0.7320869 6.8317 13.679750 10.31760 530.410
## Classification
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
str(data)
## 'data.frame': 116 obs. of 10 variables:
## $ Age : int 48 83 82 68 86 49 89 76 73 75 ...
## $ BMI : num 23.5 20.7 23.1 21.4 21.1 ...
## $ Glucose : int 70 92 91 77 92 92 77 118 97 83 ...
## $ Insulin : num 2.71 3.12 4.5 3.23 3.55 ...
## $ HOMA : num 0.467 0.707 1.01 0.613 0.805 ...
## $ Leptin : num 8.81 8.84 17.94 9.88 6.7 ...
## $ Adiponectin : num 9.7 5.43 22.43 7.17 4.82 ...
## $ Resistin : num 8 4.06 9.28 12.77 10.58 ...
## $ MCP.1 : num 417 469 555 928 774 ...
## $ Classification: int 1 1 1 1 1 1 1 1 1 1 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
plotdata <- data %>%
count(Classification) %>%
arrange(desc(Classification)) %>%
mutate(prop = round(n*100/sum(n), 1),
lab.ypos = cumsum(prop) - 0.5*prop)
plotdata$label <- paste0(plotdata$Classification, "\n",
round(plotdata$prop), "%")
ggplot(plotdata,
aes(x = "", y = prop, fill = Classification)) +
geom_bar(width = 1, stat = "identity", color = "black") +
geom_text(aes(y = lab.ypos, label = label),color = "black") +
coord_polar("y",start = 0,direction = -1) +
theme_void() +
theme(legend.position = "FALSE") +
labs(title = "Klasifikasi kanker")
#1. Deteksi data hilang
summary(data)
## Age BMI Glucose Insulin
## Min. :24.0 Min. :18.37 Min. : 60.00 Min. : 2.432
## 1st Qu.:45.0 1st Qu.:22.97 1st Qu.: 85.75 1st Qu.: 4.359
## Median :56.0 Median :27.66 Median : 92.00 Median : 5.925
## Mean :57.3 Mean :27.58 Mean : 97.79 Mean :10.012
## 3rd Qu.:71.0 3rd Qu.:31.24 3rd Qu.:102.00 3rd Qu.:11.189
## Max. :89.0 Max. :38.58 Max. :201.00 Max. :58.460
## HOMA Leptin Adiponectin Resistin
## Min. : 0.4674 Min. : 4.311 Min. : 1.656 Min. : 3.210
## 1st Qu.: 0.9180 1st Qu.:12.314 1st Qu.: 5.474 1st Qu.: 6.882
## Median : 1.3809 Median :20.271 Median : 8.353 Median :10.828
## Mean : 2.6950 Mean :26.615 Mean :10.181 Mean :14.726
## 3rd Qu.: 2.8578 3rd Qu.:37.378 3rd Qu.:11.816 3rd Qu.:17.755
## Max. :25.0503 Max. :90.280 Max. :38.040 Max. :82.100
## MCP.1 Classification
## Min. : 45.84 Min. :1.000
## 1st Qu.: 269.98 1st Qu.:1.000
## Median : 471.32 Median :2.000
## Mean : 534.65 Mean :1.517
## 3rd Qu.: 700.09 3rd Qu.:2.000
## Max. :1698.44 Max. :2.000
sum(is.na(data))
## [1] 0
#2. Seleksi Fitur
cor(data) # Catatan : jangan ambil atribut yang kurang dari 0.5
## Age BMI Glucose Insulin HOMA
## Age 1.000000000 0.008529857 0.2301056 0.03249535 0.12703259
## BMI 0.008529857 1.000000000 0.1388452 0.14529526 0.11448013
## Glucose 0.230105617 0.138845189 1.0000000 0.50465307 0.69621182
## Insulin 0.032495353 0.145295260 0.5046531 1.00000000 0.93219777
## HOMA 0.127032593 0.114480131 0.6962118 0.93219777 1.00000000
## Leptin 0.102626049 0.569592606 0.3050799 0.30146162 0.32720986
## Adiponectin -0.219812891 -0.302734758 -0.1221213 -0.03129608 -0.05633712
## Resistin 0.002741708 0.195350206 0.2913275 0.14673099 0.23110123
## MCP.1 0.013461678 0.224038215 0.2648793 0.17435580 0.25952919
## Classification -0.071082789 -0.125107364 0.3310768 0.27511819 0.27519979
## Leptin Adiponectin Resistin MCP.1 Classification
## Age 0.10262605 -0.219812891 0.002741708 0.01346168 -0.071082789
## BMI 0.56959261 -0.302734758 0.195350206 0.22403821 -0.125107364
## Glucose 0.30507994 -0.122121312 0.291327462 0.26487927 0.331076762
## Insulin 0.30146162 -0.031296082 0.146730986 0.17435580 0.275118192
## HOMA 0.32720986 -0.056337123 0.231101229 0.25952919 0.275199786
## Leptin 1.00000000 -0.095388740 0.256233522 0.01400862 -0.085339334
## Adiponectin -0.09538874 1.000000000 -0.252363303 -0.20069450 -0.008080435
## Resistin 0.25623352 -0.252363303 1.000000000 0.36647421 0.259822770
## MCP.1 0.01400862 -0.200694496 0.366474210 1.00000000 0.144790706
## Classification -0.08533933 -0.008080435 0.259822770 0.14479071 1.000000000
# Hasil Cleaning dan Seleksi Fitur
# Atribut yang digunakan adalah BMI, Glucose, Insulin, Homa, Resistin, MCP.1
dataml<- data[c('BMI', 'Glucose','Insulin','HOMA','Resistin','MCP.1','Classification')]
head(dataml)
## BMI Glucose Insulin HOMA Resistin MCP.1 Classification
## 1 23.50000 70 2.707 0.4674087 7.99585 417.114 1
## 2 20.69049 92 3.115 0.7068973 4.06405 468.786 1
## 3 23.12467 91 4.498 1.0096511 9.27715 554.697 1
## 4 21.36752 77 3.226 0.6127249 12.76600 928.220 1
## 5 21.11111 92 3.549 0.8053864 10.57635 773.920 1
## 6 22.85446 92 3.226 0.7320869 10.31760 530.410 1
#Membagi Data
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.3
set.seed(123)
dataml1=sample.split(dataml,SplitRatio = 0.8)
#1. Data training
data_training<- subset(dataml,dataml1==TRUE)
head(data_training)
## BMI Glucose Insulin HOMA Resistin MCP.1 Classification
## 1 23.50000 70 2.707 0.4674087 7.99585 417.114 1
## 2 20.69049 92 3.115 0.7068973 4.06405 468.786 1
## 3 23.12467 91 4.498 1.0096511 9.27715 554.697 1
## 6 22.85446 92 3.226 0.7320869 10.31760 530.410 1
## 7 22.70000 77 4.690 0.8907873 12.93610 1256.083 1
## 8 23.80000 118 6.470 1.8832013 5.10420 280.694 1
dim(data_training)
## [1] 83 7
#2. Data Testing
data_testing=subset(dataml,dataml1==FALSE)
head(data_training)
## BMI Glucose Insulin HOMA Resistin MCP.1 Classification
## 1 23.50000 70 2.707 0.4674087 7.99585 417.114 1
## 2 20.69049 92 3.115 0.7068973 4.06405 468.786 1
## 3 23.12467 91 4.498 1.0096511 9.27715 554.697 1
## 6 22.85446 92 3.226 0.7320869 10.31760 530.410 1
## 7 22.70000 77 4.690 0.8907873 12.93610 1256.083 1
## 8 23.80000 118 6.470 1.8832013 5.10420 280.694 1
dim(data_testing)
## [1] 33 7
#1. Metode CART
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
pohon<-rpart(Classification ~., data = data_training, method = 'class' )
rpart.plot(pohon)
#1.1 Predic CART
pred<-predict(pohon,newdata=data_testing,type = "class")
table(pred,data_testing$Classification)
##
## pred 1 2
## 1 9 6
## 2 7 11
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
confusionMatrix(table(data = pred, reference = data_testing$Classification))
## Confusion Matrix and Statistics
##
## reference
## data 1 2
## 1 9 6
## 2 7 11
##
## Accuracy : 0.6061
## 95% CI : (0.4214, 0.7709)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : 0.1922
##
## Kappa : 0.2099
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5625
## Specificity : 0.6471
## Pos Pred Value : 0.6000
## Neg Pred Value : 0.6111
## Prevalence : 0.4848
## Detection Rate : 0.2727
## Detection Prevalence : 0.4545
## Balanced Accuracy : 0.6048
##
## 'Positive' Class : 1
##
#2. Metode ID3 (package: rpart)
pohon1<-rpart(Classification ~., data = data_training, method = 'class',control=rpart.control(cp=0))
rpart.plot(pohon1)
#1.1 Predic IDE
pred1<-predict(pohon1,newdata=data_testing,type = "class")
table(pred1,data_testing$Classification)
##
## pred1 1 2
## 1 9 6
## 2 7 11
library(caret)
confusionMatrix(table(data = pred1, reference = data_testing$Classification))
## Confusion Matrix and Statistics
##
## reference
## data 1 2
## 1 9 6
## 2 7 11
##
## Accuracy : 0.6061
## 95% CI : (0.4214, 0.7709)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : 0.1922
##
## Kappa : 0.2099
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5625
## Specificity : 0.6471
## Pos Pred Value : 0.6000
## Neg Pred Value : 0.6111
## Prevalence : 0.4848
## Detection Rate : 0.2727
## Detection Prevalence : 0.4545
## Balanced Accuracy : 0.6048
##
## 'Positive' Class : 1
##
#3. Metode C4.5 (package: C50
library(C50)
## Warning: package 'C50' was built under R version 4.2.3
data_training$Classification<-as.factor(data_training$Classification)
pohon2<-C5.0(Classification~.,data = data_training)
summary(pohon2)
##
## Call:
## C5.0.formula(formula = Classification ~ ., data = data_training)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat May 13 10:45:53 2023
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 83 cases (7 attributes) from undefined.data
##
## Decision tree:
##
## Glucose > 91: 2 (48/14)
## Glucose <= 91:
## :...Resistin <= 12.9361: 1 (21/1)
## Resistin > 12.9361:
## :...BMI <= 29.66655: 2 (7)
## BMI > 29.66655: 1 (7/1)
##
##
## Evaluation on training data (83 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 4 16(19.3%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 26 14 (a): class 1
## 2 41 (b): class 2
##
##
## Attribute usage:
##
## 100.00% Glucose
## 42.17% Resistin
## 16.87% BMI
##
##
## Time: 0.0 secs
plot(pohon2)
#1.2 Predic
pred2<-predict(pohon2,newdata=data_testing,type = "class")
table(pred2,data_testing$Classification)
##
## pred2 1 2
## 1 9 3
## 2 7 14
library(caret)
confusionMatrix(table(data = pred2, reference = data_testing$Classification))
## Confusion Matrix and Statistics
##
## reference
## data 1 2
## 1 9 3
## 2 7 14
##
## Accuracy : 0.697
## 95% CI : (0.5129, 0.8441)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : 0.02655
##
## Kappa : 0.3889
##
## Mcnemar's Test P-Value : 0.34278
##
## Sensitivity : 0.5625
## Specificity : 0.8235
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.6667
## Prevalence : 0.4848
## Detection Rate : 0.2727
## Detection Prevalence : 0.3636
## Balanced Accuracy : 0.6930
##
## 'Positive' Class : 1
##