Tahapan Penerapan Algoritma: 1. Input Data 2. Eksplorasi Data 3. Preprocessing Data 4. Membagi data 5. Terapkan Algoritma 6. Lakukan Evaluasi
cs <- ('D:/Bahan Ajar Semester Genap 2022-2023/Modern Predic dan Machine Learning/dataR2.csv')
data <- read.csv(cs,sep=',')
head(data)
str(data)
## 'data.frame': 116 obs. of 10 variables:
## $ Age : int 48 83 82 68 86 49 89 76 73 75 ...
## $ BMI : num 23.5 20.7 23.1 21.4 21.1 ...
## $ Glucose : int 70 92 91 77 92 92 77 118 97 83 ...
## $ Insulin : num 2.71 3.12 4.5 3.23 3.55 ...
## $ HOMA : num 0.467 0.707 1.01 0.613 0.805 ...
## $ Leptin : num 8.81 8.84 17.94 9.88 6.7 ...
## $ Adiponectin : num 9.7 5.43 22.43 7.17 4.82 ...
## $ Resistin : num 8 4.06 9.28 12.77 10.58 ...
## $ MCP.1 : num 417 469 555 928 774 ...
## $ Classification: int 1 1 1 1 1 1 1 1 1 1 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
plotdata <- data %>%
count(Classification) %>%
arrange(desc(Classification)) %>%
mutate(prop = round(n*100/sum(n), 1),
lab.ypos = cumsum(prop) - 0.5*prop)
plotdata$label <- paste0(plotdata$Classification, "\n",
round(plotdata$prop), "%")
ggplot(plotdata,
aes(x = "", y = prop, fill = Classification)) +
geom_bar(width = 1, stat = "identity", color = "black") +
geom_text(aes(y = lab.ypos, label = label),color = "black") +
coord_polar("y",start = 0,direction = -1) +
theme_void() +
theme(legend.position = "FALSE") +
labs(title = "Klasifikasi kanker")
#1. Deteksi data hilang
summary(data)
## Age BMI Glucose Insulin
## Min. :24.0 Min. :18.37 Min. : 60.00 Min. : 2.432
## 1st Qu.:45.0 1st Qu.:22.97 1st Qu.: 85.75 1st Qu.: 4.359
## Median :56.0 Median :27.66 Median : 92.00 Median : 5.925
## Mean :57.3 Mean :27.58 Mean : 97.79 Mean :10.012
## 3rd Qu.:71.0 3rd Qu.:31.24 3rd Qu.:102.00 3rd Qu.:11.189
## Max. :89.0 Max. :38.58 Max. :201.00 Max. :58.460
## HOMA Leptin Adiponectin Resistin
## Min. : 0.4674 Min. : 4.311 Min. : 1.656 Min. : 3.210
## 1st Qu.: 0.9180 1st Qu.:12.314 1st Qu.: 5.474 1st Qu.: 6.882
## Median : 1.3809 Median :20.271 Median : 8.353 Median :10.828
## Mean : 2.6950 Mean :26.615 Mean :10.181 Mean :14.726
## 3rd Qu.: 2.8578 3rd Qu.:37.378 3rd Qu.:11.816 3rd Qu.:17.755
## Max. :25.0503 Max. :90.280 Max. :38.040 Max. :82.100
## MCP.1 Classification
## Min. : 45.84 Min. :1.000
## 1st Qu.: 269.98 1st Qu.:1.000
## Median : 471.32 Median :2.000
## Mean : 534.65 Mean :1.517
## 3rd Qu.: 700.09 3rd Qu.:2.000
## Max. :1698.44 Max. :2.000
sum(is.na(data))
## [1] 0
#2. Seleksi Fitur
cor(data) # Catatan : jangan ambil atribut yang kurang dari 0.5
## Age BMI Glucose Insulin HOMA
## Age 1.000000000 0.008529857 0.2301056 0.03249535 0.12703259
## BMI 0.008529857 1.000000000 0.1388452 0.14529526 0.11448013
## Glucose 0.230105617 0.138845189 1.0000000 0.50465307 0.69621182
## Insulin 0.032495353 0.145295260 0.5046531 1.00000000 0.93219777
## HOMA 0.127032593 0.114480131 0.6962118 0.93219777 1.00000000
## Leptin 0.102626049 0.569592606 0.3050799 0.30146162 0.32720986
## Adiponectin -0.219812891 -0.302734758 -0.1221213 -0.03129608 -0.05633712
## Resistin 0.002741708 0.195350206 0.2913275 0.14673099 0.23110123
## MCP.1 0.013461678 0.224038215 0.2648793 0.17435580 0.25952919
## Classification -0.071082789 -0.125107364 0.3310768 0.27511819 0.27519979
## Leptin Adiponectin Resistin MCP.1 Classification
## Age 0.10262605 -0.219812891 0.002741708 0.01346168 -0.071082789
## BMI 0.56959261 -0.302734758 0.195350206 0.22403821 -0.125107364
## Glucose 0.30507994 -0.122121312 0.291327462 0.26487927 0.331076762
## Insulin 0.30146162 -0.031296082 0.146730986 0.17435580 0.275118192
## HOMA 0.32720986 -0.056337123 0.231101229 0.25952919 0.275199786
## Leptin 1.00000000 -0.095388740 0.256233522 0.01400862 -0.085339334
## Adiponectin -0.09538874 1.000000000 -0.252363303 -0.20069450 -0.008080435
## Resistin 0.25623352 -0.252363303 1.000000000 0.36647421 0.259822770
## MCP.1 0.01400862 -0.200694496 0.366474210 1.00000000 0.144790706
## Classification -0.08533933 -0.008080435 0.259822770 0.14479071 1.000000000
# Hasil Cleaning dan Seleksi Fitur
# Atribut yang digunakan adalah BMI, Glucose, Insulin, Homa, Resistin, MCP.1
dataml<- data[c('BMI', 'Glucose','Insulin','HOMA','Resistin','MCP.1','Classification')]
head(dataml)
#Membagi Data
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.3
set.seed(123)
dataml1=sample.split(dataml,SplitRatio = 0.8)
#1. Data training
data_training<- subset(dataml,dataml1==TRUE)
head(data_training)
str(data_training)
## 'data.frame': 83 obs. of 7 variables:
## $ BMI : num 23.5 20.7 23.1 22.9 22.7 ...
## $ Glucose : int 70 92 91 92 77 118 97 83 82 88 ...
## $ Insulin : num 2.71 3.12 4.5 3.23 4.69 ...
## $ HOMA : num 0.467 0.707 1.01 0.732 0.891 ...
## $ Resistin : num 8 4.06 9.28 10.32 12.94 ...
## $ MCP.1 : num 417 469 555 530 1256 ...
## $ Classification: int 1 1 1 1 1 1 1 1 1 1 ...
#2. Data Testing
data_testing=subset(dataml,dataml1==FALSE)
str(data_testing)
## 'data.frame': 33 obs. of 7 variables:
## $ BMI : num 21.4 21.1 21.5 23 32 ...
## $ Glucose : int 77 92 78 82 85 95 90 83 101 87 ...
## $ Insulin : num 3.23 3.55 3.47 5.66 18.08 ...
## $ HOMA : num 0.613 0.805 0.667 1.145 3.79 ...
## $ Resistin : num 12.77 10.58 6.92 4.58 13.68 ...
## $ MCP.1 : num 928 774 355 175 444 ...
## $ Classification: int 1 1 1 1 1 1 1 1 1 1 ...
#Algoritma NaiveBayes
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.3
model_naive <- naiveBayes(Classification~., data=data_training)
#Predic
data_testing$predicted<-predict(model_naive,data_testing)
data_testing$actual <- data_testing$Classification
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
confusionMatrix(factor(data_testing$predicted),
factor(data_testing$actual))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 14 10
## 2 2 7
##
## Accuracy : 0.6364
## 95% CI : (0.4512, 0.796)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : 0.11097
##
## Kappa : 0.2826
##
## Mcnemar's Test P-Value : 0.04331
##
## Sensitivity : 0.8750
## Specificity : 0.4118
## Pos Pred Value : 0.5833
## Neg Pred Value : 0.7778
## Prevalence : 0.4848
## Detection Rate : 0.4242
## Detection Prevalence : 0.7273
## Balanced Accuracy : 0.6434
##
## 'Positive' Class : 1
##