Objectives

Analisis ini bertujuan untuk mengamati fitur mana yang paling membantu dalam memprediksi kanker ganas atau jinak dan untuk melihat tren umum yang dapat membantu kita dalam pemilihan model dan pemilihan parameter hiper. Tujuannya adalah untuk mengklasifikasikan apakah kanker payudara tersebut jinak atau ganas. Untuk mencapai ini saya menggunakan metode naive bayes

library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.2
library(visdat)
## Warning: package 'visdat' was built under R version 4.2.2
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.2.2
library(corrplot)
## corrplot 0.92 loaded

load data

df = read.csv('data_input/data.csv')
head(df)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1         0.11840          0.27760         0.3001             0.14710
## 2         0.08474          0.07864         0.0869             0.07017
## 3         0.10960          0.15990         0.1974             0.12790
## 4         0.14250          0.28390         0.2414             0.10520
## 5         0.10030          0.13280         0.1980             0.10430
## 6         0.12780          0.17000         0.1578             0.08089
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1        0.2419                0.07871    1.0950     0.9053        8.589
## 2        0.1812                0.05667    0.5435     0.7339        3.398
## 3        0.2069                0.05999    0.7456     0.7869        4.585
## 4        0.2597                0.09744    0.4956     1.1560        3.445
## 5        0.1809                0.05883    0.7572     0.7813        5.438
## 6        0.2087                0.07613    0.3345     0.8902        2.217
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1  153.40      0.006399        0.04904      0.05373           0.01587
## 2   74.08      0.005225        0.01308      0.01860           0.01340
## 3   94.03      0.006150        0.04006      0.03832           0.02058
## 4   27.23      0.009110        0.07458      0.05661           0.01867
## 5   94.44      0.011490        0.02461      0.05688           0.01885
## 6   27.19      0.007510        0.03345      0.03672           0.01137
##   symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1     0.03003             0.006193        25.38         17.33          184.60
## 2     0.01389             0.003532        24.99         23.41          158.80
## 3     0.02250             0.004571        23.57         25.53          152.50
## 4     0.05963             0.009208        14.91         26.50           98.87
## 5     0.01756             0.005115        22.54         16.67          152.20
## 6     0.02165             0.005082        15.47         23.75          103.40
##   area_worst smoothness_worst compactness_worst concavity_worst
## 1     2019.0           0.1622            0.6656          0.7119
## 2     1956.0           0.1238            0.1866          0.2416
## 3     1709.0           0.1444            0.4245          0.4504
## 4      567.7           0.2098            0.8663          0.6869
## 5     1575.0           0.1374            0.2050          0.4000
## 6      741.6           0.1791            0.5249          0.5355
##   concave.points_worst symmetry_worst fractal_dimension_worst  X
## 1               0.2654         0.4601                 0.11890 NA
## 2               0.1860         0.2750                 0.08902 NA
## 3               0.2430         0.3613                 0.08758 NA
## 4               0.2575         0.6638                 0.17300 NA
## 5               0.1625         0.2364                 0.07678 NA
## 6               0.1741         0.3985                 0.12440 NA

Data Preparation

Check Missing Value

colSums(is.na(df))
##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave.points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave.points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave.points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst                       X 
##                       0                       0                     569
glimpse(df)
## Rows: 569
## Columns: 33
## $ id                      <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ X                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
  • insight Kumpulan data kanker payudara Wisconsin berisi 569 entri yang berasal dari analisis gambar aspirasi jarum halus dari massa payudara. Kolom pertama berisi id unik dan kolom kedua berisi diagnosis.

Tidak ada nilai yang hilang kecuali kolom 33 yang terlihat kosong.

# select features
df_selected <- df[1:32]

# drop id variable
Matrix <- df_selected%>%
select(-c("id"))%>%
# convert diagnosis to numerical
mutate(diagnosis = case_when(
diagnosis == "M"~1,
diagnosis == "B"~0,
))%>%
# convert to matrix
as.matrix()

# add id column as names
row.names(Matrix) <- df_selected$id
head(Matrix)
##          diagnosis radius_mean texture_mean perimeter_mean area_mean
## 842302           1       17.99        10.38         122.80    1001.0
## 842517           1       20.57        17.77         132.90    1326.0
## 84300903         1       19.69        21.25         130.00    1203.0
## 84348301         1       11.42        20.38          77.58     386.1
## 84358402         1       20.29        14.34         135.10    1297.0
## 843786           1       12.45        15.70          82.57     477.1
##          smoothness_mean compactness_mean concavity_mean concave.points_mean
## 842302           0.11840          0.27760         0.3001             0.14710
## 842517           0.08474          0.07864         0.0869             0.07017
## 84300903         0.10960          0.15990         0.1974             0.12790
## 84348301         0.14250          0.28390         0.2414             0.10520
## 84358402         0.10030          0.13280         0.1980             0.10430
## 843786           0.12780          0.17000         0.1578             0.08089
##          symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 842302          0.2419                0.07871    1.0950     0.9053        8.589
## 842517          0.1812                0.05667    0.5435     0.7339        3.398
## 84300903        0.2069                0.05999    0.7456     0.7869        4.585
## 84348301        0.2597                0.09744    0.4956     1.1560        3.445
## 84358402        0.1809                0.05883    0.7572     0.7813        5.438
## 843786          0.2087                0.07613    0.3345     0.8902        2.217
##          area_se smoothness_se compactness_se concavity_se concave.points_se
## 842302    153.40      0.006399        0.04904      0.05373           0.01587
## 842517     74.08      0.005225        0.01308      0.01860           0.01340
## 84300903   94.03      0.006150        0.04006      0.03832           0.02058
## 84348301   27.23      0.009110        0.07458      0.05661           0.01867
## 84358402   94.44      0.011490        0.02461      0.05688           0.01885
## 843786     27.19      0.007510        0.03345      0.03672           0.01137
##          symmetry_se fractal_dimension_se radius_worst texture_worst
## 842302       0.03003             0.006193        25.38         17.33
## 842517       0.01389             0.003532        24.99         23.41
## 84300903     0.02250             0.004571        23.57         25.53
## 84348301     0.05963             0.009208        14.91         26.50
## 84358402     0.01756             0.005115        22.54         16.67
## 843786       0.02165             0.005082        15.47         23.75
##          perimeter_worst area_worst smoothness_worst compactness_worst
## 842302            184.60     2019.0           0.1622            0.6656
## 842517            158.80     1956.0           0.1238            0.1866
## 84300903          152.50     1709.0           0.1444            0.4245
## 84348301           98.87      567.7           0.2098            0.8663
## 84358402          152.20     1575.0           0.1374            0.2050
## 843786            103.40      741.6           0.1791            0.5249
##          concavity_worst concave.points_worst symmetry_worst
## 842302            0.7119               0.2654         0.4601
## 842517            0.2416               0.1860         0.2750
## 84300903          0.4504               0.2430         0.3613
## 84348301          0.6869               0.2575         0.6638
## 84358402          0.4000               0.1625         0.2364
## 843786            0.5355               0.1741         0.3985
##          fractal_dimension_worst
## 842302                   0.11890
## 842517                   0.08902
## 84300903                 0.08758
## 84348301                 0.17300
## 84358402                 0.07678
## 843786                   0.12440
is.numeric(Matrix)
## [1] TRUE

Scalling

mean_sd <- cbind(as.data.frame(colMeans(Matrix)),  as.data.frame(apply(Matrix, 2, sd)))
colnames(mean_sd) <- c("Mean","SD")

mean_sd
##                                 Mean           SD
## diagnosis               3.725835e-01 4.839180e-01
## radius_mean             1.412729e+01 3.524049e+00
## texture_mean            1.928965e+01 4.301036e+00
## perimeter_mean          9.196903e+01 2.429898e+01
## area_mean               6.548891e+02 3.519141e+02
## smoothness_mean         9.636028e-02 1.406413e-02
## compactness_mean        1.043410e-01 5.281276e-02
## concavity_mean          8.879932e-02 7.971981e-02
## concave.points_mean     4.891915e-02 3.880284e-02
## symmetry_mean           1.811619e-01 2.741428e-02
## fractal_dimension_mean  6.279761e-02 7.060363e-03
## radius_se               4.051721e-01 2.773127e-01
## texture_se              1.216853e+00 5.516484e-01
## perimeter_se            2.866059e+00 2.021855e+00
## area_se                 4.033708e+01 4.549101e+01
## smoothness_se           7.040979e-03 3.002518e-03
## compactness_se          2.547814e-02 1.790818e-02
## concavity_se            3.189372e-02 3.018606e-02
## concave.points_se       1.179614e-02 6.170285e-03
## symmetry_se             2.054230e-02 8.266372e-03
## fractal_dimension_se    3.794904e-03 2.646071e-03
## radius_worst            1.626919e+01 4.833242e+00
## texture_worst           2.567722e+01 6.146258e+00
## perimeter_worst         1.072612e+02 3.360254e+01
## area_worst              8.805831e+02 5.693570e+02
## smoothness_worst        1.323686e-01 2.283243e-02
## compactness_worst       2.542650e-01 1.573365e-01
## concavity_worst         2.721885e-01 2.086243e-01
## concave.points_worst    1.146062e-01 6.573234e-02
## symmetry_worst          2.900756e-01 6.186747e-02
## fractal_dimension_worst 8.394582e-02 1.806127e-02
ScaledMatrix <- scale(Matrix)
head(ScaledMatrix)
##          diagnosis radius_mean texture_mean perimeter_mean  area_mean
## 842302    1.296535   1.0960995   -2.0715123      1.2688173  0.9835095
## 842517    1.296535   1.8282120   -0.3533215      1.6844726  1.9070303
## 84300903  1.296535   1.5784992    0.4557859      1.5651260  1.5575132
## 84348301  1.296535  -0.7682333    0.2535091     -0.5921661 -0.7637917
## 84358402  1.296535   1.7487579   -1.1508038      1.7750113  1.8246238
## 843786    1.296535  -0.4759559   -0.8346009     -0.3868077 -0.5052059
##          smoothness_mean compactness_mean concavity_mean concave.points_mean
## 842302         1.5670875        3.2806281     2.65054179           2.5302489
## 842517        -0.8262354       -0.4866435    -0.02382489           0.5476623
## 84300903       0.9413821        1.0519999     1.36227979           2.0354398
## 84348301       3.2806668        3.3999174     1.91421287           1.4504311
## 84358402       0.2801253        0.5388663     1.36980615           1.4272370
## 843786         2.2354545        1.2432416     0.86554001           0.8239307
##          symmetry_mean fractal_dimension_mean  radius_se texture_se
## 842302     2.215565542              2.2537638  2.4875451 -0.5647681
## 842517     0.001391139             -0.8678888  0.4988157 -0.8754733
## 84300903   0.938858720             -0.3976580  1.2275958 -0.7793976
## 84348301   2.864862154              4.9066020  0.3260865 -0.1103120
## 84358402  -0.009552062             -0.5619555  1.2694258 -0.7895490
## 843786     1.004517928              1.8883435 -0.2548461 -0.5921406
##          perimeter_se    area_se smoothness_se compactness_se concavity_se
## 842302      2.8305403  2.4853907    -0.2138135     1.31570389    0.7233897
## 842517      0.2630955  0.7417493    -0.6048187    -0.69231710   -0.4403926
## 84300903    0.8501802  1.1802975    -0.2967439     0.81425704    0.2128891
## 84348301    0.2863415 -0.2881246     0.6890953     2.74186785    0.8187979
## 84358402    1.2720701  1.1893103     1.4817634    -0.04847723    0.8277425
## 843786     -0.3210217 -0.2890039     0.1562093     0.44515196    0.1598845
##          concave.points_se symmetry_se fractal_dimension_se radius_worst
## 842302          0.66023900   1.1477468           0.90628565    1.8850310
## 842517          0.25993335  -0.8047423          -0.09935632    1.8043398
## 84300903        1.42357487   0.2368272           0.29330133    1.5105411
## 84348301        1.11402678   4.7285198           2.04571087   -0.2812170
## 84358402        1.14319885  -0.3607748           0.49888916    1.2974336
## 843786         -0.06906279   0.1340009           0.48641784   -0.1653528
##          texture_worst perimeter_worst area_worst smoothness_worst
## 842302     -1.35809849       2.3015755  1.9994782        1.3065367
## 842517     -0.36887865       1.5337764  1.8888270       -0.3752817
## 84300903   -0.02395331       1.3462906  1.4550043        0.5269438
## 84348301    0.13386631      -0.2497196 -0.5495377        3.3912907
## 84358402   -1.46548091       1.3373627  1.2196511        0.2203623
## 843786     -0.31356043      -0.1149083 -0.2441054        2.0467119
##          compactness_worst concavity_worst concave.points_worst symmetry_worst
## 842302           2.6143647       2.1076718            2.2940576      2.7482041
## 842517          -0.4300658      -0.1466200            1.0861286     -0.2436753
## 84300903         1.0819801       0.8542223            1.9532817      1.1512420
## 84348301         3.8899747       1.9878392            2.1738732      6.0407261
## 84358402        -0.3131190       0.6126397            0.7286181     -0.8675896
## 843786           1.7201029       1.2621327            0.9050914      1.7525273
##          fractal_dimension_worst
## 842302                 1.9353117
## 842517                 0.2809428
## 84300903               0.2012142
## 84348301               4.9306719
## 84358402              -0.3967505
## 843786                 2.2398308
options(repr.plot.width = 15, repr.plot.height = 15) # set dimensions of plots
# Plot
corrplot(cor(ScaledMatrix),
         #addCoef.col = 1,    # color numeric labels
         # = 0.8, # size numeric labels
         order = "hclust",
         tl.cex = 1, # size text label
         addrect = 8) # draw rectangle

Exploratory Data Analysis

df%>%
group_by(diagnosis)%>%
summarize(
    n = n())%>%
mutate(percent = paste0(round(100 * n/sum(n), 0), "%"))
## # A tibble: 2 × 3
##   diagnosis     n percent
##   <chr>     <int> <chr>  
## 1 B           357 63%    
## 2 M           212 37%
  • insight Variabel hasil diagnosis memiliki kadar B untuk jinak dan M untuk ganas. Persentase kasus keganasan merupakan proporsi besar yang mengejutkan dari data dengan 37%.

Modelling

PCA

set.seed(1) 
# Perform PCA on the scaled matrix and exclude "diagnosis"
pca <- prcomp(x= ScaledMatrix[,-c(1)], scale = TRUE, center = TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000
pca_res <- as.data.frame(pca$x[,1:7])
# extract diagnosis column
diagnosis <- df_selected[c("diagnosis")]
row.names(diagnosis) <- df$id

# merge PCA results with diagnosis 
PCA_data <- merge(diagnosis, pca_res,by=0,  all=TRUE)%>%
select(-c("Row.names")) %>%
mutate(diagnosis = factor(diagnosis, levels = c("B", "M")))

# asign id as row names
row.names(PCA_data) <- df_selected$id
#head(PCA_data)
str(PCA_data)
## 'data.frame':    569 obs. of  8 variables:
##  $ diagnosis: Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PC1      : num  -9.18 -2.39 -5.73 -7.12 -3.93 ...
##  $ PC2      : num  -1.95 3.76 1.07 -10.27 1.95 ...
##  $ PC3      : num  -1.122 -0.529 -0.551 -3.23 1.389 ...
##  $ PC4      : num  3.631 1.117 0.911 0.152 2.938 ...
##  $ PC5      : num  1.194 -0.621 0.177 2.958 -0.546 ...
##  $ PC6      : num  1.4102 0.0286 0.541 3.0507 -1.2254 ...
##  $ PC7      : num  2.1575 0.0133 -0.6676 1.4287 -0.9354 ...

Train Test Split

# scale data
sc_Matrix <- scale(Matrix)
# convert to df
Scaled_df <- as.data.frame(sc_Matrix )
# add diagnosis as categories
Scaled_df <- cbind(diagnosis = df[,2],Scaled_df[,-1])
# define diagnosis as factor
Scaled_df<- Scaled_df%>%
mutate(diagnosis = factor(diagnosis, levels = c("B","M")))

# split training and testing data
Scaled_df_index <- createDataPartition(Scaled_df$diagnosis, p=0.7, list = FALSE)
training_data <- Scaled_df[Scaled_df_index,]
testing_data <- Scaled_df[-Scaled_df_index,]

# split PCA data into training and testing data
PCA_data_index <- createDataPartition(PCA_data$diagnosis, p=0.7, list = FALSE)
training_data_PCA <- PCA_data[PCA_data_index,]
testing_data_PCA <- PCA_data[-PCA_data_index,]
#str(training_data_PCA)
#str(testing_data_PCA)

Naive Bayes

set.seed(1) 
model_NB <- naiveBayes(diagnosis~., data = training_data, laplace = 1)
summary(model_NB)
##           Length Class  Mode     
## apriori    2     table  numeric  
## tables    30     -none- list     
## levels     2     -none- character
## isnumeric 30     -none- logical  
## call       4     -none- call
options(repr.plot.width = 5, repr.plot.height = 5) # set dimensions of plots
# model predictions
testing_data$predictions_NB <- predict(model_NB, testing_data)
predictions_NB <- predict(model_NB, testing_data)
#confusion matrix
CM_NB <- confusionMatrix(predictions_NB, testing_data$diagnosis, positive = "M")
CM_NB
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 103   7
##          M   4  56
##                                           
##                Accuracy : 0.9353          
##                  95% CI : (0.8872, 0.9673)
##     No Information Rate : 0.6294          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8599          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 0.9626          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.9364          
##              Prevalence : 0.3706          
##          Detection Rate : 0.3294          
##    Detection Prevalence : 0.3529          
##       Balanced Accuracy : 0.9258          
##                                           
##        'Positive' Class : M               
## 
# Plot confusion matrix
confusion_matrix <- data.frame(confusionMatrix(testing_data$predictions_NB, testing_data$diagnosis)$table)%>% rename(Observed = Reference)
ggplot(data =  confusion_matrix, mapping = aes(x = Observed, y = Prediction)) +
      labs(title = "Confusion matrix", subtitle = "") +
      geom_tile(aes(fill = Freq), colour = "grey") +
      geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1, size = 7) +
      scale_fill_gradient(low = '#009ADC', high = '#FF1F5B') +
      theme_bw() + theme(legend.position = "none")

# Kesimpulan Tujuan dari analisis ini adalah untuk memprediksi diagnosis sampel jaringan payudara dengan benar, berdasarkan 30 fitur jaringan. Kami menemukan bahwa model Naive Bayes (92,94%). Secara keseluruhan, ini semua adalah skor kinerja luar biasa yang menunjukkan bahwa jaringan biopsi dapat menjadi sumber yang berharga untuk diagnosis kanker.