Studi Kasus - Kanker Payudara Wisconsin Ini adalah contoh klasifikasi lain. Kita harus mengklasifikasikan tumor payudara sebagai ganas atau jinak.

Himpunan data tersedia di situs web pembelajaran Mesin UCI serta di [Kaggle](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggcorrplot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(e1071)
library(ROCR)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(RCurl)
## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete
df <- read_csv("data.csv")
## New names:
## • `` -> `...33`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 568 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
## lgl  (1): ...33
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 568 × 33
##          id diagnosis radius_m…¹ textu…² perim…³ area_…⁴ smoot…⁵ compa…⁶ conca…⁷
##       <dbl> <chr>          <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1   842302 M               18.0    10.4   123.    1001   0.118   0.278   0.300 
##  2   842517 M               20.6    17.8   133.    1326   0.0847  0.0786  0.0869
##  3 84300903 M               19.7    21.2   130     1203   0.110   0.160   0.197 
##  4 84348301 M               11.4    20.4    77.6    386.  0.142   0.284   0.241 
##  5 84358402 M               20.3    14.3   135.    1297   0.100   0.133   0.198 
##  6   843786 M               12.4    15.7    82.6    477.  0.128   0.17    0.158 
##  7   844359 M               18.2    20.0   120.    1040   0.0946  0.109   0.113 
##  8 84458202 M               13.7    20.8    90.2    578.  0.119   0.164   0.0937
##  9   844981 M               13      21.8    87.5    520.  0.127   0.193   0.186 
## 10 84501001 M               12.5    24.0    84.0    476.  0.119   0.240   0.227 
## # … with 558 more rows, 24 more variables: `concave points_mean` <dbl>,
## #   symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## #   texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>, …
glimpse(df)
## Rows: 568
## Columns: 33
## $ id                      <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ `concave points_mean`   <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ `concave points_se`     <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ `concave points_worst`  <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ ...33                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Jadi kami memiliki 569 pengamatan dengan 32 variabel. Idealnya untuk begitu banyak variabel, akan lebih tepat untuk mendapatkan beberapa pengamatan lagi.

df <- select(df, -...33)
glimpse(df)
## Rows: 568
## Columns: 32
## $ id                      <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ `concave points_mean`   <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ `concave points_se`     <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ `concave points_worst`  <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
df$diagnosis <- as.factor(df$diagnosis)
map_int(df, function(.x) sum(is.na(.x)))
##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst 
##                       0                       0
round(prop.table(table(df$diagnosis)), 2)
## 
##    B    M 
## 0.63 0.37
library(corrplot)
## corrplot 0.92 loaded
df_corr <- cor(df %>% select(-id, -diagnosis))
corrplot(df_corr, order = "hclust", tl.cex = 1, addrect = 8)

#Number of columns for our new data frame
ncol(df)
## [1] 32
library(dplyr)
library(caret)
# The findcorrelation() function from caret package remove highly correlated predictors
# based on whose correlation is above 0.9. This function uses a heuristic algorithm 
# to determine which variable should be removed instead of selecting blindly
df2 <- df %>% select(-findCorrelation(df_corr, cutoff = 0.9))

#Number of columns for our new data frame
ncol(df2)
## [1] 22
library(corrplot)
df2_corr <- cor(df2)
corrplot(df2_corr, order = "hclust", tl.cex = 1, addrect = 8)

#Number of columns for our new data frame
ncol(df2)
## [1] 22

Jadi bingkai data baru kami adalah 10 variabel lebih pendek.df2

Pertama-tama mari kita lanjutkan analisis tanpa pengawasan dengan analisis PCA. Untuk melakukannya, kita akan menghapus variabel and, maka kita juga akan menskalakan dan menghentikan variabel.iddiagnosis

preproc_pca_df <- prcomp(df %>% select(-id, -diagnosis), scale = TRUE, center = TRUE)
summary(preproc_pca_df)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5    PC6     PC7
## Standard deviation     3.6430 2.3887 1.67894 1.40544 1.28662 1.0982 0.81949
## Proportion of Variance 0.4424 0.1902 0.09396 0.06584 0.05518 0.0402 0.02239
## Cumulative Proportion  0.4424 0.6326 0.72653 0.79237 0.84755 0.8878 0.91013
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.68973 0.64618 0.59266 0.54282 0.51175 0.49126 0.39418
## Proportion of Variance 0.01586 0.01392 0.01171 0.00982 0.00873 0.00804 0.00518
## Cumulative Proportion  0.92599 0.93991 0.95162 0.96144 0.97017 0.97821 0.98339
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30696 0.28022 0.24367 0.22980 0.22256 0.17656 0.1729
## Proportion of Variance 0.00314 0.00262 0.00198 0.00176 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98653 0.98915 0.99113 0.99289 0.99454 0.99558 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16547 0.15629 0.1344 0.12458 0.08929 0.08295 0.03993
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02728 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000
df2
## # A tibble: 568 × 22
##    texture_mean perime…¹ area_…² conca…³ conca…⁴ symme…⁵ fract…⁶ perim…⁷ area_se
##           <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1         10.4    123.    1001   0.300   0.147    0.242  0.0787    8.59   153. 
##  2         17.8    133.    1326   0.0869  0.0702   0.181  0.0567    3.40    74.1
##  3         21.2    130     1203   0.197   0.128    0.207  0.0600    4.58    94.0
##  4         20.4     77.6    386.  0.241   0.105    0.260  0.0974    3.44    27.2
##  5         14.3    135.    1297   0.198   0.104    0.181  0.0588    5.44    94.4
##  6         15.7     82.6    477.  0.158   0.0809   0.209  0.0761    2.22    27.2
##  7         20.0    120.    1040   0.113   0.074    0.179  0.0574    3.18    53.9
##  8         20.8     90.2    578.  0.0937  0.0598   0.220  0.0745    3.86    51.0
##  9         21.8     87.5    520.  0.186   0.0935   0.235  0.0739    2.41    24.3
## 10         24.0     84.0    476.  0.227   0.0854   0.203  0.0824    2.04    23.9
## # … with 558 more rows, 13 more variables: smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   fractal_dimension_se <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>, and abbreviated variable names
## #   ¹​perimeter_mean, ²​area_mean, ³​concavity_mean, ⁴​`concave points_mean`, …
#menambahkan colomn diagnosis pada tabel data frame df2
df2$diagnosis <- df$diagnosis
df2
## # A tibble: 568 × 23
##    texture_mean perime…¹ area_…² conca…³ conca…⁴ symme…⁵ fract…⁶ perim…⁷ area_se
##           <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1         10.4    123.    1001   0.300   0.147    0.242  0.0787    8.59   153. 
##  2         17.8    133.    1326   0.0869  0.0702   0.181  0.0567    3.40    74.1
##  3         21.2    130     1203   0.197   0.128    0.207  0.0600    4.58    94.0
##  4         20.4     77.6    386.  0.241   0.105    0.260  0.0974    3.44    27.2
##  5         14.3    135.    1297   0.198   0.104    0.181  0.0588    5.44    94.4
##  6         15.7     82.6    477.  0.158   0.0809   0.209  0.0761    2.22    27.2
##  7         20.0    120.    1040   0.113   0.074    0.179  0.0574    3.18    53.9
##  8         20.8     90.2    578.  0.0937  0.0598   0.220  0.0745    3.86    51.0
##  9         21.8     87.5    520.  0.186   0.0935   0.235  0.0739    2.41    24.3
## 10         24.0     84.0    476.  0.227   0.0854   0.203  0.0824    2.04    23.9
## # … with 558 more rows, 14 more variables: smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   fractal_dimension_se <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>, diagnosis <fct>, and abbreviated variable
## #   names ¹​perimeter_mean, ²​area_mean, ³​concavity_mean, …
#merubah tipe data dari colomn diagnosis menjadi numeric
diagnosis <- as.numeric(df2$diagnosis == "M")
df2$diagnosis <- diagnosis
require(fuzzyreg)
## Loading required package: fuzzyreg
f = fuzzylm(diagnosis ~ fractal_dimension_worst, data = df2)
print(f)
## 
## Fuzzy linear model using the PLRLS method
## 
## Call:
## fuzzylm(formula = diagnosis ~ fractal_dimension_worst, data = df2)
## 
## Coefficients in form of non-symmetric triangular fuzzy numbers:
## 
##                            center   left.spread right.spread
## (Intercept)             0.3732394  3.732394e-01    0.6267606
## fractal_dimension_worst 0.0000000 -4.440892e-15    0.0000000
summary(f)
## 
## Central tendency of the fuzzy regression model:
## diagnosis = 0.3732 + 0 * fractal_dimension_worst
## 
## Lower boundary of the model support interval:
## diagnosis = 0 + 0 * fractal_dimension_worst
## 
## Upper boundary of the model support interval:
## diagnosis = 1 + 0 * fractal_dimension_worst
## 
## The total error of fit: 2.84e+08
## The mean squared distance between response and prediction: 1.23