# Load package 'caret'
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
library(psych)
## Warning: package 'psych' was built under R version 4.2.2
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.2.2
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(readr)
wdbc <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", 
    col_names = FALSE)
## Rows: 569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): X2
## dbl (31): X1, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(wdbc) <- c('id_number', 'diagnosis', 'radius_mean', 
         'texture_mean', 'perimeter_mean', 'area_mean', 
         'smoothness_mean', 'compactness_mean', 
         'concavity_mean','concave_points_mean', 
         'symmetry_mean', 'fractal_dimension_mean',
         'radius_se', 'texture_se', 'perimeter_se', 
         'area_se', 'smoothness_se', 'compactness_se', 
         'concavity_se', 'concave_points_se', 
         'symmetry_se', 'fractal_dimension_se', 
         'radius_worst', 'texture_worst', 
         'perimeter_worst', 'area_worst', 
         'smoothness_worst', 'compactness_worst', 
         'concavity_worst', 'concave_points_worst', 
         'symmetry_worst', 'fractal_dimension_worst')
glimpse(wdbc)
## Rows: 569
## Columns: 32
## $ id_number               <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave_points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave_points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave_points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
#Check dimensions of the split
 
prop.table(table(wdbc$diagnosis)) * 100
## 
##        B        M 
## 62.74165 37.25835
# Removing the first column (id_number) because it is not a real predictor.
# Keep the response variable ( diagnosis) and the predictors containing 'mean'
# Checking the values (B/M) of the response variable and convert to 0/1

wdbc <- wdbc %>%
  dplyr::select(diagnosis, contains("mean"))%>%
  mutate(bc = ifelse(diagnosis=="M", 1, ifelse(diagnosis=="B",0, NA))) %>%
  dplyr::select(-diagnosis)
head(wdbc)
## # A tibble: 6 × 11
##   radius_mean texture_…¹ perim…² area_…³ smoot…⁴ compa…⁵ conca…⁶ conca…⁷ symme…⁸
##         <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1        18.0       10.4   123.    1001   0.118   0.278   0.300   0.147    0.242
## 2        20.6       17.8   133.    1326   0.0847  0.0786  0.0869  0.0702   0.181
## 3        19.7       21.2   130     1203   0.110   0.160   0.197   0.128    0.207
## 4        11.4       20.4    77.6    386.  0.142   0.284   0.241   0.105    0.260
## 5        20.3       14.3   135.    1297   0.100   0.133   0.198   0.104    0.181
## 6        12.4       15.7    82.6    477.  0.128   0.17    0.158   0.0809   0.209
## # … with 2 more variables: fractal_dimension_mean <dbl>, bc <dbl>, and
## #   abbreviated variable names ¹​texture_mean, ²​perimeter_mean, ³​area_mean,
## #   ⁴​smoothness_mean, ⁵​compactness_mean, ⁶​concavity_mean, ⁷​concave_points_mean,
## #   ⁸​symmetry_mean
dim(wdbc)
## [1] 569  11
table(wdbc$bc)
## 
##   0   1 
## 357 212
# Checking missing values
library(mice)
## Warning: package 'mice' was built under R version 4.2.2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
missmap(wdbc)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

# Checking bivariate relationships among variables
ggpairs(wdbc)

library(dplyr)
# Standardization
scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
wdbc <- wdbc %>%
  mutate_at(vars(-bc), scale2)
# Provide a seed number for reproducibility
set.seed(34859)

nr <- nrow(wdbc)
tr.id <- sample(nr, floor(0.7*nr), replace=FALSE, prob=NULL)

tr.dat <- wdbc[tr.id,]
ts.dat <- wdbc[-tr.id,]

tr.x <- tr.dat %>% dplyr::select(-bc)

ts.x <- ts.dat %>% dplyr::select(-bc)
ts.y <- ts.dat %>% dplyr::select(bc)  

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.