# Load package 'caret'
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
library(psych)
## Warning: package 'psych' was built under R version 4.2.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.2.2
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readr)
wdbc <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",
col_names = FALSE)
## Rows: 569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X2
## dbl (31): X1, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(wdbc) <- c('id_number', 'diagnosis', 'radius_mean',
'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean',
'concavity_mean','concave_points_mean',
'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se',
'area_se', 'smoothness_se', 'compactness_se',
'concavity_se', 'concave_points_se',
'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst',
'smoothness_worst', 'compactness_worst',
'concavity_worst', 'concave_points_worst',
'symmetry_worst', 'fractal_dimension_worst')
glimpse(wdbc)
## Rows: 569
## Columns: 32
## $ id_number <dbl> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave_points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave_points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave_points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
#Check dimensions of the split
prop.table(table(wdbc$diagnosis)) * 100
##
## B M
## 62.74165 37.25835
# Removing the first column (id_number) because it is not a real predictor.
# Keep the response variable ( diagnosis) and the predictors containing 'mean'
# Checking the values (B/M) of the response variable and convert to 0/1
wdbc <- wdbc %>%
dplyr::select(diagnosis, contains("mean"))%>%
mutate(bc = ifelse(diagnosis=="M", 1, ifelse(diagnosis=="B",0, NA))) %>%
dplyr::select(-diagnosis)
head(wdbc)
## # A tibble: 6 × 11
## radius_mean texture_…¹ perim…² area_…³ smoot…⁴ compa…⁵ conca…⁶ conca…⁷ symme…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18.0 10.4 123. 1001 0.118 0.278 0.300 0.147 0.242
## 2 20.6 17.8 133. 1326 0.0847 0.0786 0.0869 0.0702 0.181
## 3 19.7 21.2 130 1203 0.110 0.160 0.197 0.128 0.207
## 4 11.4 20.4 77.6 386. 0.142 0.284 0.241 0.105 0.260
## 5 20.3 14.3 135. 1297 0.100 0.133 0.198 0.104 0.181
## 6 12.4 15.7 82.6 477. 0.128 0.17 0.158 0.0809 0.209
## # … with 2 more variables: fractal_dimension_mean <dbl>, bc <dbl>, and
## # abbreviated variable names ¹texture_mean, ²perimeter_mean, ³area_mean,
## # ⁴smoothness_mean, ⁵compactness_mean, ⁶concavity_mean, ⁷concave_points_mean,
## # ⁸symmetry_mean
dim(wdbc)
## [1] 569 11
table(wdbc$bc)
##
## 0 1
## 357 212
# Checking missing values
library(mice)
## Warning: package 'mice' was built under R version 4.2.2
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
missmap(wdbc)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
# Checking bivariate relationships among variables
ggpairs(wdbc)
library(dplyr)
# Standardization
scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
wdbc <- wdbc %>%
mutate_at(vars(-bc), scale2)
# Provide a seed number for reproducibility
set.seed(34859)
nr <- nrow(wdbc)
tr.id <- sample(nr, floor(0.7*nr), replace=FALSE, prob=NULL)
tr.dat <- wdbc[tr.id,]
ts.dat <- wdbc[-tr.id,]
tr.x <- tr.dat %>% dplyr::select(-bc)
ts.x <- ts.dat %>% dplyr::select(-bc)
ts.y <- ts.dat %>% dplyr::select(bc)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.