# Import the Required Libraries for Data Analysis and Model Building
# For data manipulation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(stringr)
# For Statistics
library(stats)
# For data visualization
library(ggplot2)
# For splitting data into training and testing sets
library(caTools)
# For feature scaling
library(caret)
## Loading required package: lattice
# Decision Tree
library(rpart)
# Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# For model evaluation
library(caret)
# Load and read the dataset
df <- read.csv("wisconsin.csv",header=TRUE)
# Read the first 10 rows of the dataset
head(df, 10)
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 1 5 1 1 1 2 1
## 2 5 4 4 5 7 10
## 3 3 1 1 1 2 2
## 4 6 8 8 1 3 4
## 5 4 1 1 3 2 1
## 6 8 10 10 8 7 10
## 7 1 1 1 1 2 10
## 8 2 1 2 1 2 1
## 9 2 1 1 1 2 1
## 10 4 2 1 1 2 1
## Bl.cromatin Normal.nucleoli Mitoses Class
## 1 3 1 1 benign
## 2 3 2 1 benign
## 3 3 1 1 benign
## 4 3 7 1 benign
## 5 3 1 1 benign
## 6 9 7 1 malignant
## 7 3 1 1 benign
## 8 3 1 1 benign
## 9 1 1 5 benign
## 10 2 1 1 benign
# Read the last 10 rows of the dataset
tail(df, 10)
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 690 1 1 1 1 2 1
## 691 1 1 1 3 2 1
## 692 5 10 10 5 4 5
## 693 3 1 1 1 2 1
## 694 3 1 1 1 2 1
## 695 3 1 1 1 3 2
## 696 2 1 1 1 2 1
## 697 5 10 10 3 7 3
## 698 4 8 6 4 3 4
## 699 4 8 8 5 4 5
## Bl.cromatin Normal.nucleoli Mitoses Class
## 690 1 1 8 benign
## 691 1 1 1 benign
## 692 4 4 1 malignant
## 693 1 1 1 benign
## 694 2 1 2 benign
## 695 1 1 1 benign
## 696 1 1 1 benign
## 697 8 10 2 malignant
## 698 10 6 1 malignant
## 699 10 4 1 malignant
# Check the number of rows and columns
num_rows <- nrow(df)
num_columns <- ncol(df)
cat("There are", num_rows, "rows and", num_columns, "columns present in our dataset.\n")
## There are 699 rows and 10 columns present in our dataset.
# Information about the dataset
str(df)
## 'data.frame': 699 obs. of 10 variables:
## $ Cl.thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : int 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : int 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: int 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : chr "benign" "benign" "benign" "benign" ...
# Check for missing values
colSums(is.na(df))
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 0 0 0 0 0
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 16 0 0 0 0
# Remove rows with missing values
df <- na.omit(df)
# Check again for missing values
colSums(is.na(df))
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 0 0 0 0 0
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 0 0 0 0 0
# Check the data types of columns
sapply(df, class)
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## "integer" "integer" "integer" "integer" "integer"
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## "integer" "integer" "integer" "integer" "character"
# Convert 'Class' column to a categorical variable
df$Class <- as.factor(df$Class)
# Check again the data types of columns
sapply(df, class)
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## "integer" "integer" "integer" "integer" "integer"
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## "integer" "integer" "integer" "integer" "factor"
# Summary statistics of numerical columns
summary(df)
## Cl.thickness Cell.size Cell.shape Marg.adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.00
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.00
## Mean : 4.442 Mean : 3.151 Mean : 3.215 Mean : 2.83
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.00
## Mean : 3.234 Mean : 3.545 Mean : 3.445 Mean : 2.87
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## Mitoses Class
## Min. : 1.000 benign :444
## 1st Qu.: 1.000 malignant:239
## Median : 1.000
## Mean : 1.603
## 3rd Qu.: 1.000
## Max. :10.000
# Create correlation matrix without non-numeric columns
cor(df[, sapply(df, is.numeric)])
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## Cl.thickness 1.0000000 0.6424815 0.6534700 0.4878287 0.5235960
## Cell.size 0.6424815 1.0000000 0.9072282 0.7069770 0.7535440
## Cell.shape 0.6534700 0.9072282 1.0000000 0.6859481 0.7224624
## Marg.adhesion 0.4878287 0.7069770 0.6859481 1.0000000 0.5945478
## Epith.c.size 0.5235960 0.7535440 0.7224624 0.5945478 1.0000000
## Bare.nuclei 0.5930914 0.6917088 0.7138775 0.6706483 0.5857161
## Bl.cromatin 0.5537424 0.7555592 0.7353435 0.6685671 0.6181279
## Normal.nucleoli 0.5340659 0.7193460 0.7179634 0.6031211 0.6289264
## Mitoses 0.3509572 0.4607547 0.4412576 0.4188983 0.4805833
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## Cl.thickness 0.5930914 0.5537424 0.5340659 0.3509572
## Cell.size 0.6917088 0.7555592 0.7193460 0.4607547
## Cell.shape 0.7138775 0.7353435 0.7179634 0.4412576
## Marg.adhesion 0.6706483 0.6685671 0.6031211 0.4188983
## Epith.c.size 0.5857161 0.6181279 0.6289264 0.4805833
## Bare.nuclei 1.0000000 0.6806149 0.5842802 0.3392104
## Bl.cromatin 0.6806149 1.0000000 0.6656015 0.3460109
## Normal.nucleoli 0.5842802 0.6656015 1.0000000 0.4337573
## Mitoses 0.3392104 0.3460109 0.4337573 1.0000000
# Value counts for the 'Class' column
table(df$Class)
##
## benign malignant
## 444 239
# Create pair plot
pairs(df[, c('Cl.thickness', 'Cell.size', 'Cell.shape', 'Marg.adhesion', 'Epith.c.size','Bare.nuclei','Bl.cromatin','Normal.nucleoli','Mitoses')],
col = df$Class)

# Create count plot
ggplot(df, aes(x = Class, fill = Class)) +
geom_bar() +
scale_fill_manual(values = c("green", "red")) +
labs(x = "Class", y = "Count", title = "Visualization of Patients with Cancer Diagnosis or Not Diagnosis") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
theme_minimal()

# Create heatmap of correlation matrix
heatmap(cor(df[, sapply(df, is.numeric)]),
Colv = NA, Rowv = NA,
col = colorRampPalette(c("blue", "white", "red"))(100),
scale = "none",
margins = c(10, 10),
main = "Heatmap of Correlation Matrix of Breast Cancer Data",
xlab = "Variables",
ylab = "Variables",
cex.main = 1.2,
cex.axis = 1.2,
cex.lab = 1.2,
symm = TRUE)

# Create input variables without the 'Class' column
X <- df[, !(names(df) %in% c("Class"))]
# Read the first five rows of input columns
head(X, 5)
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 1 5 1 1 1 2 1
## 2 5 4 4 5 7 10
## 3 3 1 1 1 2 2
## 4 6 8 8 1 3 4
## 5 4 1 1 3 2 1
## Bl.cromatin Normal.nucleoli Mitoses
## 1 3 1 1
## 2 3 2 1
## 3 3 1 1
## 4 3 7 1
## 5 3 1 1
# Create output variable 'y'
y <- df$Class
# Read the first five rows of output column 'y'
head(y, 5)
## [1] benign benign benign benign benign
## Levels: benign malignant
# Set the seed for reproducibility
set.seed(5)
# Split the dataset into training and testing sets
split <- sample.split(y, SplitRatio = 0.8)
# Create training and testing sets
X_train <- X[split, ]
X_test <- X[!split, ]
y_train <- y[split]
y_test <- y[!split]
# Scale the input variables using StandardScaler
X_train_sc <- scale(X_train)
X_test_sc <- scale(X_test)
# Combine X_train and y_train into one dataframe
train_data <- cbind(X_train, y_train)
# Create Decision Tree classifier
dt_classifier <- rpart(y_train ~ ., data = train_data, method = "class")
# Predict on test data
y_pred_dt <- predict(dt_classifier, cbind(X_test, y_test), type = "class")
# Calculate Decision Tree accuracy
accuracy_dt <- mean(y_pred_dt == y_test)
accuracy_dt
## [1] 0.9489051
# Calculate Decision Tree precision
if (sum(y_pred_dt == 1) > 0) {
precision_dt <- sum(y_pred_rf == 1 & y_test == 1) / sum(y_pred_dt == 1)
} else {
precision_dt <- 0 # Set precision to 0 if there are no positive predictions
}
precision_dt
## [1] 0
# Create Random Forest classifier
rf_classifier <- randomForest(y_train ~ ., data = train_data, ntree = 100, importance = TRUE)
# Predict on test data
y_pred_rf <- predict(rf_classifier, cbind(X_test, y_test))
# Calculate Random Forest accuracy
accuracy_rf <- mean(y_pred_rf == y_test)
accuracy_rf
## [1] 0.9854015
# Calculate Random Forest precision
if (sum(y_pred_rf == 1) > 0) {
precision_rf <- sum(y_pred_rf == 1 & y_test == 1) / sum(y_pred_rf == 1)
} else {
precision_rf <- 0 # Set precision to 0 if there are no positive predictions
}
precision_rf
## [1] 0