# Import the Required Libraries for Data Analysis and Model Building

# For data manipulation
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(stringr) 

# For Statistics
library(stats)

# For data visualization
library(ggplot2)

# For splitting data into training and testing sets
library(caTools)

# For feature scaling
library(caret)
## Loading required package: lattice
# Decision Tree
library(rpart)

# Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# For model evaluation
library(caret)

# Load and read the dataset
df <- read.csv("wisconsin.csv",header=TRUE)

# Read the first 10 rows of the dataset
head(df, 10)
##    Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 1             5         1          1             1            2           1
## 2             5         4          4             5            7          10
## 3             3         1          1             1            2           2
## 4             6         8          8             1            3           4
## 5             4         1          1             3            2           1
## 6             8        10         10             8            7          10
## 7             1         1          1             1            2          10
## 8             2         1          2             1            2           1
## 9             2         1          1             1            2           1
## 10            4         2          1             1            2           1
##    Bl.cromatin Normal.nucleoli Mitoses     Class
## 1            3               1       1    benign
## 2            3               2       1    benign
## 3            3               1       1    benign
## 4            3               7       1    benign
## 5            3               1       1    benign
## 6            9               7       1 malignant
## 7            3               1       1    benign
## 8            3               1       1    benign
## 9            1               1       5    benign
## 10           2               1       1    benign
# Read the last 10 rows of the dataset
tail(df, 10)
##     Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 690            1         1          1             1            2           1
## 691            1         1          1             3            2           1
## 692            5        10         10             5            4           5
## 693            3         1          1             1            2           1
## 694            3         1          1             1            2           1
## 695            3         1          1             1            3           2
## 696            2         1          1             1            2           1
## 697            5        10         10             3            7           3
## 698            4         8          6             4            3           4
## 699            4         8          8             5            4           5
##     Bl.cromatin Normal.nucleoli Mitoses     Class
## 690           1               1       8    benign
## 691           1               1       1    benign
## 692           4               4       1 malignant
## 693           1               1       1    benign
## 694           2               1       2    benign
## 695           1               1       1    benign
## 696           1               1       1    benign
## 697           8              10       2 malignant
## 698          10               6       1 malignant
## 699          10               4       1 malignant
# Check the number of rows and columns
num_rows <- nrow(df)
num_columns <- ncol(df)
cat("There are", num_rows, "rows and", num_columns, "columns present in our dataset.\n")
## There are 699 rows and 10 columns present in our dataset.
# Information about the dataset
str(df)
## 'data.frame':    699 obs. of  10 variables:
##  $ Cl.thickness   : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : int  1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: int  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : chr  "benign" "benign" "benign" "benign" ...
# Check for missing values
colSums(is.na(df))
##    Cl.thickness       Cell.size      Cell.shape   Marg.adhesion    Epith.c.size 
##               0               0               0               0               0 
##     Bare.nuclei     Bl.cromatin Normal.nucleoli         Mitoses           Class 
##              16               0               0               0               0
# Remove rows with missing values
df <- na.omit(df)

# Check again for missing values
colSums(is.na(df))
##    Cl.thickness       Cell.size      Cell.shape   Marg.adhesion    Epith.c.size 
##               0               0               0               0               0 
##     Bare.nuclei     Bl.cromatin Normal.nucleoli         Mitoses           Class 
##               0               0               0               0               0
# Check the data types of columns
sapply(df, class)
##    Cl.thickness       Cell.size      Cell.shape   Marg.adhesion    Epith.c.size 
##       "integer"       "integer"       "integer"       "integer"       "integer" 
##     Bare.nuclei     Bl.cromatin Normal.nucleoli         Mitoses           Class 
##       "integer"       "integer"       "integer"       "integer"     "character"
# Convert 'Class' column to a categorical variable
df$Class <- as.factor(df$Class)


# Check again the data types of columns
sapply(df, class)
##    Cl.thickness       Cell.size      Cell.shape   Marg.adhesion    Epith.c.size 
##       "integer"       "integer"       "integer"       "integer"       "integer" 
##     Bare.nuclei     Bl.cromatin Normal.nucleoli         Mitoses           Class 
##       "integer"       "integer"       "integer"       "integer"        "factor"
# Summary statistics of numerical columns
summary(df)
##   Cl.thickness      Cell.size        Cell.shape     Marg.adhesion  
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.00  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.00  
##  Mean   : 4.442   Mean   : 3.151   Mean   : 3.215   Mean   : 2.83  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.00  
##   Epith.c.size     Bare.nuclei      Bl.cromatin     Normal.nucleoli
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.00  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.00  
##  Mean   : 3.234   Mean   : 3.545   Mean   : 3.445   Mean   : 2.87  
##  3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.00  
##     Mitoses             Class    
##  Min.   : 1.000   benign   :444  
##  1st Qu.: 1.000   malignant:239  
##  Median : 1.000                  
##  Mean   : 1.603                  
##  3rd Qu.: 1.000                  
##  Max.   :10.000
# Create correlation matrix without non-numeric columns
cor(df[, sapply(df, is.numeric)])
##                 Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## Cl.thickness       1.0000000 0.6424815  0.6534700     0.4878287    0.5235960
## Cell.size          0.6424815 1.0000000  0.9072282     0.7069770    0.7535440
## Cell.shape         0.6534700 0.9072282  1.0000000     0.6859481    0.7224624
## Marg.adhesion      0.4878287 0.7069770  0.6859481     1.0000000    0.5945478
## Epith.c.size       0.5235960 0.7535440  0.7224624     0.5945478    1.0000000
## Bare.nuclei        0.5930914 0.6917088  0.7138775     0.6706483    0.5857161
## Bl.cromatin        0.5537424 0.7555592  0.7353435     0.6685671    0.6181279
## Normal.nucleoli    0.5340659 0.7193460  0.7179634     0.6031211    0.6289264
## Mitoses            0.3509572 0.4607547  0.4412576     0.4188983    0.4805833
##                 Bare.nuclei Bl.cromatin Normal.nucleoli   Mitoses
## Cl.thickness      0.5930914   0.5537424       0.5340659 0.3509572
## Cell.size         0.6917088   0.7555592       0.7193460 0.4607547
## Cell.shape        0.7138775   0.7353435       0.7179634 0.4412576
## Marg.adhesion     0.6706483   0.6685671       0.6031211 0.4188983
## Epith.c.size      0.5857161   0.6181279       0.6289264 0.4805833
## Bare.nuclei       1.0000000   0.6806149       0.5842802 0.3392104
## Bl.cromatin       0.6806149   1.0000000       0.6656015 0.3460109
## Normal.nucleoli   0.5842802   0.6656015       1.0000000 0.4337573
## Mitoses           0.3392104   0.3460109       0.4337573 1.0000000
# Value counts for the 'Class' column
table(df$Class)
## 
##    benign malignant 
##       444       239
# Create pair plot
pairs(df[, c('Cl.thickness', 'Cell.size', 'Cell.shape', 'Marg.adhesion', 'Epith.c.size','Bare.nuclei','Bl.cromatin','Normal.nucleoli','Mitoses')],
      col = df$Class)

# Create count plot
ggplot(df, aes(x = Class, fill = Class)) +
  geom_bar() +
  scale_fill_manual(values = c("green", "red")) +
  labs(x = "Class", y = "Count", title = "Visualization of Patients with Cancer Diagnosis or Not Diagnosis") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  theme_minimal()

# Create heatmap of correlation matrix
heatmap(cor(df[, sapply(df, is.numeric)]), 
        Colv = NA, Rowv = NA, 
        col = colorRampPalette(c("blue", "white", "red"))(100), 
        scale = "none", 
        margins = c(10, 10), 
        main = "Heatmap of Correlation Matrix of Breast Cancer Data",
        xlab = "Variables", 
        ylab = "Variables",
        cex.main = 1.2, 
        cex.axis = 1.2,
        cex.lab = 1.2,
        symm = TRUE)

# Create input variables without the 'Class' column
X <- df[, !(names(df) %in% c("Class"))]

# Read the first five rows of input columns
head(X, 5)
##   Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 1            5         1          1             1            2           1
## 2            5         4          4             5            7          10
## 3            3         1          1             1            2           2
## 4            6         8          8             1            3           4
## 5            4         1          1             3            2           1
##   Bl.cromatin Normal.nucleoli Mitoses
## 1           3               1       1
## 2           3               2       1
## 3           3               1       1
## 4           3               7       1
## 5           3               1       1
# Create output variable 'y'
y <- df$Class

# Read the first five rows of output column 'y'
head(y, 5)
## [1] benign benign benign benign benign
## Levels: benign malignant
# Set the seed for reproducibility
set.seed(5)

# Split the dataset into training and testing sets
split <- sample.split(y, SplitRatio = 0.8)

# Create training and testing sets
X_train <- X[split, ]
X_test <- X[!split, ]
y_train <- y[split]
y_test <- y[!split]


# Scale the input variables using StandardScaler
X_train_sc <- scale(X_train)
X_test_sc <- scale(X_test)


# Combine X_train and y_train into one dataframe
train_data <- cbind(X_train, y_train)

# Create Decision Tree classifier
dt_classifier <- rpart(y_train ~ ., data = train_data, method = "class")

# Predict on test data
y_pred_dt <- predict(dt_classifier, cbind(X_test, y_test), type = "class")

# Calculate Decision Tree accuracy
accuracy_dt <- mean(y_pred_dt == y_test)
accuracy_dt
## [1] 0.9489051
# Calculate Decision Tree precision
if (sum(y_pred_dt == 1) > 0) {
  precision_dt <- sum(y_pred_rf == 1 & y_test == 1) / sum(y_pred_dt == 1)
} else {
  precision_dt <- 0  # Set precision to 0 if there are no positive predictions
}
precision_dt
## [1] 0
# Create Random Forest classifier
rf_classifier <- randomForest(y_train ~ ., data = train_data, ntree = 100, importance = TRUE)

# Predict on test data
y_pred_rf <- predict(rf_classifier, cbind(X_test, y_test))

# Calculate Random Forest accuracy
accuracy_rf <- mean(y_pred_rf == y_test)
accuracy_rf
## [1] 0.9854015
# Calculate Random Forest precision
if (sum(y_pred_rf == 1) > 0) {
  precision_rf <- sum(y_pred_rf == 1 & y_test == 1) / sum(y_pred_rf == 1)
} else {
  precision_rf <- 0  # Set precision to 0 if there are no positive predictions
}
precision_rf
## [1] 0