#Research question: Can we categorize the cancer cells in the data set into homogeneous groups based on four cluster variables (radius, texture, perimeter and area)?

mydata <- read.csv("C:/Users/nadja/Downloads/breast-cancer-wisconsin-data_data.csv")

#Excluding unimportant variables
mydata<- mydata[, -c(7:33)]

head(mydata)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1

##Description

##Data manipulation and Descriptive statistics

library(tidyr)
mydata <- drop_na(mydata) #Removing units with missing values
mydata$diagnosisF <- factor(mydata$diagnosis,
                        levels = c("M", "B"),
                        labels = c("Malignant","Benign"))

head(mydata)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1
##   diagnosisF
## 1  Malignant
## 2  Malignant
## 3  Malignant
## 4  Malignant
## 5  Malignant
## 6  Malignant
library(tidyr)
summary(mydata[,c(-1,-2)]) #Descriptive statistics (excluding categorical data)
##   radius_mean      texture_mean   perimeter_mean     area_mean     
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79   Min.   : 143.5  
##  1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17   1st Qu.: 420.3  
##  Median :13.370   Median :18.84   Median : 86.24   Median : 551.1  
##  Mean   :14.127   Mean   :19.29   Mean   : 91.97   Mean   : 654.9  
##  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10   3rd Qu.: 782.7  
##  Max.   :28.110   Max.   :39.28   Max.   :188.50   Max.   :2501.0  
##      diagnosisF 
##  Malignant:212  
##  Benign   :357  
##                 
##                 
##                 
## 

##Clustering

#Standardizing cluster variables
mydata$radius_z <- scale(mydata$radius_mean)
mydata$texture_z   <- scale(mydata$texture_mean)
mydata$perimeter_z <- scale(mydata$perimeter_mean)
mydata$area_z <- scale(mydata$area_mean)

library(tidyr)
mydata <- drop_na(mydata) #Removing units with missing values
mydata$Dissimilarity <- sqrt(mydata$radius_z^2 + mydata$texture_z^2 + mydata$perimeter_z^2 + mydata$area_z^2) #Finding outliers
head(mydata[order(-mydata$Dissimilarity), c("id", "Dissimilarity") ], 10) #10 units with the highest value of dissimilarity
##            id Dissimilarity
## 462 911296202      7.722987
## 213   8810703      7.682549
## 181    873592      6.961796
## 353    899987      6.107474
## 83    8611555      5.865842
## 522  91762702      5.448662
## 123    865423      5.244761
## 203    878796      5.085291
## 340     89812      4.981009
## 237  88299702      4.956238

*We have to standardize data again after removing outliers

#Standardizing cluster variables
mydata$radius_z <- scale(mydata$radius_mean)
mydata$texture_z   <- scale(mydata$texture_mean)
mydata$perimeter_z <- scale(mydata$perimeter_mean)
mydata$area_z <- scale(mydata$area_mean)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist (mydata[c("radius_z", "texture_z", "perimeter_z", "area_z")],
                     method = "euclidian")

distance2 <- distance^2

fviz_dist(distance2) #Dissimilarity matrix

##Hopkins statistics

get_clust_tendency(mydata[c("radius_z", "texture_z", "perimeter_z", "area_z")], #Hopkins statistics
                   n = nrow(mydata) - 1, 
                   graph = FALSE)
## $hopkins_stat
## [1] 0.9473338
## 
## $plot
## NULL

##Hierarchical clustering

library(dplyr) #Allowing use of %>%
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
WARD <- mydata[c("radius_z", "texture_z", "perimeter_z", "area_z")] %>%
  #scale() %>%                           
  get_dist(method = "euclidean") %>%  #Selecting distance
  hclust(method = "ward.D2") #Selecting algorithm

WARD
## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 569

##Dendrogram

library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(WARD, 
          k = 3,
          cex = 0.5, 
          palette = "jama",
          color_labels_by_k = TRUE, 
          rect = TRUE)

* We are cutting dendrogram into 3 groups

mydata$ClusterWard <- cutree(WARD, 
                             k = 3) #Cutting the dendrogram
head(mydata)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1
##   diagnosisF   radius_z  texture_z perimeter_z     area_z Dissimilarity
## 1  Malignant  1.0960995 -2.0715123   1.2688173  0.9835095      2.840737
## 2  Malignant  1.8282120 -0.3533215   1.6844726  1.9070303      3.153000
## 3  Malignant  1.5784992  0.4557859   1.5651260  1.5575132      2.752248
## 4  Malignant -0.7682333  0.2535091  -0.5921661 -0.7637917      1.260352
## 5  Malignant  1.7487579 -1.1508038   1.7750113  1.8246238      3.295819
## 6  Malignant -0.4759559 -0.8346009  -0.3868077 -0.5052059      1.152365
##   ClusterWard
## 1           1
## 2           1
## 3           1
## 4           2
## 5           1
## 6           2
#Calculating positions of initial leaders

Initial_leaders <- aggregate(mydata[, c("radius_z", "texture_z", "perimeter_z", "area_z")], 
                             by = list(mydata$ClusterWard), 
                             FUN = mean)

Initial_leaders
##   Group.1   radius_z  texture_z perimeter_z     area_z
## 1       1  1.7088083  0.4953569   1.7074300  1.7605512
## 2       2 -0.5714602 -0.3351839  -0.5754811 -0.5559728
## 3       3  0.4029972  0.7806526   0.4202220  0.2867965
library(factoextra)

#Performing K-Means clustering 

K_MEANS <- hkmeans(mydata[c("radius_z", "texture_z", "perimeter_z", "area_z")], 
                   k = 3,
                   hc.metric = "euclidean",
                   hc.method = "ward.D2")

K_MEANS
## Hierarchical K-means clustering with 3 clusters of sizes 107, 296, 166
## 
## Cluster means:
##      radius_z  texture_z perimeter_z     area_z
## 1  1.67511174  0.5291436  1.67462636  1.7212771
## 2 -0.64646288 -0.6141324 -0.64983176 -0.6133779
## 3  0.07298829  0.7540049  0.07930831 -0.0157639
## 
## Clustering vector:
##   [1] 1 1 1 2 1 2 1 3 3 3 3 3 1 3 3 3 3 3 1 2 2 2 2 1 3 1 3 1 3 1 1 2 3 1 3 3 3
##  [38] 2 3 3 3 2 1 3 3 1 2 2 2 3 2 2 2 1 3 2 1 3 2 2 2 2 3 2 3 3 2 2 2 2 1 2 3 2
##  [75] 2 3 2 1 1 2 2 2 1 1 2 1 3 1 3 2 3 3 2 2 3 1 2 2 2 3 3 2 2 2 2 2 2 2 1 2 2
## [112] 3 3 2 2 2 2 3 3 1 2 1 1 2 2 2 3 1 3 1 2 3 3 2 1 3 2 2 3 2 2 3 2 2 2 2 2 3
## [149] 2 2 3 2 2 2 2 2 1 3 2 2 2 1 1 3 1 3 2 3 1 3 2 3 2 2 2 2 2 3 3 2 1 1 3 2 3
## [186] 2 1 2 2 2 3 3 2 3 3 2 3 1 1 3 2 1 1 3 2 3 2 3 3 2 1 2 1 3 3 2 2 2 1 1 2 2
## [223] 2 3 2 2 2 2 3 3 3 3 3 1 2 3 1 1 3 3 2 2 2 3 1 2 2 2 3 2 1 2 1 1 1 2 1 3 3
## [260] 3 1 3 1 3 3 1 2 3 2 2 2 2 1 2 1 2 2 1 2 2 1 2 1 3 2 2 2 2 2 2 3 3 2 2 2 2
## [297] 2 2 3 2 1 2 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 2 2 2 3 3 3 2 2
## [334] 2 2 3 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 3 1 3 2 2 2 2 2 2 2 3 2 3 2 1 1 2 1 1
## [371] 3 2 1 1 2 3 2 3 2 2 2 2 3 2 2 3 2 2 2 1 2 2 3 1 2 2 2 2 2 2 1 2 2 2 2 2 3
## [408] 3 1 2 2 2 2 3 3 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 3 2 1 1 3 3 2 2 3 2 2 3 2 2
## [445] 1 3 1 3 3 1 2 1 3 2 2 3 3 3 3 3 3 1 3 2 2 3 3 2 1 2 2 3 2 3 2 2 3 2 2 3 2
## [482] 3 2 2 2 2 2 1 2 3 3 1 1 2 3 3 2 2 1 1 3 3 2 1 2 2 2 2 3 3 2 2 3 2 3 2 1 1
## [519] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 3 3 2 2 2 3 3 3 3 3 2 2 2 3 2 2 3 2 3
## [556] 3 2 3 3 3 3 3 3 1 1 1 3 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 242.7607 278.0061 235.8988
##  (between_SS / total_SS =  66.7 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"

###Size of clusters * The first cluster group contains the least people - 18% (107/569). * The second cluster group contains the most people - 52% (296/569). * The third cluster group contains 30% (166/569).

*Between_SS/total_SS = 66.7% - which means that cluster groups differentiate between the units (We want this ratio to be as high as possible)

##Cluster Plot

fviz_cluster(K_MEANS, 
             palette = "jama", 
             repel = FALSE,
             ggtheme = theme_classic()) #Vizualization

mydata$ClusterK_Means <- K_MEANS$cluster
head(mydata[c("id", "ClusterWard", "ClusterK_Means")])
##         id ClusterWard ClusterK_Means
## 1   842302           1              1
## 2   842517           1              1
## 3 84300903           1              1
## 4 84348301           2              2
## 5 84358402           1              1
## 6   843786           2              2
#Checking for reclassifications
table(mydata$ClusterWard)
## 
##   1   2   3 
## 102 372  95
table(mydata$ClusterK_Means)
## 
##   1   2   3 
## 107 296 166
table(mydata$ClusterWard, mydata$ClusterK_Means)
##    
##       1   2   3
##   1 102   0   0
##   2   0 296  76
##   3   5   0  90

###First group * Initially in first group there were 102 units, and all units stayed in same group.

###Second group * Initially in second group there were 372 units, 296 units stayed in same group, while 76 units reclassified in the third group.

###Third group * Initially in third group there were 95 units, 90 units stayed in same group, while 5 units reclassified in the first group.

##Final leaders (Centroids)

Centroids <- K_MEANS$centers
Centroids
##      radius_z  texture_z perimeter_z     area_z
## 1  1.67511174  0.5291436  1.67462636  1.7212771
## 2 -0.64646288 -0.6141324 -0.64983176 -0.6133779
## 3  0.07298829  0.7540049  0.07930831 -0.0157639
library(ggplot2)
library(tidyr)

Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c("radius_z", "texture_z", "perimeter_z", "area_z"))

Figure$Groups <- factor(Figure$id,
                        levels = c(1, 2, 3),
                        labels = c("1", "2", "3"))

Figure$NameFactor <- factor(Figure$name,
                            levels = c("radius_z", "texture_z", "perimeter_z", "area_z"),
                            labels = c("radius_z", "texture_z", "perimeter_z", "area_z"))

ggplot(Figure, aes(x = NameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Groups, col = Groups), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster Variables") +
  ylim(-3, 3)

* Group 1: best performing group, with all values high above average. Lowest value is texture, but still above average.

##Performing ANOVA

#Are all the cluster variables successful at classifying units into groups? Performing ANOVA

fit <- aov(cbind(radius_z, texture_z, perimeter_z, area_z) ~ as.factor(ClusterK_Means), 
                 data = mydata)

summary(fit)
##  Response 1 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 424.83 212.414  839.74 < 2.2e-16 ***
## Residuals                 566 143.17   0.253                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 235.97 117.987  201.13 < 2.2e-16 ***
## Residuals                 566 332.03   0.587                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 426.11 213.054  849.86 < 2.2e-16 ***
## Residuals                 566 141.89   0.251                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 428.43 214.213  868.67 < 2.2e-16 ***
## Residuals                 566 139.57   0.247                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

###Radius * H0: H0: arithmetic mean (radius, G1) = arithmetic mean (radius, G2) = arithmetic mean (radius, G3) * H1: At least one μ is different

*We reject the null hypothesis at p < 0.001 and assume that at least one group has different arithmetic mean of radius than the others.

###Texture * H0: H0: arithmetic mean (texture, G1) = arithmetic mean (texture, G2) = arithmetic mean (texture, G3) * H1: At least one μ is different

*We reject the null hypothesis at p < 0.001 and assume that at least one group has different arithmetic mean of texture than the others.

###Perimeter * H0: H0: arithmetic mean (perimeter, G1) = arithmetic mean (perimeter, G2) = arithmetic mean (perimeter, G3) * H1: At least one μ is different

*We reject the null hypothesis at p < 0.001 and assume that at least one group has different arithmetic mean of perimeter than the others.

###Area * H0: H0: arithmetic mean (area, G1) = arithmetic mean (area, G2) = arithmetic mean (area, G3) * H1: At least one μ is different

*We reject the null hypothesis at p < 0.001 and assume that at least one group has different arithmetic mean of area than the others.

##Validation of results

#Pearson Chi2 test

chisq_results <- chisq.test(mydata$diagnosisF, as.factor(mydata$ClusterK_Means))
chisq_results
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$diagnosisF and as.factor(mydata$ClusterK_Means)
## X-squared = 332.26, df = 2, p-value < 2.2e-16

*We reject null hypothesis at p<0.001, which means that there is association between diagnosis and clusters by K Means.

addmargins(chisq_results$observed)
##                  
## mydata$diagnosisF   1   2   3 Sum
##         Malignant 106  14  92 212
##         Benign      1 282  74 357
##         Sum       107 296 166 569
round(chisq_results$expected, 2)
##                  
## mydata$diagnosisF     1      2      3
##         Malignant 39.87 110.28  61.85
##         Benign    67.13 185.72 104.15
round(chisq_results$res, 2)
##                  
## mydata$diagnosisF     1     2     3
##         Malignant 10.47 -9.17  3.83
##         Benign    -8.07  7.07 -2.95

#Conclusion * Group 1: After K Means clustering, 107 cancer cells, constituting 18% of the sample, fall into the first group. Within this group, 99% are malignant, while rest of 1% is benign. There are more than expected malignant and less than expected benign cells. This is also highest performing group, with all values highly above average (lowest value is texture, but still above average).