Breast cancer

#NAIVE BAYES - CLASSIFICATION #1.IMPORT DATASET

url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
dataset = read.csv(url,header = F,col.names = c("Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli"," Mitoses","Class" ))

View(dataset)

#2.ENCODING THE TARGET FEATURE AS FACTOR
summary(dataset)

##  Sample.code.number Clump.Thickness  Uniformity.of.Cell.Size
##  Min.   :   61634   Min.   : 1.000   Min.   : 1.000         
##  1st Qu.:  870688   1st Qu.: 2.000   1st Qu.: 1.000         
##  Median : 1171710   Median : 4.000   Median : 1.000         
##  Mean   : 1071704   Mean   : 4.418   Mean   : 3.134         
##  3rd Qu.: 1238298   3rd Qu.: 6.000   3rd Qu.: 5.000         
##  Max.   :13454352   Max.   :10.000   Max.   :10.000         
##  Uniformity.of.Cell.Shape Marginal.Adhesion Single.Epithelial.Cell.Size
##  Min.   : 1.000           Min.   : 1.000    Min.   : 1.000             
##  1st Qu.: 1.000           1st Qu.: 1.000    1st Qu.: 2.000             
##  Median : 1.000           Median : 1.000    Median : 2.000             
##  Mean   : 3.207           Mean   : 2.807    Mean   : 3.216             
##  3rd Qu.: 5.000           3rd Qu.: 4.000    3rd Qu.: 4.000             
##  Max.   :10.000           Max.   :10.000    Max.   :10.000             
##  Bare.Nuclei        Bland.Chromatin  Normal.Nucleoli    X.Mitoses     
##  Length:699         Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  Class :character   1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Mode  :character   Median : 3.000   Median : 1.000   Median : 1.000  
##                     Mean   : 3.438   Mean   : 2.867   Mean   : 1.589  
##                     3rd Qu.: 5.000   3rd Qu.: 4.000   3rd Qu.: 1.000  
##                     Max.   :10.000   Max.   :10.000   Max.   :10.000  
##      Class     
##  Min.   :2.00  
##  1st Qu.:2.00  
##  Median :2.00  
##  Mean   :2.69  
##  3rd Qu.:4.00  
##  Max.   :4.00

str(dataset)

## 'data.frame':    699 obs. of  11 variables:
##  $ Sample.code.number         : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ Clump.Thickness            : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ Uniformity.of.Cell.Size    : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ Uniformity.of.Cell.Shape   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marginal.Adhesion          : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ Single.Epithelial.Cell.Size: int  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.Nuclei                : chr  "1" "10" "2" "4" ...
##  $ Bland.Chromatin            : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.Nucleoli            : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ X.Mitoses                  : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class                      : int  2 2 2 2 2 4 2 2 2 2 ...

dataset_Class = factor(x=dataset$Class,levels = c(2,4))

#3.SPLIT THE DATA TO TRAINING SET AND TEST SET
dataset$Bare.Nuclei <- as.numeric(dataset$Bare.Nuclei)

## Warning: NAs introduced by coercion

library(caTools)
set.seed(123)
split = sample.split(dataset$Class,SplitRatio = 0.75)
tr_set = subset(dataset,split==TRUE)
ts_set = subset(dataset,split==FALSE)
View(tr_set)
View(ts_set)
dim(tr_set)

## [1] 525  11

dim(ts_set)

## [1] 174  11

#4.feature scaling 
tr_set[-3] = scale(tr_set[-3])
ts_set[-3] = scale(ts_set[-3])
View(ts_set[-3])
View(tr_set[-3])

#5.Fitting to the training set
library(e1071)
classifier = naiveBayes(tr_set[-3],tr_set$Class)
summary(classifier)

##           Length Class  Mode     
## apriori    2     table  numeric  
## tables    10     -none- list     
## levels     2     -none- character
## isnumeric 10     -none- logical  
## call       3     -none- call

#6.prediction
y_predict = predict(object=classifier,newdata=ts_set[-3])
y_predict

##   [1] 1.3772916852338    1.3772916852338    -0.724679636707318
##   [4] 1.3772916852338    -0.724679636707318 1.3772916852338   
##   [7] -0.724679636707318 -0.724679636707318 -0.724679636707318
##  [10] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [13] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [16] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [19] -0.724679636707318 -0.724679636707318 1.3772916852338   
##  [22] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [25] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [28] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [31] -0.724679636707318 1.3772916852338    -0.724679636707318
##  [34] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [37] -0.724679636707318 -0.724679636707318 1.3772916852338   
##  [40] 1.3772916852338    -0.724679636707318 1.3772916852338   
##  [43] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [46] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [49] -0.724679636707318 -0.724679636707318 1.3772916852338   
##  [52] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [55] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [58] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [61] -0.724679636707318 -0.724679636707318 1.3772916852338   
##  [64] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [67] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [70] 1.3772916852338    1.3772916852338    -0.724679636707318
##  [73] 1.3772916852338    1.3772916852338    1.3772916852338   
##  [76] 1.3772916852338    1.3772916852338    -0.724679636707318
##  [79] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [82] 1.3772916852338    1.3772916852338    -0.724679636707318
##  [85] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [88] 1.3772916852338    1.3772916852338    -0.724679636707318
##  [91] -0.724679636707318 1.3772916852338    1.3772916852338   
##  [94] 1.3772916852338    -0.724679636707318 -0.724679636707318
##  [97] 1.3772916852338    -0.724679636707318 -0.724679636707318
## [100] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [103] -0.724679636707318 1.3772916852338    -0.724679636707318
## [106] -0.724679636707318 -0.724679636707318 1.3772916852338   
## [109] -0.724679636707318 1.3772916852338    -0.724679636707318
## [112] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [115] 1.3772916852338    1.3772916852338    -0.724679636707318
## [118] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [121] -0.724679636707318 1.3772916852338    -0.724679636707318
## [124] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [127] -0.724679636707318 1.3772916852338    -0.724679636707318
## [130] 1.3772916852338    -0.724679636707318 -0.724679636707318
## [133] -0.724679636707318 1.3772916852338    -0.724679636707318
## [136] 1.3772916852338    -0.724679636707318 -0.724679636707318
## [139] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [142] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [145] 1.3772916852338    -0.724679636707318 -0.724679636707318
## [148] -0.724679636707318 1.3772916852338    -0.724679636707318
## [151] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [154] -0.724679636707318 -0.724679636707318 1.3772916852338   
## [157] 1.3772916852338    1.3772916852338    -0.724679636707318
## [160] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [163] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [166] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [169] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [172] -0.724679636707318 -0.724679636707318 -0.724679636707318
## Levels: -0.724679636707318 1.3772916852338

#7.confusion matrix
cm = table(ts_set$Class,y_predict)
cm

##                     y_predict
##                      -0.724679636707318 1.3772916852338
##   -0.723388544342461                109               5
##   1.37443823425068                    1              59

sum(diag(cm))/sum(cm)

## [1] 0.9655172

#Decision tree

#1.import data
url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
dataset = read.csv(url,header = F,col.names = c("Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli"," Mitoses","Class" ))
View(dataset)
str(dataset)

## 'data.frame':    699 obs. of  11 variables:
##  $ Sample.code.number         : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ Clump.Thickness            : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ Uniformity.of.Cell.Size    : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ Uniformity.of.Cell.Shape   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marginal.Adhesion          : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ Single.Epithelial.Cell.Size: int  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.Nuclei                : chr  "1" "10" "2" "4" ...
##  $ Bland.Chromatin            : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.Nucleoli            : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ X.Mitoses                  : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class                      : int  2 2 2 2 2 4 2 2 2 2 ...

dataset$Bare.Nuclei <- as.numeric(dataset$Bare.Nuclei)

## Warning: NAs introduced by coercion

set.seed(123)

#2.shuffle dataset
shuffle_index = sample(1:nrow(dataset))
shuffle_index

##   [1] 415 463 179 526 195 118 299 229 244  14 374 665 602 603  91 348 649 355
##  [19]  26 519 426 683 211 590 593 555 373 143 544 490 621  23 309 135 224 166
##  [37] 217 290 581  72 588 575 141 153 294 277 698  41 431  90 316 223 528 116
##  [55] 606 456 598  39 159 209 689  34 516  13  69 409 308 278  89 537 291 424
##  [73] 286 121 110 158  64 483 477 480  67  85 165  51  74 178 362 236 610 330
##  [91] 127 212 310 243 113 669 151 160 391 155 679   5 326 280 567 238 339 642
## [109] 137 455 560  83 592 196 627 500 344 607 459  20 695 164  52 534 177 554
## [127]  84 523 392 302 430 428 250 429 398 647 381 545  40 522 473 200 125 265
## [145] 186 252 458 152  54 538 235 289 185 413 589 667 551 205 630 601 634 346
## [163] 687 468 509  57 457 357 279 270 347 129 218 337 681 680 390 498 222 421
## [181] 576 163 594 637 611 225 389 117  25 136  55 663 618  45 146 170 699 134
## [199] 199 445 361 176 401 245 566 476 447 671 104 492 319 501 536 210 349 497
## [217] 514 258 658 386 657  24 433 451 466 130 617 191  76 377 269 504 584 198
## [235] 613 234 422 368 623  80  36 629 253 343 323  48 450 111 531 393 317 295
## [253] 521 287 493  73 292 226 632 378 172 297 684  93 557 693 587 237 559 107
## [271]  33 659 396 354 654 640 471  94 553  30 508 175 639 676 115 470 338  96
## [289] 358 487 412 525 397 404 230 148 350 571 202  81 546 232 645 106 375  11
## [307] 605 364 620 479  31 503 441 653 484  16 197 614 481  12  66  50 204 418
## [325] 673 502 122 315 259 353 248 548 454 331 100 108 301  10 410 596 437 628
## [343] 372 591 670 402   8 114 261  29 306 597 615 282 444 267 262 688 414 677
## [361] 219 184 600 119 619 625 417 661 535 240 120 304 465 656 365 105 281 180
## [379] 387 241 478 167  47 472  37 174 543 303 207  19 558 641 532 103 691 367
## [397] 188 139 434 624 189 363 524  38 573 314 474 329 682  21 488  87 660 385
## [415] 651 400 366 333 305   6 128 156  49 227 239 193 383 406 190 112 678 336
## [433] 388 327 583  59  61 599 668 556  88 132 674 215 251 203 246 622 533 399
## [451] 697 382 328 131 495 467 162 379 461 168 257 580  78 359 462 515 298 276
## [469]  95 150 208 123  86 221 595 161 644 192 181 633  46 520 489  60 300 686
## [487]  17 187 547 171 351 506 510  79 578 335 604 342 664  28 288 442 102 138
## [505] 109 273 652 311 564 460 694 432 140 446 609 274 370 577 341 464   2  65
## [523] 666 403 517 124 271   7 692 423 511 685 572 586 540 340 380 183  71  15
## [541] 568 549 395 550 585   4 284 407 475 648 494 690 321 356 646 675 320 582
## [559]  98 469  68 650 157  75 133 142  35 263 425 507 635 672 443 499 249 233
## [577]  18 154 427 452 256 655 275 369 416 101 394 206 530 438  56 626 541 542
## [595] 182 482 631 332   3 439 570  53 419  99 214 453 529 268 643 371  32 285
## [613] 436 405 216 513  92 486 264  97 563 266 149 512  44  42 485 254 325 296
## [631] 213 496 435 242 612 449 616   9 411 169 569  22 247 552 562 574 662 334
## [649]  62 638 313  43 201 505 696  77 539 260  70 318 312 352 565 440 527 636
## [667] 579 231  27 360 491 608 272  82 194 324 420 220 126 307 408 518 376 173
## [685] 145 448 561  58   1 228 345  63 144 255 384 322 293 283 147

#3.Create Training set and Test set
library(caTools)
split = sample.split(Y = dataset$Class,SplitRatio = 0.75)
training_set = subset(dataset,split==T)
test_set = subset(dataset,split==F)
dim(training_set)

## [1] 525  11

str(test_set)

## 'data.frame':    174 obs. of  11 variables:
##  $ Sample.code.number         : int  1002945 1035283 1036172 1047630 1048672 1049815 1050670 1100524 1102573 1105524 ...
##  $ Clump.Thickness            : int  5 1 2 7 4 4 10 6 5 1 ...
##  $ Uniformity.of.Cell.Size    : int  4 1 1 4 1 1 7 10 6 1 ...
##  $ Uniformity.of.Cell.Shape   : int  4 1 1 6 1 1 7 10 5 1 ...
##  $ Marginal.Adhesion          : int  5 1 1 4 1 1 6 2 6 1 ...
##  $ Single.Epithelial.Cell.Size: int  7 1 2 6 2 2 4 8 10 2 ...
##  $ Bare.Nuclei                : num  10 1 1 1 1 1 10 10 1 1 ...
##  $ Bland.Chromatin            : int  3 3 2 4 2 3 4 7 3 2 ...
##  $ Normal.Nucleoli            : int  2 1 1 3 1 1 1 3 1 1 ...
##  $ X.Mitoses                  : int  1 1 1 1 1 1 2 3 1 1 ...
##  $ Class                      : int  2 2 2 4 2 2 4 4 4 2 ...

#4.Build a model
library(rpart)
fit = rpart(formula = Class~.,data = training_set,method = 'class')
summary(fit)

## Call:
## rpart(formula = Class ~ ., data = training_set, method = "class")
##   n= 525 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.79005525      0 1.0000000 1.0000000 0.06016727
## 2 0.05524862      1 0.2099448 0.2375691 0.03471359
## 3 0.02762431      2 0.1546961 0.1602210 0.02891889
## 4 0.01000000      3 0.1270718 0.1602210 0.02891889
## 
## Variable importance
##     Uniformity.of.Cell.Size    Uniformity.of.Cell.Shape 
##                          21                          19 
## Single.Epithelial.Cell.Size             Normal.Nucleoli 
##                          16                          15 
##             Bland.Chromatin                 Bare.Nuclei 
##                          15                          13 
##             Clump.Thickness 
##                           1 
## 
## Node number 1: 525 observations,    complexity param=0.7900552
##   predicted class=2  expected loss=0.3447619  P(node) =1
##     class counts:   344   181
##    probabilities: 0.655 0.345 
##   left son=2 (328 obs) right son=3 (197 obs)
##   Primary splits:
##       Uniformity.of.Cell.Size     < 2.5 to the left,  improve=169.3350, (0 missing)
##       Uniformity.of.Cell.Shape    < 3.5 to the left,  improve=166.7428, (0 missing)
##       Single.Epithelial.Cell.Size < 2.5 to the left,  improve=154.2208, (0 missing)
##       Bare.Nuclei                 < 3.5 to the left,  improve=150.0674, (12 missing)
##       Bland.Chromatin             < 3.5 to the left,  improve=148.3829, (0 missing)
##   Surrogate splits:
##       Uniformity.of.Cell.Shape    < 3.5 to the left,  agree=0.928, adj=0.807, (0 split)
##       Single.Epithelial.Cell.Size < 2.5 to the left,  agree=0.903, adj=0.741, (0 split)
##       Normal.Nucleoli             < 2.5 to the left,  agree=0.891, adj=0.711, (0 split)
##       Bland.Chromatin             < 3.5 to the left,  agree=0.888, adj=0.701, (0 split)
##       Bare.Nuclei                 < 2.5 to the left,  agree=0.859, adj=0.624, (0 split)
## 
## Node number 2: 328 observations,    complexity param=0.02762431
##   predicted class=2  expected loss=0.03353659  P(node) =0.6247619
##     class counts:   317    11
##    probabilities: 0.966 0.034 
##   left son=4 (321 obs) right son=5 (7 obs)
##   Primary splits:
##       Clump.Thickness             < 6.5 to the left,  improve=9.703673, (0 missing)
##       Bare.Nuclei                 < 5.5 to the left,  improve=9.685480, (10 missing)
##       Normal.Nucleoli             < 3.5 to the left,  improve=8.418445, (0 missing)
##       Bland.Chromatin             < 4.5 to the left,  improve=7.530128, (0 missing)
##       Single.Epithelial.Cell.Size < 3.5 to the left,  improve=4.034779, (0 missing)
## 
## Node number 3: 197 observations,    complexity param=0.05524862
##   predicted class=4  expected loss=0.1370558  P(node) =0.3752381
##     class counts:    27   170
##    probabilities: 0.137 0.863 
##   left son=6 (16 obs) right son=7 (181 obs)
##   Primary splits:
##       Uniformity.of.Cell.Shape    < 2.5 to the left,  improve=15.889730, (0 missing)
##       Uniformity.of.Cell.Size     < 4.5 to the left,  improve=11.616580, (0 missing)
##       Bare.Nuclei                 < 1.5 to the left,  improve=10.384060, (2 missing)
##       Single.Epithelial.Cell.Size < 2.5 to the left,  improve= 9.105739, (0 missing)
##       Bland.Chromatin             < 2.5 to the left,  improve= 8.292355, (0 missing)
##   Surrogate splits:
##       Bland.Chromatin < 1.5 to the left,  agree=0.924, adj=0.063, (0 split)
## 
## Node number 4: 321 observations
##   predicted class=2  expected loss=0.01557632  P(node) =0.6114286
##     class counts:   316     5
##    probabilities: 0.984 0.016 
## 
## Node number 5: 7 observations
##   predicted class=4  expected loss=0.1428571  P(node) =0.01333333
##     class counts:     1     6
##    probabilities: 0.143 0.857 
## 
## Node number 6: 16 observations
##   predicted class=2  expected loss=0.1875  P(node) =0.03047619
##     class counts:    13     3
##    probabilities: 0.812 0.187 
## 
## Node number 7: 181 observations
##   predicted class=4  expected loss=0.07734807  P(node) =0.3447619
##     class counts:    14   167
##    probabilities: 0.077 0.923

plot(fit)
text(fit)

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.2

rpart.plot(fit)

#APRIORI ALGORITHM

library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
dataset = read.csv(url,header = F,col.names = c("Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli"," Mitoses","Class" ))
View(dataset)
dataset = dataset[2:6]
set.seed(123)

#CLASSIFICATOION - 3 #CLUSTERING -2 #REGREESSION -2

Breast cancer

20MIC0058

2022-11-11