#NAIVE BAYES - CLASSIFICATION #1.IMPORT DATASET
url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
dataset = read.csv(url,header = F,col.names = c("Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli"," Mitoses","Class" ))
View(dataset)
#2.ENCODING THE TARGET FEATURE AS FACTOR
summary(dataset)
## Sample.code.number Clump.Thickness Uniformity.of.Cell.Size
## Min. : 61634 Min. : 1.000 Min. : 1.000
## 1st Qu.: 870688 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 1171710 Median : 4.000 Median : 1.000
## Mean : 1071704 Mean : 4.418 Mean : 3.134
## 3rd Qu.: 1238298 3rd Qu.: 6.000 3rd Qu.: 5.000
## Max. :13454352 Max. :10.000 Max. :10.000
## Uniformity.of.Cell.Shape Marginal.Adhesion Single.Epithelial.Cell.Size
## Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 1.000 Median : 1.000 Median : 2.000
## Mean : 3.207 Mean : 2.807 Mean : 3.216
## 3rd Qu.: 5.000 3rd Qu.: 4.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000
## Bare.Nuclei Bland.Chromatin Normal.Nucleoli X.Mitoses
## Length:699 Min. : 1.000 Min. : 1.000 Min. : 1.000
## Class :character 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000
## Mode :character Median : 3.000 Median : 1.000 Median : 1.000
## Mean : 3.438 Mean : 2.867 Mean : 1.589
## 3rd Qu.: 5.000 3rd Qu.: 4.000 3rd Qu.: 1.000
## Max. :10.000 Max. :10.000 Max. :10.000
## Class
## Min. :2.00
## 1st Qu.:2.00
## Median :2.00
## Mean :2.69
## 3rd Qu.:4.00
## Max. :4.00
str(dataset)
## 'data.frame': 699 obs. of 11 variables:
## $ Sample.code.number : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ Clump.Thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ Uniformity.of.Cell.Size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ Uniformity.of.Cell.Shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ Marginal.Adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ Single.Epithelial.Cell.Size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.Nuclei : chr "1" "10" "2" "4" ...
## $ Bland.Chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.Nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ X.Mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : int 2 2 2 2 2 4 2 2 2 2 ...
dataset_Class = factor(x=dataset$Class,levels = c(2,4))
#3.SPLIT THE DATA TO TRAINING SET AND TEST SET
dataset$Bare.Nuclei <- as.numeric(dataset$Bare.Nuclei)
## Warning: NAs introduced by coercion
library(caTools)
set.seed(123)
split = sample.split(dataset$Class,SplitRatio = 0.75)
tr_set = subset(dataset,split==TRUE)
ts_set = subset(dataset,split==FALSE)
View(tr_set)
View(ts_set)
dim(tr_set)
## [1] 525 11
dim(ts_set)
## [1] 174 11
#4.feature scaling
tr_set[-3] = scale(tr_set[-3])
ts_set[-3] = scale(ts_set[-3])
View(ts_set[-3])
View(tr_set[-3])
#5.Fitting to the training set
library(e1071)
classifier = naiveBayes(tr_set[-3],tr_set$Class)
summary(classifier)
## Length Class Mode
## apriori 2 table numeric
## tables 10 -none- list
## levels 2 -none- character
## isnumeric 10 -none- logical
## call 3 -none- call
#6.prediction
y_predict = predict(object=classifier,newdata=ts_set[-3])
y_predict
## [1] 1.3772916852338 1.3772916852338 -0.724679636707318
## [4] 1.3772916852338 -0.724679636707318 1.3772916852338
## [7] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [10] -0.724679636707318 1.3772916852338 1.3772916852338
## [13] -0.724679636707318 1.3772916852338 1.3772916852338
## [16] -0.724679636707318 1.3772916852338 1.3772916852338
## [19] -0.724679636707318 -0.724679636707318 1.3772916852338
## [22] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [25] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [28] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [31] -0.724679636707318 1.3772916852338 -0.724679636707318
## [34] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [37] -0.724679636707318 -0.724679636707318 1.3772916852338
## [40] 1.3772916852338 -0.724679636707318 1.3772916852338
## [43] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [46] -0.724679636707318 1.3772916852338 1.3772916852338
## [49] -0.724679636707318 -0.724679636707318 1.3772916852338
## [52] -0.724679636707318 1.3772916852338 1.3772916852338
## [55] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [58] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [61] -0.724679636707318 -0.724679636707318 1.3772916852338
## [64] -0.724679636707318 1.3772916852338 1.3772916852338
## [67] -0.724679636707318 1.3772916852338 1.3772916852338
## [70] 1.3772916852338 1.3772916852338 -0.724679636707318
## [73] 1.3772916852338 1.3772916852338 1.3772916852338
## [76] 1.3772916852338 1.3772916852338 -0.724679636707318
## [79] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [82] 1.3772916852338 1.3772916852338 -0.724679636707318
## [85] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [88] 1.3772916852338 1.3772916852338 -0.724679636707318
## [91] -0.724679636707318 1.3772916852338 1.3772916852338
## [94] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [97] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [100] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [103] -0.724679636707318 1.3772916852338 -0.724679636707318
## [106] -0.724679636707318 -0.724679636707318 1.3772916852338
## [109] -0.724679636707318 1.3772916852338 -0.724679636707318
## [112] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [115] 1.3772916852338 1.3772916852338 -0.724679636707318
## [118] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [121] -0.724679636707318 1.3772916852338 -0.724679636707318
## [124] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [127] -0.724679636707318 1.3772916852338 -0.724679636707318
## [130] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [133] -0.724679636707318 1.3772916852338 -0.724679636707318
## [136] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [139] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [142] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [145] 1.3772916852338 -0.724679636707318 -0.724679636707318
## [148] -0.724679636707318 1.3772916852338 -0.724679636707318
## [151] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [154] -0.724679636707318 -0.724679636707318 1.3772916852338
## [157] 1.3772916852338 1.3772916852338 -0.724679636707318
## [160] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [163] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [166] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [169] -0.724679636707318 -0.724679636707318 -0.724679636707318
## [172] -0.724679636707318 -0.724679636707318 -0.724679636707318
## Levels: -0.724679636707318 1.3772916852338
#7.confusion matrix
cm = table(ts_set$Class,y_predict)
cm
## y_predict
## -0.724679636707318 1.3772916852338
## -0.723388544342461 109 5
## 1.37443823425068 1 59
sum(diag(cm))/sum(cm)
## [1] 0.9655172
#Decision tree
#1.import data
url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
dataset = read.csv(url,header = F,col.names = c("Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli"," Mitoses","Class" ))
View(dataset)
str(dataset)
## 'data.frame': 699 obs. of 11 variables:
## $ Sample.code.number : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ Clump.Thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ Uniformity.of.Cell.Size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ Uniformity.of.Cell.Shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ Marginal.Adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ Single.Epithelial.Cell.Size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.Nuclei : chr "1" "10" "2" "4" ...
## $ Bland.Chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.Nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ X.Mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : int 2 2 2 2 2 4 2 2 2 2 ...
dataset$Bare.Nuclei <- as.numeric(dataset$Bare.Nuclei)
## Warning: NAs introduced by coercion
set.seed(123)
#2.shuffle dataset
shuffle_index = sample(1:nrow(dataset))
shuffle_index
## [1] 415 463 179 526 195 118 299 229 244 14 374 665 602 603 91 348 649 355
## [19] 26 519 426 683 211 590 593 555 373 143 544 490 621 23 309 135 224 166
## [37] 217 290 581 72 588 575 141 153 294 277 698 41 431 90 316 223 528 116
## [55] 606 456 598 39 159 209 689 34 516 13 69 409 308 278 89 537 291 424
## [73] 286 121 110 158 64 483 477 480 67 85 165 51 74 178 362 236 610 330
## [91] 127 212 310 243 113 669 151 160 391 155 679 5 326 280 567 238 339 642
## [109] 137 455 560 83 592 196 627 500 344 607 459 20 695 164 52 534 177 554
## [127] 84 523 392 302 430 428 250 429 398 647 381 545 40 522 473 200 125 265
## [145] 186 252 458 152 54 538 235 289 185 413 589 667 551 205 630 601 634 346
## [163] 687 468 509 57 457 357 279 270 347 129 218 337 681 680 390 498 222 421
## [181] 576 163 594 637 611 225 389 117 25 136 55 663 618 45 146 170 699 134
## [199] 199 445 361 176 401 245 566 476 447 671 104 492 319 501 536 210 349 497
## [217] 514 258 658 386 657 24 433 451 466 130 617 191 76 377 269 504 584 198
## [235] 613 234 422 368 623 80 36 629 253 343 323 48 450 111 531 393 317 295
## [253] 521 287 493 73 292 226 632 378 172 297 684 93 557 693 587 237 559 107
## [271] 33 659 396 354 654 640 471 94 553 30 508 175 639 676 115 470 338 96
## [289] 358 487 412 525 397 404 230 148 350 571 202 81 546 232 645 106 375 11
## [307] 605 364 620 479 31 503 441 653 484 16 197 614 481 12 66 50 204 418
## [325] 673 502 122 315 259 353 248 548 454 331 100 108 301 10 410 596 437 628
## [343] 372 591 670 402 8 114 261 29 306 597 615 282 444 267 262 688 414 677
## [361] 219 184 600 119 619 625 417 661 535 240 120 304 465 656 365 105 281 180
## [379] 387 241 478 167 47 472 37 174 543 303 207 19 558 641 532 103 691 367
## [397] 188 139 434 624 189 363 524 38 573 314 474 329 682 21 488 87 660 385
## [415] 651 400 366 333 305 6 128 156 49 227 239 193 383 406 190 112 678 336
## [433] 388 327 583 59 61 599 668 556 88 132 674 215 251 203 246 622 533 399
## [451] 697 382 328 131 495 467 162 379 461 168 257 580 78 359 462 515 298 276
## [469] 95 150 208 123 86 221 595 161 644 192 181 633 46 520 489 60 300 686
## [487] 17 187 547 171 351 506 510 79 578 335 604 342 664 28 288 442 102 138
## [505] 109 273 652 311 564 460 694 432 140 446 609 274 370 577 341 464 2 65
## [523] 666 403 517 124 271 7 692 423 511 685 572 586 540 340 380 183 71 15
## [541] 568 549 395 550 585 4 284 407 475 648 494 690 321 356 646 675 320 582
## [559] 98 469 68 650 157 75 133 142 35 263 425 507 635 672 443 499 249 233
## [577] 18 154 427 452 256 655 275 369 416 101 394 206 530 438 56 626 541 542
## [595] 182 482 631 332 3 439 570 53 419 99 214 453 529 268 643 371 32 285
## [613] 436 405 216 513 92 486 264 97 563 266 149 512 44 42 485 254 325 296
## [631] 213 496 435 242 612 449 616 9 411 169 569 22 247 552 562 574 662 334
## [649] 62 638 313 43 201 505 696 77 539 260 70 318 312 352 565 440 527 636
## [667] 579 231 27 360 491 608 272 82 194 324 420 220 126 307 408 518 376 173
## [685] 145 448 561 58 1 228 345 63 144 255 384 322 293 283 147
#3.Create Training set and Test set
library(caTools)
split = sample.split(Y = dataset$Class,SplitRatio = 0.75)
training_set = subset(dataset,split==T)
test_set = subset(dataset,split==F)
dim(training_set)
## [1] 525 11
str(test_set)
## 'data.frame': 174 obs. of 11 variables:
## $ Sample.code.number : int 1002945 1035283 1036172 1047630 1048672 1049815 1050670 1100524 1102573 1105524 ...
## $ Clump.Thickness : int 5 1 2 7 4 4 10 6 5 1 ...
## $ Uniformity.of.Cell.Size : int 4 1 1 4 1 1 7 10 6 1 ...
## $ Uniformity.of.Cell.Shape : int 4 1 1 6 1 1 7 10 5 1 ...
## $ Marginal.Adhesion : int 5 1 1 4 1 1 6 2 6 1 ...
## $ Single.Epithelial.Cell.Size: int 7 1 2 6 2 2 4 8 10 2 ...
## $ Bare.Nuclei : num 10 1 1 1 1 1 10 10 1 1 ...
## $ Bland.Chromatin : int 3 3 2 4 2 3 4 7 3 2 ...
## $ Normal.Nucleoli : int 2 1 1 3 1 1 1 3 1 1 ...
## $ X.Mitoses : int 1 1 1 1 1 1 2 3 1 1 ...
## $ Class : int 2 2 2 4 2 2 4 4 4 2 ...
#4.Build a model
library(rpart)
fit = rpart(formula = Class~.,data = training_set,method = 'class')
summary(fit)
## Call:
## rpart(formula = Class ~ ., data = training_set, method = "class")
## n= 525
##
## CP nsplit rel error xerror xstd
## 1 0.79005525 0 1.0000000 1.0000000 0.06016727
## 2 0.05524862 1 0.2099448 0.2375691 0.03471359
## 3 0.02762431 2 0.1546961 0.1602210 0.02891889
## 4 0.01000000 3 0.1270718 0.1602210 0.02891889
##
## Variable importance
## Uniformity.of.Cell.Size Uniformity.of.Cell.Shape
## 21 19
## Single.Epithelial.Cell.Size Normal.Nucleoli
## 16 15
## Bland.Chromatin Bare.Nuclei
## 15 13
## Clump.Thickness
## 1
##
## Node number 1: 525 observations, complexity param=0.7900552
## predicted class=2 expected loss=0.3447619 P(node) =1
## class counts: 344 181
## probabilities: 0.655 0.345
## left son=2 (328 obs) right son=3 (197 obs)
## Primary splits:
## Uniformity.of.Cell.Size < 2.5 to the left, improve=169.3350, (0 missing)
## Uniformity.of.Cell.Shape < 3.5 to the left, improve=166.7428, (0 missing)
## Single.Epithelial.Cell.Size < 2.5 to the left, improve=154.2208, (0 missing)
## Bare.Nuclei < 3.5 to the left, improve=150.0674, (12 missing)
## Bland.Chromatin < 3.5 to the left, improve=148.3829, (0 missing)
## Surrogate splits:
## Uniformity.of.Cell.Shape < 3.5 to the left, agree=0.928, adj=0.807, (0 split)
## Single.Epithelial.Cell.Size < 2.5 to the left, agree=0.903, adj=0.741, (0 split)
## Normal.Nucleoli < 2.5 to the left, agree=0.891, adj=0.711, (0 split)
## Bland.Chromatin < 3.5 to the left, agree=0.888, adj=0.701, (0 split)
## Bare.Nuclei < 2.5 to the left, agree=0.859, adj=0.624, (0 split)
##
## Node number 2: 328 observations, complexity param=0.02762431
## predicted class=2 expected loss=0.03353659 P(node) =0.6247619
## class counts: 317 11
## probabilities: 0.966 0.034
## left son=4 (321 obs) right son=5 (7 obs)
## Primary splits:
## Clump.Thickness < 6.5 to the left, improve=9.703673, (0 missing)
## Bare.Nuclei < 5.5 to the left, improve=9.685480, (10 missing)
## Normal.Nucleoli < 3.5 to the left, improve=8.418445, (0 missing)
## Bland.Chromatin < 4.5 to the left, improve=7.530128, (0 missing)
## Single.Epithelial.Cell.Size < 3.5 to the left, improve=4.034779, (0 missing)
##
## Node number 3: 197 observations, complexity param=0.05524862
## predicted class=4 expected loss=0.1370558 P(node) =0.3752381
## class counts: 27 170
## probabilities: 0.137 0.863
## left son=6 (16 obs) right son=7 (181 obs)
## Primary splits:
## Uniformity.of.Cell.Shape < 2.5 to the left, improve=15.889730, (0 missing)
## Uniformity.of.Cell.Size < 4.5 to the left, improve=11.616580, (0 missing)
## Bare.Nuclei < 1.5 to the left, improve=10.384060, (2 missing)
## Single.Epithelial.Cell.Size < 2.5 to the left, improve= 9.105739, (0 missing)
## Bland.Chromatin < 2.5 to the left, improve= 8.292355, (0 missing)
## Surrogate splits:
## Bland.Chromatin < 1.5 to the left, agree=0.924, adj=0.063, (0 split)
##
## Node number 4: 321 observations
## predicted class=2 expected loss=0.01557632 P(node) =0.6114286
## class counts: 316 5
## probabilities: 0.984 0.016
##
## Node number 5: 7 observations
## predicted class=4 expected loss=0.1428571 P(node) =0.01333333
## class counts: 1 6
## probabilities: 0.143 0.857
##
## Node number 6: 16 observations
## predicted class=2 expected loss=0.1875 P(node) =0.03047619
## class counts: 13 3
## probabilities: 0.812 0.187
##
## Node number 7: 181 observations
## predicted class=4 expected loss=0.07734807 P(node) =0.3447619
## class counts: 14 167
## probabilities: 0.077 0.923
plot(fit)
text(fit)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.2
rpart.plot(fit)
#APRIORI ALGORITHM
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
url <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
dataset = read.csv(url,header = F,col.names = c("Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli"," Mitoses","Class" ))
View(dataset)
dataset = dataset[2:6]
set.seed(123)
#CLASSIFICATOION - 3 #CLUSTERING -2 #REGREESSION -2