Task Description

The main aim of this task is to apply classifier in training and testing data set to determine the disorder of the patient (patient is healthy or suffers from hyperthyroidism or hypothyroidism).

Solution

Library

library(rpart) 
library(rpart.plot)
library(caTools) 
library(class) 
library(ggplot2)
library(dplyr)

Import data

thayroid_data<-read.csv("Thyroid_data.csv", header = TRUE, sep = ',')

Structure of the data

str(thayroid_data)
## 'data.frame':    215 obs. of  6 variables:
##  $ CLASS  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ T3     : int  107 113 127 109 105 105 110 114 106 107 ...
##  $ TST    : num  10.1 9.9 12.9 5.3 7.3 6.1 10.4 9.9 9.4 13 ...
##  $ TSTR   : num  2.2 3.1 2.4 1.6 1.5 2.1 1.6 2.4 2.2 1.1 ...
##  $ TSH    : num  0.9 2 1.4 1.4 1.5 1.4 1.6 1.5 1.5 0.9 ...
##  $ MAD.TSH: num  2.7 5.9 0.6 1.5 -0.1 7 2.7 5.7 0 3.1 ...

Missing value treatment of the data set

missing_value<-is.na(thayroid_data)
which(is.na(thayroid_data))
## integer(0)
### There are no missing value in dataframe so the dataframe is clean

Specify the class distribution

Class1<-filter(thayroid_data,CLASS==1)
Class2<-filter(thayroid_data,CLASS==2)
Class3<-filter(thayroid_data,CLASS==3)

Find out how many observations are in different classes

table(thayroid_data$CLASS)
## 
##   1   2   3 
## 150  35  30

Normalization of the data set (Min-Max method)

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }
thayroid_normalise_data <- as.data.frame(lapply(thayroid_data[,2:6], normalize))

Randomly cross-validate the data (70% of the data)

dat.d <- sample(1:nrow(thayroid_normalise_data),size=nrow(thayroid_normalise_data)*0.7,replace = FALSE)

Split the data set into train and test data set

train.thayroid<- thayroid_data[dat.d,] 
test.thayroid <- thayroid_data[-dat.d,] 

Classification Analysis: Decission Tree

Tree for train data set

tree1=rpart(CLASS ~.,data=train.thayroid,method="class")
tree1
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 46 1 (0.69333333 0.16000000 0.14666667)  
##   2) TST>=5.65 129 26 1 (0.79844961 0.18604651 0.01550388)  
##     4) TST< 14 108  7 1 (0.93518519 0.04629630 0.01851852) *
##     5) TST>=14 21  2 2 (0.09523810 0.90476190 0.00000000) *
##   3) TST< 5.65 21  1 3 (0.04761905 0.00000000 0.95238095) *

Visualization of the tree for train data set

Tree for test data set

tree_test=rpart(CLASS ~.,data=test.thayroid,method="class")
tree_test
## n= 65 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 65 19 1 (0.70769231 0.16923077 0.12307692)  
##   2) TST< 12.8 55  9 1 (0.83636364 0.01818182 0.14545455)  
##     4) TST>=5.2 48  2 1 (0.95833333 0.02083333 0.02083333) *
##     5) TST< 5.2 7  0 3 (0.00000000 0.00000000 1.00000000) *
##   3) TST>=12.8 10  0 2 (0.00000000 1.00000000 0.00000000) *

Visualization of the tree for test data set

rpart.plot(tree_test)

Make prediction for train and test data set

treeTrain=predict(tree1,train.thayroid,type="class")
treeTest=predict(tree_test,test.thayroid,type="class")

Construct a confusion matrix for train data set

tab_train=table(train.thayroid$CLASS,treeTrain)
tab_train
##    treeTrain
##       1   2   3
##   1 101   2   1
##   2   5  19   0
##   3   2   0  20

Construct confusion matrix for test data set

tab_test=table(test.thayroid$CLASS,treeTest)
tab_test
##    treeTest
##      1  2  3
##   1 46  0  0
##   2  1 10  0
##   3  1  0  7

Calculate accuracy for training data set

n_train=sum(tab_train) 
nc_train = nrow(tab_train) 
diag_train = diag(tab_train) 
rowsums_train = apply(tab_train, 1, sum) 
colsums_train = apply(tab_train, 2, sum) 
p_train = rowsums_train / n_train 
q_train = colsums_train / n_train 
accuracy_train=sum(diag_train)/n_train
accuracy_train
## [1] 0.9333333

Calculate accuracy for test data

n_test=sum(tab_test) 
nc_test = nrow(tab_test) 
diag_test = diag(tab_test) 
rowsums_test = apply(tab_test, 1, sum) 
colsums_test = apply(tab_test, 2, sum) 
p_test = rowsums_test / n_test
q_test = colsums_test / n_test 
accuracy_test=sum(diag_test)/n_test
accuracy_test
## [1] 0.9692308

Accuracy Calculation of training data set while minsplit=30 and minbucket=10

fit <- rpart(CLASS~., data = train.thayroid, method = 'class')
accuracy_tune <- function(fit) {
  predict_unseen <- predict(fit, test.thayroid, type = 'class')
  table_mat <- table(test.thayroid$CLASS, predict_unseen)
  accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
  accuracy_Test
}
control <- rpart.control(minsplit = 30,
                         minbucket = round(30/ 3),
                         maxdepth = 3,
                         cp = 0)
tune_fit_train <- rpart(CLASS~., data = train.thayroid, method = 'class', control = control)
accuracy_tune(tune_fit_train)
## [1] 0.9384615

Accuracy Calculation of test data set while minsplit=30 and minbucket=10

control <- rpart.control(minsplit = 30,
                         minbucket = round(30/ 3),
                         maxdepth = 3,
                         cp = 0)
tune_fit_test <- rpart(CLASS~., data = test.thayroid, method = 'class', control = control)
accuracy_tune(tune_fit_test)
## [1] 0.9230769

Classification Problem: KNN Classifier

Make prediction for the train and test data set

trainpred_n = knn(train.thayroid[,1:3], train.thayroid[,1:3], factor(train.thayroid$CLASS), 10)
testpred_n = knn(train.thayroid[,1:3], test.thayroid[,1:3], factor(train.thayroid$CLASS), 10)

Construct confusion matrix for train and test data set

tab_knn_n=table(train.thayroid$CLASS,trainpred_n)
tab_knn_n
##    trainpred_n
##       1   2   3
##   1 104   0   0
##   2   7  17   0
##   3   7   0  15
tab_knn_test_n=table(test.thayroid$CLASS,testpred_n)
tab_knn_test_n
##    testpred_n
##      1  2  3
##   1 46  0  0
##   2  4  7  0
##   3  2  0  6

Calculate accuracy for training data set

normalise_train_knn=sum(tab_knn_n) 
norm_nc_train_knn = nrow(tab_knn_n) 
normalise_diag_train_knn = diag(tab_knn_n)
normalise_knn_accuracy_train=sum(normalise_diag_train_knn)/normalise_train_knn
normalise_knn_accuracy_train
## [1] 0.9066667

Calculate accuracy for test data set

normalise_test_knn=sum(tab_knn_test_n) 
norm_nc_test_knn = nrow(tab_knn_test_n) 
normalise_diag_test_knn = diag(tab_knn_test_n) 
normalise_knn_accuracy_test=sum(normalise_diag_test_knn)/normalise_test_knn
normalise_knn_accuracy_test
## [1] 0.9076923

Alternate way for looking at the accuracy

norm_KNNTrain_m=mean(train.thayroid$CLASS==trainpred_n)
norm_KNNTest_m=mean(test.thayroid$CLASS==testpred_n)

Calculate error

error1=1-mean(train.thayroid$CLASS==trainpred_n)
error2=1-mean(test.thayroid$CLASS==testpred_n)

Construct a loop to see how the number of neighbors impacts the training and test performance

norm_KNNTrain_m=matrix(0,nrow=50,ncol=200)
norm_KNNTest_m=matrix(0,nrow=50,ncol=200)
for(i in 1:50){
  for(j in 1:200){
    
    dat.d <- sample(1:nrow(thayroid_normalise_data),size=nrow(thayroid_normalise_data)*0.7,replace = FALSE)
    train.thayroid<- thayroid_data[dat.d,] 
    test.thayroid <- thayroid_data[-dat.d,]
    trainpred = knn(train.thayroid[,1:3], train.thayroid[,1:3], factor(train.thayroid$CLASS), i)
    testpred = knn(train.thayroid[,1:3], test.thayroid[,1:3], factor(train.thayroid$CLASS), i)
    trainpred_n = knn(train.thayroid[,1:3], train.thayroid[,1:3], factor(train.thayroid$CLASS), i)
    testpred_n = knn(train.thayroid[,1:3], test.thayroid[,1:3], factor(train.thayroid$CLASS), i)
    norm_KNNTrain_m[i,j]=mean(train.thayroid$CLASS==trainpred_n)
    norm_KNNTest_m[i,j]=mean(test.thayroid$CLASS==testpred_n)
    
  }
}

norm_KNNTrain_m=rowMeans(norm_KNNTrain_m)
norm_KNNTest_m=rowMeans(norm_KNNTest_m)
norm_KNNTrain_m
##  [1] 1.0000000 0.9718333 0.9654000 0.9473667 0.9348667 0.9247333 0.9175667
##  [8] 0.9095333 0.9063667 0.9014333 0.8997333 0.8943333 0.8872667 0.8824333
## [15] 0.8810000 0.8747333 0.8730333 0.8674333 0.8623333 0.8596000 0.8556333
## [22] 0.8479000 0.8452333 0.8413333 0.8309000 0.8249667 0.8180333 0.8085667
## [29] 0.8039667 0.7953333 0.7891667 0.7784667 0.7701667 0.7614000 0.7535000
## [36] 0.7457333 0.7368333 0.7284667 0.7229667 0.7140667 0.7099333 0.7097333
## [43] 0.7076000 0.7037667 0.7049667 0.6996000 0.7021333 0.6986333 0.6979000
## [50] 0.6968333
norm_KNNTest_m
##  [1] 0.9454615 0.9220000 0.9192308 0.9108462 0.9020000 0.8951538 0.8931538
##  [8] 0.8923846 0.8926923 0.8872308 0.8802308 0.8793077 0.8781538 0.8701538
## [15] 0.8633077 0.8631538 0.8590000 0.8483846 0.8469231 0.8445385 0.8365385
## [22] 0.8333846 0.8256923 0.8282308 0.8140000 0.8113077 0.7907692 0.7931538
## [29] 0.7876923 0.7785385 0.7731538 0.7549231 0.7508462 0.7548462 0.7328462
## [36] 0.7313077 0.7209231 0.7187692 0.7170769 0.7090769 0.7067692 0.7043846
## [43] 0.7010000 0.7033846 0.6988462 0.7025385 0.6938462 0.7020769 0.7020769
## [50] 0.7017692

Store the result as dataframe

resultsdf_norm=data.frame(neighbors=1:50,train_perf1=norm_KNNTrain_m, test_perf1=norm_KNNTest_m)

Visualization of Train and Test Accuracy for different k-neighbors(red=test data, blue=train data)

ggplot(resultsdf_norm,aes(x=neighbors, y=train_perf1))+
  geom_line(col='blue')+
  geom_line(aes(x=neighbors, y=test_perf1),col='red')

Recommendation

### Data scientist should focuS on the size of Minbucket and k value for choosing best accuracy for training and testing data set.