Library
library(rpart)
library(rpart.plot)
library(caTools)
library(class)
library(ggplot2)
library(dplyr)
Import data
thayroid_data<-read.csv("Thyroid_data.csv", header = TRUE, sep = ',')
Structure of the data
str(thayroid_data)
## 'data.frame': 215 obs. of 6 variables:
## $ CLASS : int 1 1 1 1 1 1 1 1 1 1 ...
## $ T3 : int 107 113 127 109 105 105 110 114 106 107 ...
## $ TST : num 10.1 9.9 12.9 5.3 7.3 6.1 10.4 9.9 9.4 13 ...
## $ TSTR : num 2.2 3.1 2.4 1.6 1.5 2.1 1.6 2.4 2.2 1.1 ...
## $ TSH : num 0.9 2 1.4 1.4 1.5 1.4 1.6 1.5 1.5 0.9 ...
## $ MAD.TSH: num 2.7 5.9 0.6 1.5 -0.1 7 2.7 5.7 0 3.1 ...
Missing value treatment of the data set
missing_value<-is.na(thayroid_data)
which(is.na(thayroid_data))
## integer(0)
### There are no missing value in dataframe so the dataframe is clean
Specify the class distribution
Class1<-filter(thayroid_data,CLASS==1)
Class2<-filter(thayroid_data,CLASS==2)
Class3<-filter(thayroid_data,CLASS==3)
Find out how many observations are in different classes
table(thayroid_data$CLASS)
##
## 1 2 3
## 150 35 30
Normalization of the data set (Min-Max method)
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
thayroid_normalise_data <- as.data.frame(lapply(thayroid_data[,2:6], normalize))
Randomly cross-validate the data (70% of the data)
dat.d <- sample(1:nrow(thayroid_normalise_data),size=nrow(thayroid_normalise_data)*0.7,replace = FALSE)
Split the data set into train and test data set
train.thayroid<- thayroid_data[dat.d,]
test.thayroid <- thayroid_data[-dat.d,]
Classification Analysis: Decission Tree
Tree for train data set
tree1=rpart(CLASS ~.,data=train.thayroid,method="class")
tree1
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 46 1 (0.69333333 0.16000000 0.14666667)
## 2) TST>=5.65 129 26 1 (0.79844961 0.18604651 0.01550388)
## 4) TST< 14 108 7 1 (0.93518519 0.04629630 0.01851852) *
## 5) TST>=14 21 2 2 (0.09523810 0.90476190 0.00000000) *
## 3) TST< 5.65 21 1 3 (0.04761905 0.00000000 0.95238095) *
Visualization of the tree for train data set

Tree for test data set
tree_test=rpart(CLASS ~.,data=test.thayroid,method="class")
tree_test
## n= 65
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 65 19 1 (0.70769231 0.16923077 0.12307692)
## 2) TST< 12.8 55 9 1 (0.83636364 0.01818182 0.14545455)
## 4) TST>=5.2 48 2 1 (0.95833333 0.02083333 0.02083333) *
## 5) TST< 5.2 7 0 3 (0.00000000 0.00000000 1.00000000) *
## 3) TST>=12.8 10 0 2 (0.00000000 1.00000000 0.00000000) *
Visualization of the tree for test data set
rpart.plot(tree_test)

Make prediction for train and test data set
treeTrain=predict(tree1,train.thayroid,type="class")
treeTest=predict(tree_test,test.thayroid,type="class")
Construct a confusion matrix for train data set
tab_train=table(train.thayroid$CLASS,treeTrain)
tab_train
## treeTrain
## 1 2 3
## 1 101 2 1
## 2 5 19 0
## 3 2 0 20
Construct confusion matrix for test data set
tab_test=table(test.thayroid$CLASS,treeTest)
tab_test
## treeTest
## 1 2 3
## 1 46 0 0
## 2 1 10 0
## 3 1 0 7
Calculate accuracy for training data set
n_train=sum(tab_train)
nc_train = nrow(tab_train)
diag_train = diag(tab_train)
rowsums_train = apply(tab_train, 1, sum)
colsums_train = apply(tab_train, 2, sum)
p_train = rowsums_train / n_train
q_train = colsums_train / n_train
accuracy_train=sum(diag_train)/n_train
accuracy_train
## [1] 0.9333333
Calculate accuracy for test data
n_test=sum(tab_test)
nc_test = nrow(tab_test)
diag_test = diag(tab_test)
rowsums_test = apply(tab_test, 1, sum)
colsums_test = apply(tab_test, 2, sum)
p_test = rowsums_test / n_test
q_test = colsums_test / n_test
accuracy_test=sum(diag_test)/n_test
accuracy_test
## [1] 0.9692308
Accuracy Calculation of training data set while minsplit=30 and minbucket=10
fit <- rpart(CLASS~., data = train.thayroid, method = 'class')
accuracy_tune <- function(fit) {
predict_unseen <- predict(fit, test.thayroid, type = 'class')
table_mat <- table(test.thayroid$CLASS, predict_unseen)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
}
control <- rpart.control(minsplit = 30,
minbucket = round(30/ 3),
maxdepth = 3,
cp = 0)
tune_fit_train <- rpart(CLASS~., data = train.thayroid, method = 'class', control = control)
accuracy_tune(tune_fit_train)
## [1] 0.9384615
Accuracy Calculation of test data set while minsplit=30 and minbucket=10
control <- rpart.control(minsplit = 30,
minbucket = round(30/ 3),
maxdepth = 3,
cp = 0)
tune_fit_test <- rpart(CLASS~., data = test.thayroid, method = 'class', control = control)
accuracy_tune(tune_fit_test)
## [1] 0.9230769
Classification Problem: KNN Classifier
Make prediction for the train and test data set
trainpred_n = knn(train.thayroid[,1:3], train.thayroid[,1:3], factor(train.thayroid$CLASS), 10)
testpred_n = knn(train.thayroid[,1:3], test.thayroid[,1:3], factor(train.thayroid$CLASS), 10)
Construct confusion matrix for train and test data set
tab_knn_n=table(train.thayroid$CLASS,trainpred_n)
tab_knn_n
## trainpred_n
## 1 2 3
## 1 104 0 0
## 2 7 17 0
## 3 7 0 15
tab_knn_test_n=table(test.thayroid$CLASS,testpred_n)
tab_knn_test_n
## testpred_n
## 1 2 3
## 1 46 0 0
## 2 4 7 0
## 3 2 0 6
Calculate accuracy for training data set
normalise_train_knn=sum(tab_knn_n)
norm_nc_train_knn = nrow(tab_knn_n)
normalise_diag_train_knn = diag(tab_knn_n)
normalise_knn_accuracy_train=sum(normalise_diag_train_knn)/normalise_train_knn
normalise_knn_accuracy_train
## [1] 0.9066667
Calculate accuracy for test data set
normalise_test_knn=sum(tab_knn_test_n)
norm_nc_test_knn = nrow(tab_knn_test_n)
normalise_diag_test_knn = diag(tab_knn_test_n)
normalise_knn_accuracy_test=sum(normalise_diag_test_knn)/normalise_test_knn
normalise_knn_accuracy_test
## [1] 0.9076923
Alternate way for looking at the accuracy
norm_KNNTrain_m=mean(train.thayroid$CLASS==trainpred_n)
norm_KNNTest_m=mean(test.thayroid$CLASS==testpred_n)
Calculate error
error1=1-mean(train.thayroid$CLASS==trainpred_n)
error2=1-mean(test.thayroid$CLASS==testpred_n)
Construct a loop to see how the number of neighbors impacts the training and test performance
norm_KNNTrain_m=matrix(0,nrow=50,ncol=200)
norm_KNNTest_m=matrix(0,nrow=50,ncol=200)
for(i in 1:50){
for(j in 1:200){
dat.d <- sample(1:nrow(thayroid_normalise_data),size=nrow(thayroid_normalise_data)*0.7,replace = FALSE)
train.thayroid<- thayroid_data[dat.d,]
test.thayroid <- thayroid_data[-dat.d,]
trainpred = knn(train.thayroid[,1:3], train.thayroid[,1:3], factor(train.thayroid$CLASS), i)
testpred = knn(train.thayroid[,1:3], test.thayroid[,1:3], factor(train.thayroid$CLASS), i)
trainpred_n = knn(train.thayroid[,1:3], train.thayroid[,1:3], factor(train.thayroid$CLASS), i)
testpred_n = knn(train.thayroid[,1:3], test.thayroid[,1:3], factor(train.thayroid$CLASS), i)
norm_KNNTrain_m[i,j]=mean(train.thayroid$CLASS==trainpred_n)
norm_KNNTest_m[i,j]=mean(test.thayroid$CLASS==testpred_n)
}
}
norm_KNNTrain_m=rowMeans(norm_KNNTrain_m)
norm_KNNTest_m=rowMeans(norm_KNNTest_m)
norm_KNNTrain_m
## [1] 1.0000000 0.9718333 0.9654000 0.9473667 0.9348667 0.9247333 0.9175667
## [8] 0.9095333 0.9063667 0.9014333 0.8997333 0.8943333 0.8872667 0.8824333
## [15] 0.8810000 0.8747333 0.8730333 0.8674333 0.8623333 0.8596000 0.8556333
## [22] 0.8479000 0.8452333 0.8413333 0.8309000 0.8249667 0.8180333 0.8085667
## [29] 0.8039667 0.7953333 0.7891667 0.7784667 0.7701667 0.7614000 0.7535000
## [36] 0.7457333 0.7368333 0.7284667 0.7229667 0.7140667 0.7099333 0.7097333
## [43] 0.7076000 0.7037667 0.7049667 0.6996000 0.7021333 0.6986333 0.6979000
## [50] 0.6968333
norm_KNNTest_m
## [1] 0.9454615 0.9220000 0.9192308 0.9108462 0.9020000 0.8951538 0.8931538
## [8] 0.8923846 0.8926923 0.8872308 0.8802308 0.8793077 0.8781538 0.8701538
## [15] 0.8633077 0.8631538 0.8590000 0.8483846 0.8469231 0.8445385 0.8365385
## [22] 0.8333846 0.8256923 0.8282308 0.8140000 0.8113077 0.7907692 0.7931538
## [29] 0.7876923 0.7785385 0.7731538 0.7549231 0.7508462 0.7548462 0.7328462
## [36] 0.7313077 0.7209231 0.7187692 0.7170769 0.7090769 0.7067692 0.7043846
## [43] 0.7010000 0.7033846 0.6988462 0.7025385 0.6938462 0.7020769 0.7020769
## [50] 0.7017692
Store the result as dataframe
resultsdf_norm=data.frame(neighbors=1:50,train_perf1=norm_KNNTrain_m, test_perf1=norm_KNNTest_m)
Visualization of Train and Test Accuracy for different k-neighbors(red=test data, blue=train data)
ggplot(resultsdf_norm,aes(x=neighbors, y=train_perf1))+
geom_line(col='blue')+
geom_line(aes(x=neighbors, y=test_perf1),col='red')

Recommendation
### Data scientist should focuS on the size of Minbucket and k value for choosing best accuracy for training and testing data set.