# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('ggplot2') # visualization
## Warning: package 'ggplot2' was built under R version 3.4.1
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.1
library('scales') # visualization
## Warning: package 'scales' was built under R version 3.4.3
library('dplyr') # data manipulation
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.3
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('ROCR')
library('randomForest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library('glue')
## Warning: package 'glue' was built under R version 3.4.2
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.1
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require("knitr")
## Loading required package: knitr
## Warning: package 'knitr' was built under R version 3.4.3
require("geosphere")
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
require("gmapsdistance")
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
library('corrplot')
#source("distance.R")
library('car')
## Warning: package 'car' was built under R version 3.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
library('visdat')
## Warning: package 'visdat' was built under R version 3.4.1
library('psych')
## Warning: package 'psych' was built under R version 3.4.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library('leaflet')
## Warning: package 'leaflet' was built under R version 3.4.1
library('leaflet.extras')
## Warning: package 'leaflet.extras' was built under R version 3.4.1
library("PerformanceAnalytics")
## Warning: package 'PerformanceAnalytics' was built under R version 3.4.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
##
## addLegend
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
library('GPArotation')
## Warning: package 'GPArotation' was built under R version 3.4.1
library('MVN')
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
library('psych')
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library('psy')
## Warning: package 'psy' was built under R version 3.4.1
##
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
##
## wkappa
library('corpcor')
## Warning: package 'corpcor' was built under R version 3.4.1
library('fastmatch')
## Warning: package 'fastmatch' was built under R version 3.4.1
##
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
##
## coalesce
library('plyr')
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library('car')
library("PerformanceAnalytics")
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.2
library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.3
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.2
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library('RColorBrewer')
## Warning: package 'RColorBrewer' was built under R version 3.4.1
library('data.table')
library('ROCR')
library('maptree')
## Warning: package 'maptree' was built under R version 3.4.2
library('tree')
## Warning: package 'tree' was built under R version 3.4.3
library('dummies') # for converting categorical into dummy one
## Warning: package 'dummies' was built under R version 3.4.1
## dummies-1.5.6 provided by Decision Patterns
library('caret')
library('pscl') ## for McFadden R2
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('randomForest')
library('StatMeasures')
## Warning: package 'StatMeasures' was built under R version 3.4.3
library('sqldf')
## Warning: package 'sqldf' was built under R version 3.4.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.1
library('class')
## Warning: package 'class' was built under R version 3.4.3
library('caret')
library('sjPlot')
## Warning: package 'sjPlot' was built under R version 3.4.3
Load the data which is in CSV file format and remove unnecessary colum (Id). Then do a basic data visuaisation
medical_data <- read.csv('Prostate_Cancer.csv')
medical_data <-medical_data[,-1] # dropping the first column which is not adding any value
summary(medical_data)
## diagnosis_result radius texture perimeter
## B:38 Min. : 9.00 Min. :11.00 Min. : 52.00
## M:62 1st Qu.:12.00 1st Qu.:14.00 1st Qu.: 82.50
## Median :17.00 Median :17.50 Median : 94.00
## Mean :16.85 Mean :18.23 Mean : 96.78
## 3rd Qu.:21.00 3rd Qu.:22.25 3rd Qu.:114.25
## Max. :25.00 Max. :27.00 Max. :172.00
## area smoothness compactness symmetry
## Min. : 202.0 Min. :0.0700 Min. :0.0380 Min. :0.1350
## 1st Qu.: 476.8 1st Qu.:0.0935 1st Qu.:0.0805 1st Qu.:0.1720
## Median : 644.0 Median :0.1020 Median :0.1185 Median :0.1900
## Mean : 702.9 Mean :0.1027 Mean :0.1267 Mean :0.1932
## 3rd Qu.: 917.0 3rd Qu.:0.1120 3rd Qu.:0.1570 3rd Qu.:0.2090
## Max. :1878.0 Max. :0.1430 Max. :0.3450 Max. :0.3040
## fractal_dimension
## Min. :0.05300
## 1st Qu.:0.05900
## Median :0.06300
## Mean :0.06469
## 3rd Qu.:0.06900
## Max. :0.09700
Analyze the data further and it shows that data is biassed
table(medical_data$diagnosis_result)
##
## B M
## 38 62
It is necessary for us to scale the data as kNN uses eucledian distance calculation
medical <- data.frame(medical_data$diagnosis_result,scale(medical_data[,2:9]))
# medical <- data.frame(medical_data_scaled , medical_data$diagnosis_result)
summary(medical)
## medical_data.diagnosis_result radius texture
## B:38 Min. :-1.60891 Min. :-1.3923
## M:62 1st Qu.:-0.99404 1st Qu.:-0.8146
## Median : 0.03074 Median :-0.1406
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.85057 3rd Qu.: 0.7741
## Max. : 1.67039 Max. : 1.6888
## perimeter area smoothness compactness
## Min. :-1.8914 Min. :-1.5667 Min. :-2.23539 Min. :-1.4507
## 1st Qu.:-0.6031 1st Qu.:-0.7073 1st Qu.:-0.63039 1st Qu.:-0.7556
## Median :-0.1174 Median :-0.1842 Median :-0.04986 Median :-0.1341
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7379 3rd Qu.: 0.6697 3rd Qu.: 0.63312 3rd Qu.: 0.4956
## Max. : 3.1770 Max. : 3.6756 Max. : 2.75035 Max. : 3.5703
## symmetry fractal_dimension
## Min. :-1.8896 Min. :-1.4342
## 1st Qu.:-0.6877 1st Qu.:-0.6981
## Median :-0.1030 Median :-0.2073
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5142 3rd Qu.: 0.5288
## Max. : 3.6001 Max. : 3.9639
Now lets break the data into test and training data randomly…A blank value in each of the above statements indicate that all rows and columns should be included. Our target variable is ‘diagnosis_result’ which we have also included in our training and test data sets and will compare later with response
#Here we are taking 75% of data
## 75% of the sample size
smp_size <- floor(0.75 * nrow(medical))
set.seed(123)
train_ind <- sample(seq_len(nrow(medical)), size = smp_size)
train_medical <- medical[train_ind, ]
test_medical <- medical[-train_ind, ]
# train_medical_backup < train_medical ## keeping a backup before removing the string column
# test_medical_backup < test_medical ## keeping a backup before removing the string column
Now lets train our model with train data. Note here that i have dropped first column as this string and storing result and so KNN does not equire this for distance calcualtion. The KNN function classifies data points by calculating the Euclidean distance between the points. That’s a mathematical calculation requiring numbers. All variables in KNN must therefore be coerce-able to numerics. The data preparation for KNN often involves three tasks:
The value for k is generally chosen as the square root of the number of observations.
prc_train_labels <- train_medical[1:nrow(train_medical), 1]
prc_test_labels <- test_medical[1:nrow(test_medical), 1] ## storing the value of response variable into a data frame for comparing later
prc_test_pred <- knn(train = train_medical[,-1], test = test_medical[,-1],cl = prc_train_labels, k=sqrt(nrow(medical_data)), prob = FALSE)
Now it is time to compare the model performance
Medicaldatatable <- table(actualclass=prc_test_labels, predictedclass=prc_test_pred)
MedicaldatatableconfusionVImatrix <- confusionMatrix(Medicaldatatable)
print(MedicaldatatableconfusionVImatrix)
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass B M
## B 5 0
## M 5 15
##
## Accuracy : 0.8
## 95% CI : (0.593, 0.9317)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.02936
##
## Kappa : 0.5455
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.50
## Specificity : 1.00
## Pos Pred Value : 1.00
## Neg Pred Value : 0.75
## Prevalence : 0.40
## Detection Rate : 0.20
## Detection Prevalence : 0.20
## Balanced Accuracy : 0.75
##
## 'Positive' Class : B
##
sjc.elbow(medical_data[,-1]) ## Here drawing the elbow one and it seems that our optimal k value can be taken as 7 or 8
Now lets run the model with k value as 8 and compare the prediction accuracy.. Here the revised accuracy he 88% and has improved.
prc_test_pred_revised <- knn(train = train_medical[,-1], test = test_medical[,-1],cl = prc_train_labels, k=8, prob = FALSE)
MedicaldatatableRev <- table(actualclass=prc_test_labels, predictedclass=prc_test_pred_revised)
MedicaldatatableconfusionVImatrixRev <- confusionMatrix(MedicaldatatableRev)
print(MedicaldatatableconfusionVImatrixRev)
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass B M
## B 5 0
## M 5 15
##
## Accuracy : 0.8
## 95% CI : (0.593, 0.9317)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.02936
##
## Kappa : 0.5455
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.50
## Specificity : 1.00
## Pos Pred Value : 1.00
## Neg Pred Value : 0.75
## Prevalence : 0.40
## Detection Rate : 0.20
## Detection Prevalence : 0.20
## Balanced Accuracy : 0.75
##
## 'Positive' Class : B
##