data <- read.csv("Wholesale_customers_data.csv")
summary(data)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0
str(data)
## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
#Part I

#Requires some data frame and the top N to remove
data <- read.csv("Wholesale_customers_data.csv")
summary(data)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0
str(data)
## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
top.n.custs <- function (data,cols,n=5) {
#Initialize a vector to hold customers being removed
idx.to.remove <-integer(0)
for (c in cols){
# For every column in the data we passed to this function
#Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
col.order <-order(data[,c],decreasing=T)
#Take the first n of the sorted column C to
#combine and de-duplicate the row ids that need to be removed
idx <-head(col.order, n)
idx.to.remove <-union(idx.to.remove,idx)
}
#Return the indexes of customers to be removed
return(idx.to.remove)
}


##Data Pre-processing
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)
## [1] 18
#Examine the customers
data[top.custs,]
##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423
#Remove the Customers
data.rm.top<-data[-c(top.custs),]
#Examine summary stats for the remaining data
print(summary(data.rm.top))
##     Channel         Region          Fresh            Milk          Grocery     
##  Min.   :1.00   Min.   :1.000   Min.   :    3   Min.   :   55   Min.   :    3  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.: 3072   1st Qu.: 1497   1st Qu.: 2132  
##  Median :1.00   Median :3.000   Median : 8130   Median : 3582   Median : 4603  
##  Mean   :1.31   Mean   :2.531   Mean   :11076   Mean   : 5172   Mean   : 7211  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:16251   3rd Qu.: 6962   3rd Qu.:10391  
##  Max.   :2.00   Max.   :3.000   Max.   :56082   Max.   :36423   Max.   :39694  
##      Frozen        Detergents_Paper    Delicassen     
##  Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.:  738.8   1st Qu.:  255.2   1st Qu.:  398.0  
##  Median : 1487.5   Median :  799.5   Median :  904.5  
##  Mean   : 2910.3   Mean   : 2541.6   Mean   : 1352.0  
##  3rd Qu.: 3428.0   3rd Qu.: 3879.2   3rd Qu.: 1752.2  
##  Max.   :60869.0   Max.   :19410.0   Max.   :16523.0
#Set the seed for reproducibility

set.seed(76964057)
#Try K from 2 to 20
rng<-2:20
#Number of times to run the K Means algorithm
tries <-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries

  v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",

ylab="Average Total Between Sum of Squares",
xlab="Value of K")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")

##Creating the best number of clusters

#Create the best number of clusters, Remove columns 1 and 2

##Will take k value =5

k <-kmeans(data.rm.top[,-c(1,2)], centers=5)
#Display&nbsp;cluster centers
print(k$centers)
##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4344.942  8698.233 12642.663 1430.802        5375.9302  1430.2442
## 2 37275.906  5219.844  5850.094 6883.125         824.8125  2169.3438
## 3  6910.586 17347.655 25997.931 2053.552       11446.7586  2569.4138
## 4 18857.735  3371.235  4775.049 3492.618        1131.5196  1547.4020
## 5  5686.942  2429.746  3049.092 2711.133         788.6705   842.6127
#Give a count of data points in each cluster
print(table(k$cluster))
## 
##   1   2   3   4   5 
##  86  32  29 102 173
#Generate a plot of the clusters
library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)

##Part 2 - K means for Wine dataset

This R script will perform a k-Means analysis on the wine dataset from UCI. This dataset has 13 chemical measurements on 178 observations of Italian wine.

Pre-work

#Plot the within (cluster) sum of squares to determine the initial value for "k"
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}

#Loading the data

wine <- read.csv("wine.csv")
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
df <- scale(wine[-1])

#Examine the data frame and plot the within sum of squares
head(df)
##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue        OD
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585 1.8427215
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085 1.1103172
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085 0.7863692
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410 1.1807407
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585 0.4483365
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085 0.3356589
##          Proline
## [1,]  1.01015939
## [2,]  0.96252635
## [3,]  1.39122370
## [4,]  2.32800680
## [5,] -0.03776747
## [6,]  2.23274072
wssplot(df)

#Start the k-Means analysis using the variable "nc" for the number of clusters
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 19 proposed 3 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
## *** : The Hubert index is a graphical method of determining the number of clusters.
##In the plot of Hubert index, we seek a significant knee that corresponds to a 
##significant increase of the value of the measure i.e the significant peak in Hubert
##index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

print(table(nc$Best.n[1,]))
## 
##  0  1  2  3 14 15 
##  2  1  2 19  1  1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

#Conduct the k-Means analysis using the best number of clusters
##Will take k value as 3
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)
## [1] 62 65 51
print(fit.km$centers)
##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue         OD
## 1  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504  0.7770551
## 2  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046  0.2700025
## 3 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122 -1.2887761
##      Proline
## 1  1.1220202
## 2 -0.7517257
## 3 -0.4059428
print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))
##   cluster  Alcohol Malic.acid      Ash      Acl        Mg  Phenols Flavanoids
## 1       1 13.67677   1.997903 2.466290 17.46290 107.96774 2.847581  3.0032258
## 2       2 12.25092   1.897385 2.231231 20.06308  92.73846 2.247692  2.0500000
## 3       3 13.13412   3.307255 2.417647 21.24118  98.66667 1.683922  0.8188235
##   Nonflavanoid.phenols  Proanth Color.int       Hue       OD   Proline
## 1            0.2920968 1.922097  5.453548 1.0654839 3.163387 1100.2258
## 2            0.3576923 1.624154  2.973077 1.0627077 2.803385  510.1692
## 3            0.4519608 1.145882  7.234706 0.6919608 1.696667  619.0588
#Use a confusion or truth table to evaluate how well the k-Means analysis performed
#ct.km <- table(wine$Type, fit.km$cluster)
#print(ct.km)
#Generate a plot of the clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)

##Part 3 - Training a classifier to classify wines using the clusters we obtained in Part 2

Training the model and Evaluate

#Set-up to train a model for classification of wines
library(rpart)

df_rp <- data.frame(k=fit.km$cluster, df)
print(str(df_rp))
## 'data.frame':    178 obs. of  14 variables:
##  $ k                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  1.514 0.246 0.196 1.687 0.295 ...
##  $ Malic.acid          : num  -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
##  $ Ash                 : num  0.231 -0.826 1.106 0.487 1.835 ...
##  $ Acl                 : num  -1.166 -2.484 -0.268 -0.807 0.451 ...
##  $ Mg                  : num  1.9085 0.0181 0.0881 0.9283 1.2784 ...
##  $ Phenols             : num  0.807 0.567 0.807 2.484 0.807 ...
##  $ Flavanoids          : num  1.032 0.732 1.212 1.462 0.661 ...
##  $ Nonflavanoid.phenols: num  -0.658 -0.818 -0.497 -0.979 0.226 ...
##  $ Proanth             : num  1.221 -0.543 2.13 1.029 0.4 ...
##  $ Color.int           : num  0.251 -0.292 0.268 1.183 -0.318 ...
##  $ Hue                 : num  0.361 0.405 0.317 -0.426 0.361 ...
##  $ OD                  : num  1.843 1.11 0.786 1.181 0.448 ...
##  $ Proline             : num  1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL
#Randomize the dataset
rdf <- df_rp[sample(1:nrow(df_rp)), ]
print(head(rdf))
##     k    Alcohol Malic.acid        Ash        Acl          Mg    Phenols
## 127 2 -0.7028817 -0.7217931 -0.2789084  0.6003946 -0.96212770  0.7108523
## 155 3 -0.5181131 -0.9366262 -0.9714696  0.1512342  0.22814148 -1.3024063
## 72  2  1.0585784 -0.7396958  1.1062139  1.6484357 -0.96212770  1.0463954
## 48  1  1.1078500 -0.5875224 -0.8985684 -1.0465271  0.08810981  1.2860690
## 60  2 -0.7767891 -1.2499245 -3.6688130 -2.6635047 -0.82209603 -0.5034942
## 36  1  0.5904981 -0.4711544  0.1584986  0.3009543  0.01809398  0.6469393
##     Flavanoids Nonflavanoid.phenols    Proanth   Color.int        Hue
## 127  1.1220109            0.2261576  0.3129175 -0.48229163 -1.1700906
## 155 -1.4509256            1.3510772 -0.3335300  1.09646103 -1.6513403
## 72   0.8316795           -1.2201676  0.4876331 -0.72384942  1.7611577
## 48   1.3622851           -1.2201676  0.9593651  0.44943125 -0.2075912
## 60  -1.4609371           -0.6577078 -2.0457425 -1.34068448  0.4049085
## 36   0.9518167           -0.8184106  0.4701615  0.01807806  0.3611585
##             OD    Proline
## 127  0.3215742 -1.2539977
## 155 -1.4953517 -0.3394434
## 72   0.7722845 -1.0698166
## 48   1.0117244  0.7561165
## 60  -1.1150649 -0.7205077
## 36   1.2089101  0.5497067
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
#Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(RColorBrewer)
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 3.6.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked _by_ '.GlobalEnv':
## 
##     wine
fancyRpartPlot(fit)

#Model Evaluation

pred <- predict(fit, test, type="class")
news_tbl <- table(pred, test$k)
Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100)
news_tbl
##     
## pred  1  2  3
##    1 14  1  0
##    2  2  9  0
##    3  0  2  9
Accuracy
## [1] 86.48649

Part 4 KNN For Breast cancer dataset

wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
str(wbcd)
## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : chr  "B" "B" "B" "B" ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...
summary(wbcd)
##        id             diagnosis          radius_mean      texture_mean  
##  Min.   :     8670   Length:569         Min.   : 6.981   Min.   : 9.71  
##  1st Qu.:   869218   Class :character   1st Qu.:11.700   1st Qu.:16.17  
##  Median :   906024   Mode  :character   Median :13.370   Median :18.84  
##  Mean   : 30371831                      Mean   :14.127   Mean   :19.29  
##  3rd Qu.:  8813129                      3rd Qu.:15.780   3rd Qu.:21.80  
##  Max.   :911320502                      Max.   :28.110   Max.   :39.28  
##  perimeter_mean     area_mean      smoothness_mean   compactness_mean 
##  Min.   : 43.79   Min.   : 143.5   Min.   :0.05263   Min.   :0.01938  
##  1st Qu.: 75.17   1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492  
##  Median : 86.24   Median : 551.1   Median :0.09587   Median :0.09263  
##  Mean   : 91.97   Mean   : 654.9   Mean   :0.09636   Mean   :0.10434  
##  3rd Qu.:104.10   3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040  
##  Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
##  concavity_mean     points_mean      symmetry_mean    dimension_mean   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.1060   Min.   :0.04996  
##  1st Qu.:0.02956   1st Qu.:0.02031   1st Qu.:0.1619   1st Qu.:0.05770  
##  Median :0.06154   Median :0.03350   Median :0.1792   Median :0.06154  
##  Mean   :0.08880   Mean   :0.04892   Mean   :0.1812   Mean   :0.06280  
##  3rd Qu.:0.13070   3rd Qu.:0.07400   3rd Qu.:0.1957   3rd Qu.:0.06612  
##  Max.   :0.42680   Max.   :0.20120   Max.   :0.3040   Max.   :0.09744  
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se       points_se       
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638  
##  Median :0.006380   Median :0.020450   Median :0.02589   Median :0.010930  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189   Mean   :0.011796  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600   Max.   :0.052790  
##   symmetry_se        dimension_se        radius_worst   texture_worst  
##  Min.   :0.007882   Min.   :0.0008948   Min.   : 7.93   Min.   :12.02  
##  1st Qu.:0.015160   1st Qu.:0.0022480   1st Qu.:13.01   1st Qu.:21.08  
##  Median :0.018730   Median :0.0031870   Median :14.97   Median :25.41  
##  Mean   :0.020542   Mean   :0.0037949   Mean   :16.27   Mean   :25.68  
##  3rd Qu.:0.023480   3rd Qu.:0.0045580   3rd Qu.:18.79   3rd Qu.:29.72  
##  Max.   :0.078950   Max.   :0.0298400   Max.   :36.04   Max.   :49.54  
##  perimeter_worst    area_worst     smoothness_worst  compactness_worst
##  Min.   : 50.41   Min.   : 185.2   Min.   :0.07117   Min.   :0.02729  
##  1st Qu.: 84.11   1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720  
##  Median : 97.66   Median : 686.5   Median :0.13130   Median :0.21190  
##  Mean   :107.26   Mean   : 880.6   Mean   :0.13237   Mean   :0.25427  
##  3rd Qu.:125.40   3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910  
##  Max.   :251.20   Max.   :4254.0   Max.   :0.22260   Max.   :1.05800  
##  concavity_worst   points_worst     symmetry_worst   dimension_worst  
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.1565   Min.   :0.05504  
##  1st Qu.:0.1145   1st Qu.:0.06493   1st Qu.:0.2504   1st Qu.:0.07146  
##  Median :0.2267   Median :0.09993   Median :0.2822   Median :0.08004  
##  Mean   :0.2722   Mean   :0.11461   Mean   :0.2901   Mean   :0.08395  
##  3rd Qu.:0.3829   3rd Qu.:0.16140   3rd Qu.:0.3179   3rd Qu.:0.09208  
##  Max.   :1.2520   Max.   :0.29100   Max.   :0.6638   Max.   :0.20750
#Pre-processing
wbcd <- wbcd[-1]
table(wbcd$diagnosis)
## 
##   B   M 
## 357 212
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
## 
##    Benign Malignant 
##      62.7      37.3
#Normalising the data

##We can also use scale
##Here we are using the function below and scaled data between 0 and 1
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)
##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000
#Assign 82% of data for training and the rest for testing Define target variables separately

wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]


library(class)
## Warning: package 'class' was built under R version 3.6.3
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
wbcd_test_pred
##   [1] Benign    Benign    Benign    Benign    Malignant Benign    Malignant
##   [8] Benign    Malignant Benign    Malignant Benign    Malignant Malignant
##  [15] Benign    Benign    Malignant Benign    Malignant Benign    Malignant
##  [22] Malignant Malignant Malignant Benign    Benign    Benign    Benign   
##  [29] Malignant Malignant Malignant Benign    Malignant Malignant Benign   
##  [36] Benign    Benign    Benign    Benign    Malignant Malignant Benign   
##  [43] Malignant Malignant Benign    Malignant Malignant Malignant Malignant
##  [50] Malignant Benign    Benign    Benign    Benign    Benign    Benign   
##  [57] Benign    Benign    Malignant Benign    Benign    Benign    Benign   
##  [64] Benign    Malignant Malignant Benign    Benign    Benign    Benign   
##  [71] Benign    Malignant Benign    Benign    Malignant Malignant Benign   
##  [78] Benign    Benign    Benign    Benign    Benign    Benign    Malignant
##  [85] Benign    Benign    Malignant Benign    Benign    Benign    Benign   
##  [92] Malignant Benign    Benign    Benign    Benign    Benign    Malignant
##  [99] Benign    Malignant
## Levels: Benign Malignant
##Evaluation

#Evaluation
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))
##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        61         2
##      Malignant      0        37
(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)
## [1] 98

Sensitivity and Specificity Calculation of Sensitivity and Specificity

library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
cat("The Sensitivity and Specificity values are", "\n")
## The Sensitivity and Specificity values are
sensitivity(wbcd_tbl)
## [1] 1
specificity(wbcd_tbl)
## [1] 0.9487179
##Sensitivity = TP / (TP+FN) Specificity = TN / (FP+TN)

##Sensitivity is simply a reflection of how many patients with the diagnosed disease test positive. Specificity is a measure of your false positive rate.

##By looking at the sensitivity rate to be 100%, it means that 100% of the diseased patients will have positive results when tested. And by looking at the specificity of 94.8%, it means that about 5.2% of healthy patients will have positive results when tested.

##We can also calculate likelihood ratios for more accuracy statistics of a test result. The positive predictive value is defined as the percent of predicted positives that are actually positive while the negative predictive value is defined as the percent of negative positives that are actually negative.

Part 5 KNN technique on News Popularity problem

news <- read.csv("OnlineNewsPopularity_for_R.csv")

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
#for(i in 1:39644) {
#  if(newsShort$shares[i] >= 1400) {
#    newsShort$shares[i] = "yes"} 
 # else {newsShort$shares[i] = "no"}
  #cat("i=,",i," shares=",newsShort$shares[i],"\n")
#}

newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))


#news_rand <- news_n[order(runif(10000)), ]
#set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
set.seed(76964057)
#Try K from 2 to 20
rng<-2:20
#Number of times to run the K Means algorithm
tries <-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries

  v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(news_train,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",

ylab="Average Total Between Sum of Squares",
xlab="Value of K")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")

Taking k-value as 5 we get the following prediction and accuracy rate

news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 5)

(news_tbl <- table(news_test_pred, news_test_labels))
##               news_test_labels
## news_test_pred  no yes
##            no  225 173
##            yes 322 280
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 50.5
##Taking k-value as 7 we get the following prediction and accuracy rate

news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 7)

(news_tbl <- table(news_test_pred, news_test_labels))
##               news_test_labels
## news_test_pred  no yes
##            no  227 154
##            yes 320 299
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 52.6
#Hence when k=7 it gives better performance.