Part 1 K Means Clustering

Step 1: Loading data into r

library(readr)
Wholesale_customers_data <- read.csv("C:/Users/gmutya048/Downloads/Wholesale_customers_data.csv")
str(Wholesale_customers_data)
## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(Wholesale_customers_data)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0
data<-data.frame(Wholesale_customers_data)

Step 2: Pre-Processing

top.n.custs <- function (data,cols,n=5) { #Requires some data frame and the top N to remove
  idx.to.remove <-integer(0) #Initialize a vector to hold customers being removed
  for (c in cols){ # For every column in the data we passed to this function
    col.order <-order(data[,c],decreasing=T) #Sort column "c" in descending order (bigger on top)
    #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
    idx <-head(col.order, n) #Take the first n of the sorted column C to
    idx.to.remove <-union(idx.to.remove,idx) #Combine and de-duplicate the row ids that need to be   removed
  }
  return(idx.to.remove) #Return the indexes of customers to be removed
}

How many customers to be remove?

top.custs<-top.n.custs(data,cols=3:8,n=5)
length(top.custs)
## [1] 19

Examine the customers

data[top.custs,]
##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423
## 326       1      2  32717 16784   13626  60869             1272       5609
## 94        1      3  11314  3090    2062  35009               71       2698
## 197       1      1  30624  7209    4897  18711              763       2876
## 104       1      3  56082  3504    8906  18028             1480       2498
## 24        2      3  26373 36423   22019   5154             4337      16523
## 72        1      3  18291  1266   21042   5373             4173      14472
## 88        1      3  43265  5025    8117   6312             1579      14351

Step 3 : Model Design

Remove the customers and examine the remaining data

data.rm.top<-data[-c(top.custs),] 
summary(data.rm.top)
##     Channel          Region          Fresh            Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.: 3067   1st Qu.: 1492  
##  Median :1.000   Median :3.000   Median : 8040   Median : 3587  
##  Mean   :1.321   Mean   :2.537   Mean   :10753   Mean   : 5112  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:15615   3rd Qu.: 7027  
##  Max.   :2.000   Max.   :3.000   Max.   :53205   Max.   :29892  
##     Grocery          Frozen      Detergents_Paper   Delicassen  
##  Min.   :    3   Min.   :   25   Min.   :    3    Min.   :   3  
##  1st Qu.: 2146   1st Qu.:  688   1st Qu.:  256    1st Qu.: 396  
##  Median : 4602   Median : 1457   Median :  811    Median : 898  
##  Mean   : 7135   Mean   : 2599   Mean   : 2547    Mean   :1261  
##  3rd Qu.: 9965   3rd Qu.: 3242   3rd Qu.: 3843    3rd Qu.:1697  
##  Max.   :39694   Max.   :17866   Max.   :19410    Max.   :7844

Set the seed for reproducibility

set.seed(76964057)

Create 5 clusters, Remove columns 1 and 2

k <-kmeans(data.rm.top[,-c(1,2)], centers=5)

Display cluster centers

k$centers
##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  5830.214 15295.048 23449.167 1936.452       10361.6429   1912.738
## 2 18649.606  3335.586  4497.848 3301.747        1046.5859   1450.566
## 3  5845.392  2337.319  2878.205 2766.596         660.2952    858.994
## 4  4238.892  7725.289 11011.747 1336.566        4733.3614   1400.530
## 5 35922.387  4851.806  5862.581 3730.677        1004.6129   1552.161

Give a count of data points in each cluster

table(k$cluster)
## 
##   1   2   3   4   5 
##  42  99 166  83  31

set K from 2 to 20 and run the Kmeans Algorithm 100 times

rng<-2:20
tries<-100

Set up an empty vector to hold all of points

avg.totw.ss <-integer(length(rng)) 
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))

# For each value of the range variable 
for(v in rng){
#Set up an empty vectors to hold the tries 
  v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans 
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss 
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss 
}
#Average the withinss and betweenss 
avg.totw.ss[v-1] <-mean(v.totw.ss) 
avg.totb.ss[v-1] <-mean(b.totb.ss) 
avg.tot.ss[v-1] <- mean(tot.ss)
}
## Warning: did not converge in 10 iterations

For each value of the range variable

plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
 ylab="Average Total Within Sum of Squares",
 xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares", xlab="Value of K")

plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K") 
abline(h=0.85, col="blue")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K", ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="blue")

k <-kmeans(data.rm.top[,-c(1,2)], centers=2)
k$withinss
## [1] 39113465976 14572965733
table(k$cluster)
## 
##   1   2 
## 300 121
print(k$centers)
##      Fresh     Milk  Grocery   Frozen Detergents_Paper Delicassen
## 1  5577.38 5668.003 8001.727 2244.843         3121.060   1169.410
## 2 23584.49 3734.000 4984.554 3475.967         1124.826   1488.198

Cluster Plot

library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)

Question 1: Given this is an imperfect real-world, you need to determine what you believe is the best value for “k” and write-up this portion of your lab report.

Answer 1: As per the elbow method (plot 1 "Avg total with sum of squares by number of clusters(k))the optimal K value was found to be 3.This method is used to find the percentage of variance explained by k. we choose the optimal k value at the point where the change rate starts to show a drop. We can also use other methods like Bayesian inference criterion or sillhouette method.

When we consider K=5 Cluster 4 seems to be those customers who heavily consume milk and grocery, followed by Cluster 1. Cluster 3 seems to be those customers who heavily consume fresh, followed by Cluster 2. Cluster 5 seems to represent the low (or regular) consumption customers.

Question 2: How many points do you see in each cluster?

Answer 2: In the above Cluster Plot we can see 304 points in one cluster and 117 in the other.

If we consider overall data we can see 83 points in fresh 108 points in Milk 43 points in Grocery 43 points in Detergents_Paper 145 points in Delicassen

Part 2 Cluster Analysis

Step 1: Loading data into r

library(readr)
wine <- read.csv("C:/Users/gmutya048/Downloads/wine.csv")
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

Scale The data

df <- scale(wine[-1])

Optimum K

wssplot(df)

K-means analysis

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 19 proposed 3 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

K-means Train and evaluation

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
table(fit.km$cluster)
## 
##  1  2  3 
## 62 65 51
(df_km <- table(wine$Wine, fit.km$cluster))
##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921

K-means Plot

library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

df_rpart <- data.frame(k=fit.km$cluster, df)
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked _by_ '.GlobalEnv':
## 
##     wine
fancyRpartPlot(fit)

Model Evaluation

pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##     
## pred  1  2  3
##    1 14  1  0
##    2  2  9  0
##    3  0  2  9
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 86.48649

Part 4: Breast Cancer Wisconsin Diagnostic data set

Q3: Load the data set of breast cancer. Do the preliminary analysis and implement a KNN (Knearest neighbors) model for this data set and don’t forget that whenever it is required you should use: set.seed(12345)

Answer 3: There are 32 variables and 469 observations in the data set.No randomization in sampling. For reproducibility data is divided into test and train data sets in 80:20. Using KNN Model there is a 98% accuracy observed in this model at k=21. K is set to be positive Square root of the number of rows in the training set.

library(readr)
wbcd <- read.csv("C:/Users/gmutya048/Downloads/wisc_bc_data.csv")
str(wbcd)
## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : chr  "B" "B" "B" "B" ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...

Eliminating the first attribute

wbcd <- wbcd[-1]

Pre-processing

table(wbcd$diagnosis)
## 
##   B   M 
## 357 212
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
## 
##    Benign Malignant 
##      62.7      37.3
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)
##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

Assign 82% of data to training. Define the target variables separately

wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]

Train the Model

library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)

Evaluation

(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))
##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        61         2
##      Malignant      0        37
(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)
## [1] 98

We can observe 98% of accuracy in Breast cancer data set using KNN model. This implies that the model is performing at an optimum level

News Popularity

Loading Data into R

library(readr)
news<- read.csv("C:/Users/gmutya048/Downloads/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : chr  "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Pre Processing

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
#for(i in 1:39644) {
#  if(newsShort$shares[i] >= 1400) {
#    newsShort$shares[i] = "yes"} 
 # else {newsShort$shares[i] = "no"}
  #cat("i=,",i," shares=",newsShort$shares[i],"\n")
#}
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))


news_rand <- news_n[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]

Considering k=5

news_test_pred <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 5)

(news_tbl <- table(news_test_pred, news_test_labels))
##               news_test_labels
## news_test_pred  no yes
##            no  225 174
##            yes 322 279
(Accuracy <- (news_tbl[1] + news_tbl[4])/sum(news_tbl)*100)
## [1] 50.4

Considering K=95

news_test_pred1  <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 95)
(news_tbl_1 <- table(news_test_pred1, news_test_labels))
##                news_test_labels
## news_test_pred1  no yes
##             no  108  62
##             yes 439 391
(Accuracy <- (news_tbl_1[1] + news_tbl_1[4])/sum(news_tbl_1)*100)
## [1] 49.9

Q4: Now let’s get back to our problem of news popularity and see if we can apply KNN (K- nearest neighbors) to improve the accuracy of the model. Use the same strategy of training and testing that we did on first 2 labs, and don’t forget that whenever it is required you should use: set.seed(12345).

Answer 4: The above KNN Method does not improve the accuracy of the model for news popularity data set when K is set to be 5 or 95.

Accuracy when k=5 is 50.4

Accuracy when k=95 is 49.9