Introduction

The main purpose of this lab is to understand how to integate unsupervised and supervised learning techniques. Firstly, we will be using K-means clustering and then using that result as input for supervised learning analysis. This is the basis for a mixed methods or semi-supervised learning analysis.

Part 1 Wholesale Customer Dataset

We will be using the Wholesale customer dataset in the part 1 to perform K-means analysis to find the best K value. ## Step1 Pre-processing

getwd()

## [1] "C:/Users/dkkan/Desktop/Harrisburg/530machinelearning/lab3"

setwd("C:/Users/dkkan/Desktop/Harrisburg/530machinelearning/lab3")
#Load the wholesale customer dataset
library(readr)
data <- read_csv("Wholesale_customers_data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Channel = col_double(),
##   Region = col_double(),
##   Fresh = col_double(),
##   Milk = col_double(),
##   Grocery = col_double(),
##   Frozen = col_double(),
##   Detergents_Paper = col_double(),
##   Delicassen = col_double()
## )

#Pre-Processing
str(data)

## tibble [440 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Channel         : num [1:440] 2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : num [1:440] 3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : num [1:440] 12669 7057 6353 13265 22615 ...
##  $ Milk            : num [1:440] 9656 9810 8808 1196 5410 ...
##  $ Grocery         : num [1:440] 7561 9568 7684 4221 7198 ...
##  $ Frozen          : num [1:440] 214 1762 2405 6404 3915 ...
##  $ Detergents_Paper: num [1:440] 2674 3293 3516 507 1777 ...
##  $ Delicassen      : num [1:440] 1338 1776 7844 1788 5185 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Channel = col_double(),
##   ..   Region = col_double(),
##   ..   Fresh = col_double(),
##   ..   Milk = col_double(),
##   ..   Grocery = col_double(),
##   ..   Frozen = col_double(),
##   ..   Detergents_Paper = col_double(),
##   ..   Delicassen = col_double()
##   .. )

summary(data)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

top.n.custs <- function (data,cols,n=5) { 
idx.to.remove <-integer(0) 
for (c in cols){ 
col.order <-order(data[,c],decreasing=T) 
 idx <-head(col.order, n) 

 idx.to.remove <-union(idx.to.remove,idx) 
 }
 return(idx.to.remove) 
}

#How Many Customers to be Removed?
data=data.frame(data)
top.custs <-top.n.custs(data, cols=3:8,n=5)
length(top.custs)

## [1] 19

#Examine the customers
data[top.custs,]

##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423
## 326       1      2  32717 16784   13626  60869             1272       5609
## 94        1      3  11314  3090    2062  35009               71       2698
## 197       1      1  30624  7209    4897  18711              763       2876
## 104       1      3  56082  3504    8906  18028             1480       2498
## 24        2      3  26373 36423   22019   5154             4337      16523
## 72        1      3  18291  1266   21042   5373             4173      14472
## 88        1      3  43265  5025    8117   6312             1579      14351

## Step-2 Model Design

#Model Design
data.rm.top<-data[-c(top.custs),] #Remove the Customers
set.seed(76964057) #Set the seed for reproducibility
k <-kmeans(data.rm.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
k$centers #Display&nbsp;cluster centers

##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  5830.214 15295.048 23449.167 1936.452       10361.6429   1912.738
## 2 18649.606  3335.586  4497.848 3301.747        1046.5859   1450.566
## 3  5845.392  2337.319  2878.205 2766.596         660.2952    858.994
## 4  4238.892  7725.289 11011.747 1336.566        4733.3614   1400.530
## 5 35922.387  4851.806  5862.581 3730.677        1004.6129   1552.161

#Data points in each cluster
table(k$cluster)

## 
##   1   2   3   4   5 
##  42  99 166  83  31

## Step3 Optimum K

#Evaluation of the model
rng<-2:20
tries<-100

#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
for(v in rng){ 
 v.totw.ss <-integer(tries) 
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v) 
 v.totw.ss[i] <-k.temp$tot.withinss
 }
 avg.totw.ss[v-1] <-mean(v.totw.ss) 
}

## Warning: did not converge in 10 iterations

ggplot(aes(x,y), data=data.frame("x"=c(2:20), "y"=avg.totw.ss/1000000000))+geom_line()+geom_point(size=2.5)+labs(x="Number of Clusters (starting with k=2)", y="Average Total Within Sum of Squares (×10^9)")+ggtitle("Fig. 1. Plot of TotW.SS by k in \"Wholesale\" Clustering")+theme_classic()

Q1- Given this is an imperfect real-world, you need to determine what you believe is the best value for “k” and write-up this portion of your lab report.

Ans: As we can see in the above plot that Optimum K is 3 and that is because the change rate starts to drop after 3. But from the elbow method, we can say the best value of K is either 2,3,or 5. In my opinion, K=3 is better than K=2 or 5 because in case of K=5, there is no distinct clustering happening as sum of square is very large.

Q2- How many points do you see in each cluster?

Ans: There are total 421 points and we Set the seed as 76964057 for reproducibility then we got the following points in each cluster. 1 2 3 4 5 42 99 166 83 31

Part-2 Wine Dataset

We will be using Wine dataset in part 2 to determine the quantities of 13 constituents found in each of the three types of wines. And for that we need to find the best K value by using K-means algorithm. ## Exploring and Building k-Means Clustering models

#Load the data Wine
wine <- read_csv("wine.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Wine = col_double(),
##   Alcohol = col_double(),
##   Malic.acid = col_double(),
##   Ash = col_double(),
##   Acl = col_double(),
##   Mg = col_double(),
##   Phenols = col_double(),
##   Flavanoids = col_double(),
##   Nonflavanoid.phenols = col_double(),
##   Proanth = col_double(),
##   Color.int = col_double(),
##   Hue = col_double(),
##   OD = col_double(),
##   Proline = col_double()
## )

str(wine)

## tibble [178 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Wine                : num [1:178] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num [1:178] 14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num [1:178] 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num [1:178] 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num [1:178] 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : num [1:178] 127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num [1:178] 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num [1:178] 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num [1:178] 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num [1:178] 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num [1:178] 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num [1:178] 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num [1:178] 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : num [1:178] 1065 1050 1185 1480 735 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Wine = col_double(),
##   ..   Alcohol = col_double(),
##   ..   Malic.acid = col_double(),
##   ..   Ash = col_double(),
##   ..   Acl = col_double(),
##   ..   Mg = col_double(),
##   ..   Phenols = col_double(),
##   ..   Flavanoids = col_double(),
##   ..   Nonflavanoid.phenols = col_double(),
##   ..   Proanth = col_double(),
##   ..   Color.int = col_double(),
##   ..   Hue = col_double(),
##   ..   OD = col_double(),
##   ..   Proline = col_double()
##   .. )

#Best K Value
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

#Scale the dataset
df <- scale(wine[-1])

#Optimum K
wssplot(df)

#K means Analysis
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 19 proposed 3 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

#K means Train and evaluation
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

#Check Accuracy
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))

## [1] 96.62921

#K means Plot
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

Explanation: Similar to part 1, the optimal K is 3 and the confusion matrix shows 96.6% accuracy, the cluster 1 is heavy with Alcohol, Ash, Magnesium, Total Phenols, Flavanoids, and Proline. Cluster 2 is heavy with Hue, Color Intensity, and Proline and shows outliers of the group. Cluster 3 is heavy with Malic Acid, Alcalinity of Ash, Nonflavanoid Phenols, Color Intensity, and relatively low values in Total Phenols, Flavanoids, Proanthocyanins, Hue, which are opposite to Cluster 1.

Part-3 Classification Model

For part 3 we will train a classifier model to classify wines using the clusters we obtained in Part II. The first thing we’ll do is label the data frame we have been using with the cluster labels.

df_rpart <- data.frame(k=fit.km$cluster, df)

#Randamize the dataset
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]

#Prepare train and test data
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]

#Training the model 
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked _by_ '.GlobalEnv':
## 
##     wine

fancyRpartPlot(fit)

#Model Evaluation
pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))

##     
## pred  1  2  3
##    1 14  1  0
##    2  2  9  0
##    3  0  2  9

#Check Accuracy
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))

## [1] 86.48649

Explanation: As above, we trained a classifier model to classify wines using 3 clusters identified in part 2. Firstly, we randamized the data and then split into train 80% and test 20%. We got 86.48% accuracy from our model. In part2, there was some missclassification in the clusters. With the model accuracy the three most important features in this model are “Flavanoids”, “Phenols”, and “OD”.

Part-4 Breast Cancer Dataset

Q3- Load the dataset of breast cancer. Do the preliminary analysis and implement a KNN (Knearest neighbors) model for this dataset and don’t forget that whenever it is required you should use: set.seed(12345).

wbcd <- read_csv("wisc_bc_data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   diagnosis = col_character()
## )
## i Use `spec()` for the full column specifications.

wbcd<- as.data.frame(wbcd)
str(wbcd)

## 'data.frame':    569 obs. of  32 variables:
##  $ id               : num  87139402 8910251 905520 868871 9012568 ...
##  $ diagnosis        : chr  "B" "B" "B" "B" ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   diagnosis = col_character(),
##   ..   radius_mean = col_double(),
##   ..   texture_mean = col_double(),
##   ..   perimeter_mean = col_double(),
##   ..   area_mean = col_double(),
##   ..   smoothness_mean = col_double(),
##   ..   compactness_mean = col_double(),
##   ..   concavity_mean = col_double(),
##   ..   points_mean = col_double(),
##   ..   symmetry_mean = col_double(),
##   ..   dimension_mean = col_double(),
##   ..   radius_se = col_double(),
##   ..   texture_se = col_double(),
##   ..   perimeter_se = col_double(),
##   ..   area_se = col_double(),
##   ..   smoothness_se = col_double(),
##   ..   compactness_se = col_double(),
##   ..   concavity_se = col_double(),
##   ..   points_se = col_double(),
##   ..   symmetry_se = col_double(),
##   ..   dimension_se = col_double(),
##   ..   radius_worst = col_double(),
##   ..   texture_worst = col_double(),
##   ..   perimeter_worst = col_double(),
##   ..   area_worst = col_double(),
##   ..   smoothness_worst = col_double(),
##   ..   compactness_worst = col_double(),
##   ..   concavity_worst = col_double(),
##   ..   points_worst = col_double(),
##   ..   symmetry_worst = col_double(),
##   ..   dimension_worst = col_double()
##   .. )

wbcd <- wbcd[-1]
table(wbcd$diagnosis)

## 
##   B   M 
## 357 212

wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

## 
##    Benign Malignant 
##      62.7      37.3

normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)

##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

set.seed(12345)

#split training dataset then train the model
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]

library(class)

#Train the model
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))

##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        61         2
##      Malignant      0        37

#CHeck the accuracy
(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)

## [1] 98

Part-4 News Popularity

Q4- Now let’s get back to our problem of news popularity and see if we can apply KNN (Knearest neighbors) to improve the accuracy of the model. Use the same strategy of training and testing that we did on first 2 labs, and don’t forget that whenever it is required you should use: set.seed(12345). Use PCA to reduce the number of features.

#Load the data
news<- read_csv("OnlineNewsPopularity.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   url = col_character()
## )
## i Use `spec()` for the full column specifications.

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}

newsShort$shares = newsShort$popular
#for(i in 1:39644) {
# if(newsShort$shares[i] >= 1400) {
#    newsShort$shares[i] = "yes"} 
 # else {newsShort$shares[i] = "no"}
  #cat("i=,",i," shares=",newsShort$shares[i],"\n")
#}

newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))

news_rand <- news_n[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]

#use KNN to Evaluate the Model and improve the accuracy of the model
news_test_pred <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 5)

(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  225 174
##            yes 322 279

(Accuracy <- (news_tbl[1] + news_tbl[4])/sum(news_tbl)*100)

## [1] 50.4

#Let’s try another k: k = 95 is about the square root of 9000, the number of training records in the data set.
news_test_pred1  <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 95)
(news_tbl_1 <- table(news_test_pred1, news_test_labels))

##                news_test_labels
## news_test_pred1  no yes
##             no  108  62
##             yes 439 391

(Accuracy <- (news_tbl_1[1] + news_tbl_1[4])/sum(news_tbl_1)*100)

## [1] 49.9

Result: Afer loading the newspopularity data, we set the random seed to 12345 for sampling. In first KNN technique, we use K = 5. WE found that our model’s accuracy is 50.4% which shows high error rate and these error can be due to missclassification. Then we tried another model with KNN = 95 to see if we improve the accuracy. Unfortunately, the another process does not improve the accuracy of the model. We found 501 mistakes on 1000 training examples, which is more than 1st k=5. The accuracy is 49.9 which is slightly less than first model.

Clustering wholesale customers data using K-Means

Divyani Kanawat

November 28, 2020