Lab3-ANLY530

#OBJECTIVE

#This lab is intended to help you understand how to integrate unsupervised and supervised learning techniques. You’ll start by considering unsupervised learning using k-Means clustering. Then, you’ll use the results of that analysis as input for a supervised learning analysis, specifically a classification analysis. This is the basis for a mixed methods or semi-supervised learning analysis.

###PART 1

##PRE-PROCESSING the data for wholesale customers

#Requires some data frame and the top N to remove
data <- read.csv("C:/Users/Priya/Downloads/Wholesale customers data.csv")
summary(data)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

# This is the function that will remove n highest instances (outliers) from dataset
top.n.custs <- function (data,cols,n=5) { 
#Requires some data frame and the top N to remove
idx.to.remove <-integer(0) 
#Initialize a vector to hold customers being removed
for (c in cols){ 
  # For every column in the data we passed to this function
col.order <-order(data[,c],decreasing=T) 
#Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
 idx <-head(col.order, n) 
 #Take the first n of the sorted column C to
 idx.to.remove <-union(idx.to.remove,idx) 
 #Combine and de-duplicate the row ids that need to be removed
 }
 return(idx.to.remove) 
 #Return the indexes of customers to be removed
}
#How Many Customers to be Removed?
top.custs <-top.n.custs(data,cols=3:8,n=5)
length(top.custs) #How Many Customers to be Removed?

## [1] 19

# Now let’s call the function
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)

## [1] 18

#Examine the customers
data[top.custs,]

##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423

# Design the model
data.rm.top<-data[-c(top.custs),] #Remove the Customers
summary(data.rm.top)

##     Channel         Region          Fresh            Milk      
##  Min.   :1.00   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.: 3072   1st Qu.: 1497  
##  Median :1.00   Median :3.000   Median : 8130   Median : 3582  
##  Mean   :1.31   Mean   :2.531   Mean   :11076   Mean   : 5172  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:16251   3rd Qu.: 6962  
##  Max.   :2.00   Max.   :3.000   Max.   :56082   Max.   :36423  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2132   1st Qu.:  738.8   1st Qu.:  255.2   1st Qu.:  398.0  
##  Median : 4603   Median : 1487.5   Median :  799.5   Median :  904.5  
##  Mean   : 7211   Mean   : 2910.3   Mean   : 2541.6   Mean   : 1352.0  
##  3rd Qu.:10391   3rd Qu.: 3428.0   3rd Qu.: 3879.2   3rd Qu.: 1752.2  
##  Max.   :39694   Max.   :60869.0   Max.   :19410.0   Max.   :16523.0

set.seed(76964057) #Set the seed for reproducibility
rng<-2:20
tries<-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng)) 
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))

# For each value of the range variable 
for(v in rng){
#Set up an empty vectors to hold the tries 
  v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans 
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss 
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss 
}
#Average the withinss and betweenss 
avg.totw.ss[v-1] <-mean(v.totw.ss) 
avg.totb.ss[v-1] <-mean(b.totb.ss) 
avg.tot.ss[v-1] <- mean(tot.ss)
}

plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares", xlab="Value of K")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K") 
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K", ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")

k <-kmeans(data.rm.top[,-c(1,2)], centers=2)
k$withinss

## [1] 16873576067 50486095378

#Display cluster centers 
print(k$centers)

##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4589.846 10573.068 15829.726 1552.718        6876.3504   1730.547
## 2 13564.652  3099.436  3904.531 3431.039         878.6951   1206.774

library(cluster)

clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)

###Q1- Given this is an imperfect real-world, you need to determine what you believe is the best value for “k” and write-up this portion of your lab report. You should include a brief discussion of your k-Means analysis as well as the best value of “k” that you determine. You should include what mixture of variables within the clusters that this value of “k” results in. That is, you need to interpret your k-Means analysis and discuss what it means.

###ANSWER-Q1: The objective is to perform k-mean analysis to find the best or optimum value of k. The Wholesale customer dataset contains “fresh”, “milk”, “grocery” etc by region. First we will do some prepossesing of the data by converting into a dataframe and removing the top 5 customers from each region. Then, variables channel and region are dropped. After prepocessing, we will model the data by running the kmeans algorithm to find the optimum value of k. From the plots it is seen that the k mean values is between 2 and 20 , but looking for the elbow it is either 2, 3 or 5. k=2 seems to be a better value of k as in k=5 there is no disctinct clustering happening due to SS being very large. The output shows Cluster 1 heavy with “grocery” and “milk” and cluster 2 is heavy with “fresh”.

###Q2- How many points do you see in each cluster?

###ANSWER-Q2: First cluster has about 117 points and the other has 304 points in it.

###PART 2

#Now switching to the wine dataset which contains the chemical analysis of wine grown in 3 different cultivars of Italy. The objective is to determine the quantities of 13 constituents found in each of the three types of wines by using k-mean algorithm on the dataset and finding the best value of k from that. First, I will use wssplot function to determine the best value of k. The output and the bar plot shows the k value is 3. We can see that the cluster 1 is heavy with Phenols, Flavanoids and Proline, while cluster 2 is heavy with Hue and shows outliers of the group. Cluster 3 shows heavy on Color.int and Malic. Acid. From the confusion matrix it is seen that cluster 1 and 3 are good and all points are in the cluster itself whereas cluster 2 shares points with cluster 1 and cluster 3 indicating dependency and lack of clear clustering.

#This R script will perform a k-Means analysis on the wine dataset from UCI. This dataset
#has 13 chemical measurements on 178 observations of Italian wine.
wine <- read.csv("C:/Users/Priya/Downloads/wine.csv")

#Plot the within (cluster) sum of squares to determine the initial value for "k"
wssplot <- function(data, nc=15, seed=1234){
 wss <- (nrow(data)-1)*sum(apply(data,2,var))
 for (i in 2:nc){
 set.seed(seed)
 wss[i] <- sum(kmeans(data, centers=i)$withinss)}
 plot(1:nc, wss, type="b", xlab="Number of Clusters",
 ylab="Within groups sum of squares")}

#Scale the data
df <- scale(wine[-1])

#k-mean Analysis
library(NbClust)

## Warning: package 'NbClust' was built under R version 3.5.2

set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

#Train Kmeans and evaluate
set.seed(1234)
fit.km<-kmeans(df,3,nstart=25) #n=3
(df_km<-table(wine$Wine,fit.km$cluster))

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))

## [1] 96.62921

#Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25) #n=3
print(fit.km$size)

## [1] 62 65 51

print(fit.km$centers)

##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue
## 1  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504
## 2  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046
## 3 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122
##           OD    Proline
## 1  0.7770551  1.1220202
## 2  0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428

print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))

##   cluster  Alcohol Malic.acid      Ash      Acl        Mg  Phenols
## 1       1 13.67677   1.997903 2.466290 17.46290 107.96774 2.847581
## 2       2 12.25092   1.897385 2.231231 20.06308  92.73846 2.247692
## 3       3 13.13412   3.307255 2.417647 21.24118  98.66667 1.683922
##   Flavanoids Nonflavanoid.phenols  Proanth Color.int       Hue       OD
## 1  3.0032258            0.2920968 1.922097  5.453548 1.0654839 3.163387
## 2  2.0500000            0.3576923 1.624154  2.973077 1.0627077 2.803385
## 3  0.8188235            0.4519608 1.145882  7.234706 0.6919608 1.696667
##     Proline
## 1 1100.2258
## 2  510.1692
## 3  619.0588

#Using confusion matrix to evaluate performance of the kmeans analysis
ct.km <- table(wine$Wine, fit.km$cluster) 
print(ct.km)

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

#Generating the plot of clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)

###PART 3

#Now, training a classifier to classify wines using the 3 clusters identified earlier. First step is to label the dataframe using cluster labels. I have decided to use decision tree method for training the data. For this the data need to be randomized first and I have selected 80% train and 20% test data. Running the test and checking the accuracy of the model it is seen that the accuracy is 100% for all the 3 clusters. There was some misclassification in the wine dataset in cluster 2 initially which is due to the outliers of the different contents of the wine. In the test dataset, I was able to achieve 100% accuarcy with no misclassification due to the randomization of the dataset.

###Training a model for classifying wines from the wines dataset

library(rpart)
df <- data.frame(k=fit.km$cluster, df) 
print(str(df))

## 'data.frame':    178 obs. of  14 variables:
##  $ k                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  1.514 0.246 0.196 1.687 0.295 ...
##  $ Malic.acid          : num  -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
##  $ Ash                 : num  0.231 -0.826 1.106 0.487 1.835 ...
##  $ Acl                 : num  -1.166 -2.484 -0.268 -0.807 0.451 ...
##  $ Mg                  : num  1.9085 0.0181 0.0881 0.9283 1.2784 ...
##  $ Phenols             : num  0.807 0.567 0.807 2.484 0.807 ...
##  $ Flavanoids          : num  1.032 0.732 1.212 1.462 0.661 ...
##  $ Nonflavanoid.phenols: num  -0.658 -0.818 -0.497 -0.979 0.226 ...
##  $ Proanth             : num  1.221 -0.543 2.13 1.029 0.4 ...
##  $ Color.int           : num  0.251 -0.292 0.268 1.183 -0.318 ...
##  $ Hue                 : num  0.361 0.405 0.317 -0.426 0.361 ...
##  $ OD                  : num  1.843 1.11 0.786 1.181 0.448 ...
##  $ Proline             : num  1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL

rdf <- df[sample(1:nrow(df)), ] 
print(head(rdf))

##     k    Alcohol Malic.acid         Ash        Acl         Mg      Phenols
## 93  2 -0.3826162 -0.7217931 -0.38826018  0.3608424 -1.3822227 -1.462188745
## 69  2  0.4180475 -1.2499245 -0.02375431 -0.7470867  0.7182523  0.375309174
## 13  1  0.9230815 -0.5427655  0.15849862 -1.0465271 -0.7520802  0.487156874
## 57  1  1.5020229 -0.5696196 -0.24245783 -0.9566950  1.2783790  1.445851440
## 117 2 -1.4542737 -0.7755014 -1.37242601  0.3907864 -0.9621277 -0.503494178
## 161 3 -0.7891070  1.3370245  0.04914686  0.4506745 -0.8220960  0.007809591
##     Flavanoids Nonflavanoid.phenols     Proanth  Color.int         Hue
## 93  -0.5699201            1.7528342  0.05084419 -0.8661960  0.01115870
## 69  -0.7301029            1.5117800 -2.04574255 -0.8144336  0.27365854
## 13   0.7315653           -0.5773564  0.38280376  0.2337547  0.84240820
## 57   0.9718395           -0.8184106  0.76717799  0.5702101 -0.07634125
## 117 -0.4297602           -0.4970050 -0.10639981 -1.3406845 -0.03259127
## 161 -1.1105371            1.1100230 -0.96250606  1.1180287 -1.73884025
##             OD      Proline
## 93  -0.7770322 -0.799896093
## 69  -0.9601332  0.009865569
## 13   0.4060824  1.819921051
## 57   0.9835550  0.708483475
## 117  1.0117244 -0.799896093
## 161 -1.4530976 -0.720507695

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
#Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.5.3

rpart.plot(fit)

#Now lets access the model performace
pred<-predict(fit, test, type="class")
print(table(pred, test$k))

##     
## pred  1  2  3
##    1 11  1  0
##    2  0 12  1
##    3  0  1 11

p<-table(pred, test$k)
(Accuracy <- (sum(diag(p))/sum(p)*100))

## [1] 91.89189

###PART 4

###Q3- Load the dataset of breast cancer. Do the preliminary analysis and implement a KNN (Knearest neighbors) model for this dataset and don’t forget that whenever it is required you should use: set.seed(12345). For designing the model, use following command: knn(train = , test = , cl=, k = custom value) Use 80% of data for training and report the accuracy.

###ANSWER-Q3: Let’s see if we can apply KNN (K-nearest neighbors) to improve the accuracy of the model. After loading the data as shown below and doing some preprocessing, the set.seed(12345) for reproducibility purposes and the data was split in 80:20 train and test split.

wbdata <- read.csv("C:/Users/Priya/Downloads/wisc_bc_data.csv")

library(class)

#Removing the first attribute that mentions patients
wbdata <- wbdata[-1]

#Pre processing data
table(wbdata$diagnosis)

## 
##   B   M 
## 357 212

wbdata$diagnosis <- factor(wbdata$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbdata$diagnosis)) * 100, digits = 1)

## 
##    Benign Malignant 
##      62.7      37.3

normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
wbdata_n <- as.data.frame(lapply(wbdata[2:31], normalize))
summary(wbdata_n)

##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

#splitting into train and test datasets of about 80:20 split and setting seed at 12345
set.seed(12345)
wbdata_train <- wbdata_n[1:469, ]
wbdata_test <- wbdata_n[470:569, ]
wbdata_train_labels <- wbdata[1:469, 1]
wbdata_test_labels <- wbdata[470:569, 1]

#Train the model
wbdata_test_pred <- knn(train = wbdata_train, test = wbdata_test, cl=wbdata_train_labels, k = 21) #k=21 is square root of number of traning records.

#Evaluation

(wbdata_tbl <- table(wbdata_test_pred, wbdata_test_labels))

##                 wbdata_test_labels
## wbdata_test_pred Benign Malignant
##        Benign        61         2
##        Malignant      0        37

(Accuracy <- (wbdata_tbl[1]+wbdata_tbl[4])/sum(wbdata_tbl)*100)

## [1] 98

###Here we get an accuracy of 98% when doing a knn model on the breast cancer dataset. That is a pretty good accuracy value and seems like the model is performing at an optimum level.

###Q4 - Now let’s get back to our problem of news popularity and see if we can apply KNN (Knearest neighbors) to improve the accuracy of the model. Use the same strategy of training and testing that we did on first 2 labs, and don’t forget that whenever it is required you should use: set.seed(12345).

#NEWS Popularity Dataset

#Load the dataset
news <- read.csv("C:/Users/Priya/Downloads/OnlineNewsPopularity_for_R.csv")

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

#Some preprocessing
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}

newsShort$shares = newsShort$popular

#Noramlize the data
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))

news_rand <- news_n[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]

news_test_pred <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 5) #choosing a nominal k value of 5.

(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  225 174
##            yes 322 279

#Lets check the performance
(Accuracy <- (news_tbl[1] + news_tbl[4])/sum(news_tbl)*100)

## [1] 50.4

#Next iteration of k = 95 as 95 is the square root of the number of training records in the data set.

news_test_pred1  <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 95)
(news_tbl_1 <- table(news_test_pred1, news_test_labels))

##                news_test_labels
## news_test_pred1  no yes
##             no  108  62
##             yes 439 391

#Lets check the performance again,
(Accuracy <- (news_tbl_1[1] + news_tbl_1[4])/sum(news_tbl_1)*100)

## [1] 49.9

###ANSWER-Q3: The value of k is used as 5 and 95 and both produces somewhat same accuracy. We see that the accuracy drops to almost half at 49.9% in this model as compared to the previous model we just run. This classifier also performed poorly as it made 501 errors on 1000 traning examples. This is high error rate and hence the model performance is poor.

Lab3-ANLY530

Priya

10/6/2019