#OBJECTIVE
#This lab is intended to help you understand how to integrate unsupervised and supervised learning techniques. You’ll start by considering unsupervised learning using k-Means clustering. Then, you’ll use the results of that analysis as input for a supervised learning analysis, specifically a classification analysis. This is the basis for a mixed methods or semi-supervised learning analysis.
###PART 1
##PRE-PROCESSING the data for wholesale customers
#Requires some data frame and the top N to remove
data <- read.csv("C:/Users/Priya/Downloads/Wholesale customers data.csv")
summary(data)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
# This is the function that will remove n highest instances (outliers) from dataset
top.n.custs <- function (data,cols,n=5) {
#Requires some data frame and the top N to remove
idx.to.remove <-integer(0)
#Initialize a vector to hold customers being removed
for (c in cols){
# For every column in the data we passed to this function
col.order <-order(data[,c],decreasing=T)
#Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
idx <-head(col.order, n)
#Take the first n of the sorted column C to
idx.to.remove <-union(idx.to.remove,idx)
#Combine and de-duplicate the row ids that need to be removed
}
return(idx.to.remove)
#Return the indexes of customers to be removed
}
#How Many Customers to be Removed?
top.custs <-top.n.custs(data,cols=3:8,n=5)
length(top.custs) #How Many Customers to be Removed?
## [1] 19
# Now let’s call the function
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)
## [1] 18
#Examine the customers
data[top.custs,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
## 4 1 3 13265 1196 4221 6404 507 1788
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
# Design the model
data.rm.top<-data[-c(top.custs),] #Remove the Customers
summary(data.rm.top)
## Channel Region Fresh Milk
## Min. :1.00 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.00 1st Qu.:2.000 1st Qu.: 3072 1st Qu.: 1497
## Median :1.00 Median :3.000 Median : 8130 Median : 3582
## Mean :1.31 Mean :2.531 Mean :11076 Mean : 5172
## 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:16251 3rd Qu.: 6962
## Max. :2.00 Max. :3.000 Max. :56082 Max. :36423
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2132 1st Qu.: 738.8 1st Qu.: 255.2 1st Qu.: 398.0
## Median : 4603 Median : 1487.5 Median : 799.5 Median : 904.5
## Mean : 7211 Mean : 2910.3 Mean : 2541.6 Mean : 1352.0
## 3rd Qu.:10391 3rd Qu.: 3428.0 3rd Qu.: 3879.2 3rd Qu.: 1752.2
## Max. :39694 Max. :60869.0 Max. :19410.0 Max. :16523.0
set.seed(76964057) #Set the seed for reproducibility
rng<-2:20
tries<-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries
v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")
plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares", xlab="Value of K")
#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")
plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K", ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")
k <-kmeans(data.rm.top[,-c(1,2)], centers=2)
k$withinss
## [1] 16873576067 50486095378
#Display cluster centers
print(k$centers)
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 4589.846 10573.068 15829.726 1552.718 6876.3504 1730.547
## 2 13564.652 3099.436 3904.531 3431.039 878.6951 1206.774
library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)
###Q1- Given this is an imperfect real-world, you need to determine what you believe is the best value for “k” and write-up this portion of your lab report. You should include a brief discussion of your k-Means analysis as well as the best value of “k” that you determine. You should include what mixture of variables within the clusters that this value of “k” results in. That is, you need to interpret your k-Means analysis and discuss what it means.
###ANSWER-Q1: The objective is to perform k-mean analysis to find the best or optimum value of k. The Wholesale customer dataset contains “fresh”, “milk”, “grocery” etc by region. First we will do some prepossesing of the data by converting into a dataframe and removing the top 5 customers from each region. Then, variables channel and region are dropped. After prepocessing, we will model the data by running the kmeans algorithm to find the optimum value of k. From the plots it is seen that the k mean values is between 2 and 20 , but looking for the elbow it is either 2, 3 or 5. k=2 seems to be a better value of k as in k=5 there is no disctinct clustering happening due to SS being very large. The output shows Cluster 1 heavy with “grocery” and “milk” and cluster 2 is heavy with “fresh”.
###Q2- How many points do you see in each cluster?
###ANSWER-Q2: First cluster has about 117 points and the other has 304 points in it.
###PART 2
#Now switching to the wine dataset which contains the chemical analysis of wine grown in 3 different cultivars of Italy. The objective is to determine the quantities of 13 constituents found in each of the three types of wines by using k-mean algorithm on the dataset and finding the best value of k from that. First, I will use wssplot function to determine the best value of k. The output and the bar plot shows the k value is 3. We can see that the cluster 1 is heavy with Phenols, Flavanoids and Proline, while cluster 2 is heavy with Hue and shows outliers of the group. Cluster 3 shows heavy on Color.int and Malic. Acid. From the confusion matrix it is seen that cluster 1 and 3 are good and all points are in the cluster itself whereas cluster 2 shares points with cluster 1 and cluster 3 indicating dependency and lack of clear clustering.
#This R script will perform a k-Means analysis on the wine dataset from UCI. This dataset
#has 13 chemical measurements on 178 observations of Italian wine.
wine <- read.csv("C:/Users/Priya/Downloads/wine.csv")
#Plot the within (cluster) sum of squares to determine the initial value for "k"
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
#Scale the data
df <- scale(wine[-1])
#k-mean Analysis
library(NbClust)
## Warning: package 'NbClust' was built under R version 3.5.2
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")
#Train Kmeans and evaluate
set.seed(1234)
fit.km<-kmeans(df,3,nstart=25) #n=3
(df_km<-table(wine$Wine,fit.km$cluster))
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921
#Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25) #n=3
print(fit.km$size)
## [1] 62 65 51
print(fit.km$centers)
## Alcohol Malic.acid Ash Acl Mg Phenols
## 1 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## 3 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## 1 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 2 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## 3 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## OD Proline
## 1 0.7770551 1.1220202
## 2 0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428
print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))
## cluster Alcohol Malic.acid Ash Acl Mg Phenols
## 1 1 13.67677 1.997903 2.466290 17.46290 107.96774 2.847581
## 2 2 12.25092 1.897385 2.231231 20.06308 92.73846 2.247692
## 3 3 13.13412 3.307255 2.417647 21.24118 98.66667 1.683922
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD
## 1 3.0032258 0.2920968 1.922097 5.453548 1.0654839 3.163387
## 2 2.0500000 0.3576923 1.624154 2.973077 1.0627077 2.803385
## 3 0.8188235 0.4519608 1.145882 7.234706 0.6919608 1.696667
## Proline
## 1 1100.2258
## 2 510.1692
## 3 619.0588
#Using confusion matrix to evaluate performance of the kmeans analysis
ct.km <- table(wine$Wine, fit.km$cluster)
print(ct.km)
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
#Generating the plot of clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)
###PART 3
#Now, training a classifier to classify wines using the 3 clusters identified earlier. First step is to label the dataframe using cluster labels. I have decided to use decision tree method for training the data. For this the data need to be randomized first and I have selected 80% train and 20% test data. Running the test and checking the accuracy of the model it is seen that the accuracy is 100% for all the 3 clusters. There was some misclassification in the wine dataset in cluster 2 initially which is due to the outliers of the different contents of the wine. In the test dataset, I was able to achieve 100% accuarcy with no misclassification due to the randomization of the dataset.
###Training a model for classifying wines from the wines dataset
library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))
## 'data.frame': 178 obs. of 14 variables:
## $ k : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 1.514 0.246 0.196 1.687 0.295 ...
## $ Malic.acid : num -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
## $ Ash : num 0.231 -0.826 1.106 0.487 1.835 ...
## $ Acl : num -1.166 -2.484 -0.268 -0.807 0.451 ...
## $ Mg : num 1.9085 0.0181 0.0881 0.9283 1.2784 ...
## $ Phenols : num 0.807 0.567 0.807 2.484 0.807 ...
## $ Flavanoids : num 1.032 0.732 1.212 1.462 0.661 ...
## $ Nonflavanoid.phenols: num -0.658 -0.818 -0.497 -0.979 0.226 ...
## $ Proanth : num 1.221 -0.543 2.13 1.029 0.4 ...
## $ Color.int : num 0.251 -0.292 0.268 1.183 -0.318 ...
## $ Hue : num 0.361 0.405 0.317 -0.426 0.361 ...
## $ OD : num 1.843 1.11 0.786 1.181 0.448 ...
## $ Proline : num 1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL
rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))
## k Alcohol Malic.acid Ash Acl Mg Phenols
## 93 2 -0.3826162 -0.7217931 -0.38826018 0.3608424 -1.3822227 -1.462188745
## 69 2 0.4180475 -1.2499245 -0.02375431 -0.7470867 0.7182523 0.375309174
## 13 1 0.9230815 -0.5427655 0.15849862 -1.0465271 -0.7520802 0.487156874
## 57 1 1.5020229 -0.5696196 -0.24245783 -0.9566950 1.2783790 1.445851440
## 117 2 -1.4542737 -0.7755014 -1.37242601 0.3907864 -0.9621277 -0.503494178
## 161 3 -0.7891070 1.3370245 0.04914686 0.4506745 -0.8220960 0.007809591
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## 93 -0.5699201 1.7528342 0.05084419 -0.8661960 0.01115870
## 69 -0.7301029 1.5117800 -2.04574255 -0.8144336 0.27365854
## 13 0.7315653 -0.5773564 0.38280376 0.2337547 0.84240820
## 57 0.9718395 -0.8184106 0.76717799 0.5702101 -0.07634125
## 117 -0.4297602 -0.4970050 -0.10639981 -1.3406845 -0.03259127
## 161 -1.1105371 1.1100230 -0.96250606 1.1180287 -1.73884025
## OD Proline
## 93 -0.7770322 -0.799896093
## 69 -0.9601332 0.009865569
## 13 0.4060824 1.819921051
## 57 0.9835550 0.708483475
## 117 1.0117244 -0.799896093
## 161 -1.4530976 -0.720507695
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
#Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
rpart.plot(fit)
#Now lets access the model performace
pred<-predict(fit, test, type="class")
print(table(pred, test$k))
##
## pred 1 2 3
## 1 11 1 0
## 2 0 12 1
## 3 0 1 11
p<-table(pred, test$k)
(Accuracy <- (sum(diag(p))/sum(p)*100))
## [1] 91.89189
###PART 4
###Q3- Load the dataset of breast cancer. Do the preliminary analysis and implement a KNN (Knearest neighbors) model for this dataset and don’t forget that whenever it is required you should use: set.seed(12345). For designing the model, use following command: knn(train =
###ANSWER-Q3: Let’s see if we can apply KNN (K-nearest neighbors) to improve the accuracy of the model. After loading the data as shown below and doing some preprocessing, the set.seed(12345) for reproducibility purposes and the data was split in 80:20 train and test split.
wbdata <- read.csv("C:/Users/Priya/Downloads/wisc_bc_data.csv")
library(class)
#Removing the first attribute that mentions patients
wbdata <- wbdata[-1]
#Pre processing data
table(wbdata$diagnosis)
##
## B M
## 357 212
wbdata$diagnosis <- factor(wbdata$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbdata$diagnosis)) * 100, digits = 1)
##
## Benign Malignant
## 62.7 37.3
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
wbdata_n <- as.data.frame(lapply(wbdata[2:31], normalize))
summary(wbdata_n)
## radius_mean texture_mean perimeter_mean area_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168 1st Qu.:0.1174
## Median :0.3024 Median :0.3088 Median :0.2933 Median :0.1729
## Mean :0.3382 Mean :0.3240 Mean :0.3329 Mean :0.2169
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168 3rd Qu.:0.2711
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## smoothness_mean compactness_mean concavity_mean points_mean
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3046 1st Qu.:0.1397 1st Qu.:0.06926 1st Qu.:0.1009
## Median :0.3904 Median :0.2247 Median :0.14419 Median :0.1665
## Mean :0.3948 Mean :0.2606 Mean :0.20806 Mean :0.2431
## 3rd Qu.:0.4755 3rd Qu.:0.3405 3rd Qu.:0.30623 3rd Qu.:0.3678
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_mean dimension_mean radius_se texture_se
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04378 1st Qu.:0.1047
## Median :0.3697 Median :0.2439 Median :0.07702 Median :0.1653
## Mean :0.3796 Mean :0.2704 Mean :0.10635 Mean :0.1893
## 3rd Qu.:0.4530 3rd Qu.:0.3404 3rd Qu.:0.13304 3rd Qu.:0.2462
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## perimeter_se area_se smoothness_se compactness_se
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.04000 1st Qu.:0.02064 1st Qu.:0.1175 1st Qu.:0.08132
## Median :0.07209 Median :0.03311 Median :0.1586 Median :0.13667
## Mean :0.09938 Mean :0.06264 Mean :0.1811 Mean :0.17444
## 3rd Qu.:0.12251 3rd Qu.:0.07170 3rd Qu.:0.2187 3rd Qu.:0.22680
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## concavity_se points_se symmetry_se dimension_se
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.03811 1st Qu.:0.1447 1st Qu.:0.1024 1st Qu.:0.04675
## Median :0.06538 Median :0.2070 Median :0.1526 Median :0.07919
## Mean :0.08054 Mean :0.2235 Mean :0.1781 Mean :0.10019
## 3rd Qu.:0.10619 3rd Qu.:0.2787 3rd Qu.:0.2195 3rd Qu.:0.12656
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## radius_worst texture_worst perimeter_worst area_worst
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1807 1st Qu.:0.2415 1st Qu.:0.1678 1st Qu.:0.08113
## Median :0.2504 Median :0.3569 Median :0.2353 Median :0.12321
## Mean :0.2967 Mean :0.3640 Mean :0.2831 Mean :0.17091
## 3rd Qu.:0.3863 3rd Qu.:0.4717 3rd Qu.:0.3735 3rd Qu.:0.22090
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## smoothness_worst compactness_worst concavity_worst points_worst
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3000 1st Qu.:0.1163 1st Qu.:0.09145 1st Qu.:0.2231
## Median :0.3971 Median :0.1791 Median :0.18107 Median :0.3434
## Mean :0.4041 Mean :0.2202 Mean :0.21740 Mean :0.3938
## 3rd Qu.:0.4942 3rd Qu.:0.3025 3rd Qu.:0.30583 3rd Qu.:0.5546
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_worst dimension_worst
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1851 1st Qu.:0.1077
## Median :0.2478 Median :0.1640
## Mean :0.2633 Mean :0.1896
## 3rd Qu.:0.3182 3rd Qu.:0.2429
## Max. :1.0000 Max. :1.0000
#splitting into train and test datasets of about 80:20 split and setting seed at 12345
set.seed(12345)
wbdata_train <- wbdata_n[1:469, ]
wbdata_test <- wbdata_n[470:569, ]
wbdata_train_labels <- wbdata[1:469, 1]
wbdata_test_labels <- wbdata[470:569, 1]
#Train the model
wbdata_test_pred <- knn(train = wbdata_train, test = wbdata_test, cl=wbdata_train_labels, k = 21) #k=21 is square root of number of traning records.
#Evaluation
(wbdata_tbl <- table(wbdata_test_pred, wbdata_test_labels))
## wbdata_test_labels
## wbdata_test_pred Benign Malignant
## Benign 61 2
## Malignant 0 37
(Accuracy <- (wbdata_tbl[1]+wbdata_tbl[4])/sum(wbdata_tbl)*100)
## [1] 98
###Here we get an accuracy of 98% when doing a knn model on the breast cancer dataset. That is a pretty good accuracy value and seems like the model is performing at an optimum level.
###Q4 - Now let’s get back to our problem of news popularity and see if we can apply KNN (Knearest neighbors) to improve the accuracy of the model. Use the same strategy of training and testing that we did on first 2 labs, and don’t forget that whenever it is required you should use: set.seed(12345).
#NEWS Popularity Dataset
#Load the dataset
news <- read.csv("C:/Users/Priya/Downloads/OnlineNewsPopularity_for_R.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
#Some preprocessing
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
#Noramlize the data
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_rand <- news_n[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
news_test_pred <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 5) #choosing a nominal k value of 5.
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 225 174
## yes 322 279
#Lets check the performance
(Accuracy <- (news_tbl[1] + news_tbl[4])/sum(news_tbl)*100)
## [1] 50.4
#Next iteration of k = 95 as 95 is the square root of the number of training records in the data set.
news_test_pred1 <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 95)
(news_tbl_1 <- table(news_test_pred1, news_test_labels))
## news_test_labels
## news_test_pred1 no yes
## no 108 62
## yes 439 391
#Lets check the performance again,
(Accuracy <- (news_tbl_1[1] + news_tbl_1[4])/sum(news_tbl_1)*100)
## [1] 49.9
###ANSWER-Q3: The value of k is used as 5 and 95 and both produces somewhat same accuracy. We see that the accuracy drops to almost half at 49.9% in this model as compared to the previous model we just run. This classifier also performed poorly as it made 501 errors on 1000 traning examples. This is high error rate and hence the model performance is poor.