Part 1 K Means Clustering
Step 1: Loading data into r
library(readr)
Wholesale_customers_data <- read.csv("C:/Users/gmutya048/Downloads/Wholesale_customers_data.csv")
str(Wholesale_customers_data)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(Wholesale_customers_data)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
data<-data.frame(Wholesale_customers_data)
Step 2: Pre-Processing
top.n.custs <- function (data,cols,n=5) { #Requires some data frame and the top N to remove
idx.to.remove <-integer(0) #Initialize a vector to hold customers being removed
for (c in cols){ # For every column in the data we passed to this function
col.order <-order(data[,c],decreasing=T) #Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
idx <-head(col.order, n) #Take the first n of the sorted column C to
idx.to.remove <-union(idx.to.remove,idx) #Combine and de-duplicate the row ids that need to be removed
}
return(idx.to.remove) #Return the indexes of customers to be removed
}
How many customers to be remove?
top.custs<-top.n.custs(data,cols=3:8,n=5)
length(top.custs)
## [1] 19
Examine the customers
data[top.custs,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
## 326 1 2 32717 16784 13626 60869 1272 5609
## 94 1 3 11314 3090 2062 35009 71 2698
## 197 1 1 30624 7209 4897 18711 763 2876
## 104 1 3 56082 3504 8906 18028 1480 2498
## 24 2 3 26373 36423 22019 5154 4337 16523
## 72 1 3 18291 1266 21042 5373 4173 14472
## 88 1 3 43265 5025 8117 6312 1579 14351
Step 3 : Model Design
Remove the customers and examine the remaining data
data.rm.top<-data[-c(top.custs),]
summary(data.rm.top)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3067 1st Qu.: 1492
## Median :1.000 Median :3.000 Median : 8040 Median : 3587
## Mean :1.321 Mean :2.537 Mean :10753 Mean : 5112
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:15615 3rd Qu.: 7027
## Max. :2.000 Max. :3.000 Max. :53205 Max. :29892
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25 Min. : 3 Min. : 3
## 1st Qu.: 2146 1st Qu.: 688 1st Qu.: 256 1st Qu.: 396
## Median : 4602 Median : 1457 Median : 811 Median : 898
## Mean : 7135 Mean : 2599 Mean : 2547 Mean :1261
## 3rd Qu.: 9965 3rd Qu.: 3242 3rd Qu.: 3843 3rd Qu.:1697
## Max. :39694 Max. :17866 Max. :19410 Max. :7844
Set the seed for reproducibility
set.seed(76964057)
Create 5 clusters, Remove columns 1 and 2
k <-kmeans(data.rm.top[,-c(1,2)], centers=5)
Display cluster centers
k$centers
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 5830.214 15295.048 23449.167 1936.452 10361.6429 1912.738
## 2 18649.606 3335.586 4497.848 3301.747 1046.5859 1450.566
## 3 5845.392 2337.319 2878.205 2766.596 660.2952 858.994
## 4 4238.892 7725.289 11011.747 1336.566 4733.3614 1400.530
## 5 35922.387 4851.806 5862.581 3730.677 1004.6129 1552.161
Give a count of data points in each cluster
table(k$cluster)
##
## 1 2 3 4 5
## 42 99 166 83 31
set K from 2 to 20 and run the Kmeans Algorithm 100 times
rng<-2:20
tries<-100
Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries
v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
## Warning: did not converge in 10 iterations
For each value of the range variable
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")
plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares", xlab="Value of K")
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="blue")
plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K", ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="blue")
k <-kmeans(data.rm.top[,-c(1,2)], centers=2)
k$withinss
## [1] 39113465976 14572965733
table(k$cluster)
##
## 1 2
## 300 121
print(k$centers)
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 5577.38 5668.003 8001.727 2244.843 3121.060 1169.410
## 2 23584.49 3734.000 4984.554 3475.967 1124.826 1488.198
Cluster Plot
library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)
Question 1: Given this is an imperfect real-world, you need to determine what you believe is the best value for “k” and write-up this portion of your lab report.
Answer 1: As per the elbow method (plot 1 "Avg total with sum of squares by number of clusters(k))the optimal K value was found to be 3.This method is used to find the percentage of variance explained by k. we choose the optimal k value at the point where the change rate starts to show a drop. We can also use other methods like Bayesian inference criterion or sillhouette method.
When we consider K=5 Cluster 4 seems to be those customers who heavily consume milk and grocery, followed by Cluster 1. Cluster 3 seems to be those customers who heavily consume fresh, followed by Cluster 2. Cluster 5 seems to represent the low (or regular) consumption customers.
Question 2: How many points do you see in each cluster?
Answer 2: In the above Cluster Plot we can see 304 points in one cluster and 117 in the other.
If we consider overall data we can see 83 points in fresh 108 points in Milk 43 points in Grocery 43 points in Detergents_Paper 145 points in Delicassen
Part 2 Cluster Analysis
Step 1: Loading data into r
library(readr)
wine <- read.csv("C:/Users/gmutya048/Downloads/wine.csv")
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Wine : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Acl : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Mg : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanth : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.int : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
Scale The data
df <- scale(wine[-1])
Optimum K
wssplot(df)
K-means analysis
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 2 as the best number of clusters
## * 19 proposed 3 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")
K-means Train and evaluation
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
table(fit.km$cluster)
##
## 1 2 3
## 62 65 51
(df_km <- table(wine$Wine, fit.km$cluster))
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921
K-means Plot
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)
df_rpart <- data.frame(k=fit.km$cluster, df)
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked _by_ '.GlobalEnv':
##
## wine
fancyRpartPlot(fit)
Model Evaluation
pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##
## pred 1 2 3
## 1 14 1 0
## 2 2 9 0
## 3 0 2 9
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 86.48649
Part 4: Breast Cancer Wisconsin Diagnostic data set
Q3: Load the data set of breast cancer. Do the preliminary analysis and implement a KNN (Knearest neighbors) model for this data set and don’t forget that whenever it is required you should use: set.seed(12345)
Answer 3: There are 32 variables and 469 observations in the data set.No randomization in sampling. For reproducibility data is divided into test and train data sets in 80:20. Using KNN Model there is a 98% accuracy observed in this model at k=21. K is set to be positive Square root of the number of rows in the training set.
library(readr)
wbcd <- read.csv("C:/Users/gmutya048/Downloads/wisc_bc_data.csv")
str(wbcd)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
## $ diagnosis : chr "B" "B" "B" "B" ...
## $ radius_mean : num 12.3 10.6 11 11.3 15.2 ...
## $ texture_mean : num 12.4 18.9 16.8 13.4 13.2 ...
## $ perimeter_mean : num 78.8 69.3 70.9 73 97.7 ...
## $ area_mean : num 464 346 373 385 712 ...
## $ smoothness_mean : num 0.1028 0.0969 0.1077 0.1164 0.0796 ...
## $ compactness_mean : num 0.0698 0.1147 0.078 0.1136 0.0693 ...
## $ concavity_mean : num 0.0399 0.0639 0.0305 0.0464 0.0339 ...
## $ points_mean : num 0.037 0.0264 0.0248 0.048 0.0266 ...
## $ symmetry_mean : num 0.196 0.192 0.171 0.177 0.172 ...
## $ dimension_mean : num 0.0595 0.0649 0.0634 0.0607 0.0554 ...
## $ radius_se : num 0.236 0.451 0.197 0.338 0.178 ...
## $ texture_se : num 0.666 1.197 1.387 1.343 0.412 ...
## $ perimeter_se : num 1.67 3.43 1.34 1.85 1.34 ...
## $ area_se : num 17.4 27.1 13.5 26.3 17.7 ...
## $ smoothness_se : num 0.00805 0.00747 0.00516 0.01127 0.00501 ...
## $ compactness_se : num 0.0118 0.03581 0.00936 0.03498 0.01485 ...
## $ concavity_se : num 0.0168 0.0335 0.0106 0.0219 0.0155 ...
## $ points_se : num 0.01241 0.01365 0.00748 0.01965 0.00915 ...
## $ symmetry_se : num 0.0192 0.035 0.0172 0.0158 0.0165 ...
## $ dimension_se : num 0.00225 0.00332 0.0022 0.00344 0.00177 ...
## $ radius_worst : num 13.5 11.9 12.4 11.9 16.2 ...
## $ texture_worst : num 15.6 22.9 26.4 15.8 15.7 ...
## $ perimeter_worst : num 87 78.3 79.9 76.5 104.5 ...
## $ area_worst : num 549 425 471 434 819 ...
## $ smoothness_worst : num 0.139 0.121 0.137 0.137 0.113 ...
## $ compactness_worst: num 0.127 0.252 0.148 0.182 0.174 ...
## $ concavity_worst : num 0.1242 0.1916 0.1067 0.0867 0.1362 ...
## $ points_worst : num 0.0939 0.0793 0.0743 0.0861 0.0818 ...
## $ symmetry_worst : num 0.283 0.294 0.3 0.21 0.249 ...
## $ dimension_worst : num 0.0677 0.0759 0.0788 0.0678 0.0677 ...
Eliminating the first attribute
wbcd <- wbcd[-1]
Pre-processing
table(wbcd$diagnosis)
##
## B M
## 357 212
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
##
## Benign Malignant
## 62.7 37.3
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)
## radius_mean texture_mean perimeter_mean area_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168 1st Qu.:0.1174
## Median :0.3024 Median :0.3088 Median :0.2933 Median :0.1729
## Mean :0.3382 Mean :0.3240 Mean :0.3329 Mean :0.2169
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168 3rd Qu.:0.2711
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## smoothness_mean compactness_mean concavity_mean points_mean
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3046 1st Qu.:0.1397 1st Qu.:0.06926 1st Qu.:0.1009
## Median :0.3904 Median :0.2247 Median :0.14419 Median :0.1665
## Mean :0.3948 Mean :0.2606 Mean :0.20806 Mean :0.2431
## 3rd Qu.:0.4755 3rd Qu.:0.3405 3rd Qu.:0.30623 3rd Qu.:0.3678
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_mean dimension_mean radius_se texture_se
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04378 1st Qu.:0.1047
## Median :0.3697 Median :0.2439 Median :0.07702 Median :0.1653
## Mean :0.3796 Mean :0.2704 Mean :0.10635 Mean :0.1893
## 3rd Qu.:0.4530 3rd Qu.:0.3404 3rd Qu.:0.13304 3rd Qu.:0.2462
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## perimeter_se area_se smoothness_se compactness_se
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.04000 1st Qu.:0.02064 1st Qu.:0.1175 1st Qu.:0.08132
## Median :0.07209 Median :0.03311 Median :0.1586 Median :0.13667
## Mean :0.09938 Mean :0.06264 Mean :0.1811 Mean :0.17444
## 3rd Qu.:0.12251 3rd Qu.:0.07170 3rd Qu.:0.2187 3rd Qu.:0.22680
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## concavity_se points_se symmetry_se dimension_se
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.03811 1st Qu.:0.1447 1st Qu.:0.1024 1st Qu.:0.04675
## Median :0.06538 Median :0.2070 Median :0.1526 Median :0.07919
## Mean :0.08054 Mean :0.2235 Mean :0.1781 Mean :0.10019
## 3rd Qu.:0.10619 3rd Qu.:0.2787 3rd Qu.:0.2195 3rd Qu.:0.12656
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## radius_worst texture_worst perimeter_worst area_worst
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1807 1st Qu.:0.2415 1st Qu.:0.1678 1st Qu.:0.08113
## Median :0.2504 Median :0.3569 Median :0.2353 Median :0.12321
## Mean :0.2967 Mean :0.3640 Mean :0.2831 Mean :0.17091
## 3rd Qu.:0.3863 3rd Qu.:0.4717 3rd Qu.:0.3735 3rd Qu.:0.22090
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## smoothness_worst compactness_worst concavity_worst points_worst
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3000 1st Qu.:0.1163 1st Qu.:0.09145 1st Qu.:0.2231
## Median :0.3971 Median :0.1791 Median :0.18107 Median :0.3434
## Mean :0.4041 Mean :0.2202 Mean :0.21740 Mean :0.3938
## 3rd Qu.:0.4942 3rd Qu.:0.3025 3rd Qu.:0.30583 3rd Qu.:0.5546
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_worst dimension_worst
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1851 1st Qu.:0.1077
## Median :0.2478 Median :0.1640
## Mean :0.2633 Mean :0.1896
## 3rd Qu.:0.3182 3rd Qu.:0.2429
## Max. :1.0000 Max. :1.0000
Assign 82% of data to training. Define the target variables separately
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
Train the Model
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
Evaluation
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))
## wbcd_test_labels
## wbcd_test_pred Benign Malignant
## Benign 61 2
## Malignant 0 37
(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)
## [1] 98
We can observe 98% of accuracy in Breast cancer data set using KNN model. This implies that the model is performing at an optimum level
News Popularity
Loading Data into R
library(readr)
news<- read.csv("C:/Users/gmutya048/Downloads/OnlineNewsPopularity_for_R.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : chr "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
Pre Processing
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
#for(i in 1:39644) {
# if(newsShort$shares[i] >= 1400) {
# newsShort$shares[i] = "yes"}
# else {newsShort$shares[i] = "no"}
#cat("i=,",i," shares=",newsShort$shares[i],"\n")
#}
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_rand <- news_n[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
Considering k=5
news_test_pred <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 5)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 225 174
## yes 322 279
(Accuracy <- (news_tbl[1] + news_tbl[4])/sum(news_tbl)*100)
## [1] 50.4
Considering K=95
news_test_pred1 <- knn(train = news_train, test = news_test, cl = news_train_labels, k= 95)
(news_tbl_1 <- table(news_test_pred1, news_test_labels))
## news_test_labels
## news_test_pred1 no yes
## no 108 62
## yes 439 391
(Accuracy <- (news_tbl_1[1] + news_tbl_1[4])/sum(news_tbl_1)*100)
## [1] 49.9
Q4: Now let’s get back to our problem of news popularity and see if we can apply KNN (K- nearest neighbors) to improve the accuracy of the model. Use the same strategy of training and testing that we did on first 2 labs, and don’t forget that whenever it is required you should use: set.seed(12345).
Answer 4: The above KNN Method does not improve the accuracy of the model for news popularity data set when K is set to be 5 or 95.
Accuracy when k=5 is 50.4
Accuracy when k=95 is 49.9