Part 1
Wholesale_customers_data <- read.csv("Wholesale customers data.csv")
str(Wholesale_customers_data)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
#Requires some data frame and the top N to remove
#Convert to data frame
data <- data.frame(Wholesale_customers_data)
top.n.custs <- function (data,cols,n=5)
{
#Initialize a vector to hold customers being removed
idx.to.remove <-integer(0)
for (c in cols)
{
# For every column in the data we passed to this function
#Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
col.order <-order(data[,c],decreasing=T)
#Take the first n of the sorted column C to
#combine and de-duplicate the row ids that need to be removed
idx <-head(col.order, n)
idx.to.remove <-union(idx.to.remove,idx)
}
#Return the indexes of customers to be removed
return(idx.to.remove)
}
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)
## [1] 18
#Examine the customers
data[top.custs,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
## 4 1 3 13265 1196 4221 6404 507 1788
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
#Remove the Customers
data.rm.top<-data[-c(top.custs),]
#Examine summary stats for the remaining data
print(summary(data.rm.top))
## Channel Region Fresh Milk
## Min. :1.00 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.00 1st Qu.:2.000 1st Qu.: 3072 1st Qu.: 1497
## Median :1.00 Median :3.000 Median : 8130 Median : 3582
## Mean :1.31 Mean :2.531 Mean :11076 Mean : 5172
## 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:16251 3rd Qu.: 6962
## Max. :2.00 Max. :3.000 Max. :56082 Max. :36423
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2132 1st Qu.: 738.8 1st Qu.: 255.2 1st Qu.: 398.0
## Median : 4603 Median : 1487.5 Median : 799.5 Median : 904.5
## Mean : 7211 Mean : 2910.3 Mean : 2541.6 Mean : 1352.0
## 3rd Qu.:10391 3rd Qu.: 3428.0 3rd Qu.: 3879.2 3rd Qu.: 1752.2
## Max. :39694 Max. :60869.0 Max. :19410.0 Max. :16523.0
#Set the seed for reproducibility
set.seed(76964057)
#Try K from 2 to 20
rng<-2:20
#Number of times to run the K Means algorithm
tries <-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng)
{
#Set up an empty vectors to hold the tries
v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries)
{
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweens
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares",
xlab="Value of K")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")

#Create the best number of clusters, Remove columns 1 and 2
k <-kmeans(data.rm.top[,-c(1,2)], centers=2)
#Display cluster centers
print(k$centers)
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 4589.846 10573.068 15829.726 1552.718 6876.3504 1730.547
## 2 13564.652 3099.436 3904.531 3431.039 878.6951 1206.774
#Give a count of data points in each cluster
print(table(k$cluster))
##
## 1 2
## 117 305
#Generate a plot of the clusters
library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)

Part 2
#Plot the within (cluster) sum of squares to determine the initial value for "k"
wssplot <- function(data, nc=15, seed=1234)
{
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc)
{
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)
}
plot(1:nc, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
}
#Load data into R/RStudio and view it
wine <- read.csv("wine.csv")
df <- scale(wine[-1])
#Examine the data frame and plot the within sum of squares
head(df)
## Alcohol Malic.acid Ash Acl Mg Phenols
## [1,] 1.5143408 -0.56066822 0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252 0.02117152 1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508 0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684 0.22705328 1.8352256 0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132 0.3043010 -1.2860793 0.85828399 1.5576991
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## [1,] 1.0319081 -0.6577078 1.2214385 0.2510088 0.3611585
## [2,] 0.7315653 -0.8184106 -0.5431887 -0.2924962 0.4049085
## [3,] 1.2121137 -0.4970050 2.1299594 0.2682629 0.3174085
## [4,] 1.4623994 -0.9791134 1.0292513 1.1827317 -0.4263410
## [5,] 0.6614853 0.2261576 0.4002753 -0.3183774 0.3611585
## [6,] 1.3622851 -0.1755994 0.6623487 0.7298108 0.4049085
## OD Proline
## [1,] 1.8427215 1.01015939
## [2,] 1.1103172 0.96252635
## [3,] 0.7863692 1.39122370
## [4,] 1.1807407 2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589 2.23274072
wssplot(df)

#Start the k-Means analysis using the variable "nc" for the number of clusters
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
print(table(nc$Best.n[1,]))
##
## 0 1 2 3 10 12 14 15
## 2 1 4 15 1 1 1 1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria",
main = "Number of Clusters Chosen by 26 Criteria")
#Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)
## [1] 62 65 51
print(fit.km$centers)
## Alcohol Malic.acid Ash Acl Mg Phenols
## 1 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## 3 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## 1 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 2 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## 3 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## OD Proline
## 1 0.7770551 1.1220202
## 2 0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428
print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))
## cluster Alcohol Malic.acid Ash Acl Mg Phenols
## 1 1 13.67677 1.997903 2.466290 17.46290 107.96774 2.847581
## 2 2 12.25092 1.897385 2.231231 20.06308 92.73846 2.247692
## 3 3 13.13412 3.307255 2.417647 21.24118 98.66667 1.683922
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD
## 1 3.0032258 0.2920968 1.922097 5.453548 1.0654839 3.163387
## 2 2.0500000 0.3576923 1.624154 2.973077 1.0627077 2.803385
## 3 0.8188235 0.4519608 1.145882 7.234706 0.6919608 1.696667
## Proline
## 1 1100.2258
## 2 510.1692
## 3 619.0588
#Use a confusion or truth table to evaluate how well the k-Means analysis performed
ct.km <- table(wine$Wine, fit.km$cluster)
print(ct.km)
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
#Generate a plot of the clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)

Part 3
#Set-up to train a model for classification of wines
library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))
## 'data.frame': 178 obs. of 14 variables:
## $ k : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 1.514 0.246 0.196 1.687 0.295 ...
## $ Malic.acid : num -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
## $ Ash : num 0.231 -0.826 1.106 0.487 1.835 ...
## $ Acl : num -1.166 -2.484 -0.268 -0.807 0.451 ...
## $ Mg : num 1.9085 0.0181 0.0881 0.9283 1.2784 ...
## $ Phenols : num 0.807 0.567 0.807 2.484 0.807 ...
## $ Flavanoids : num 1.032 0.732 1.212 1.462 0.661 ...
## $ Nonflavanoid.phenols: num -0.658 -0.818 -0.497 -0.979 0.226 ...
## $ Proanth : num 1.221 -0.543 2.13 1.029 0.4 ...
## $ Color.int : num 0.251 -0.292 0.268 1.183 -0.318 ...
## $ Hue : num 0.361 0.405 0.317 -0.426 0.361 ...
## $ OD : num 1.843 1.11 0.786 1.181 0.448 ...
## $ Proline : num 1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL
#Randomize the dataset
rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))
## k Alcohol Malic.acid Ash Acl Mg Phenols
## 93 2 -0.3826162 -0.7217931 -0.38826018 0.3608424 -1.3822227 -1.462188745
## 69 2 0.4180475 -1.2499245 -0.02375431 -0.7470867 0.7182523 0.375309174
## 13 1 0.9230815 -0.5427655 0.15849862 -1.0465271 -0.7520802 0.487156874
## 57 1 1.5020229 -0.5696196 -0.24245783 -0.9566950 1.2783790 1.445851440
## 117 2 -1.4542737 -0.7755014 -1.37242601 0.3907864 -0.9621277 -0.503494178
## 161 3 -0.7891070 1.3370245 0.04914686 0.4506745 -0.8220960 0.007809591
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## 93 -0.5699201 1.7528342 0.05084419 -0.8661960 0.01115870
## 69 -0.7301029 1.5117800 -2.04574255 -0.8144336 0.27365854
## 13 0.7315653 -0.5773564 0.38280376 0.2337547 0.84240820
## 57 0.9718395 -0.8184106 0.76717799 0.5702101 -0.07634125
## 117 -0.4297602 -0.4970050 -0.10639981 -1.3406845 -0.03259127
## 161 -1.1105371 1.1100230 -0.96250606 1.1180287 -1.73884025
## OD Proline
## 93 -0.7770322 -0.799896093
## 69 -0.9601332 0.009865569
## 13 0.4060824 1.819921051
## 57 0.9835550 0.708483475
## 117 1.0117244 -0.799896093
## 161 -1.4530976 -0.720507695
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
#Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)
library(RColorBrewer)
library(rattle)
fancyRpartPlot(fit)

#Now use the predict() function to see how well the model works
pred <- predict(fit, test, type="class")
print(table(pred, test$k))
##
## pred 1 2 3
## 1 11 1 0
## 2 0 12 1
## 3 0 1 11
Part 4
wbcd<-read.csv("wisc_bc_data.csv")
str(wbcd)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 2 1 1 ...
## $ radius_mean : num 12.3 10.6 11 11.3 15.2 ...
## $ texture_mean : num 12.4 18.9 16.8 13.4 13.2 ...
## $ perimeter_mean : num 78.8 69.3 70.9 73 97.7 ...
## $ area_mean : num 464 346 373 385 712 ...
## $ smoothness_mean : num 0.1028 0.0969 0.1077 0.1164 0.0796 ...
## $ compactness_mean : num 0.0698 0.1147 0.078 0.1136 0.0693 ...
## $ concavity_mean : num 0.0399 0.0639 0.0305 0.0464 0.0339 ...
## $ points_mean : num 0.037 0.0264 0.0248 0.048 0.0266 ...
## $ symmetry_mean : num 0.196 0.192 0.171 0.177 0.172 ...
## $ dimension_mean : num 0.0595 0.0649 0.0634 0.0607 0.0554 ...
## $ radius_se : num 0.236 0.451 0.197 0.338 0.178 ...
## $ texture_se : num 0.666 1.197 1.387 1.343 0.412 ...
## $ perimeter_se : num 1.67 3.43 1.34 1.85 1.34 ...
## $ area_se : num 17.4 27.1 13.5 26.3 17.7 ...
## $ smoothness_se : num 0.00805 0.00747 0.00516 0.01127 0.00501 ...
## $ compactness_se : num 0.0118 0.03581 0.00936 0.03498 0.01485 ...
## $ concavity_se : num 0.0168 0.0335 0.0106 0.0219 0.0155 ...
## $ points_se : num 0.01241 0.01365 0.00748 0.01965 0.00915 ...
## $ symmetry_se : num 0.0192 0.035 0.0172 0.0158 0.0165 ...
## $ dimension_se : num 0.00225 0.00332 0.0022 0.00344 0.00177 ...
## $ radius_worst : num 13.5 11.9 12.4 11.9 16.2 ...
## $ texture_worst : num 15.6 22.9 26.4 15.8 15.7 ...
## $ perimeter_worst : num 87 78.3 79.9 76.5 104.5 ...
## $ area_worst : num 549 425 471 434 819 ...
## $ smoothness_worst : num 0.139 0.121 0.137 0.137 0.113 ...
## $ compactness_worst: num 0.127 0.252 0.148 0.182 0.174 ...
## $ concavity_worst : num 0.1242 0.1916 0.1067 0.0867 0.1362 ...
## $ points_worst : num 0.0939 0.0793 0.0743 0.0861 0.0818 ...
## $ symmetry_worst : num 0.283 0.294 0.3 0.21 0.249 ...
## $ dimension_worst : num 0.0677 0.0759 0.0788 0.0678 0.0677 ...
#Eliminating the first attribute which talks about patients IDs and preprocessing the data
wbcd <- wbcd[-1]
table(wbcd$diagnosis)
##
## B M
## 357 212
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
##
## Benign Malignant
## 62.7 37.3
normalize <- function(x)
{
return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)
## radius_mean texture_mean perimeter_mean area_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168 1st Qu.:0.1174
## Median :0.3024 Median :0.3088 Median :0.2933 Median :0.1729
## Mean :0.3382 Mean :0.3240 Mean :0.3329 Mean :0.2169
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168 3rd Qu.:0.2711
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## smoothness_mean compactness_mean concavity_mean points_mean
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3046 1st Qu.:0.1397 1st Qu.:0.06926 1st Qu.:0.1009
## Median :0.3904 Median :0.2247 Median :0.14419 Median :0.1665
## Mean :0.3948 Mean :0.2606 Mean :0.20806 Mean :0.2431
## 3rd Qu.:0.4755 3rd Qu.:0.3405 3rd Qu.:0.30623 3rd Qu.:0.3678
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_mean dimension_mean radius_se texture_se
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04378 1st Qu.:0.1047
## Median :0.3697 Median :0.2439 Median :0.07702 Median :0.1653
## Mean :0.3796 Mean :0.2704 Mean :0.10635 Mean :0.1893
## 3rd Qu.:0.4530 3rd Qu.:0.3404 3rd Qu.:0.13304 3rd Qu.:0.2462
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## perimeter_se area_se smoothness_se compactness_se
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.04000 1st Qu.:0.02064 1st Qu.:0.1175 1st Qu.:0.08132
## Median :0.07209 Median :0.03311 Median :0.1586 Median :0.13667
## Mean :0.09938 Mean :0.06264 Mean :0.1811 Mean :0.17444
## 3rd Qu.:0.12251 3rd Qu.:0.07170 3rd Qu.:0.2187 3rd Qu.:0.22680
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## concavity_se points_se symmetry_se dimension_se
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.03811 1st Qu.:0.1447 1st Qu.:0.1024 1st Qu.:0.04675
## Median :0.06538 Median :0.2070 Median :0.1526 Median :0.07919
## Mean :0.08054 Mean :0.2235 Mean :0.1781 Mean :0.10019
## 3rd Qu.:0.10619 3rd Qu.:0.2787 3rd Qu.:0.2195 3rd Qu.:0.12656
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## radius_worst texture_worst perimeter_worst area_worst
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1807 1st Qu.:0.2415 1st Qu.:0.1678 1st Qu.:0.08113
## Median :0.2504 Median :0.3569 Median :0.2353 Median :0.12321
## Mean :0.2967 Mean :0.3640 Mean :0.2831 Mean :0.17091
## 3rd Qu.:0.3863 3rd Qu.:0.4717 3rd Qu.:0.3735 3rd Qu.:0.22090
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## smoothness_worst compactness_worst concavity_worst points_worst
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3000 1st Qu.:0.1163 1st Qu.:0.09145 1st Qu.:0.2231
## Median :0.3971 Median :0.1791 Median :0.18107 Median :0.3434
## Mean :0.4041 Mean :0.2202 Mean :0.21740 Mean :0.3938
## 3rd Qu.:0.4942 3rd Qu.:0.3025 3rd Qu.:0.30583 3rd Qu.:0.5546
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_worst dimension_worst
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1851 1st Qu.:0.1077
## Median :0.2478 Median :0.1640
## Mean :0.2633 Mean :0.1896
## 3rd Qu.:0.3182 3rd Qu.:0.2429
## Max. :1.0000 Max. :1.0000
#Splitting data into train and test
wbcd_train <- wbcd_n[1:456, ]
wbcd_test <- wbcd_n[457:569, ]
wbcd_train_labels <- wbcd[1:456, 1]
wbcd_test_labels <- wbcd[457:569, 1]
#Using KNN to train the model
#Defining the number of neighbors (K = 21); square root of the number of training records
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))
## wbcd_test_labels
## wbcd_test_pred Benign Malignant
## Benign 69 2
## Malignant 0 42
(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)
## [1] 98.23009
#Applying KNN to news popularity
news<-read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644)
{
if(newsShort$shares[i] >= 1400)
{
newsShort$popular[i] = "yes"
}
else
{
newsShort$popular[i] = "no"
}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
#Trying with k = 10
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 10)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 229 166
## yes 318 287
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 51.6
#Trying with k = 20
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 20)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 198 141
## yes 349 312
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 51
#Trying with k = 30
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 30)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 168 120
## yes 379 333
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 50.1
#Trying with k = 40
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 40)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 158 92
## yes 389 361
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 51.9
#Trying with k = 50
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 50)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 132 80
## yes 415 373
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 50.5
#Accuracy didn't really imporving for different values of k