#Data Pre-processing
library(readxl)
Wholesale <- read.csv("C:/Users/NehaKatti/Desktop/Wholesale_customers_data.csv")
str(Wholesale)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(Wholesale)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
data<-data.frame(Wholesale)
top.custs <- function (data,cols,n=5) { #Requires some data frame and the top N to remove
idx.to.remove <-integer(0) #Initialize a vector to hold customers being removed
for (c in cols){ # For every column in the data we passed to this function
col.order <-order(data[,c],decreasing=T) #Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
idx <-head(col.order, n) #Take the first n of the sorted column C to
idx.to.remove <-union(idx.to.remove,idx) #Combine and de-duplicate the row ids that need to be removed
}
return(idx.to.remove) #Return the indexes of customers to be removed
}
top.cus<-top.custs(data,cols=3:8,n=5)
length(top.cus)
## [1] 19
#Examine the customers
data[top.cus,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
## 326 1 2 32717 16784 13626 60869 1272 5609
## 94 1 3 11314 3090 2062 35009 71 2698
## 197 1 1 30624 7209 4897 18711 763 2876
## 104 1 3 56082 3504 8906 18028 1480 2498
## 24 2 3 26373 36423 22019 5154 4337 16523
## 72 1 3 18291 1266 21042 5373 4173 14472
## 88 1 3 43265 5025 8117 6312 1579 14351
#Remove the customers and check the other data
data.r.top<-data[-c(top.cus),]
summary(data.r.top)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3067 1st Qu.: 1492
## Median :1.000 Median :3.000 Median : 8040 Median : 3587
## Mean :1.321 Mean :2.537 Mean :10753 Mean : 5112
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:15615 3rd Qu.: 7027
## Max. :2.000 Max. :3.000 Max. :53205 Max. :29892
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25 Min. : 3 Min. : 3
## 1st Qu.: 2146 1st Qu.: 688 1st Qu.: 256 1st Qu.: 396
## Median : 4602 Median : 1457 Median : 811 Median : 898
## Mean : 7135 Mean : 2599 Mean : 2547 Mean :1261
## 3rd Qu.: 9965 3rd Qu.: 3242 3rd Qu.: 3843 3rd Qu.:1697
## Max. :39694 Max. :17866 Max. :19410 Max. :7844
##Designing the model
set.seed(76964057) #Set the seed for reproducibility
n <-kmeans(data.r.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
n$centers #Display cluster centers
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 5830.214 15295.048 23449.167 1936.452 10361.6429 1912.738
## 2 18649.606 3335.586 4497.848 3301.747 1046.5859 1450.566
## 3 5845.392 2337.319 2878.205 2766.596 660.2952 858.994
## 4 4238.892 7725.289 11011.747 1336.566 4733.3614 1400.530
## 5 35922.387 4851.806 5862.581 3730.677 1004.6129 1552.161
##Optimum value of k
r<-2:20 #K from 2 to 20
tri <-100 #Run the K Means algorithm 100 times
avg.totw.ss <-integer(length(r)) #Set up an empty vector to hold all of points
for(v in r){ # For each value of the range variable
v.totw.ss <-integer(tri) #Set up an empty vector to hold the 100 tries
for(i in 1:tri){
n.temp <-kmeans(data.r.top,centers=v) #Run kmeans
v.totw.ss[i] <-n.temp$tot.withinss#Store the total withinss
}
avg.totw.ss[v-1] <-mean(v.totw.ss) #Average the 100 total withinss
}
## Warning: did not converge in 10 iterations
plot(r,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")

#Part2
wine <- read.csv("C:/Users/NehaKatti/Desktop/wine.csv")
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Wine : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Acl : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Mg : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanth : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.int : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
df <- scale(wine[-1])
head(df)
## Alcohol Malic.acid Ash Acl Mg Phenols
## [1,] 1.5143408 -0.56066822 0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252 0.02117152 1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508 0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684 0.22705328 1.8352256 0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132 0.3043010 -1.2860793 0.85828399 1.5576991
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## [1,] 1.0319081 -0.6577078 1.2214385 0.2510088 0.3611585
## [2,] 0.7315653 -0.8184106 -0.5431887 -0.2924962 0.4049085
## [3,] 1.2121137 -0.4970050 2.1299594 0.2682629 0.3174085
## [4,] 1.4623994 -0.9791134 1.0292513 1.1827317 -0.4263410
## [5,] 0.6614853 0.2261576 0.4002753 -0.3183774 0.3611585
## [6,] 1.3622851 -0.1755994 0.6623487 0.7298108 0.4049085
## OD Proline
## [1,] 1.8427215 1.01015939
## [2,] 1.1103172 0.96252635
## [3,] 0.7863692 1.39122370
## [4,] 1.1807407 2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589 2.23274072
##Start the k-Means analysis using the variable "nc" for the number of clusters
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 2 as the best number of clusters
## * 19 proposed 3 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
print(table(nc$Best.n[1,]))
##
## 0 1 2 3 14 15
## 2 1 2 19 1 1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

#Enter the best number of clusters based on the information in the table and barplot
n <- readline(prompt = "Enter the best number of clusters: ")
## Enter the best number of clusters:
n <- as.integer(n)
##Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)
## [1] 62 65 51
print(fit.km$centers)
## Alcohol Malic.acid Ash Acl Mg Phenols
## 1 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## 3 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## 1 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 2 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## 3 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## OD Proline
## 1 0.7770551 1.1220202
## 2 0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428
print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))
## cluster Alcohol Malic.acid Ash Acl Mg Phenols
## 1 1 13.67677 1.997903 2.466290 17.46290 107.96774 2.847581
## 2 2 12.25092 1.897385 2.231231 20.06308 92.73846 2.247692
## 3 3 13.13412 3.307255 2.417647 21.24118 98.66667 1.683922
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD
## 1 3.0032258 0.2920968 1.922097 5.453548 1.0654839 3.163387
## 2 2.0500000 0.3576923 1.624154 2.973077 1.0627077 2.803385
## 3 0.8188235 0.4519608 1.145882 7.234706 0.6919608 1.696667
## Proline
## 1 1100.2258
## 2 510.1692
## 3 619.0588
ct.km <- table(wine$Wine, fit.km$cluster)
print(ct.km)
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
##Generate a plot of the clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution', color=TRUE, shade=TRUE,labels=2, lines=0)

# Part 3
##Set-up to train a model for classification of wines
library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))
## 'data.frame': 178 obs. of 14 variables:
## $ k : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 1.514 0.246 0.196 1.687 0.295 ...
## $ Malic.acid : num -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
## $ Ash : num 0.231 -0.826 1.106 0.487 1.835 ...
## $ Acl : num -1.166 -2.484 -0.268 -0.807 0.451 ...
## $ Mg : num 1.9085 0.0181 0.0881 0.9283 1.2784 ...
## $ Phenols : num 0.807 0.567 0.807 2.484 0.807 ...
## $ Flavanoids : num 1.032 0.732 1.212 1.462 0.661 ...
## $ Nonflavanoid.phenols: num -0.658 -0.818 -0.497 -0.979 0.226 ...
## $ Proanth : num 1.221 -0.543 2.13 1.029 0.4 ...
## $ Color.int : num 0.251 -0.292 0.268 1.183 -0.318 ...
## $ Hue : num 0.361 0.405 0.317 -0.426 0.361 ...
## $ OD : num 1.843 1.11 0.786 1.181 0.448 ...
## $ Proline : num 1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL
##Randomize the dataset
rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))
## k Alcohol Malic.acid Ash Acl Mg Phenols
## 127 2 -0.7028817 -0.7217931 -0.2789084 0.6003946 -0.96212770 0.7108523
## 155 3 -0.5181131 -0.9366262 -0.9714696 0.1512342 0.22814148 -1.3024063
## 72 2 1.0585784 -0.7396958 1.1062139 1.6484357 -0.96212770 1.0463954
## 48 1 1.1078500 -0.5875224 -0.8985684 -1.0465271 0.08810981 1.2860690
## 60 2 -0.7767891 -1.2499245 -3.6688130 -2.6635047 -0.82209603 -0.5034942
## 36 1 0.5904981 -0.4711544 0.1584986 0.3009543 0.01809398 0.6469393
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## 127 1.1220109 0.2261576 0.3129175 -0.48229163 -1.1700906
## 155 -1.4509256 1.3510772 -0.3335300 1.09646103 -1.6513403
## 72 0.8316795 -1.2201676 0.4876331 -0.72384942 1.7611577
## 48 1.3622851 -1.2201676 0.9593651 0.44943125 -0.2075912
## 60 -1.4609371 -0.6577078 -2.0457425 -1.34068448 0.4049085
## 36 0.9518167 -0.8184106 0.4701615 0.01807806 0.3611585
## OD Proline
## 127 0.3215742 -1.2539977
## 155 -1.4953517 -0.3394434
## 72 0.7722845 -1.0698166
## 48 1.0117244 0.7561165
## 60 -1.1150649 -0.7205077
## 36 1.2089101 0.5497067
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
##Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.2
library(RColorBrewer)
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.2
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked _by_ '.GlobalEnv':
##
## wine
fancyRpartPlot(fit)

##Now use the predict() function to see how well the model works
pred<-predict(fit, test, type="class")
print(table(pred, test$k))
##
## pred 1 2 3
## 1 14 1 0
## 2 2 9 0
## 3 0 2 9
p<-table(pred, test$k)
Accuracy <- (sum(diag(p))/sum(p)*100)
Accuracy
## [1] 86.48649
# Part 4
##Q3
wbcd <- read.csv("C:/Users/NehaKatti/Desktop/wisc_bc_data.csv")
str(wbcd)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 2 1 1 ...
## $ radius_mean : num 12.3 10.6 11 11.3 15.2 ...
## $ texture_mean : num 12.4 18.9 16.8 13.4 13.2 ...
## $ perimeter_mean : num 78.8 69.3 70.9 73 97.7 ...
## $ area_mean : num 464 346 373 385 712 ...
## $ smoothness_mean : num 0.1028 0.0969 0.1077 0.1164 0.0796 ...
## $ compactness_mean : num 0.0698 0.1147 0.078 0.1136 0.0693 ...
## $ concavity_mean : num 0.0399 0.0639 0.0305 0.0464 0.0339 ...
## $ points_mean : num 0.037 0.0264 0.0248 0.048 0.0266 ...
## $ symmetry_mean : num 0.196 0.192 0.171 0.177 0.172 ...
## $ dimension_mean : num 0.0595 0.0649 0.0634 0.0607 0.0554 ...
## $ radius_se : num 0.236 0.451 0.197 0.338 0.178 ...
## $ texture_se : num 0.666 1.197 1.387 1.343 0.412 ...
## $ perimeter_se : num 1.67 3.43 1.34 1.85 1.34 ...
## $ area_se : num 17.4 27.1 13.5 26.3 17.7 ...
## $ smoothness_se : num 0.00805 0.00747 0.00516 0.01127 0.00501 ...
## $ compactness_se : num 0.0118 0.03581 0.00936 0.03498 0.01485 ...
## $ concavity_se : num 0.0168 0.0335 0.0106 0.0219 0.0155 ...
## $ points_se : num 0.01241 0.01365 0.00748 0.01965 0.00915 ...
## $ symmetry_se : num 0.0192 0.035 0.0172 0.0158 0.0165 ...
## $ dimension_se : num 0.00225 0.00332 0.0022 0.00344 0.00177 ...
## $ radius_worst : num 13.5 11.9 12.4 11.9 16.2 ...
## $ texture_worst : num 15.6 22.9 26.4 15.8 15.7 ...
## $ perimeter_worst : num 87 78.3 79.9 76.5 104.5 ...
## $ area_worst : num 549 425 471 434 819 ...
## $ smoothness_worst : num 0.139 0.121 0.137 0.137 0.113 ...
## $ compactness_worst: num 0.127 0.252 0.148 0.182 0.174 ...
## $ concavity_worst : num 0.1242 0.1916 0.1067 0.0867 0.1362 ...
## $ points_worst : num 0.0939 0.0793 0.0743 0.0861 0.0818 ...
## $ symmetry_worst : num 0.283 0.294 0.3 0.21 0.249 ...
## $ dimension_worst : num 0.0677 0.0759 0.0788 0.0678 0.0677 ...
wbcd <- wbcd[-1]
table(wbcd$diagnosis)
##
## B M
## 357 212
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
##
## Benign Malignant
## 62.7 37.3
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)
## radius_mean texture_mean perimeter_mean area_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168 1st Qu.:0.1174
## Median :0.3024 Median :0.3088 Median :0.2933 Median :0.1729
## Mean :0.3382 Mean :0.3240 Mean :0.3329 Mean :0.2169
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168 3rd Qu.:0.2711
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## smoothness_mean compactness_mean concavity_mean points_mean
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3046 1st Qu.:0.1397 1st Qu.:0.06926 1st Qu.:0.1009
## Median :0.3904 Median :0.2247 Median :0.14419 Median :0.1665
## Mean :0.3948 Mean :0.2606 Mean :0.20806 Mean :0.2431
## 3rd Qu.:0.4755 3rd Qu.:0.3405 3rd Qu.:0.30623 3rd Qu.:0.3678
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_mean dimension_mean radius_se texture_se
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04378 1st Qu.:0.1047
## Median :0.3697 Median :0.2439 Median :0.07702 Median :0.1653
## Mean :0.3796 Mean :0.2704 Mean :0.10635 Mean :0.1893
## 3rd Qu.:0.4530 3rd Qu.:0.3404 3rd Qu.:0.13304 3rd Qu.:0.2462
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## perimeter_se area_se smoothness_se compactness_se
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.04000 1st Qu.:0.02064 1st Qu.:0.1175 1st Qu.:0.08132
## Median :0.07209 Median :0.03311 Median :0.1586 Median :0.13667
## Mean :0.09938 Mean :0.06264 Mean :0.1811 Mean :0.17444
## 3rd Qu.:0.12251 3rd Qu.:0.07170 3rd Qu.:0.2187 3rd Qu.:0.22680
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## concavity_se points_se symmetry_se dimension_se
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.03811 1st Qu.:0.1447 1st Qu.:0.1024 1st Qu.:0.04675
## Median :0.06538 Median :0.2070 Median :0.1526 Median :0.07919
## Mean :0.08054 Mean :0.2235 Mean :0.1781 Mean :0.10019
## 3rd Qu.:0.10619 3rd Qu.:0.2787 3rd Qu.:0.2195 3rd Qu.:0.12656
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## radius_worst texture_worst perimeter_worst area_worst
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1807 1st Qu.:0.2415 1st Qu.:0.1678 1st Qu.:0.08113
## Median :0.2504 Median :0.3569 Median :0.2353 Median :0.12321
## Mean :0.2967 Mean :0.3640 Mean :0.2831 Mean :0.17091
## 3rd Qu.:0.3863 3rd Qu.:0.4717 3rd Qu.:0.3735 3rd Qu.:0.22090
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## smoothness_worst compactness_worst concavity_worst points_worst
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3000 1st Qu.:0.1163 1st Qu.:0.09145 1st Qu.:0.2231
## Median :0.3971 Median :0.1791 Median :0.18107 Median :0.3434
## Mean :0.4041 Mean :0.2202 Mean :0.21740 Mean :0.3938
## 3rd Qu.:0.4942 3rd Qu.:0.3025 3rd Qu.:0.30583 3rd Qu.:0.5546
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_worst dimension_worst
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1851 1st Qu.:0.1077
## Median :0.2478 Median :0.1640
## Mean :0.2633 Mean :0.1896
## 3rd Qu.:0.3182 3rd Qu.:0.2429
## Max. :1.0000 Max. :1.0000
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))
## wbcd_test_labels
## wbcd_test_pred Benign Malignant
## Benign 61 2
## Malignant 0 37
(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)
## [1] 98
##Q4
news <- read.csv("C:/Users/NehaKatti/Desktop/OnlineNews.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 20)
(news_tbl <- table(news_test_pred, news_test_labels))
## news_test_labels
## news_test_pred no yes
## no 190 146
## yes 357 307
(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)
## [1] 49.7
#Classifier has an error rate of 50%, slightly higher. These errors might be caused by misclassifications.