Part 1

##Requires some data frame and the top N to remove

data <- read.csv("Wholesale_customers_data.csv")
summary(data)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

##This is the function that will remove n highest instances (outliers) from dataset

top.n.custs <- function (data,cols,n=5) { #Requires some data frame and the top N to remove
  idx.to.remove <-integer(0) #Initialize a vector to hold customers being removed
  for (c in cols){ # For every column in the data we passed to this function
    col.order <-order(data[,c],decreasing=T) #Sort column "c" in descending order (bigger on top)
    #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
    idx <-head(col.order, n) #Take the first n of the sorted column C to
    idx.to.remove <-union(idx.to.remove,idx) #Combine and de-duplicate the row ids that need to be removed
  }
  return(idx.to.remove) #Return the indexes of customers to be removed
}

##How Many Customers to be Removed?

top.custs <-top.n.custs(data,cols=3:8,n=5)
length(top.custs)

## [1] 19

##Now let’s call the function

top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)

## [1] 18

##Examine the customers

data[top.custs,]

##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423

##Design the model

data.rm.top<-data[-c(top.custs),] #Remove the Customers
set.seed(76964057) #Set the seed for reproducibility
k <-kmeans(data.rm.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
k$centers #Display&nbsp;cluster centers

##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1 18857.735  3371.235  4775.049 3492.618        1131.5196  1547.4020
## 2  6910.586 17347.655 25997.931 2053.552       11446.7586  2569.4138
## 3  4344.942  8698.233 12642.663 1430.802        5375.9302  1430.2442
## 4  5686.942  2429.746  3049.092 2711.133         788.6705   842.6127
## 5 37275.906  5219.844  5850.094 6883.125         824.8125  2169.3438

##Optimum value of k

rng<-2:20 #K from 2 to 20
tries <-100 #Run the K Means algorithm 100 times
avg.totw.ss <-integer(length(rng)) #Set up an empty vector to hold all of points
for(v in rng){ # For each value of the range variable
  v.totw.ss <-integer(tries) #Set up an empty vector to hold the 100 tries
  for(i in 1:tries){
    k.temp <-kmeans(data.rm.top,centers=v) #Run kmeans
    v.totw.ss[i] <-k.temp$tot.withinss#Store the total withinss
  }
  avg.totw.ss[v-1] <-mean(v.totw.ss) #Average the 100 total withinss
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
     ylab="Average Total Within Sum of Squares",
     xlab="Value of K")

# Part 2 ##This R script will perform a k-Means analysis on the wine dataset from UCI. This dataset has 13 chemical measurements on 178 observations of Italian wine.

wine <- read.csv("wine.csv")
str(wine)

## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

df <- scale(wine[-1])

##Plot the within (cluster) sum of squares to determine the initial value for “k”

wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

##Examine the data frame and plot the within sum of squares

head(df)

##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue        OD
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585 1.8427215
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085 1.1103172
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085 0.7863692
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410 1.1807407
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585 0.4483365
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085 0.3356589
##          Proline
## [1,]  1.01015939
## [2,]  0.96252635
## [3,]  1.39122370
## [4,]  2.32800680
## [5,] -0.03776747
## [6,]  2.23274072

wssplot(df)

##Start the k-Means analysis using the variable “nc” for the number of clusters

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 19 proposed 3 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

print(table(nc$Best.n[1,]))

## 
##  0  1  2  3 14 15 
##  2  1  2 19  1  1

barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

##Enter the best number of clusters based on the information in the table and barplot

n <- readline(prompt = "Enter the best number of clusters: ")

## Enter the best number of clusters:

n <- as.integer(n)

##Conduct the k-Means analysis using the best number of clusters

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)

## [1] 62 65 51

print(fit.km$centers)

##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue         OD
## 1  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504  0.7770551
## 2  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046  0.2700025
## 3 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122 -1.2887761
##      Proline
## 1  1.1220202
## 2 -0.7517257
## 3 -0.4059428

print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))

##   cluster  Alcohol Malic.acid      Ash      Acl        Mg  Phenols Flavanoids
## 1       1 13.67677   1.997903 2.466290 17.46290 107.96774 2.847581  3.0032258
## 2       2 12.25092   1.897385 2.231231 20.06308  92.73846 2.247692  2.0500000
## 3       3 13.13412   3.307255 2.417647 21.24118  98.66667 1.683922  0.8188235
##   Nonflavanoid.phenols  Proanth Color.int       Hue       OD   Proline
## 1            0.2920968 1.922097  5.453548 1.0654839 3.163387 1100.2258
## 2            0.3576923 1.624154  2.973077 1.0627077 2.803385  510.1692
## 3            0.4519608 1.145882  7.234706 0.6919608 1.696667  619.0588

##Use a confusion or truth table to evaluate how well the k-Means analysis performed

ct.km <- table(wine$Wine, fit.km$cluster)
print(ct.km)

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

##Generate a plot of the clusters

library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution', color=TRUE, shade=TRUE,labels=2, lines=0)

# Part 3 ##Set-up to train a model for classification of wines

library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))

## 'data.frame':    178 obs. of  14 variables:
##  $ k                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  1.514 0.246 0.196 1.687 0.295 ...
##  $ Malic.acid          : num  -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
##  $ Ash                 : num  0.231 -0.826 1.106 0.487 1.835 ...
##  $ Acl                 : num  -1.166 -2.484 -0.268 -0.807 0.451 ...
##  $ Mg                  : num  1.9085 0.0181 0.0881 0.9283 1.2784 ...
##  $ Phenols             : num  0.807 0.567 0.807 2.484 0.807 ...
##  $ Flavanoids          : num  1.032 0.732 1.212 1.462 0.661 ...
##  $ Nonflavanoid.phenols: num  -0.658 -0.818 -0.497 -0.979 0.226 ...
##  $ Proanth             : num  1.221 -0.543 2.13 1.029 0.4 ...
##  $ Color.int           : num  0.251 -0.292 0.268 1.183 -0.318 ...
##  $ Hue                 : num  0.361 0.405 0.317 -0.426 0.361 ...
##  $ OD                  : num  1.843 1.11 0.786 1.181 0.448 ...
##  $ Proline             : num  1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL

##Randomize the dataset

rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))

##     k    Alcohol Malic.acid        Ash        Acl          Mg    Phenols
## 127 2 -0.7028817 -0.7217931 -0.2789084  0.6003946 -0.96212770  0.7108523
## 155 3 -0.5181131 -0.9366262 -0.9714696  0.1512342  0.22814148 -1.3024063
## 72  2  1.0585784 -0.7396958  1.1062139  1.6484357 -0.96212770  1.0463954
## 48  1  1.1078500 -0.5875224 -0.8985684 -1.0465271  0.08810981  1.2860690
## 60  2 -0.7767891 -1.2499245 -3.6688130 -2.6635047 -0.82209603 -0.5034942
## 36  1  0.5904981 -0.4711544  0.1584986  0.3009543  0.01809398  0.6469393
##     Flavanoids Nonflavanoid.phenols    Proanth   Color.int        Hue
## 127  1.1220109            0.2261576  0.3129175 -0.48229163 -1.1700906
## 155 -1.4509256            1.3510772 -0.3335300  1.09646103 -1.6513403
## 72   0.8316795           -1.2201676  0.4876331 -0.72384942  1.7611577
## 48   1.3622851           -1.2201676  0.9593651  0.44943125 -0.2075912
## 60  -1.4609371           -0.6577078 -2.0457425 -1.34068448  0.4049085
## 36   0.9518167           -0.8184106  0.4701615  0.01807806  0.3611585
##             OD    Proline
## 127  0.3215742 -1.2539977
## 155 -1.4953517 -0.3394434
## 72   0.7722845 -1.0698166
## 48   1.0117244  0.7561165
## 60  -1.1150649 -0.7205077
## 36   1.2089101  0.5497067

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]

##Train the classifier and plot the results

fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)
library(RColorBrewer)
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked _by_ '.GlobalEnv':
## 
##     wine

fancyRpartPlot(fit)

##Now use the predict() function to see how well the model works

pred<-predict(fit, test, type="class")
print(table(pred, test$k))

##     
## pred  1  2  3
##    1 14  1  0
##    2  2  9  0
##    3  0  2  9

p<-table(pred, test$k)
Accuracy <- (sum(diag(p))/sum(p)*100)
Accuracy

## [1] 86.48649

Part 4

##Q3

wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
str(wbcd)

## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : chr  "B" "B" "B" "B" ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...

wbcd <- wbcd[-1]
table(wbcd$diagnosis)

## 
##   B   M 
## 357 212

wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

## 
##    Benign Malignant 
##      62.7      37.3

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)

##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))

##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        61         2
##      Malignant      0        37

(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)

## [1] 98

##Q4

news <- read.csv("OnlineNewsPopularity_for_R.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
  if(newsShort$shares[i] >= 1400) {
    newsShort$popular[i] = "yes"} 
  else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 20)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  190 146
##            yes 357 307

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 49.7

#Classifier has an error rate of 50%, slightly higher for k = 5. These errors might be caused by misclassifications.

Lab 3

Jing Ru Tan

2/1/2020

Part 1

Part 4