ANLY530_Lab3

Part 1

Wholesale_customers_data <- read.csv("Wholesale customers data.csv")
str(Wholesale_customers_data)

## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

#Requires some data frame and the top N to remove
#Convert to data frame
data <- data.frame(Wholesale_customers_data)
top.n.custs <- function (data,cols,n=5)
  {
  #Initialize a vector to hold customers being removed
  idx.to.remove <-integer(0)
  for (c in cols)
    {
    # For every column in the data we passed to this function
    #Sort column "c" in descending order (bigger on top)
    #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
    col.order <-order(data[,c],decreasing=T)
    #Take the first n of the sorted column C to
    #combine and de-duplicate the row ids that need to be removed
    idx <-head(col.order, n)
    idx.to.remove <-union(idx.to.remove,idx)
    }
  #Return the indexes of customers to be removed
  return(idx.to.remove)
  }
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)

## [1] 18

#Examine the customers
data[top.custs,]

##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423

#Remove the Customers
data.rm.top<-data[-c(top.custs),]
#Examine summary stats for the remaining data
print(summary(data.rm.top))

##     Channel         Region          Fresh            Milk      
##  Min.   :1.00   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.: 3072   1st Qu.: 1497  
##  Median :1.00   Median :3.000   Median : 8130   Median : 3582  
##  Mean   :1.31   Mean   :2.531   Mean   :11076   Mean   : 5172  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:16251   3rd Qu.: 6962  
##  Max.   :2.00   Max.   :3.000   Max.   :56082   Max.   :36423  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2132   1st Qu.:  738.8   1st Qu.:  255.2   1st Qu.:  398.0  
##  Median : 4603   Median : 1487.5   Median :  799.5   Median :  904.5  
##  Mean   : 7211   Mean   : 2910.3   Mean   : 2541.6   Mean   : 1352.0  
##  3rd Qu.:10391   3rd Qu.: 3428.0   3rd Qu.: 3879.2   3rd Qu.: 1752.2  
##  Max.   :39694   Max.   :60869.0   Max.   :19410.0   Max.   :16523.0

#Set the seed for reproducibility
set.seed(76964057)
#Try K from 2 to 20
rng<-2:20
#Number of times to run the K Means algorithm
tries <-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng)
  {
  #Set up an empty vectors to hold the tries
  v.totw.ss <-integer(tries)
  b.totb.ss <- integer(tries)
  tot.ss <- integer(tries)
  #Run kmeans
  for(i in 1:tries)
    {
    k.temp <-kmeans(data.rm.top,centers=v)
    #Store the total withinss
    v.totw.ss[i] <-k.temp$tot.withinss
    #Store the betweenss
    b.totb.ss[i] <- k.temp$betweens
    #Store the total sum of squares
    tot.ss[i] <- k.temp$totss
    }
  #Average the withinss and betweenss
  avg.totw.ss[v-1] <-mean(v.totw.ss)
  avg.totb.ss[v-1] <-mean(b.totb.ss)
  avg.tot.ss[v-1] <- mean(tot.ss)
  }
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
     ylab="Average Total Within Sum of Squares",
     xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
     ylab="Average Total Between Sum of Squares",
     xlab="Value of K")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
     ylab="Ratio Between SS / Total SS",
     xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
     ylab="Ratio Between SS / Total SS",
     xlab="Value of K")
abline(h=0.15, col="red")

#Create the best number of clusters, Remove columns 1 and 2
k <-kmeans(data.rm.top[,-c(1,2)], centers=2)
#Display cluster centers
print(k$centers)

##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4589.846 10573.068 15829.726 1552.718        6876.3504   1730.547
## 2 13564.652  3099.436  3904.531 3431.039         878.6951   1206.774

#Give a count of data points in each cluster
print(table(k$cluster))

## 
##   1   2 
## 117 305

#Generate a plot of the clusters
library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE, labels=2, lines=0)

Part 2

#Plot the within (cluster) sum of squares to determine the initial value for "k"
wssplot <- function(data, nc=15, seed=1234)
  {
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc)
    {
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)
    }
  plot(1:nc, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
  }
#Load data into R/RStudio and view it
wine <- read.csv("wine.csv")
df <- scale(wine[-1])
#Examine the data frame and plot the within sum of squares
head(df)

##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085
##             OD     Proline
## [1,] 1.8427215  1.01015939
## [2,] 1.1103172  0.96252635
## [3,] 0.7863692  1.39122370
## [4,] 1.1807407  2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589  2.23274072

wssplot(df)

#Start the k-Means analysis using the variable "nc" for the number of clusters
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

print(table(nc$Best.n[1,]))

## 
##  0  1  2  3 10 12 14 15 
##  2  1  4 15  1  1  1  1

barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria",
        main = "Number of Clusters Chosen by 26 Criteria")
#Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)

## [1] 62 65 51

print(fit.km$centers)

##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue
## 1  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504
## 2  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046
## 3 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122
##           OD    Proline
## 1  0.7770551  1.1220202
## 2  0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428

print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))

##   cluster  Alcohol Malic.acid      Ash      Acl        Mg  Phenols
## 1       1 13.67677   1.997903 2.466290 17.46290 107.96774 2.847581
## 2       2 12.25092   1.897385 2.231231 20.06308  92.73846 2.247692
## 3       3 13.13412   3.307255 2.417647 21.24118  98.66667 1.683922
##   Flavanoids Nonflavanoid.phenols  Proanth Color.int       Hue       OD
## 1  3.0032258            0.2920968 1.922097  5.453548 1.0654839 3.163387
## 2  2.0500000            0.3576923 1.624154  2.973077 1.0627077 2.803385
## 3  0.8188235            0.4519608 1.145882  7.234706 0.6919608 1.696667
##     Proline
## 1 1100.2258
## 2  510.1692
## 3  619.0588

#Use a confusion or truth table to evaluate how well the k-Means analysis performed
ct.km <- table(wine$Wine, fit.km$cluster)
print(ct.km)

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

#Generate a plot of the clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
 color=TRUE, shade=TRUE,
 labels=2, lines=0)

Part 3

#Set-up to train a model for classification of wines
library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))

## 'data.frame':    178 obs. of  14 variables:
##  $ k                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  1.514 0.246 0.196 1.687 0.295 ...
##  $ Malic.acid          : num  -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
##  $ Ash                 : num  0.231 -0.826 1.106 0.487 1.835 ...
##  $ Acl                 : num  -1.166 -2.484 -0.268 -0.807 0.451 ...
##  $ Mg                  : num  1.9085 0.0181 0.0881 0.9283 1.2784 ...
##  $ Phenols             : num  0.807 0.567 0.807 2.484 0.807 ...
##  $ Flavanoids          : num  1.032 0.732 1.212 1.462 0.661 ...
##  $ Nonflavanoid.phenols: num  -0.658 -0.818 -0.497 -0.979 0.226 ...
##  $ Proanth             : num  1.221 -0.543 2.13 1.029 0.4 ...
##  $ Color.int           : num  0.251 -0.292 0.268 1.183 -0.318 ...
##  $ Hue                 : num  0.361 0.405 0.317 -0.426 0.361 ...
##  $ OD                  : num  1.843 1.11 0.786 1.181 0.448 ...
##  $ Proline             : num  1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL

#Randomize the dataset
rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))

##     k    Alcohol Malic.acid         Ash        Acl         Mg      Phenols
## 93  2 -0.3826162 -0.7217931 -0.38826018  0.3608424 -1.3822227 -1.462188745
## 69  2  0.4180475 -1.2499245 -0.02375431 -0.7470867  0.7182523  0.375309174
## 13  1  0.9230815 -0.5427655  0.15849862 -1.0465271 -0.7520802  0.487156874
## 57  1  1.5020229 -0.5696196 -0.24245783 -0.9566950  1.2783790  1.445851440
## 117 2 -1.4542737 -0.7755014 -1.37242601  0.3907864 -0.9621277 -0.503494178
## 161 3 -0.7891070  1.3370245  0.04914686  0.4506745 -0.8220960  0.007809591
##     Flavanoids Nonflavanoid.phenols     Proanth  Color.int         Hue
## 93  -0.5699201            1.7528342  0.05084419 -0.8661960  0.01115870
## 69  -0.7301029            1.5117800 -2.04574255 -0.8144336  0.27365854
## 13   0.7315653           -0.5773564  0.38280376  0.2337547  0.84240820
## 57   0.9718395           -0.8184106  0.76717799  0.5702101 -0.07634125
## 117 -0.4297602           -0.4970050 -0.10639981 -1.3406845 -0.03259127
## 161 -1.1105371            1.1100230 -0.96250606  1.1180287 -1.73884025
##             OD      Proline
## 93  -0.7770322 -0.799896093
## 69  -0.9601332  0.009865569
## 13   0.4060824  1.819921051
## 57   0.9835550  0.708483475
## 117  1.0117244 -0.799896093
## 161 -1.4530976 -0.720507695

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
#Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)
library(RColorBrewer)
library(rattle)
fancyRpartPlot(fit)

#Now use the predict() function to see how well the model works
pred <- predict(fit, test, type="class")
print(table(pred, test$k))

##     
## pred  1  2  3
##    1 11  1  0
##    2  0 12  1
##    3  0  1 11

Part 4

wbcd<-read.csv("wisc_bc_data.csv")
str(wbcd)

## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 2 1 1 ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...

#Eliminating the first attribute which talks about patients IDs and preprocessing the data
wbcd <- wbcd[-1]
table(wbcd$diagnosis)

## 
##   B   M 
## 357 212

wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

## 
##    Benign Malignant 
##      62.7      37.3

normalize <- function(x)
  {
  return ((x - min(x)) / (max(x) - min(x)))
  }
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)

##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

#Splitting data into train and test
wbcd_train <- wbcd_n[1:456, ]
wbcd_test <- wbcd_n[457:569, ]
wbcd_train_labels <- wbcd[1:456, 1]
wbcd_test_labels <- wbcd[457:569, 1]
#Using KNN to train the model
#Defining the number of neighbors (K = 21); square root of the number of training records
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))

##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        69         2
##      Malignant      0        42

(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)

## [1] 98.23009

#Applying KNN to news popularity
news<-read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644)
  {
  if(newsShort$shares[i] >= 1400)
    {
    newsShort$popular[i] = "yes"
    }
  else
    {
    newsShort$popular[i] = "no"
    }
  }
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
#Trying with k = 10
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 10)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  229 166
##            yes 318 287

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 51.6

#Trying with k = 20
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 20)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  198 141
##            yes 349 312

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 51

#Trying with k = 30
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 30)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  168 120
##            yes 379 333

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 50.1

#Trying with k = 40
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 40)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  158  92
##            yes 389 361

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 51.9

#Trying with k = 50
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 50)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  132  80
##            yes 415 373

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 50.5

#Accuracy didn't really imporving for different values of k

ANLY530_Lab3

Richy Varghese

October 14, 2019

Part 1

Part 2

Part 3

Part 4