Anly 530 Lab3

#Data Pre-processing
library(readxl)
Wholesale <- read.csv("C:/Users/NehaKatti/Desktop/Wholesale_customers_data.csv")
str(Wholesale)

## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

summary(Wholesale)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

data<-data.frame(Wholesale)
top.custs <- function (data,cols,n=5) { #Requires some data frame and the top N to remove
  idx.to.remove <-integer(0) #Initialize a vector to hold customers being removed
  for (c in cols){ # For every column in the data we passed to this function
    col.order <-order(data[,c],decreasing=T) #Sort column "c" in descending order (bigger on top)
    #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
    idx <-head(col.order, n) #Take the first n of the sorted column C to
    idx.to.remove <-union(idx.to.remove,idx) #Combine and de-duplicate the row ids that need to be   removed
  }
  return(idx.to.remove) #Return the indexes of customers to be removed
}

top.cus<-top.custs(data,cols=3:8,n=5)
length(top.cus)

## [1] 19

#Examine the customers
data[top.cus,]

##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423
## 326       1      2  32717 16784   13626  60869             1272       5609
## 94        1      3  11314  3090    2062  35009               71       2698
## 197       1      1  30624  7209    4897  18711              763       2876
## 104       1      3  56082  3504    8906  18028             1480       2498
## 24        2      3  26373 36423   22019   5154             4337      16523
## 72        1      3  18291  1266   21042   5373             4173      14472
## 88        1      3  43265  5025    8117   6312             1579      14351

#Remove the customers and check the other data
data.r.top<-data[-c(top.cus),]
summary(data.r.top)

##     Channel          Region          Fresh            Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.: 3067   1st Qu.: 1492  
##  Median :1.000   Median :3.000   Median : 8040   Median : 3587  
##  Mean   :1.321   Mean   :2.537   Mean   :10753   Mean   : 5112  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:15615   3rd Qu.: 7027  
##  Max.   :2.000   Max.   :3.000   Max.   :53205   Max.   :29892  
##     Grocery          Frozen      Detergents_Paper   Delicassen  
##  Min.   :    3   Min.   :   25   Min.   :    3    Min.   :   3  
##  1st Qu.: 2146   1st Qu.:  688   1st Qu.:  256    1st Qu.: 396  
##  Median : 4602   Median : 1457   Median :  811    Median : 898  
##  Mean   : 7135   Mean   : 2599   Mean   : 2547    Mean   :1261  
##  3rd Qu.: 9965   3rd Qu.: 3242   3rd Qu.: 3843    3rd Qu.:1697  
##  Max.   :39694   Max.   :17866   Max.   :19410    Max.   :7844

##Designing the model
set.seed(76964057) #Set the seed for reproducibility
n <-kmeans(data.r.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
n$centers #Display&nbsp;cluster centers

##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  5830.214 15295.048 23449.167 1936.452       10361.6429   1912.738
## 2 18649.606  3335.586  4497.848 3301.747        1046.5859   1450.566
## 3  5845.392  2337.319  2878.205 2766.596         660.2952    858.994
## 4  4238.892  7725.289 11011.747 1336.566        4733.3614   1400.530
## 5 35922.387  4851.806  5862.581 3730.677        1004.6129   1552.161

##Optimum value of k
r<-2:20 #K from 2 to 20
tri <-100 #Run the K Means algorithm 100 times
avg.totw.ss <-integer(length(r)) #Set up an empty vector to hold all of points
for(v in r){ # For each value of the range variable
  v.totw.ss <-integer(tri) #Set up an empty vector to hold the 100 tries
  for(i in 1:tri){
    n.temp <-kmeans(data.r.top,centers=v) #Run kmeans
    v.totw.ss[i] <-n.temp$tot.withinss#Store the total withinss
  }
  avg.totw.ss[v-1] <-mean(v.totw.ss) #Average the 100 total withinss
}

## Warning: did not converge in 10 iterations

plot(r,avg.totw.ss,type="b", main="Total Within SS by Various K",
     ylab="Average Total Within Sum of Squares",
     xlab="Value of K")

#Part2
wine <- read.csv("C:/Users/NehaKatti/Desktop/wine.csv")
str(wine)

## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}

df <- scale(wine[-1])
head(df)

##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085
##             OD     Proline
## [1,] 1.8427215  1.01015939
## [2,] 1.1103172  0.96252635
## [3,] 0.7863692  1.39122370
## [4,] 1.1807407  2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589  2.23274072

##Start the k-Means analysis using the variable "nc" for the number of clusters
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 19 proposed 3 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

print(table(nc$Best.n[1,]))

## 
##  0  1  2  3 14 15 
##  2  1  2 19  1  1

barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

#Enter the best number of clusters based on the information in the table and barplot
n <- readline(prompt = "Enter the best number of clusters: ")

## Enter the best number of clusters:

n <- as.integer(n)

##Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)

## [1] 62 65 51

print(fit.km$centers)

##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue
## 1  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504
## 2  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046
## 3 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122
##           OD    Proline
## 1  0.7770551  1.1220202
## 2  0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428

print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))

##   cluster  Alcohol Malic.acid      Ash      Acl        Mg  Phenols
## 1       1 13.67677   1.997903 2.466290 17.46290 107.96774 2.847581
## 2       2 12.25092   1.897385 2.231231 20.06308  92.73846 2.247692
## 3       3 13.13412   3.307255 2.417647 21.24118  98.66667 1.683922
##   Flavanoids Nonflavanoid.phenols  Proanth Color.int       Hue       OD
## 1  3.0032258            0.2920968 1.922097  5.453548 1.0654839 3.163387
## 2  2.0500000            0.3576923 1.624154  2.973077 1.0627077 2.803385
## 3  0.8188235            0.4519608 1.145882  7.234706 0.6919608 1.696667
##     Proline
## 1 1100.2258
## 2  510.1692
## 3  619.0588

ct.km <- table(wine$Wine, fit.km$cluster)
print(ct.km)

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

##Generate a plot of the clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution', color=TRUE, shade=TRUE,labels=2, lines=0)

# Part 3
##Set-up to train a model for classification of wines
library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))

## 'data.frame':    178 obs. of  14 variables:
##  $ k                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  1.514 0.246 0.196 1.687 0.295 ...
##  $ Malic.acid          : num  -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
##  $ Ash                 : num  0.231 -0.826 1.106 0.487 1.835 ...
##  $ Acl                 : num  -1.166 -2.484 -0.268 -0.807 0.451 ...
##  $ Mg                  : num  1.9085 0.0181 0.0881 0.9283 1.2784 ...
##  $ Phenols             : num  0.807 0.567 0.807 2.484 0.807 ...
##  $ Flavanoids          : num  1.032 0.732 1.212 1.462 0.661 ...
##  $ Nonflavanoid.phenols: num  -0.658 -0.818 -0.497 -0.979 0.226 ...
##  $ Proanth             : num  1.221 -0.543 2.13 1.029 0.4 ...
##  $ Color.int           : num  0.251 -0.292 0.268 1.183 -0.318 ...
##  $ Hue                 : num  0.361 0.405 0.317 -0.426 0.361 ...
##  $ OD                  : num  1.843 1.11 0.786 1.181 0.448 ...
##  $ Proline             : num  1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL

##Randomize the dataset
rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))

##     k    Alcohol Malic.acid        Ash        Acl          Mg    Phenols
## 127 2 -0.7028817 -0.7217931 -0.2789084  0.6003946 -0.96212770  0.7108523
## 155 3 -0.5181131 -0.9366262 -0.9714696  0.1512342  0.22814148 -1.3024063
## 72  2  1.0585784 -0.7396958  1.1062139  1.6484357 -0.96212770  1.0463954
## 48  1  1.1078500 -0.5875224 -0.8985684 -1.0465271  0.08810981  1.2860690
## 60  2 -0.7767891 -1.2499245 -3.6688130 -2.6635047 -0.82209603 -0.5034942
## 36  1  0.5904981 -0.4711544  0.1584986  0.3009543  0.01809398  0.6469393
##     Flavanoids Nonflavanoid.phenols    Proanth   Color.int        Hue
## 127  1.1220109            0.2261576  0.3129175 -0.48229163 -1.1700906
## 155 -1.4509256            1.3510772 -0.3335300  1.09646103 -1.6513403
## 72   0.8316795           -1.2201676  0.4876331 -0.72384942  1.7611577
## 48   1.3622851           -1.2201676  0.9593651  0.44943125 -0.2075912
## 60  -1.4609371           -0.6577078 -2.0457425 -1.34068448  0.4049085
## 36   0.9518167           -0.8184106  0.4701615  0.01807806  0.3611585
##             OD    Proline
## 127  0.3215742 -1.2539977
## 155 -1.4953517 -0.3394434
## 72   0.7722845 -1.0698166
## 48   1.0117244  0.7561165
## 60  -1.1150649 -0.7205077
## 36   1.2089101  0.5497067

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]

##Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.6.2

library(RColorBrewer)
library(rattle)

## Warning: package 'rattle' was built under R version 3.6.2

## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked _by_ '.GlobalEnv':
## 
##     wine

fancyRpartPlot(fit)

##Now use the predict() function to see how well the model works
pred<-predict(fit, test, type="class")
print(table(pred, test$k))

##     
## pred  1  2  3
##    1 14  1  0
##    2  2  9  0
##    3  0  2  9

p<-table(pred, test$k)
Accuracy <- (sum(diag(p))/sum(p)*100)
Accuracy

## [1] 86.48649

# Part 4
##Q3
wbcd <- read.csv("C:/Users/NehaKatti/Desktop/wisc_bc_data.csv")
str(wbcd)

## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 2 1 1 ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...

wbcd <- wbcd[-1]
table(wbcd$diagnosis)

## 
##   B   M 
## 357 212

wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

## 
##    Benign Malignant 
##      62.7      37.3

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)

##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)
(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))

##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        61         2
##      Malignant      0        37

(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)

## [1] 98

##Q4
news <- read.csv("C:/Users/NehaKatti/Desktop/OnlineNews.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
  if(newsShort$shares[i] >= 1400) {
    newsShort$popular[i] = "yes"} 
  else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 20)
(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  190 146
##            yes 357 307

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 49.7

#Classifier has an error rate of 50%, slightly higher. These errors might be caused by misclassifications.

Anly 530 Lab3

Neha

2/1/2020