Lab #3

Lab 3

data <-read.csv("~/Wholesale_customers_data.csv")
str(data)

## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

summary(data)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

top.n.custs <- function (data,cols,n=5) { #Requires some data frame and the top N to remove
  idx.to.remove <-integer(0) #Initialize a vector to hold customers being removed
  for (c in cols){ # For every column in the data we passed to this function
    col.order <-order(data[,c],decreasing=T) #Sort column "c" in descending order (bigger on top)
    #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
    idx <-head(col.order, n) #Take the first n of the sorted column C to
    idx.to.remove <-union(idx.to.remove,idx) #Combine and de-duplicate the row ids that need to be   removed
  }
  return(idx.to.remove) #Return the indexes of customers to be removed
}
top.custs <-top.n.custs(data, cols=3:8, n=5)
length(top.custs) #How Many Customers to be Removed?

## [1] 19

data.rm.top<-data[-c(top.custs),] #Remove the Customers
set.seed(76964057) #Set the seed for reproducibility
k <-kmeans(data.rm.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2

k$centers #Display&nbsp;cluster

##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4189.747  7645.639 11015.277 1335.145        4750.4819  1387.1205
## 2 16470.870  3026.491  4264.741 3217.306         996.5556  1319.7593
## 3 33120.163  4896.977  5579.860 3823.372         945.4651  1620.1860
## 4  5830.214 15295.048 23449.167 1936.452       10361.6429  1912.7381
## 5  5043.434  2329.683  2786.138 2689.814         652.8276   849.8414

# Our results indicate that cluster 1 and 4 seems to be heavy in groceries. For cluster 2, 3 & 5 are dominant in Fresh and lowest in detergents_paper. 

rng<-2:20 #K from 2 to 20
tries <-100 #Run the K Means algorithm 100 times
avg.totw.ss <-integer(length(rng)) #Set up an empty vector to hold all of points
for(v in rng){ # For each value of the range variable
 v.totw.ss <-integer(tries) #Set up an empty vector to hold the 100 tries
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v) #Run kmeans
 v.totw.ss[i] <-k.temp$tot.withinss#Store the total withinss
 }
 avg.totw.ss[v-1] <-mean(v.totw.ss) #Average the 100 total withinss
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
 ylab="Average Total Within Sum of Squares",
 xlab="Value of K")

# Our graph shows that the change in the value of K seems to be more gradual beyond K=5 so we will consider that as the best value of K for our analysis.

wine <-read.csv("~/wine.csv")
str(wine)

## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

wssplot <- function(wine, nc=15, seed=1234){
  wss <- (nrow(wine)-1)*sum(apply(wine,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(wine, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}
df <- scale(wine[-1])
wssplot(df)

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

# We used 3 different methodologies to compute the best value of K. using wssplot we constructed a clustering graph that indicated that K=3 could be the best value of K to use as the change beyond that point is gradual. In second method we used NbClust and provided min and max values of K,  results show that K=3 may be the best value of K. In third method, we used a bar plot to find out the best value. From the resulting bar graph we can see that K=3 is the best value of K so we will use that our analysis.

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))

##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))

## [1] 96.62921

# For our K-means analysis, we compared the three clusters we have with the actual target value to predict the accuracy of the model and then plotted the clusters using clustplot.

library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

## Part 3

df_rpart <- data.frame(k=fit.km$cluster, df)
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

fancyRpartPlot(fit)
pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))

##     
## pred  1  2  3
##    1 11  1  0
##    2  0 12  1
##    3  0  1 11

(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))

## [1] 91.89189

# In our model, we used the K that we predictd in part 2 as our target variable and evaluated its impact on the rest of variables in dataset. 
#Our decision tree results indicate that if the proline >=0.026 then the model with look for Phenols content. If phenol > 0.13 then its going to be wine 1 otherwise wine 3. If Proline less than 0.026 then the model will look for OD content. If OD content is >=-0.68 then its wine 2. If OD is <-0.68 then the model will look for color.int content. If color.int >= -0.36 then it will be wine 3 else wine 2.  There are 3 instances of misclassification which means ~8% of the times model is going to misclassify the wine type (misclassification error). If we get more data we can train the model better the error may be low for future predictions.

wbcd <- read.csv("~/wisc_bc_data.csv", stringsAsFactors = FALSE)
str(wbcd)

## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : chr  "B" "B" "B" "B" ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...

wbcd <- wbcd[-1]
table(wbcd$diagnosis)

## 
##   B   M 
## 357 212

wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

## 
##    Benign Malignant 
##      62.7      37.3

normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n)

##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean     points_mean    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##  symmetry_mean    dimension_mean     radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439   Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704   Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se       points_se       symmetry_se      dimension_se    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675  
##  Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919  
##  Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019  
##  3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst    points_worst   
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231  
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434  
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938  
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##  symmetry_worst   dimension_worst 
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1851   1st Qu.:0.1077  
##  Median :0.2478   Median :0.1640  
##  Mean   :0.2633   Mean   :0.1896  
##  3rd Qu.:0.3182   3rd Qu.:0.2429  
##  Max.   :1.0000   Max.   :1.0000

wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl=wbcd_train_labels, k = 21)

(wbcd_tbl <- table(wbcd_test_pred, wbcd_test_labels))

##               wbcd_test_labels
## wbcd_test_pred Benign Malignant
##      Benign        61         2
##      Malignant      0        37

(Accuracy <- (wbcd_tbl[1]+wbcd_tbl[4])/sum(wbcd_tbl)*100)

## [1] 98

# our model has an accuracy of 98% showing 2 cases of misclassification where the tumor was predicted to be malignant but was actually benign.

news <- read.csv("~/OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
#for(i in 1:39644) {
#  if(newsShort$shares[i] >= 1400) {
#    newsShort$shares[i] = "yes"} 
 # else {newsShort$shares[i] = "no"}
  #cat("i=,",i," shares=",newsShort$shares[i],"\n")
#}

newsShort$shares <- as.factor(newsShort$shares)
newsShort <- newsShort[-18]
news_n <- as.data.frame(lapply(newsShort[1:16], normalize))


#news_rand <- news_n[order(runif(10000)), ]
#set.seed(12345)
#Split the data into training and test datasets
news_train <- news_n[1:9000, ]
news_test <- news_n[9001:10000, ]
news_train_labels <- newsShort[1:9000, 17]
news_test_labels <- newsShort[9001:10000, 17]
news_test_pred <- knn(train = news_train, test = news_test, cl=news_train_labels, k = 5)

(news_tbl <- table(news_test_pred, news_test_labels))

##               news_test_labels
## news_test_pred  no yes
##            no  225 174
##            yes 322 279

(Accuracy <- (news_tbl[1]+news_tbl[4])/sum(news_tbl)*100)

## [1] 50.4

Lab #3

Navneet Dhull

August 7, 2018

Lab 3