lesson5.utf8.md

# Lesson 5
# DT using titanic data set for classification
# DT can work with both categorical and numeric variables so we don't need to convert all variables to numeric

# process: create forks/splits on feature that best separates and recursiveley for sub forks
# Pros: model interpretability is high as it's white box, works with both numeric and discrete data
# Cons: subject to overfitting - RF addresses that by creating an ensemble.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

titanic = read.csv("https://goo.gl/At238b")

str(titanic) # describe values and variable types, notice NA values in age

## 'data.frame':    1309 obs. of  14 variables:
##  $ pclass   : Factor w/ 1 level "1st": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
##  $ sex      : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ age      : num  29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : Factor w/ 186 levels "A10","A11","A14",..: 44 80 80 80 80 150 146 16 62 NA ...
##  $ embarked : Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
##  $ boat     : Factor w/ 27 levels "1","10","11",..: 12 3 NA NA NA 13 2 NA 27 NA ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: Factor w/ 369 levels "?Havana, Cuba",..: 309 231 231 231 231 237 162 24 22 229 ...

nrow(titanic)

## [1] 1309

head(titanic)

##   pclass survived                                            name    sex
## 1    1st        1                   Allen, Miss. Elisabeth Walton female
## 2    1st        1                  Allison, Master. Hudson Trevor   male
## 3    1st        0                    Allison, Miss. Helen Loraine female
## 4    1st        0            Allison, Mr. Hudson Joshua Creighton   male
## 5    1st        0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
## 6    1st        1                             Anderson, Mr. Harry   male
##     age sibsp parch ticket     fare   cabin embarked boat body
## 1 29.00     0     0  24160 211.3375      B5        S    2   NA
## 2  0.92     1     2 113781 151.5500 C22 C26        S   11   NA
## 3  2.00     1     2 113781 151.5500 C22 C26        S <NA>   NA
## 4 30.00     1     2 113781 151.5500 C22 C26        S <NA>  135
## 5 25.00     1     2 113781 151.5500 C22 C26        S <NA>   NA
## 6 48.00     0     0  19952  26.5500     E12        S    3   NA
##                         home.dest
## 1                    St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6                    New York, NY

titanic = titanic %>% filter(age > 0) %>% select(-body, -boat, -cabin, -home.dest) # remove non useful variables
titanic[!complete.cases(titanic),] # check if na values

##     pclass survived                                      name    sex  age
## 149    1st        1                       Icard, Miss. Amelie female 38.0
## 250    1st        1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.0
## 985    1st        0                        Storey, Mr. Thomas   male 60.5
##     sibsp parch ticket fare embarked
## 149     0     0 113572   80     <NA>
## 250     0     0 113572   80     <NA>
## 985     0     0   3701   NA        S

titanic = na.omit(titanic) # remove na values
titanic[!complete.cases(titanic),] # verify no na values

##  [1] pclass   survived name     sex      age      sibsp    parch   
##  [8] ticket   fare     embarked
## <0 rows> (or 0-length row.names)

nrow(titanic)

## [1] 1043

# select relevant variables to train
titanic = titanic %>% select(sex, age, sibsp, parch, fare, survived, embarked)
head(titanic) # our data set is ready for ML

##      sex   age sibsp parch     fare survived embarked
## 1 female 29.00     0     0 211.3375        1        S
## 2   male  0.92     1     2 151.5500        1        S
## 3 female  2.00     1     2 151.5500        0        S
## 4   male 30.00     1     2 151.5500        0        S
## 5 female 25.00     1     2 151.5500        0        S
## 6   male 48.00     0     0  26.5500        1        S

(n = nrow(titanic))

## [1] 1043

trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
train = titanic[trainIndex ,]
test = titanic[-trainIndex ,]

library(rpart)
library(rpart.plot)
dtModel = rpart(survived~., data = train, method = 'class') # don't need to figure imp features as DT will figure it out
dtModel # difficult to interpret so we will use  a plot

## n= 834 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 834 332 0 (0.60191847 0.39808153)  
##    2) sex=male 528 106 0 (0.79924242 0.20075758)  
##      4) age>=14.25 483  83 0 (0.82815735 0.17184265) *
##      5) age< 14.25 45  22 1 (0.48888889 0.51111111)  
##       10) sibsp>=2.5 17   1 0 (0.94117647 0.05882353) *
##       11) sibsp< 2.5 28   6 1 (0.21428571 0.78571429) *
##    3) sex=female 306  80 1 (0.26143791 0.73856209)  
##      6) fare< 48.2021 220  77 1 (0.35000000 0.65000000)  
##       12) sibsp>=2.5 15   4 0 (0.73333333 0.26666667) *
##       13) sibsp< 2.5 205  66 1 (0.32195122 0.67804878)  
##         26) parch>=3.5 7   1 0 (0.85714286 0.14285714) *
##         27) parch< 3.5 198  60 1 (0.30303030 0.69696970) *
##      7) fare>=48.2021 86   3 1 (0.03488372 0.96511628) *

rpart.plot(dtModel)

# now apply model to test data
predictSurvivedDT = predict(dtModel, newdata = test, type="class")

# compare accuracy to eariler models
library(MLmetrics)

## 
## Attaching package: 'MLmetrics'

## The following object is masked from 'package:base':
## 
##     Recall

accuracy_DT = Accuracy(y_pred = predictSurvivedDT, y_true = test$survived) 

# Ensemble model approaches
# Average, e.g. mean of predicted values from DT + LR on the same data.
# Majority, e.g. mode/most common predicted value from DT, LR, ... on the same data.
# Bagging, e.g. RF. Building multiple models (typically of the same type) from different subsamples of the training dataset.
# Boosting, e.g. GB. Building multiple models (typically of the same type) each of which learns to fix the prediction errors of a prior model in the chain.
# Stacking, building multiple models (typically of differing types) and supervisor model and then aggregate or select one

# let's try bagging
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:MLmetrics':
## 
##     MAE, RMSE

rftrain = na.omit(train)
control = trainControl(method="cv", number=10) # ten folds for cross validation
tunegrid <- expand.grid(.mtry=c(1:6))
rfModel = train(as.factor(survived)~., data=rftrain, method="rf", metric='Accuracy', tuneGrid=tunegrid, trControl=control)
print(rfModel)

## Random Forest 
## 
## 834 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 750, 751, 751, 751, 751, 751, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.7674125  0.4879084
##   2     0.7710413  0.5146885
##   3     0.7757889  0.5266299
##   4     0.7709696  0.5172968
##   5     0.7662220  0.5079688
##   6     0.7602266  0.4952580
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

plot(rfModel)

# predict and observe accuracy
predictSurvivedRF = predict(rfModel, newdata = test)
accuracy_RF = Accuracy(y_pred = predictSurvivedRF, y_true = test$survived) 


# stacking to compare several models
library(caretEnsemble)

## 
## Attaching package: 'caretEnsemble'

## The following object is masked from 'package:ggplot2':
## 
##     autoplot

control = trainControl(method="cv", number=10)
algorithmList <- c('lda', 'rpart', 'glm', 'knn', 'svmRadial')
rfModels = caretList(as.factor(survived)~., data=rftrain, methodList=algorithmList, trControl=control)

## Warning in trControlCheck(x = trControl, y = target):
## trControl$savePredictions not 'all' or 'final'. Setting to 'final' so we
## can ensemble the models.

## Warning in trControlCheck(x = trControl, y = target): indexes not defined
## in trControl. Attempting to set them ourselves, so each model in the
## ensemble will have the same resampling indexes.

results = resamples(rfModels)
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, rpart, glm, knn, svmRadial 
## Number of resamples: 10 
## 
## Accuracy 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda       0.6867470 0.7597533 0.7891566 0.7769793 0.7976190 0.8192771    0
## rpart     0.7108434 0.7374139 0.8072289 0.7842226 0.8208907 0.8333333    0
## glm       0.6867470 0.7641997 0.7784710 0.7769937 0.8042169 0.8313253    0
## knn       0.5952381 0.6304862 0.6867470 0.6714429 0.7014845 0.7261905    0
## svmRadial 0.7142857 0.7612952 0.8012048 0.7950373 0.8349111 0.8554217    0
## 
## Kappa 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda       0.3719441 0.4980242 0.5517300 0.5311399 0.5696198 0.6207737    0
## rpart     0.3879781 0.4518010 0.5933864 0.5487453 0.6259381 0.6478788    0
## glm       0.3780980 0.4919093 0.5286971 0.5294487 0.5821116 0.6442131    0
## knn       0.1233886 0.1902235 0.3314640 0.2908080 0.3714968 0.4290780    0
## svmRadial 0.3898305 0.4795758 0.5751143 0.5626519 0.6492806 0.6918317    0

dotplot(results)

(predictSurvivedStacking = predict(rfModels, newdata = test))

##        lda rpart glm knn svmRadial
##   [1,] "1" "1"   "1" "1" "1"      
##   [2,] "0" "0"   "0" "0" "0"      
##   [3,] "1" "0"   "1" "1" "0"      
##   [4,] "1" "1"   "1" "1" "1"      
##   [5,] "1" "1"   "1" "1" "1"      
##   [6,] "0" "0"   "0" "1" "0"      
##   [7,] "1" "1"   "1" "1" "1"      
##   [8,] "1" "1"   "1" "1" "1"      
##   [9,] "1" "1"   "1" "0" "1"      
##  [10,] "1" "1"   "1" "1" "1"      
##  [11,] "1" "1"   "1" "1" "1"      
##  [12,] "0" "0"   "0" "0" "0"      
##  [13,] "1" "1"   "1" "1" "0"      
##  [14,] "0" "0"   "0" "0" "0"      
##  [15,] "1" "1"   "1" "1" "1"      
##  [16,] "0" "0"   "0" "1" "0"      
##  [17,] "1" "1"   "1" "1" "1"      
##  [18,] "0" "0"   "0" "1" "0"      
##  [19,] "1" "1"   "1" "0" "1"      
##  [20,] "0" "0"   "0" "1" "0"      
##  [21,] "0" "0"   "0" "1" "0"      
##  [22,] "0" "0"   "0" "0" "0"      
##  [23,] "0" "0"   "0" "0" "0"      
##  [24,] "1" "1"   "1" "1" "1"      
##  [25,] "1" "1"   "1" "1" "1"      
##  [26,] "0" "0"   "0" "0" "0"      
##  [27,] "1" "1"   "1" "1" "1"      
##  [28,] "1" "1"   "1" "1" "1"      
##  [29,] "1" "1"   "1" "1" "0"      
##  [30,] "1" "1"   "1" "1" "1"      
##  [31,] "0" "0"   "1" "1" "0"      
##  [32,] "0" "0"   "0" "0" "0"      
##  [33,] "1" "1"   "1" "1" "1"      
##  [34,] "1" "1"   "1" "1" "1"      
##  [35,] "0" "0"   "0" "1" "0"      
##  [36,] "1" "1"   "1" "1" "1"      
##  [37,] "0" "0"   "0" "0" "0"      
##  [38,] "1" "1"   "1" "1" "1"      
##  [39,] "1" "1"   "1" "1" "1"      
##  [40,] "0" "0"   "0" "0" "0"      
##  [41,] "0" "0"   "0" "0" "0"      
##  [42,] "0" "0"   "0" "1" "0"      
##  [43,] "1" "0"   "1" "1" "0"      
##  [44,] "1" "1"   "1" "0" "1"      
##  [45,] "1" "1"   "1" "1" "1"      
##  [46,] "0" "0"   "0" "1" "0"      
##  [47,] "0" "0"   "0" "0" "0"      
##  [48,] "1" "1"   "1" "1" "1"      
##  [49,] "0" "0"   "1" "1" "0"      
##  [50,] "0" "0"   "0" "0" "0"      
##  [51,] "0" "0"   "0" "0" "0"      
##  [52,] "1" "1"   "1" "1" "1"      
##  [53,] "0" "0"   "0" "1" "0"      
##  [54,] "0" "0"   "0" "1" "0"      
##  [55,] "1" "1"   "1" "1" "1"      
##  [56,] "1" "1"   "1" "1" "1"      
##  [57,] "0" "0"   "0" "0" "0"      
##  [58,] "1" "1"   "1" "0" "1"      
##  [59,] "1" "1"   "1" "1" "1"      
##  [60,] "0" "0"   "0" "0" "0"      
##  [61,] "1" "1"   "1" "1" "1"      
##  [62,] "1" "1"   "1" "0" "1"      
##  [63,] "0" "0"   "0" "0" "0"      
##  [64,] "0" "0"   "0" "0" "0"      
##  [65,] "1" "1"   "1" "0" "1"      
##  [66,] "0" "0"   "0" "0" "0"      
##  [67,] "1" "1"   "1" "0" "1"      
##  [68,] "1" "1"   "1" "0" "1"      
##  [69,] "0" "1"   "0" "1" "1"      
##  [70,] "1" "1"   "1" "0" "1"      
##  [71,] "1" "1"   "1" "1" "1"      
##  [72,] "1" "1"   "1" "1" "1"      
##  [73,] "1" "1"   "1" "1" "1"      
##  [74,] "0" "1"   "0" "0" "1"      
##  [75,] "1" "1"   "1" "0" "1"      
##  [76,] "1" "1"   "1" "1" "1"      
##  [77,] "0" "0"   "0" "0" "0"      
##  [78,] "0" "0"   "0" "0" "0"      
##  [79,] "0" "0"   "0" "0" "0"      
##  [80,] "0" "0"   "0" "0" "0"      
##  [81,] "0" "0"   "0" "0" "0"      
##  [82,] "0" "0"   "0" "0" "0"      
##  [83,] "0" "0"   "0" "0" "0"      
##  [84,] "0" "0"   "0" "0" "0"      
##  [85,] "1" "1"   "1" "1" "1"      
##  [86,] "0" "0"   "0" "0" "0"      
##  [87,] "0" "0"   "0" "0" "0"      
##  [88,] "0" "0"   "0" "0" "0"      
##  [89,] "1" "1"   "1" "0" "1"      
##  [90,] "1" "1"   "1" "0" "1"      
##  [91,] "0" "0"   "0" "1" "0"      
##  [92,] "1" "1"   "1" "0" "1"      
##  [93,] "1" "1"   "1" "0" "1"      
##  [94,] "0" "0"   "0" "0" "0"      
##  [95,] "0" "0"   "0" "0" "0"      
##  [96,] "0" "0"   "0" "0" "0"      
##  [97,] "1" "1"   "1" "0" "1"      
##  [98,] "0" "0"   "0" "0" "0"      
##  [99,] "0" "0"   "0" "0" "0"      
## [100,] "0" "0"   "0" "0" "0"      
## [101,] "0" "0"   "0" "0" "0"      
## [102,] "0" "1"   "0" "1" "1"      
## [103,] "0" "0"   "0" "0" "0"      
## [104,] "1" "1"   "1" "0" "1"      
## [105,] "1" "1"   "1" "1" "1"      
## [106,] "0" "0"   "0" "0" "0"      
## [107,] "0" "0"   "0" "0" "0"      
## [108,] "0" "1"   "0" "1" "0"      
## [109,] "0" "0"   "0" "0" "0"      
## [110,] "0" "0"   "0" "1" "0"      
## [111,] "1" "1"   "1" "0" "1"      
## [112,] "1" "1"   "1" "0" "1"      
## [113,] "1" "1"   "1" "0" "1"      
## [114,] "1" "1"   "1" "0" "1"      
## [115,] "1" "1"   "1" "1" "1"      
## [116,] "1" "1"   "1" "0" "1"      
## [117,] "1" "1"   "1" "0" "1"      
## [118,] "0" "0"   "0" "0" "0"      
## [119,] "1" "1"   "1" "0" "1"      
## [120,] "0" "1"   "0" "0" "1"      
## [121,] "0" "0"   "0" "0" "0"      
## [122,] "0" "0"   "0" "0" "0"      
## [123,] "0" "0"   "0" "0" "0"      
## [124,] "0" "0"   "0" "0" "0"      
## [125,] "0" "0"   "0" "0" "0"      
## [126,] "1" "1"   "1" "1" "1"      
## [127,] "1" "1"   "1" "1" "1"      
## [128,] "0" "0"   "0" "0" "0"      
## [129,] "0" "0"   "0" "0" "0"      
## [130,] "1" "1"   "1" "0" "0"      
## [131,] "0" "0"   "0" "0" "0"      
## [132,] "1" "1"   "1" "0" "1"      
## [133,] "1" "1"   "1" "0" "1"      
## [134,] "1" "1"   "1" "0" "1"      
## [135,] "0" "0"   "0" "0" "0"      
## [136,] "0" "0"   "0" "0" "0"      
## [137,] "1" "1"   "0" "0" "0"      
## [138,] "0" "0"   "0" "0" "0"      
## [139,] "0" "0"   "0" "0" "0"      
## [140,] "0" "0"   "0" "0" "0"      
## [141,] "0" "0"   "0" "0" "0"      
## [142,] "0" "1"   "0" "1" "1"      
## [143,] "0" "0"   "0" "0" "0"      
## [144,] "0" "0"   "0" "0" "0"      
## [145,] "1" "1"   "1" "0" "1"      
## [146,] "0" "0"   "0" "0" "0"      
## [147,] "0" "0"   "0" "0" "0"      
## [148,] "0" "0"   "0" "0" "0"      
## [149,] "1" "1"   "1" "0" "1"      
## [150,] "0" "0"   "0" "0" "0"      
## [151,] "1" "1"   "1" "1" "1"      
## [152,] "0" "0"   "0" "0" "0"      
## [153,] "0" "0"   "0" "0" "0"      
## [154,] "0" "0"   "0" "0" "0"      
## [155,] "1" "1"   "1" "1" "0"      
## [156,] "0" "0"   "0" "0" "0"      
## [157,] "0" "0"   "0" "0" "0"      
## [158,] "0" "0"   "0" "0" "0"      
## [159,] "0" "0"   "0" "0" "0"      
## [160,] "0" "0"   "0" "0" "0"      
## [161,] "0" "0"   "0" "0" "0"      
## [162,] "0" "0"   "0" "0" "0"      
## [163,] "1" "1"   "1" "0" "1"      
## [164,] "1" "1"   "1" "0" "1"      
## [165,] "0" "0"   "0" "0" "0"      
## [166,] "0" "0"   "0" "0" "0"      
## [167,] "1" "1"   "1" "0" "0"      
## [168,] "1" "1"   "1" "0" "1"      
## [169,] "0" "0"   "0" "0" "0"      
## [170,] "0" "0"   "0" "0" "0"      
## [171,] "0" "0"   "0" "0" "0"      
## [172,] "0" "0"   "0" "0" "0"      
## [173,] "0" "1"   "0" "0" "1"      
## [174,] "0" "0"   "0" "0" "0"      
## [175,] "0" "0"   "0" "0" "0"      
## [176,] "1" "1"   "1" "0" "1"      
## [177,] "0" "0"   "0" "0" "0"      
## [178,] "0" "0"   "0" "0" "0"      
## [179,] "1" "1"   "1" "0" "1"      
## [180,] "0" "0"   "0" "0" "0"      
## [181,] "0" "0"   "0" "0" "0"      
## [182,] "0" "0"   "0" "0" "0"      
## [183,] "1" "1"   "1" "0" "1"      
## [184,] "0" "0"   "0" "0" "0"      
## [185,] "0" "0"   "0" "0" "0"      
## [186,] "0" "0"   "0" "0" "0"      
## [187,] "0" "0"   "0" "0" "0"      
## [188,] "0" "0"   "0" "1" "0"      
## [189,] "0" "0"   "0" "0" "0"      
## [190,] "1" "1"   "1" "0" "1"      
## [191,] "0" "0"   "0" "0" "0"      
## [192,] "0" "0"   "0" "0" "0"      
## [193,] "0" "0"   "0" "0" "0"      
## [194,] "0" "0"   "0" "0" "0"      
## [195,] "0" "0"   "0" "0" "0"      
## [196,] "0" "0"   "0" "0" "0"      
## [197,] "1" "1"   "1" "1" "1"      
## [198,] "0" "0"   "0" "0" "0"      
## [199,] "1" "1"   "1" "0" "1"      
## [200,] "1" "1"   "1" "1" "1"      
## [201,] "0" "0"   "0" "0" "0"      
## [202,] "0" "0"   "0" "0" "0"      
## [203,] "0" "0"   "0" "0" "0"      
## [204,] "0" "0"   "0" "0" "0"      
## [205,] "0" "0"   "0" "0" "0"      
## [206,] "0" "0"   "0" "0" "0"      
## [207,] "0" "0"   "0" "0" "0"      
## [208,] "0" "0"   "0" "0" "0"      
## [209,] "0" "0"   "0" "0" "0"

# let's aggregate by using mode value from the predictions
mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
ps = c()
for (i in 1:nrow(predictSurvivedStacking))
{
  ps[i] = mode(predictSurvivedStacking[i,])
}
accuracy_Stacking = Accuracy(y_pred = ps, y_true = test$survived) 

# accuracy from our different models
df_accuracy = data.frame("model" = c("Decision Tree", "Bagging-RF", "Stacking-Mode"), "accuracy" = c(accuracy_DT, accuracy_RF, accuracy_Stacking))
df_accuracy %>% arrange(accuracy)

##           model  accuracy
## 1 Stacking-Mode 0.7942584
## 2 Decision Tree 0.8086124
## 3    Bagging-RF 0.8277512

# Clustering using k means
# can also another example of k means in lesson 8 pdf of course notes on Canvas
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked from 'package:randomForest':
## 
##     importance

head(wine)

##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39
##   Nonflavanoids Proanthocyanins Color  Hue Dilution Proline
## 1          0.28            2.29  5.64 1.04     3.92    1065
## 2          0.26            1.28  4.38 1.05     3.40    1050
## 3          0.30            2.81  5.68 1.03     3.17    1185
## 4          0.24            2.18  7.80 0.86     3.45    1480
## 5          0.39            1.82  4.32 1.04     2.93     735
## 6          0.34            1.97  6.75 1.05     2.85    1450

wine = scale(wine[-1]) # scale before k means can run due to variance of values between variables
kmeansModel = kmeans(wine,3)
library(cluster)
clusplot(wine, kmeansModel$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

lesson5.R

pmsangal

2019-06-01