Part 1 – Supervised Learning

(i) Use Logistic Regression to model the student data

mystudent <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/student.txt",header=TRUE, stringsAsFactors=TRUE)

str(mystudent)
## 'data.frame':    300 obs. of  25 variables:
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ class     : Factor w/ 2 levels "High","Low": 2 2 2 1 2 1 2 2 1 1 ...

Convert categorical columns into dummy variables and the target variable ‘class’ to a factor

library(fastDummies)

MystudentX <- mystudent[,-25] 

MystudentXD <- dummy_cols(MystudentX, select_columns=c("sex","address","famsize","Pstatus","schoolsup","famsup","paid","activities","nursery","higher","romantic"),remove_selected_columns = TRUE)

class <- as.factor(mystudent$class)

mystudentHD <-  cbind(MystudentXD, class) 

str(mystudentHD)
## 'data.frame':    300 obs. of  36 variables:
##  $ age           : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ Medu          : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu          : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ traveltime    : int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime     : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures      : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ famrel        : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime      : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout         : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc          : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc          : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health        : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences      : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ sex_F         : int  1 1 1 1 1 0 0 1 0 0 ...
##  $ sex_M         : int  0 0 0 0 0 1 1 0 1 1 ...
##  $ address_R     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ address_U     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ famsize_GT3   : int  1 1 0 1 1 0 0 1 0 1 ...
##  $ famsize_LE3   : int  0 0 1 0 0 1 1 0 1 0 ...
##  $ Pstatus_A     : int  1 0 0 0 0 0 0 1 1 0 ...
##  $ Pstatus_T     : int  0 1 1 1 1 1 1 0 0 1 ...
##  $ schoolsup_no  : int  0 1 0 1 1 1 1 0 1 1 ...
##  $ schoolsup_yes : int  1 0 1 0 0 0 0 1 0 0 ...
##  $ famsup_no     : int  1 0 1 0 0 0 1 0 0 0 ...
##  $ famsup_yes    : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ paid_no       : int  1 1 0 0 0 0 1 1 0 0 ...
##  $ paid_yes      : int  0 0 1 1 1 1 0 0 1 1 ...
##  $ activities_no : int  1 1 1 0 1 0 1 1 1 0 ...
##  $ activities_yes: int  0 0 0 1 0 1 0 0 0 1 ...
##  $ nursery_no    : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nursery_yes   : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ higher_no     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ higher_yes    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ romantic_no   : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ romantic_yes  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ class         : Factor w/ 2 levels "High","Low": 2 2 2 1 2 1 2 2 1 1 ...
mystudentNorm<- mystudentHD

summary(mystudentNorm)
##       age            Medu            Fedu        traveltime      studytime    
##  Min.   :15.0   Min.   :0.000   Min.   :0.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:15.0   1st Qu.:2.000   1st Qu.:2.00   1st Qu.:1.000   1st Qu.:1.000  
##  Median :16.0   Median :3.000   Median :3.00   Median :1.000   Median :2.000  
##  Mean   :16.3   Mean   :2.807   Mean   :2.54   Mean   :1.407   Mean   :1.997  
##  3rd Qu.:17.0   3rd Qu.:4.000   3rd Qu.:3.00   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :22.0   Max.   :4.000   Max.   :4.00   Max.   :4.000   Max.   :4.000  
##     failures        famrel         freetime         goout            Dalc      
##  Min.   :0.00   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:0.00   1st Qu.:3.750   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :0.00   Median :4.000   Median :3.000   Median :3.000   Median :1.000  
##  Mean   :0.32   Mean   :3.933   Mean   :3.223   Mean   :3.113   Mean   :1.443  
##  3rd Qu.:0.00   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :3.00   Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##       Walc           health         absences          sex_F       
##  Min.   :1.000   Min.   :1.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :2.000   Median :4.000   Median : 4.000   Median :0.0000  
##  Mean   :2.277   Mean   :3.573   Mean   : 5.757   Mean   :0.4933  
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :5.000   Max.   :5.000   Max.   :75.000   Max.   :1.0000  
##      sex_M          address_R        address_U       famsize_GT3    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.5067   Mean   :0.1767   Mean   :0.8233   Mean   :0.7133  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   famsize_LE3       Pstatus_A        Pstatus_T       schoolsup_no   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.2867   Mean   :0.1067   Mean   :0.8933   Mean   :0.8333  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  schoolsup_yes      famsup_no        famsup_yes        paid_no    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.00  
##  Mean   :0.1667   Mean   :0.3533   Mean   :0.6467   Mean   :0.53  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00  
##     paid_yes    activities_no    activities_yes     nursery_no    
##  Min.   :0.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.47   Mean   :0.4567   Mean   :0.5433   Mean   :0.1867  
##  3rd Qu.:1.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   nursery_yes       higher_no         higher_yes      romantic_no 
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:1.0000   1st Qu.:0.0  
##  Median :1.0000   Median :0.00000   Median :1.0000   Median :1.0  
##  Mean   :0.8133   Mean   :0.05333   Mean   :0.9467   Mean   :0.7  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0  
##   romantic_yes  class    
##  Min.   :0.0   High:126  
##  1st Qu.:0.0   Low :174  
##  Median :0.0             
##  Mean   :0.3             
##  3rd Qu.:1.0             
##  Max.   :1.0

Normalize numeric columns using min-max scaling (scaling to a range of 0-1)

for(i in 1:length(colnames(mystudentHD))-1) {
  if(class(mystudentHD[,i]) == "numeric" || class(mystudentHD[,i]) == "integer") {
    minimum<-min(mystudentHD[,i])
    maximum<-max(mystudentHD[,i])
    mystudentNorm[,i] <- as.vector(scale(mystudentHD[,i],center=minimum,scale=maximum-minimum)) 
  
    }
}

Build a logistic regression model using the normalized data

set.seed(100)
LRmodel <- glm(class~ ., family=binomial, mystudentNorm)

Use stepwise regression to refine the model, removing less significant variables

summary(LRmodelR)
## 
## Call:
## glm(formula = class ~ age + Medu + traveltime + failures + goout + 
##     health + absences + sex_F + famsize_GT3 + schoolsup_no + 
##     higher_no, family = binomial, data = mystudentNorm)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.1462     0.7223  -0.202 0.839639    
## age            1.6161     1.0081   1.603 0.108909    
## Medu          -0.8560     0.5219  -1.640 0.100968    
## traveltime     0.9560     0.6532   1.463 0.143334    
## failures       3.1946     1.0590   3.017 0.002556 ** 
## goout          1.0607     0.5177   2.049 0.040466 *  
## health         0.5637     0.3973   1.419 0.155938    
## absences       3.1961     1.7567   1.819 0.068849 .  
## sex_F          0.5667     0.2786   2.034 0.041929 *  
## famsize_GT3    0.5175     0.3019   1.714 0.086468 .  
## schoolsup_no  -1.6007     0.4326  -3.700 0.000216 ***
## higher_no      1.6403     1.1422   1.436 0.150994    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 408.18  on 299  degrees of freedom
## Residual deviance: 325.07  on 288  degrees of freedom
## AIC: 349.07
## 
## Number of Fisher Scoring iterations: 5

Display the model summary in a table format

library(knitr)
library(broom)

log_likelihood <- logLik(LRmodelR)
print(log_likelihood)
## 'log Lik.' -162.5334 (df=12)
model_results <- tidy(LRmodelR)

kable(model_results, caption = "Refined Logistic Regression Model Summary", digits = 4)
Refined Logistic Regression Model Summary
term estimate std.error statistic p.value
(Intercept) -0.1462 0.7223 -0.2024 0.8396
age 1.6161 1.0081 1.6031 0.1089
Medu -0.8560 0.5219 -1.6402 0.1010
traveltime 0.9560 0.6532 1.4635 0.1433
failures 3.1946 1.0590 3.0166 0.0026
goout 1.0607 0.5177 2.0490 0.0405
health 0.5637 0.3973 1.4189 0.1559
absences 3.1961 1.7567 1.8194 0.0688
sex_F 0.5667 0.2786 2.0342 0.0419
famsize_GT3 0.5175 0.3019 1.7143 0.0865
schoolsup_no -1.6007 0.4326 -3.7000 0.0002
higher_no 1.6403 1.1422 1.4360 0.1510
myprediction <- predict(LRmodelR, mystudentNorm[,-36], type="response")
pred_class <- ifelse(myprediction > 0.5, "Low", "High")
class_ificationtable <- table(pred_class,mystudentNorm[,36])
acctestfinalmodel_logo <- sum(diag(class_ificationtable))/sum(class_ificationtable)

print(acctestfinalmodel_logo) # accuracy is 73.67%
## [1] 0.7366667

(ii) Use Support Vector Machines to model the student data

library(e1071)

summary(mystudentNorm$class)
## High  Low 
##  126  174
classtable <- table(mystudentNorm$class)

Run SVM with the RBF kernel and find the best value of model

set.seed(100)

tunedmodelRBF <- tune.svm(class~.,data = mystudentNorm,cost=2^(-12:12),gamma=2^(-12:12), cross=10)

bestgamma <- tunedmodelRBF$best.parameters[[1]]

print(bestgamma) # the best gamma is 0.00390625
## [1] 0.00390625
bestcost <- tunedmodelRBF$best.parameters[[2]]

print(bestcost) # the best cost is 256
## [1] 256

use the best value of the parameters to build a model

set.seed(100)

finalmodelRBF  <- svm(mystudentNorm$class ~ ., data = mystudentNorm, cost = bestcost, gamma=bestgamma)

myprediction <- predict(finalmodelRBF, mystudentNorm[,-36])
classificationtable <- table(myprediction,mystudentNorm[,36])
acctestfinalmodelRBF <- sum(diag(classificationtable))/sum(classificationtable)

print(acctestfinalmodelRBF)  # acctestfinalmodelRBF is roughly 98.3%
## [1] 0.9833333

use linear kernel to build SVM model

set.seed(100)
tunedmodellinear <- tune.svm(class~.,data = mystudentNorm, cost=2^(-5:5), kernel="linear")

summary(tunedmodellinear)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    32
## 
## - best performance: 0.3433333 
## 
## - Detailed performance results:
##        cost     error dispersion
## 1   0.03125 0.3633333 0.09355793
## 2   0.06250 0.3600000 0.07166451
## 3   0.12500 0.3500000 0.08642416
## 4   0.25000 0.3466667 0.08916623
## 5   0.50000 0.3600000 0.08285045
## 6   1.00000 0.3500000 0.08351831
## 7   2.00000 0.3500000 0.08642416
## 8   4.00000 0.3466667 0.08777075
## 9   8.00000 0.3466667 0.08777075
## 10 16.00000 0.3466667 0.08777075
## 11 32.00000 0.3433333 0.09033545

use the best value to build SVM model with linear kernels

best_cost <- tunedmodellinear$best.parameters[[1]]

print(best_cost) # the best cost is 32 
## [1] 32
finalmodellinear <- svm(class~.,data=mystudentNorm,cost=best_cost,kernel="linear")
my_prediction <- predict(finalmodellinear, mystudentNorm[,-36])
classificationtable <- table(my_prediction,mystudentNorm[,36])
acctestfinalmodellinear <- sum(diag(classificationtable))/sum(classificationtable)

print(acctestfinalmodellinear) # acctestfinalmodellinear is roughly 73.67%
## [1] 0.7366667

display comparison with RBF and linear kernel

svm_results <- data.frame(
  Model = c("RBF Kernel SVM", "Linear Kernel SVM"),
  Best_Cost = c(256, 32),
  Best_Gamma = c(0.00390625, "N/A"),
  Training_Data_Size = c(300, 300),
  Test_Data_Size = c("Remaining", "Remaining"),
  Test_Accuracy = c("98.3%", "73.67%")
)

print(svm_results)
##               Model Best_Cost Best_Gamma Training_Data_Size Test_Data_Size
## 1    RBF Kernel SVM       256 0.00390625                300      Remaining
## 2 Linear Kernel SVM        32        N/A                300      Remaining
##   Test_Accuracy
## 1         98.3%
## 2        73.67%

(iii) Use Classification Trees to model the student data.

use to build a tree using corssvalidation function

library(tree)


set.seed(100)


mytree <- tree(class~.,mystudent)
plot(mytree)
text(mytree)

summary(mytree)
## 
## Classification tree:
## tree(formula = class ~ ., data = mystudent)
## Variables actually used in tree construction:
##  [1] "failures"  "schoolsup" "age"       "health"    "Walc"      "sex"      
##  [7] "Medu"      "goout"     "Fedu"      "absences"  "famrel"    "famsup"   
## [13] "freetime"  "studytime" "famsize"   "higher"   
## Number of terminal nodes:  25 
## Residual mean deviance:  0.7814 = 214.9 / 275 
## Misclassification error rate: 0.2067 = 62 / 300
print(mytree)
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 300 408.200 Low ( 0.4200 0.5800 )  
##     2) failures < 1.5 273 376.800 Low ( 0.4615 0.5385 )  
##       4) schoolsup: no 227 314.500 High ( 0.5154 0.4846 )  
##         8) age < 16.5 134 179.900 High ( 0.6045 0.3955 )  
##          16) health < 1.5 18  12.560 High ( 0.8889 0.1111 )  
##            32) Walc < 1.5 7   8.376 High ( 0.7143 0.2857 ) *
##            33) Walc > 1.5 11   0.000 High ( 1.0000 0.0000 ) *
##          17) health > 1.5 116 159.100 High ( 0.5603 0.4397 )  
##            34) sex: F 44  58.700 Low ( 0.3864 0.6136 )  
##              68) Medu < 3.5 28  29.100 Low ( 0.2143 0.7857 )  
##               136) goout < 3.5 18  22.910 Low ( 0.3333 0.6667 )  
##                 272) Medu < 1.5 5   0.000 Low ( 0.0000 1.0000 ) *
##                 273) Medu > 1.5 13  17.940 Low ( 0.4615 0.5385 ) *
##               137) goout > 3.5 10   0.000 Low ( 0.0000 1.0000 ) *
##              69) Medu > 3.5 16  19.870 High ( 0.6875 0.3125 )  
##               138) Fedu < 3.5 5   0.000 High ( 1.0000 0.0000 ) *
##               139) Fedu > 3.5 11  15.160 High ( 0.5455 0.4545 ) *
##            35) sex: M 72  91.660 High ( 0.6667 0.3333 )  
##              70) absences < 7.5 56  62.980 High ( 0.7500 0.2500 )  
##               140) famrel < 3.5 9  12.370 Low ( 0.4444 0.5556 ) *
##               141) famrel > 3.5 47  45.910 High ( 0.8085 0.1915 )  
##                 282) goout < 3.5 35  24.880 High ( 0.8857 0.1143 )  
##                   564) age < 15.5 19  19.560 High ( 0.7895 0.2105 ) *
##                   565) age > 15.5 16   0.000 High ( 1.0000 0.0000 ) *
##                 283) goout > 3.5 12  16.300 High ( 0.5833 0.4167 )  
##                   566) famsup: no 7   8.376 Low ( 0.2857 0.7143 ) *
##                   567) famsup: yes 5   0.000 High ( 1.0000 0.0000 ) *
##              71) absences > 7.5 16  21.170 Low ( 0.3750 0.6250 )  
##               142) freetime < 3.5 5   0.000 Low ( 0.0000 1.0000 ) *
##               143) freetime > 3.5 11  15.160 High ( 0.5455 0.4545 )  
##                 286) studytime < 1.5 5   0.000 High ( 1.0000 0.0000 ) *
##                 287) studytime > 1.5 6   5.407 Low ( 0.1667 0.8333 ) *
##         9) age > 16.5 93 124.100 Low ( 0.3871 0.6129 )  
##          18) famsize: GT3 61  74.010 Low ( 0.2951 0.7049 )  
##            36) higher: no 6   0.000 Low ( 0.0000 1.0000 ) *
##            37) higher: yes 55  69.550 Low ( 0.3273 0.6727 )  
##              74) health < 2.5 12  16.300 High ( 0.5833 0.4167 ) *
##              75) health > 2.5 43  48.900 Low ( 0.2558 0.7442 )  
##               150) absences < 10.5 34  42.810 Low ( 0.3235 0.6765 ) *
##               151) absences > 10.5 9   0.000 Low ( 0.0000 1.0000 ) *
##          19) famsize: LE3 32  43.860 High ( 0.5625 0.4375 ) *
##       5) schoolsup: yes 46  45.480 Low ( 0.1957 0.8043 )  
##        10) studytime < 1.5 9  12.370 High ( 0.5556 0.4444 ) *
##        11) studytime > 1.5 37  25.350 Low ( 0.1081 0.8919 )  
##          22) Medu < 2.5 16  17.990 Low ( 0.2500 0.7500 )  
##            44) age < 15.5 7   0.000 Low ( 0.0000 1.0000 ) *
##            45) age > 15.5 9  12.370 Low ( 0.4444 0.5556 ) *
##          23) Medu > 2.5 21   0.000 Low ( 0.0000 1.0000 ) *
##     3) failures > 1.5 27   0.000 Low ( 0.0000 1.0000 ) *

use prune tree to improve the tree

set.seed(100)
mycrossval <- cv.tree(mytree,FUN=prune.tree, K=10)
mybestsize <- mycrossval$size[which(mycrossval$dev==min(mycrossval$dev))]
myprunedtree <- prune.tree(mytree,best =mybestsize[1] )

plot(myprunedtree)
text(myprunedtree)

print(myprunedtree)
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 300 408.20 Low ( 0.4200 0.5800 )  
##   2) failures < 1.5 273 376.80 Low ( 0.4615 0.5385 )  
##     4) schoolsup: no 227 314.50 High ( 0.5154 0.4846 ) *
##     5) schoolsup: yes 46  45.48 Low ( 0.1957 0.8043 ) *
##   3) failures > 1.5 27   0.00 Low ( 0.0000 1.0000 ) *
print(paste("Best tree size:", mybestsize))
## [1] "Best tree size: 3"
summary(myprunedtree)
## 
## Classification tree:
## snip.tree(tree = mytree, nodes = 5:4)
## Variables actually used in tree construction:
## [1] "failures"  "schoolsup"
## Number of terminal nodes:  3 
## Residual mean deviance:  1.212 = 359.9 / 297 
## Misclassification error rate: 0.3967 = 119 / 300

test the classfication accuracy

myprediction <- predict(myprunedtree, mystudent[,-25], type='class')
classificationtable <- table(myprediction,mystudent[,25])
acctesttree <- sum(diag(classificationtable))/sum(classificationtable)

print(acctesttree) # accuracy is 60.3%
## [1] 0.6033333
myprediction <- predict(mytree, mystudent[,-25], type='class')
classificationtable <- table(myprediction,mystudent[,25])
acctesttree <- sum(diag(classificationtable))/sum(classificationtable)

print(acctesttree) # accuracy is 79.3%
## [1] 0.7933333

(iv) Use Random Forests to model the student data

set.seed(100)

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
myrandomF <- randomForest(class~.,mystudent,ntree=500,mtry=4,maxnodes = 10,importance=TRUE)

importance(myrandomF)
##                  High        Low MeanDecreaseAccuracy MeanDecreaseGini
## sex         2.8443820  6.3310603            5.8859759        1.3460110
## age         2.1804090  4.0336971            4.2179420        1.6416075
## address     1.8073930  0.1099250            1.1500769        0.4481816
## famsize    -0.0143159  1.2248727            1.0018596        0.7131813
## Pstatus    -0.5154990  0.2435744           -0.1259820        0.2033943
## Medu        7.8013376  4.7482709            7.9937104        3.0204271
## Fedu        3.7912720  0.4001795            3.0400106        1.4715672
## traveltime  1.7560147 -0.9808993            0.4925705        0.8404057
## studytime   0.8572077  2.1207016            2.1385920        1.3857259
## failures   13.1205595  6.4272309           11.9105635        5.2325331
## schoolsup  10.8588638  8.4050813           11.1950042        3.2351823
## famsup      2.1265140 -0.2537696            1.1749508        0.5249845
## paid       -2.4058029  0.9456723           -0.9200125        0.3373135
## activities -1.8786053 -1.1653350           -1.9130994        0.3649614
## nursery    -1.3164991 -0.4374008           -1.2187921        0.2743834
## higher      4.6744264  2.0428411            3.9847592        0.9389124
## romantic   -0.8269225 -1.2616804           -1.6647198        0.2762145
## famrel      0.8328000  0.3217987            0.7295389        1.1525525
## freetime    1.4713316  0.6131002            1.1776744        1.5872205
## goout       4.7974577  1.6363094            4.3989186        2.8273853
## Dalc        0.7215733  1.1639531            1.2006130        0.7442742
## Walc        1.2362586  0.4002818            1.1944258        1.1152735
## health      3.8723231  0.3863323            2.5135161        1.3613627
## absences    4.2037683  1.4128226            3.4769732        2.4645220
varImpPlot(myrandomF)

print(myrandomF) # OOB is 33%
## 
## Call:
##  randomForest(formula = class ~ ., data = mystudent, ntree = 500,      mtry = 4, maxnodes = 10, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 33%
## Confusion matrix:
##      High Low class.error
## High   53  73   0.5793651
## Low    26 148   0.1494253
summary(myrandomF)
##                 Length Class  Mode     
## call               7   -none- call     
## type               1   -none- character
## predicted        300   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes            600   matrix numeric  
## oob.times        300   -none- numeric  
## classes            2   -none- character
## importance        96   -none- numeric  
## importanceSD      72   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                300   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

accuracy of the initial random forest

mydata_shuffled <- mystudent[sample(nrow(mystudent)), ]


my_prediction_initial <- predict(myrandomF, mystudent[,-25], type='class')
classification_table <- table(my_prediction_initial,mystudent[,25])
acctestrandomforest <- sum(diag(classification_table))/sum(classification_table)

print(acctestrandomforest) # accuracy is 74.3%
## [1] 0.7433333
print(classification_table)
##                      
## my_prediction_initial High Low
##                  High   67  18
##                  Low    59 156

Tuning the best model of random forest

library(dplyr)
## 
## Vedhæfter pakke: 'dplyr'
## Det følgende objekt er maskeret fra 'package:randomForest':
## 
##     combine
## De følgende objekter er maskerede fra 'package:stats':
## 
##     filter, lag
## De følgende objekter er maskerede fra 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(100)

my_final_RF <- tuneRF(
  x = select(mystudent, -class), 
  y = mystudent$class, 
  mtryStart = 5,
  ntreeTry = 500,
  trace = TRUE, 
  maxnodes = 10,
  plot = TRUE,  
  doBest = TRUE 
)
## mtry = 5  OOB error = 32% 
## Searching left ...
## mtry = 3     OOB error = 33.33% 
## -0.04166667 0.05 
## Searching right ...
## mtry = 10    OOB error = 32.67% 
## -0.02083333 0.05

importance(my_final_RF)
##            MeanDecreaseGini
## sex               1.5203400
## age               1.6609903
## address           0.4566299
## famsize           0.6700160
## Pstatus           0.1996208
## Medu              2.5725630
## Fedu              1.6372223
## traveltime        0.8247885
## studytime         1.3200032
## failures          6.0728532
## schoolsup         3.6390748
## famsup            0.6917576
## paid              0.3296157
## activities        0.3686565
## nursery           0.3089937
## higher            0.9079467
## romantic          0.3545139
## famrel            1.4105671
## freetime          1.8741188
## goout             2.6380879
## Dalc              0.6424972
## Walc              1.1177871
## health            1.4095287
## absences          2.5894864
print(my_final_RF)  
## 
## Call:
##  randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1],      maxnodes = 10) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 35%
## Confusion matrix:
##      High Low class.error
## High   51  75   0.5952381
## Low    30 144   0.1724138
varImpPlot(my_final_RF)

my_prediction_final <- predict(my_final_RF, mystudent[,-25], type='class')
classification_table_final <- table(my_prediction_final,mystudent[,25])
acc_testrandomforest <- sum(diag(classification_table_final))/sum(classification_table_final)

print(acc_testrandomforest) # accuracy is 75%
## [1] 0.75
print(classification_table_final)
##                    
## my_prediction_final High Low
##                High   71  20
##                Low    55 154

(v) Use k-Nearest Neighbors to model the student data.

library('FNN')

set.seed(100)
myystudent <- mystudentNorm[,36]
myxstudent <- mystudentNorm[,-36]

tune the value of K using leave-one-out crossvalidation

  bestk=0
bestaccuracy=0
accuracy <- NULL
for(auxk in 1:15){
  mycv <- knn.cv(train= myxstudent, cl= myystudent, k=auxk)
  mytable <- table (mycv, myystudent)
  accuracy[auxk] <- sum(diag(mytable))/sum(mytable)
  if(bestaccuracy< accuracy[auxk]) bestk=auxk
  if(bestaccuracy< accuracy[auxk]) bestaccuracy = accuracy[auxk]}

plot(accuracy, xlab="K", ylab="Crossvalidated Accuracy")

print(paste("The best k value is:", bestk)) # with seed=100 we get the best value of K is 3 
## [1] "The best k value is: 3"
print(accuracy)
##  [1] 0.5766667 0.5633333 0.6366667 0.6100000 0.6266667 0.5800000 0.6166667
##  [8] 0.5866667 0.5600000 0.5766667 0.5833333 0.5833333 0.5833333 0.5666667
## [15] 0.5800000

use the best K to classfify the testing dataset

myk3nn <- knn(train= myxstudent, test= myxstudent, cl= myystudent, k=3)
myaccuracytablek3nn <- table (myk3nn,myystudent)
mytestingaccuracyk3nn <- sum(diag(myaccuracytablek3nn))/sum(myaccuracytablek3nn)

print(mytestingaccuracyk3nn) # the accuracy is 82.3%
## [1] 0.8233333
print(myaccuracytablek3nn)
##       myystudent
## myk3nn High Low
##   High   95  22
##   Low    31 152

(vi) Discuss how the models built in Questions 1(i)-1(v) compare when predicting the class membership of the objects in studentadditional.txt.

accuracy of Logistic regression - based on dataset “studentadditional”

set.seed(100)

myadditional <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/studentadditional.txt",header=TRUE, stringsAsFactors=TRUE)

str(myadditional)
## 'data.frame':    95 obs. of  25 variables:
##  $ sex       : Factor w/ 2 levels "F","M": 1 2 1 1 2 1 2 2 2 1 ...
##  $ age       : int  18 17 17 17 19 18 20 19 19 19 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 1 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 2 2 1 1 1 1 1 1 1 2 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 1 2 2 2 ...
##  $ Medu      : int  4 4 4 3 3 2 3 4 3 1 ...
##  $ Fedu      : int  4 4 2 2 3 4 2 4 3 1 ...
##  $ traveltime: int  1 2 2 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 1 3 4 2 2 1 1 2 2 ...
##  $ failures  : int  0 0 0 0 1 1 0 1 1 1 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 2 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 1 2 1 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 1 2 1 1 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 2 1 2 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 1 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 1 2 2 1 ...
##  $ famrel    : int  4 4 4 5 4 4 5 4 4 4 ...
##  $ freetime  : int  2 1 3 2 4 4 5 3 5 4 ...
##  $ goout     : int  4 1 3 2 4 3 3 4 3 3 ...
##  $ Dalc      : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 2 1 2 1 1 1 1 2 3 ...
##  $ health    : int  4 5 3 5 3 3 5 4 5 3 ...
##  $ absences  : int  14 0 0 0 20 8 0 38 0 18 ...
##  $ class     : Factor w/ 2 levels "High","Low": 2 2 1 1 1 1 1 2 1 2 ...
MyadditionalX <- myadditional[,-25] 

MyadditionalXD <- dummy_cols(MyadditionalX, select_columns=c("sex","address","famsize","Pstatus","schoolsup","famsup","paid","activities","nursery","higher","romantic"),remove_selected_columns = TRUE)

class <- as.factor(myadditional$class)

myadditionalHD <-  cbind(MyadditionalXD, class) 

str(myadditionalHD)
## 'data.frame':    95 obs. of  36 variables:
##  $ age           : int  18 17 17 17 19 18 20 19 19 19 ...
##  $ Medu          : int  4 4 4 3 3 2 3 4 3 1 ...
##  $ Fedu          : int  4 4 2 2 3 4 2 4 3 1 ...
##  $ traveltime    : int  1 2 2 1 1 1 1 2 1 1 ...
##  $ studytime     : int  2 1 3 4 2 2 1 1 2 2 ...
##  $ failures      : int  0 0 0 0 1 1 0 1 1 1 ...
##  $ famrel        : int  4 4 4 5 4 4 5 4 4 4 ...
##  $ freetime      : int  2 1 3 2 4 4 5 3 5 4 ...
##  $ goout         : int  4 1 3 2 4 3 3 4 3 3 ...
##  $ Dalc          : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ Walc          : int  1 2 1 2 1 1 1 1 2 3 ...
##  $ health        : int  4 5 3 5 3 3 5 4 5 3 ...
##  $ absences      : int  14 0 0 0 20 8 0 38 0 18 ...
##  $ sex_F         : int  1 0 1 1 0 1 0 0 0 1 ...
##  $ sex_M         : int  0 1 0 0 1 0 1 1 1 0 ...
##  $ address_R     : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ address_U     : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ famsize_GT3   : int  0 0 1 1 1 1 1 1 1 0 ...
##  $ famsize_LE3   : int  1 1 0 0 0 0 0 0 0 1 ...
##  $ Pstatus_A     : int  1 0 0 0 0 0 1 0 0 0 ...
##  $ Pstatus_T     : int  0 1 1 1 1 1 0 1 1 1 ...
##  $ schoolsup_no  : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ schoolsup_yes : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ famsup_no     : int  0 1 0 0 0 0 1 0 1 0 ...
##  $ famsup_yes    : int  1 0 1 1 1 1 0 1 0 1 ...
##  $ paid_no       : int  1 0 0 0 1 0 1 0 1 1 ...
##  $ paid_yes      : int  0 1 1 1 0 1 0 1 0 0 ...
##  $ activities_no : int  1 1 1 0 0 0 0 1 0 0 ...
##  $ activities_yes: int  0 0 0 1 1 1 1 0 1 1 ...
##  $ nursery_no    : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ nursery_yes   : int  1 1 1 0 1 1 1 1 1 0 ...
##  $ higher_no     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ higher_yes    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ romantic_no   : int  0 1 1 1 0 1 1 0 0 1 ...
##  $ romantic_yes  : int  1 0 0 0 1 0 0 1 1 0 ...
##  $ class         : Factor w/ 2 levels "High","Low": 2 2 1 1 1 1 1 2 1 2 ...
myadditionalNorm<- myadditionalHD

# Normalize numeric columns using min-max scaling fra  "mystudent" (scaling to a range of 0-1)

for(i in 1:length(colnames(myadditionalHD))-1) {
  if(class(myadditionalHD[,i]) == "numeric" || class(myadditionalHD[,i]) == "integer") {
    minimum<-min(mystudentHD[,i])
    maximum<-max(mystudentHD[,i])
    myadditionalNorm[,i] <- as.vector(scale(myadditionalHD[,i],center=minimum,scale=maximum-minimum)) 
    
  }
}

str(myadditionalNorm)
## 'data.frame':    95 obs. of  36 variables:
##  $ age           : num  0.429 0.286 0.286 0.286 0.571 ...
##  $ Medu          : num  1 1 1 0.75 0.75 0.5 0.75 1 0.75 0.25 ...
##  $ Fedu          : num  1 1 0.5 0.5 0.75 1 0.5 1 0.75 0.25 ...
##  $ traveltime    : num  0 0.333 0.333 0 0 ...
##  $ studytime     : num  0.333 0 0.667 1 0.333 ...
##  $ failures      : num  0 0 0 0 0.333 ...
##  $ famrel        : num  0.75 0.75 0.75 1 0.75 0.75 1 0.75 0.75 0.75 ...
##  $ freetime      : num  0.25 0 0.5 0.25 0.75 0.75 1 0.5 1 0.75 ...
##  $ goout         : num  0.75 0 0.5 0.25 0.75 0.5 0.5 0.75 0.5 0.5 ...
##  $ Dalc          : num  0 0.25 0 0 0 0 0 0 0 0 ...
##  $ Walc          : num  0 0.25 0 0.25 0 0 0 0 0.25 0.5 ...
##  $ health        : num  0.75 1 0.5 1 0.5 0.5 1 0.75 1 0.5 ...
##  $ absences      : num  0.187 0 0 0 0.267 ...
##  $ sex_F         : num  1 0 1 1 0 1 0 0 0 1 ...
##  $ sex_M         : num  0 1 0 0 1 0 1 1 1 0 ...
##  $ address_R     : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ address_U     : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ famsize_GT3   : num  0 0 1 1 1 1 1 1 1 0 ...
##  $ famsize_LE3   : num  1 1 0 0 0 0 0 0 0 1 ...
##  $ Pstatus_A     : num  1 0 0 0 0 0 1 0 0 0 ...
##  $ Pstatus_T     : num  0 1 1 1 1 1 0 1 1 1 ...
##  $ schoolsup_no  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ schoolsup_yes : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ famsup_no     : num  0 1 0 0 0 0 1 0 1 0 ...
##  $ famsup_yes    : num  1 0 1 1 1 1 0 1 0 1 ...
##  $ paid_no       : num  1 0 0 0 1 0 1 0 1 1 ...
##  $ paid_yes      : num  0 1 1 1 0 1 0 1 0 0 ...
##  $ activities_no : num  1 1 1 0 0 0 0 1 0 0 ...
##  $ activities_yes: num  0 0 0 1 1 1 1 0 1 1 ...
##  $ nursery_no    : num  0 0 0 1 0 0 0 0 0 1 ...
##  $ nursery_yes   : num  1 1 1 0 1 1 1 1 1 0 ...
##  $ higher_no     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ higher_yes    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ romantic_no   : num  0 1 1 1 0 1 1 0 0 1 ...
##  $ romantic_yes  : num  1 0 0 0 1 0 0 1 1 0 ...
##  $ class         : Factor w/ 2 levels "High","Low": 2 2 1 1 1 1 1 2 1 2 ...
# use the additional data as test dataset to calculate the accuracy of the logistic model   

predicted_log <- predict(LRmodelR,myadditionalNorm[,-36], type ='response')
predicted_log <- ifelse(predicted_log > 0.5, "Low", "High")
confusionmatrix <- table(predicted_log,myadditionalNorm[,36])
accuracy <- sum(diag(confusionmatrix))/sum(confusionmatrix)

print(accuracy) # accuracy = 63.16%
## [1] 0.6315789
predicted_logg <- predict(LRmodel,myadditionalNorm[,-36], type ='response')
predicted_logg <- ifelse(predicted_logg > 0.5, "Low", "High")
confusionmatrix <- table(predicted_logg,myadditionalNorm[,36])
accuracy <- sum(diag(confusionmatrix))/sum(confusionmatrix)

print(accuracy) # accuracy = 58.94%
## [1] 0.5894737

accuracy of odel of SVM with linear and RBF kernel - based on dataset “studentadditional”

# RBF kernel of SVM
set.seed(100)
myprediction_SVM <- predict(finalmodelRBF,myadditionalNorm[,-36])
classificationtable_SVM <- table(myprediction_SVM, myadditionalNorm[,36])
acctestfinalmodel_RBF <- sum(diag(classificationtable_SVM))/sum(classificationtable_SVM)

print(acctestfinalmodel_RBF) # accuracy is 49.47% - Indicates the possibility of overfitting
## [1] 0.4947368
# Linear kernel of SVM

myprediction_SVM <- predict(finalmodellinear,myadditionalNorm[,-36])
classificationtable_SVM <- table(myprediction_SVM, myadditionalNorm[,36])
acctestfinalmodel_RBF <- sum(diag(classificationtable_SVM))/sum(classificationtable_SVM)

print(acctestfinalmodel_RBF) # accuracy = 60%
## [1] 0.6

accuracy of final model of Classification - based on dataset “studentadditional”

myprediction_CT <- predict(myprunedtree, myadditional[,-25], type='class')
classificationtable_CT <- table(myprediction_CT,myadditional[,25])
acctesttree <- sum(diag(classificationtable_CT))/sum(classificationtable_CT)

print(classificationtable_CT)
##                
## myprediction_CT High Low
##            High   34  54
##            Low     2   5
print(acctesttree) # accuracy is 41.05%
## [1] 0.4105263
myprediction_CT <- predict(mytree, myadditional[,-25], type='class')
classificationtable_CT <- table(myprediction_CT,myadditional[,25])
acctesttree <- sum(diag(classificationtable_CT))/sum(classificationtable_CT)

print(classificationtable_CT)
##                
## myprediction_CT High Low
##            High   14  22
##            Low    22  37
print(acctesttree) # accuracy is 53.68%
## [1] 0.5368421

accuracy of model of Random forests - based on dataset “studentadditional”

set.seed(100)
myprediction_RFF <- predict(my_final_RF, myadditional[,-25], type='class')
classification_table_RFF <- table(myprediction_RFF, myadditional[,25])
acc_testrandomforest_RFF <- sum(diag(classification_table_RFF))/sum(classification_table_RFF)

print(acc_testrandomforest_RFF) # accuracy is 64.2%
## [1] 0.6421053
myprediction_RFF <- predict(myrandomF, myadditional[,-25], type='class')
classification_table_RFF <- table(myprediction_RFF, myadditional[,25])
acc_testrandomforest_RFF <- sum(diag(classification_table_RFF))/sum(classification_table_RFF)

print(acc_testrandomforest_RFF) # accuracy is 63.16%
## [1] 0.6315789

accuracy of model of KNN - based on dataset “studentadditional”

test_labels <- myadditionalNorm[,36]
test_labels2 <- myadditionalNorm[,-36]



myknn3 <- knn(train= myxstudent, test= test_labels2, cl= myystudent, k=3)
myaccuracytablek3nn <- table (myknn3,test_labels)
mytestingaccuracyk3nn <- sum(diag(myaccuracytablek3nn))/sum(myaccuracytablek3nn)

print(mytestingaccuracyk3nn) # the accuracy is 54.74%
## [1] 0.5473684
prop.table(table(mystudent$class))
## 
## High  Low 
## 0.42 0.58

Part 2 UnSupervised Learning

Use hierarchical clustering to model the wine data

library('cluster')
# (i) Use hierarchical clustering to model the wine data


mywine <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/wine.txt",header=TRUE, stringsAsFactors=TRUE)

str(mywine)
## 'data.frame':    168 obs. of  13 variables:
##  $ Alcohol                    : num  13.4 13.9 13.2 13.1 14.2 ...
##  $ Malicacid                  : num  3.84 1.89 3.98 1.77 4.04 3.59 1.68 2.02 1.73 1.73 ...
##  $ Ash                        : num  2.12 2.59 2.29 2.1 2.44 2.28 2.12 2.4 2.27 2.04 ...
##  $ Alcalinityofash            : num  18.8 15 17.5 17 18.9 16 16 18.8 17.4 12.4 ...
##  $ Magnesium                  : int  90 101 103 107 111 102 101 103 108 92 ...
##  $ Totalphenols               : num  2.45 3.25 2.64 3 2.85 3.25 3.1 2.75 2.88 2.72 ...
##  $ Flavanoids                 : num  2.68 3.56 2.63 3 2.65 3.17 3.39 2.92 3.54 3.27 ...
##  $ Nonflavanoidphenols        : num  0.27 0.17 0.32 0.28 0.3 0.27 0.21 0.32 0.32 0.17 ...
##  $ Proanthocyanins            : num  1.48 1.7 1.66 2.03 1.25 2.19 2.14 2.38 2.08 2.91 ...
##  $ Colorintensity             : num  4.28 5.43 4.36 5.04 5.24 4.9 6.1 6.2 8.9 7.2 ...
##  $ Hue                        : num  0.91 0.88 0.82 0.88 0.87 1.04 0.91 1.07 1.12 1.12 ...
##  $ OD280andOD315ofdilutedwines: num  3 3.56 3 3.35 3.33 3.44 3.33 2.75 3.1 2.91 ...
##  $ Proline                    : int  1035 1095 680 885 1080 1065 985 1060 1260 1150 ...
dim(mywine)
## [1] 168  13
# normalize the data clustering using min-max normalization


min_max_normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

mywineNORM <- as.data.frame(lapply(mywine, min_max_normalize))

summary(mywineNORM)
##     Alcohol         Malicacid           Ash         Alcalinityofash 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3421   1st Qu.:0.1714   1st Qu.:0.4532   1st Qu.:0.3402  
##  Median :0.5039   Median :0.2283   Median :0.5294   Median :0.4588  
##  Mean   :0.5102   Mean   :0.3207   Mean   :0.5334   Mean   :0.4631  
##  3rd Qu.:0.6888   3rd Qu.:0.4728   3rd Qu.:0.6310   3rd Qu.:0.5619  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Magnesium       Totalphenols      Flavanoids     Nonflavanoidphenols
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000     
##  1st Qu.:0.1957   1st Qu.:0.2483   1st Qu.:0.1582   1st Qu.:0.2642     
##  Median :0.2935   Median :0.4241   Median :0.3565   Median :0.3962     
##  Mean   :0.3175   Mean   :0.4457   Mean   :0.3465   Mean   :0.4434     
##  3rd Qu.:0.4022   3rd Qu.:0.6276   3rd Qu.:0.5206   3rd Qu.:0.6038     
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000     
##  Proanthocyanins  Colorintensity        Hue         OD280andOD315ofdilutedwines
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000             
##  1st Qu.:0.2500   1st Qu.:0.1529   1st Qu.:0.2337   1st Qu.:0.2134             
##  Median :0.3612   Median :0.2910   Median :0.3902   Median :0.5513             
##  Mean   :0.3699   Mean   :0.3233   Mean   :0.3804   Mean   :0.4845             
##  3rd Qu.:0.4858   3rd Qu.:0.4251   3rd Qu.:0.5142   3rd Qu.:0.6969             
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000             
##     Proline      
##  Min.   :0.0000  
##  1st Qu.:0.1548  
##  Median :0.2653  
##  Mean   :0.3203  
##  3rd Qu.:0.4388  
##  Max.   :1.0000
#  perform hierarchical clustering 


myahclust <-hclust(dist(mywineNORM))
print(myahclust)
## 
## Call:
## hclust(d = dist(mywineNORM))
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 168
par(mfrow=c(1,1))
plot(myahclust,hang=-0.5,cex=0.5)

(ii) Construct a partitioning based clustering for the wine data, with k=1,…,15.

# Experiment the clustering with k = 1...15


kmc  <- NULL
for(auxk in 1:15){
  set.seed(1)
  myKmeansauxkmultistart <- kmeans(mywineNORM, auxk, nstart=50)
  kmc[auxk] = myKmeansauxkmultistart$tot.withinss}

plot(kmc,main = "Elbow Plot for K-Means Clustering", xlab = "Number of Clusters (k)")

Part 3 Visualization

(i) Principal Component Analysis

myseeds <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/seeds.txt",header=TRUE, stringsAsFactors=TRUE)

library('scatterplot3d')



# perform the Principal Componets Analysis 

myPCA <- prcomp(myseeds,center=T,scale.=T)

summary(myPCA)
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.191 1.0875 0.9494 0.27962 0.16199 0.09136 0.02998
## Proportion of Variance 0.686 0.1689 0.1288 0.01117 0.00375 0.00119 0.00013
## Cumulative Proportion  0.686 0.8550 0.9838 0.99493 0.99868 0.99987 1.00000
print(myPCA)
## Standard deviations (1, .., p=7):
## [1] 2.19139592 1.08750725 0.94944124 0.27961800 0.16199138 0.09136172 0.02998071
## 
## Rotation (n x k) = (7 x 7):
##                               PC1         PC2         PC3         PC4
## area                    0.4540913  0.05103249  0.04203703 -0.17680617
## perimeter               0.4522120 -0.03151002  0.10183496 -0.25536701
## compactness             0.1359975  0.79067675 -0.42545398  0.28562678
## length_of_kernel        0.4369575 -0.16006108  0.20028936 -0.10459683
## width_of_kernel         0.4330670  0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient   0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526  0.08233860  0.83378501
##                                   PC5           PC6          PC7
## area                     0.0576795753 -0.5057718211 -0.706425617
## perimeter                0.0002751834 -0.4777759254  0.700470459
## compactness             -0.2910418482 -0.0643022741  0.070007658
## length_of_kernel        -0.7363440219  0.4317071267 -0.061795725
## width_of_kernel          0.5588419384  0.5685940173  0.011978913
## asymmetry_coefficient   -0.0561890085  0.0001657458  0.001837002
## length_of_kernel_groove  0.2330313242  0.0460436789  0.037912146
myPCA$rotation
##                               PC1         PC2         PC3         PC4
## area                    0.4540913  0.05103249  0.04203703 -0.17680617
## perimeter               0.4522120 -0.03151002  0.10183496 -0.25536701
## compactness             0.1359975  0.79067675 -0.42545398  0.28562678
## length_of_kernel        0.4369575 -0.16006108  0.20028936 -0.10459683
## width_of_kernel         0.4330670  0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient   0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526  0.08233860  0.83378501
##                                   PC5           PC6          PC7
## area                     0.0576795753 -0.5057718211 -0.706425617
## perimeter                0.0002751834 -0.4777759254  0.700470459
## compactness             -0.2910418482 -0.0643022741  0.070007658
## length_of_kernel        -0.7363440219  0.4317071267 -0.061795725
## width_of_kernel          0.5588419384  0.5685940173  0.011978913
## asymmetry_coefficient   -0.0561890085  0.0001657458  0.001837002
## length_of_kernel_groove  0.2330313242  0.0460436789  0.037912146
# plot the first two principal components to visualize how well they separate the data.


x <- myPCA$x[,1]
y <- myPCA$x[,2]
plot(x, y, xlab="PC1", ylab="PC2", main="Principal Component Analysis")
text(x, y, labels=row.names(myseeds), cex = 0.7)

biplot(myPCA, main = "Principal Component Analysis")

(ii) MultiDimensional Scaling

# normalize the data 


min_max_normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

myseedNORM <- as.data.frame(lapply(myseeds, min_max_normalize))

summary(myseedNORM)
##       area          perimeter       compactness     length_of_kernel
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2160   1st Qu.:0.2587   1st Qu.:0.3685   1st Qu.:0.2668  
##  Median :0.3381   Median :0.3831   Median :0.5145   Median :0.3977  
##  Mean   :0.3912   Mean   :0.4261   Mean   :0.5127   Mean   :0.4236  
##  3rd Qu.:0.5123   3rd Qu.:0.5574   3rd Qu.:0.6334   3rd Qu.:0.5549  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  width_of_kernel  asymmetry_coefficient length_of_kernel_groove
##  Min.   :0.0000   Min.   :0.0000        Min.   :0.0000         
##  1st Qu.:0.2776   1st Qu.:0.1926        1st Qu.:0.2288         
##  Median :0.4122   Median :0.3044        Median :0.3235         
##  Mean   :0.4352   Mean   :0.3269        Mean   :0.3916         
##  3rd Qu.:0.5615   3rd Qu.:0.4429        3rd Qu.:0.5114         
##  Max.   :1.0000   Max.   :1.0000        Max.   :1.0000
with(myseedNORM, pairs(myseedNORM))

# derive multidimensional scaling 

set.seed(123)

myMDS <- cmdscale(dist(myseedNORM), 2, eig=TRUE)

print(myMDS)
## $points
##                [,1]         [,2]
##   [1,] -0.456546396 -0.414910628
##   [2,] -0.503176365  0.277339367
##   [3,] -0.511368001 -0.293340451
##   [4,] -0.566475443  0.329847513
##   [5,] -0.811554997 -0.153334741
##   [6,]  0.094213787 -0.079831065
##   [7,] -0.107146977 -0.361750743
##   [8,] -0.240648904 -0.032561885
##   [9,]  0.044535710  0.001636321
##  [10,]  0.233970216  0.260014879
##  [11,]  0.273850721 -0.074043169
##  [12,]  0.447895015  0.349114747
##  [13,] -0.006501988  0.013271794
##  [14,] -0.141795678  0.080893741
##  [15,] -0.286917086  0.104664367
##  [16,] -0.294072664  0.098162000
##  [17,] -0.442467723  0.350896687
##  [18,]  0.222022605 -0.082399222
##  [19,]  0.047872917  0.259070587
##  [20,] -0.320140880  0.093054104
##  [21,]  0.039822897  0.170944642
##  [22,] -0.006674091  0.136677963
##  [23,] -0.084002203  0.068955490
##  [24,] -0.016248342 -0.120006955
##  [25,] -0.097702198 -0.067656132
##  [26,]  0.137459768  0.085822128
##  [27,] -0.070815933 -0.392649516
##  [28,] -0.139302441  0.020433846
##  [29,] -0.068954906 -0.286684755
##  [30,]  0.006020502 -0.159182967
##  [31,] -0.454406090  0.057557304
##  [32,] -0.452800832 -0.077382476
##  [33,] -0.319375547  0.009244558
##  [34,] -0.139253820  0.115079744
##  [35,] -0.067666931 -0.013471805
##  [36,] -0.197110505  0.199587409
##  [37,]  0.443571898 -0.108871442
##  [38,]  0.325822417  0.116423848
##  [39,]  0.044059600 -0.195197916
##  [40,] -0.253889414  0.087858465
##  [41,] -0.297460765  0.126667598
##  [42,] -0.300213347 -0.004090203
##  [43,] -0.332536606 -0.016238044
##  [44,] -0.264438712  0.323606442
##  [45,]  0.183513043  0.398048679
##  [46,] -0.185636278  0.264766554
##  [47,] -0.209071704  0.613489741
##  [48,] -0.492420780 -0.123923879
##  [49,] -0.169930267 -0.259842035
##  [50,] -0.182203986 -0.079142296
##  [51,]  0.100365970  0.375228525
##  [52,] -0.651125464 -0.064376631
##  [53,] -0.061726678 -0.090737686
##  [54,]  0.194726231  0.143053481
##  [55,] -0.458948882 -0.155502590
##  [56,] -0.516781534 -0.228032969
##  [57,] -0.191447530  0.060055998
##  [58,] -0.310278861 -0.245437795
##  [59,] -0.422164769 -0.090317160
##  [60,] -0.185166567  0.278331336
##  [61,] -0.260562957  0.477666615
##  [62,]  0.064688771  0.435755024
##  [63,] -0.079573364  0.381599298
##  [64,]  0.067713219  0.140131527
##  [65,] -0.712757291 -0.270007800
##  [66,] -0.836195297 -0.053494909
##  [67,] -0.845126594  0.134380227
##  [68,] -0.633204257  0.222406222
##  [69,] -0.352505863 -0.172821666
##  [70,] -0.545798926  0.031343311
##  [71,]  0.037022514 -0.063532691
##  [72,] -0.111462735  0.159580032
##  [73,] -0.229859013  0.397370295
##  [74,] -0.549334422  0.222814714
##  [75,] -0.154127203 -0.018137293
##  [76,] -0.203072167 -0.163468921
##  [77,] -0.144588669 -0.048847828
##  [78,] -0.491378236 -0.430962995
##  [79,]  0.664580210 -0.283233889
##  [80,]  0.477630491 -0.330207538
##  [81,]  0.537169650 -0.132285451
##  [82,]  0.936224060  0.292666612
##  [83,]  0.474369252 -0.128582241
##  [84,]  0.420682862 -0.315098800
##  [85,]  0.508544906 -0.357195539
##  [86,]  1.228250336 -0.235453644
##  [87,]  0.959260766 -0.293412131
##  [88,]  0.459788861  0.083885493
##  [89,]  0.431162687 -0.143315624
##  [90,]  0.812371120  0.118006448
##  [91,]  1.080547914 -0.038277222
##  [92,]  0.912178927 -0.157686775
##  [93,]  0.888963065  0.288952597
##  [94,]  0.912204430 -0.290958127
##  [95,]  1.288543684 -0.073951638
##  [96,]  0.922235140 -0.019453901
##  [97,]  0.825203011  0.287012274
##  [98,]  0.936559605  0.028647304
##  [99,]  0.854791995  0.153973066
## [100,]  1.119992347 -0.091116753
## [101,]  1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
## 
## $eig
##   [1]  2.821530e+01  5.373075e+00  3.591202e+00  4.574901e-01  1.481929e-01
##   [6]  4.923777e-02  5.304570e-03  3.910165e-15  2.321027e-15  2.081520e-15
##  [11]  9.597328e-16  8.106786e-16  6.933797e-16  6.264531e-16  6.141442e-16
##  [16]  5.603973e-16  5.150985e-16  4.096229e-16  3.247955e-16  3.071453e-16
##  [21]  3.039442e-16  2.863639e-16  2.789339e-16  2.758621e-16  2.325260e-16
##  [26]  2.308525e-16  2.123594e-16  2.009972e-16  1.863791e-16  1.780318e-16
##  [31]  1.663434e-16  1.333160e-16  1.247651e-16  1.208808e-16  1.167020e-16
##  [36]  1.151814e-16  1.092230e-16  1.084463e-16  1.054356e-16  1.014932e-16
##  [41]  1.010797e-16  9.547708e-17  8.879628e-17  8.632458e-17  6.299933e-17
##  [46]  5.027959e-17  3.179633e-17  2.182958e-17  2.021657e-17  1.940674e-17
##  [51]  1.756088e-17  1.575818e-17  1.320219e-17  1.083291e-17  8.422059e-18
##  [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
##  [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
##  [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
##  [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
##  [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
##  [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
##  [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
##  [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
##  [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
## 
## $x
## NULL
## 
## $ac
## [1] 0
## 
## $GOF
## [1] 0.8876467 0.8876467
summary(myMDS)
##        Length Class  Mode   
## points 214    -none- numeric
## eig    107    -none- numeric
## x        0    -none- NULL   
## ac       1    -none- numeric
## GOF      2    -none- numeric
myMDS$points
##                [,1]         [,2]
##   [1,] -0.456546396 -0.414910628
##   [2,] -0.503176365  0.277339367
##   [3,] -0.511368001 -0.293340451
##   [4,] -0.566475443  0.329847513
##   [5,] -0.811554997 -0.153334741
##   [6,]  0.094213787 -0.079831065
##   [7,] -0.107146977 -0.361750743
##   [8,] -0.240648904 -0.032561885
##   [9,]  0.044535710  0.001636321
##  [10,]  0.233970216  0.260014879
##  [11,]  0.273850721 -0.074043169
##  [12,]  0.447895015  0.349114747
##  [13,] -0.006501988  0.013271794
##  [14,] -0.141795678  0.080893741
##  [15,] -0.286917086  0.104664367
##  [16,] -0.294072664  0.098162000
##  [17,] -0.442467723  0.350896687
##  [18,]  0.222022605 -0.082399222
##  [19,]  0.047872917  0.259070587
##  [20,] -0.320140880  0.093054104
##  [21,]  0.039822897  0.170944642
##  [22,] -0.006674091  0.136677963
##  [23,] -0.084002203  0.068955490
##  [24,] -0.016248342 -0.120006955
##  [25,] -0.097702198 -0.067656132
##  [26,]  0.137459768  0.085822128
##  [27,] -0.070815933 -0.392649516
##  [28,] -0.139302441  0.020433846
##  [29,] -0.068954906 -0.286684755
##  [30,]  0.006020502 -0.159182967
##  [31,] -0.454406090  0.057557304
##  [32,] -0.452800832 -0.077382476
##  [33,] -0.319375547  0.009244558
##  [34,] -0.139253820  0.115079744
##  [35,] -0.067666931 -0.013471805
##  [36,] -0.197110505  0.199587409
##  [37,]  0.443571898 -0.108871442
##  [38,]  0.325822417  0.116423848
##  [39,]  0.044059600 -0.195197916
##  [40,] -0.253889414  0.087858465
##  [41,] -0.297460765  0.126667598
##  [42,] -0.300213347 -0.004090203
##  [43,] -0.332536606 -0.016238044
##  [44,] -0.264438712  0.323606442
##  [45,]  0.183513043  0.398048679
##  [46,] -0.185636278  0.264766554
##  [47,] -0.209071704  0.613489741
##  [48,] -0.492420780 -0.123923879
##  [49,] -0.169930267 -0.259842035
##  [50,] -0.182203986 -0.079142296
##  [51,]  0.100365970  0.375228525
##  [52,] -0.651125464 -0.064376631
##  [53,] -0.061726678 -0.090737686
##  [54,]  0.194726231  0.143053481
##  [55,] -0.458948882 -0.155502590
##  [56,] -0.516781534 -0.228032969
##  [57,] -0.191447530  0.060055998
##  [58,] -0.310278861 -0.245437795
##  [59,] -0.422164769 -0.090317160
##  [60,] -0.185166567  0.278331336
##  [61,] -0.260562957  0.477666615
##  [62,]  0.064688771  0.435755024
##  [63,] -0.079573364  0.381599298
##  [64,]  0.067713219  0.140131527
##  [65,] -0.712757291 -0.270007800
##  [66,] -0.836195297 -0.053494909
##  [67,] -0.845126594  0.134380227
##  [68,] -0.633204257  0.222406222
##  [69,] -0.352505863 -0.172821666
##  [70,] -0.545798926  0.031343311
##  [71,]  0.037022514 -0.063532691
##  [72,] -0.111462735  0.159580032
##  [73,] -0.229859013  0.397370295
##  [74,] -0.549334422  0.222814714
##  [75,] -0.154127203 -0.018137293
##  [76,] -0.203072167 -0.163468921
##  [77,] -0.144588669 -0.048847828
##  [78,] -0.491378236 -0.430962995
##  [79,]  0.664580210 -0.283233889
##  [80,]  0.477630491 -0.330207538
##  [81,]  0.537169650 -0.132285451
##  [82,]  0.936224060  0.292666612
##  [83,]  0.474369252 -0.128582241
##  [84,]  0.420682862 -0.315098800
##  [85,]  0.508544906 -0.357195539
##  [86,]  1.228250336 -0.235453644
##  [87,]  0.959260766 -0.293412131
##  [88,]  0.459788861  0.083885493
##  [89,]  0.431162687 -0.143315624
##  [90,]  0.812371120  0.118006448
##  [91,]  1.080547914 -0.038277222
##  [92,]  0.912178927 -0.157686775
##  [93,]  0.888963065  0.288952597
##  [94,]  0.912204430 -0.290958127
##  [95,]  1.288543684 -0.073951638
##  [96,]  0.922235140 -0.019453901
##  [97,]  0.825203011  0.287012274
##  [98,]  0.936559605  0.028647304
##  [99,]  0.854791995  0.153973066
## [100,]  1.119992347 -0.091116753
## [101,]  1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
myMDS$eig
##   [1]  2.821530e+01  5.373075e+00  3.591202e+00  4.574901e-01  1.481929e-01
##   [6]  4.923777e-02  5.304570e-03  3.910165e-15  2.321027e-15  2.081520e-15
##  [11]  9.597328e-16  8.106786e-16  6.933797e-16  6.264531e-16  6.141442e-16
##  [16]  5.603973e-16  5.150985e-16  4.096229e-16  3.247955e-16  3.071453e-16
##  [21]  3.039442e-16  2.863639e-16  2.789339e-16  2.758621e-16  2.325260e-16
##  [26]  2.308525e-16  2.123594e-16  2.009972e-16  1.863791e-16  1.780318e-16
##  [31]  1.663434e-16  1.333160e-16  1.247651e-16  1.208808e-16  1.167020e-16
##  [36]  1.151814e-16  1.092230e-16  1.084463e-16  1.054356e-16  1.014932e-16
##  [41]  1.010797e-16  9.547708e-17  8.879628e-17  8.632458e-17  6.299933e-17
##  [46]  5.027959e-17  3.179633e-17  2.182958e-17  2.021657e-17  1.940674e-17
##  [51]  1.756088e-17  1.575818e-17  1.320219e-17  1.083291e-17  8.422059e-18
##  [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
##  [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
##  [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
##  [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
##  [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
##  [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
##  [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
##  [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
##  [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
# plot the representavite points 

x <- myMDS$points[,1]
y <- myMDS$points[,2]
plot(x, y, xlab="Representative's Coordinate 1", ylab="Representative's Coordinate 2",main="MDS")
text(x, y, labels=row.names(myseedNORM), cex = 0.7)

Part 2 UnSupervised Learning

Use hierarchical clustering to model the wine data

library('cluster')
# (i) Use hierarchical clustering to model the wine data


mywine <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/wine.txt",header=TRUE, stringsAsFactors=TRUE)

str(mywine)
## 'data.frame':    168 obs. of  13 variables:
##  $ Alcohol                    : num  13.4 13.9 13.2 13.1 14.2 ...
##  $ Malicacid                  : num  3.84 1.89 3.98 1.77 4.04 3.59 1.68 2.02 1.73 1.73 ...
##  $ Ash                        : num  2.12 2.59 2.29 2.1 2.44 2.28 2.12 2.4 2.27 2.04 ...
##  $ Alcalinityofash            : num  18.8 15 17.5 17 18.9 16 16 18.8 17.4 12.4 ...
##  $ Magnesium                  : int  90 101 103 107 111 102 101 103 108 92 ...
##  $ Totalphenols               : num  2.45 3.25 2.64 3 2.85 3.25 3.1 2.75 2.88 2.72 ...
##  $ Flavanoids                 : num  2.68 3.56 2.63 3 2.65 3.17 3.39 2.92 3.54 3.27 ...
##  $ Nonflavanoidphenols        : num  0.27 0.17 0.32 0.28 0.3 0.27 0.21 0.32 0.32 0.17 ...
##  $ Proanthocyanins            : num  1.48 1.7 1.66 2.03 1.25 2.19 2.14 2.38 2.08 2.91 ...
##  $ Colorintensity             : num  4.28 5.43 4.36 5.04 5.24 4.9 6.1 6.2 8.9 7.2 ...
##  $ Hue                        : num  0.91 0.88 0.82 0.88 0.87 1.04 0.91 1.07 1.12 1.12 ...
##  $ OD280andOD315ofdilutedwines: num  3 3.56 3 3.35 3.33 3.44 3.33 2.75 3.1 2.91 ...
##  $ Proline                    : int  1035 1095 680 885 1080 1065 985 1060 1260 1150 ...
dim(mywine)
## [1] 168  13
# normalize the data clustering using min-max normalization


min_max_normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

mywineNORM <- as.data.frame(lapply(mywine, min_max_normalize))

summary(mywineNORM)
##     Alcohol         Malicacid           Ash         Alcalinityofash 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3421   1st Qu.:0.1714   1st Qu.:0.4532   1st Qu.:0.3402  
##  Median :0.5039   Median :0.2283   Median :0.5294   Median :0.4588  
##  Mean   :0.5102   Mean   :0.3207   Mean   :0.5334   Mean   :0.4631  
##  3rd Qu.:0.6888   3rd Qu.:0.4728   3rd Qu.:0.6310   3rd Qu.:0.5619  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Magnesium       Totalphenols      Flavanoids     Nonflavanoidphenols
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000     
##  1st Qu.:0.1957   1st Qu.:0.2483   1st Qu.:0.1582   1st Qu.:0.2642     
##  Median :0.2935   Median :0.4241   Median :0.3565   Median :0.3962     
##  Mean   :0.3175   Mean   :0.4457   Mean   :0.3465   Mean   :0.4434     
##  3rd Qu.:0.4022   3rd Qu.:0.6276   3rd Qu.:0.5206   3rd Qu.:0.6038     
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000     
##  Proanthocyanins  Colorintensity        Hue         OD280andOD315ofdilutedwines
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000             
##  1st Qu.:0.2500   1st Qu.:0.1529   1st Qu.:0.2337   1st Qu.:0.2134             
##  Median :0.3612   Median :0.2910   Median :0.3902   Median :0.5513             
##  Mean   :0.3699   Mean   :0.3233   Mean   :0.3804   Mean   :0.4845             
##  3rd Qu.:0.4858   3rd Qu.:0.4251   3rd Qu.:0.5142   3rd Qu.:0.6969             
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000             
##     Proline      
##  Min.   :0.0000  
##  1st Qu.:0.1548  
##  Median :0.2653  
##  Mean   :0.3203  
##  3rd Qu.:0.4388  
##  Max.   :1.0000
#  perform hierarchical clustering 


myahclust <-hclust(dist(mywineNORM))
print(myahclust)
## 
## Call:
## hclust(d = dist(mywineNORM))
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 168
par(mfrow=c(1,1))
plot(myahclust,hang=-0.5,cex=0.5)

(ii) Construct a partitioning based clustering for the wine data, with k=1,…,15.

# Experiment the clustering with k = 1...15


kmc  <- NULL
for(auxk in 1:15){
  set.seed(1)
  myKmeansauxkmultistart <- kmeans(mywineNORM, auxk, nstart=50)
  kmc[auxk] = myKmeansauxkmultistart$tot.withinss}

plot(kmc,main = "Elbow Plot for K-Means Clustering", xlab = "Number of Clusters (k)")

Part 3 Visualization

(i) Principal Component Analysis

myseeds <- read.table("C:/Users/Allen/OneDrive - CBS - Copenhagen Business School/Desktop/Data Science/Exam/seeds.txt",header=TRUE, stringsAsFactors=TRUE)

library('scatterplot3d')



# perform the Principal Componets Analysis 

myPCA <- prcomp(myseeds,center=T,scale.=T)

summary(myPCA)
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.191 1.0875 0.9494 0.27962 0.16199 0.09136 0.02998
## Proportion of Variance 0.686 0.1689 0.1288 0.01117 0.00375 0.00119 0.00013
## Cumulative Proportion  0.686 0.8550 0.9838 0.99493 0.99868 0.99987 1.00000
print(myPCA)
## Standard deviations (1, .., p=7):
## [1] 2.19139592 1.08750725 0.94944124 0.27961800 0.16199138 0.09136172 0.02998071
## 
## Rotation (n x k) = (7 x 7):
##                               PC1         PC2         PC3         PC4
## area                    0.4540913  0.05103249  0.04203703 -0.17680617
## perimeter               0.4522120 -0.03151002  0.10183496 -0.25536701
## compactness             0.1359975  0.79067675 -0.42545398  0.28562678
## length_of_kernel        0.4369575 -0.16006108  0.20028936 -0.10459683
## width_of_kernel         0.4330670  0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient   0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526  0.08233860  0.83378501
##                                   PC5           PC6          PC7
## area                     0.0576795753 -0.5057718211 -0.706425617
## perimeter                0.0002751834 -0.4777759254  0.700470459
## compactness             -0.2910418482 -0.0643022741  0.070007658
## length_of_kernel        -0.7363440219  0.4317071267 -0.061795725
## width_of_kernel          0.5588419384  0.5685940173  0.011978913
## asymmetry_coefficient   -0.0561890085  0.0001657458  0.001837002
## length_of_kernel_groove  0.2330313242  0.0460436789  0.037912146
myPCA$rotation
##                               PC1         PC2         PC3         PC4
## area                    0.4540913  0.05103249  0.04203703 -0.17680617
## perimeter               0.4522120 -0.03151002  0.10183496 -0.25536701
## compactness             0.1359975  0.79067675 -0.42545398  0.28562678
## length_of_kernel        0.4369575 -0.16006108  0.20028936 -0.10459683
## width_of_kernel         0.4330670  0.24974555 -0.08363773 -0.32762465
## asymmetry_coefficient   0.1078413 -0.47305410 -0.86773308 -0.09202589
## length_of_kernel_groove 0.4250899 -0.24384526  0.08233860  0.83378501
##                                   PC5           PC6          PC7
## area                     0.0576795753 -0.5057718211 -0.706425617
## perimeter                0.0002751834 -0.4777759254  0.700470459
## compactness             -0.2910418482 -0.0643022741  0.070007658
## length_of_kernel        -0.7363440219  0.4317071267 -0.061795725
## width_of_kernel          0.5588419384  0.5685940173  0.011978913
## asymmetry_coefficient   -0.0561890085  0.0001657458  0.001837002
## length_of_kernel_groove  0.2330313242  0.0460436789  0.037912146
# plot the first two principal components to visualize how well they separate the data.


x <- myPCA$x[,1]
y <- myPCA$x[,2]
plot(x, y, xlab="PC1", ylab="PC2", main="Principal Component Analysis")
text(x, y, labels=row.names(myseeds), cex = 0.7)

biplot(myPCA, main = "Principal Component Analysis")

(ii) MultiDimensional Scaling

# normalize the data 


min_max_normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

myseedNORM <- as.data.frame(lapply(myseeds, min_max_normalize))

summary(myseedNORM)
##       area          perimeter       compactness     length_of_kernel
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2160   1st Qu.:0.2587   1st Qu.:0.3685   1st Qu.:0.2668  
##  Median :0.3381   Median :0.3831   Median :0.5145   Median :0.3977  
##  Mean   :0.3912   Mean   :0.4261   Mean   :0.5127   Mean   :0.4236  
##  3rd Qu.:0.5123   3rd Qu.:0.5574   3rd Qu.:0.6334   3rd Qu.:0.5549  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  width_of_kernel  asymmetry_coefficient length_of_kernel_groove
##  Min.   :0.0000   Min.   :0.0000        Min.   :0.0000         
##  1st Qu.:0.2776   1st Qu.:0.1926        1st Qu.:0.2288         
##  Median :0.4122   Median :0.3044        Median :0.3235         
##  Mean   :0.4352   Mean   :0.3269        Mean   :0.3916         
##  3rd Qu.:0.5615   3rd Qu.:0.4429        3rd Qu.:0.5114         
##  Max.   :1.0000   Max.   :1.0000        Max.   :1.0000
with(myseedNORM, pairs(myseedNORM))

# derive multidimensional scaling 

set.seed(123)

myMDS <- cmdscale(dist(myseedNORM), 2, eig=TRUE)

print(myMDS)
## $points
##                [,1]         [,2]
##   [1,] -0.456546396 -0.414910628
##   [2,] -0.503176365  0.277339367
##   [3,] -0.511368001 -0.293340451
##   [4,] -0.566475443  0.329847513
##   [5,] -0.811554997 -0.153334741
##   [6,]  0.094213787 -0.079831065
##   [7,] -0.107146977 -0.361750743
##   [8,] -0.240648904 -0.032561885
##   [9,]  0.044535710  0.001636321
##  [10,]  0.233970216  0.260014879
##  [11,]  0.273850721 -0.074043169
##  [12,]  0.447895015  0.349114747
##  [13,] -0.006501988  0.013271794
##  [14,] -0.141795678  0.080893741
##  [15,] -0.286917086  0.104664367
##  [16,] -0.294072664  0.098162000
##  [17,] -0.442467723  0.350896687
##  [18,]  0.222022605 -0.082399222
##  [19,]  0.047872917  0.259070587
##  [20,] -0.320140880  0.093054104
##  [21,]  0.039822897  0.170944642
##  [22,] -0.006674091  0.136677963
##  [23,] -0.084002203  0.068955490
##  [24,] -0.016248342 -0.120006955
##  [25,] -0.097702198 -0.067656132
##  [26,]  0.137459768  0.085822128
##  [27,] -0.070815933 -0.392649516
##  [28,] -0.139302441  0.020433846
##  [29,] -0.068954906 -0.286684755
##  [30,]  0.006020502 -0.159182967
##  [31,] -0.454406090  0.057557304
##  [32,] -0.452800832 -0.077382476
##  [33,] -0.319375547  0.009244558
##  [34,] -0.139253820  0.115079744
##  [35,] -0.067666931 -0.013471805
##  [36,] -0.197110505  0.199587409
##  [37,]  0.443571898 -0.108871442
##  [38,]  0.325822417  0.116423848
##  [39,]  0.044059600 -0.195197916
##  [40,] -0.253889414  0.087858465
##  [41,] -0.297460765  0.126667598
##  [42,] -0.300213347 -0.004090203
##  [43,] -0.332536606 -0.016238044
##  [44,] -0.264438712  0.323606442
##  [45,]  0.183513043  0.398048679
##  [46,] -0.185636278  0.264766554
##  [47,] -0.209071704  0.613489741
##  [48,] -0.492420780 -0.123923879
##  [49,] -0.169930267 -0.259842035
##  [50,] -0.182203986 -0.079142296
##  [51,]  0.100365970  0.375228525
##  [52,] -0.651125464 -0.064376631
##  [53,] -0.061726678 -0.090737686
##  [54,]  0.194726231  0.143053481
##  [55,] -0.458948882 -0.155502590
##  [56,] -0.516781534 -0.228032969
##  [57,] -0.191447530  0.060055998
##  [58,] -0.310278861 -0.245437795
##  [59,] -0.422164769 -0.090317160
##  [60,] -0.185166567  0.278331336
##  [61,] -0.260562957  0.477666615
##  [62,]  0.064688771  0.435755024
##  [63,] -0.079573364  0.381599298
##  [64,]  0.067713219  0.140131527
##  [65,] -0.712757291 -0.270007800
##  [66,] -0.836195297 -0.053494909
##  [67,] -0.845126594  0.134380227
##  [68,] -0.633204257  0.222406222
##  [69,] -0.352505863 -0.172821666
##  [70,] -0.545798926  0.031343311
##  [71,]  0.037022514 -0.063532691
##  [72,] -0.111462735  0.159580032
##  [73,] -0.229859013  0.397370295
##  [74,] -0.549334422  0.222814714
##  [75,] -0.154127203 -0.018137293
##  [76,] -0.203072167 -0.163468921
##  [77,] -0.144588669 -0.048847828
##  [78,] -0.491378236 -0.430962995
##  [79,]  0.664580210 -0.283233889
##  [80,]  0.477630491 -0.330207538
##  [81,]  0.537169650 -0.132285451
##  [82,]  0.936224060  0.292666612
##  [83,]  0.474369252 -0.128582241
##  [84,]  0.420682862 -0.315098800
##  [85,]  0.508544906 -0.357195539
##  [86,]  1.228250336 -0.235453644
##  [87,]  0.959260766 -0.293412131
##  [88,]  0.459788861  0.083885493
##  [89,]  0.431162687 -0.143315624
##  [90,]  0.812371120  0.118006448
##  [91,]  1.080547914 -0.038277222
##  [92,]  0.912178927 -0.157686775
##  [93,]  0.888963065  0.288952597
##  [94,]  0.912204430 -0.290958127
##  [95,]  1.288543684 -0.073951638
##  [96,]  0.922235140 -0.019453901
##  [97,]  0.825203011  0.287012274
##  [98,]  0.936559605  0.028647304
##  [99,]  0.854791995  0.153973066
## [100,]  1.119992347 -0.091116753
## [101,]  1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
## 
## $eig
##   [1]  2.821530e+01  5.373075e+00  3.591202e+00  4.574901e-01  1.481929e-01
##   [6]  4.923777e-02  5.304570e-03  3.910165e-15  2.321027e-15  2.081520e-15
##  [11]  9.597328e-16  8.106786e-16  6.933797e-16  6.264531e-16  6.141442e-16
##  [16]  5.603973e-16  5.150985e-16  4.096229e-16  3.247955e-16  3.071453e-16
##  [21]  3.039442e-16  2.863639e-16  2.789339e-16  2.758621e-16  2.325260e-16
##  [26]  2.308525e-16  2.123594e-16  2.009972e-16  1.863791e-16  1.780318e-16
##  [31]  1.663434e-16  1.333160e-16  1.247651e-16  1.208808e-16  1.167020e-16
##  [36]  1.151814e-16  1.092230e-16  1.084463e-16  1.054356e-16  1.014932e-16
##  [41]  1.010797e-16  9.547708e-17  8.879628e-17  8.632458e-17  6.299933e-17
##  [46]  5.027959e-17  3.179633e-17  2.182958e-17  2.021657e-17  1.940674e-17
##  [51]  1.756088e-17  1.575818e-17  1.320219e-17  1.083291e-17  8.422059e-18
##  [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
##  [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
##  [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
##  [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
##  [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
##  [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
##  [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
##  [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
##  [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
## 
## $x
## NULL
## 
## $ac
## [1] 0
## 
## $GOF
## [1] 0.8876467 0.8876467
summary(myMDS)
##        Length Class  Mode   
## points 214    -none- numeric
## eig    107    -none- numeric
## x        0    -none- NULL   
## ac       1    -none- numeric
## GOF      2    -none- numeric
myMDS$points
##                [,1]         [,2]
##   [1,] -0.456546396 -0.414910628
##   [2,] -0.503176365  0.277339367
##   [3,] -0.511368001 -0.293340451
##   [4,] -0.566475443  0.329847513
##   [5,] -0.811554997 -0.153334741
##   [6,]  0.094213787 -0.079831065
##   [7,] -0.107146977 -0.361750743
##   [8,] -0.240648904 -0.032561885
##   [9,]  0.044535710  0.001636321
##  [10,]  0.233970216  0.260014879
##  [11,]  0.273850721 -0.074043169
##  [12,]  0.447895015  0.349114747
##  [13,] -0.006501988  0.013271794
##  [14,] -0.141795678  0.080893741
##  [15,] -0.286917086  0.104664367
##  [16,] -0.294072664  0.098162000
##  [17,] -0.442467723  0.350896687
##  [18,]  0.222022605 -0.082399222
##  [19,]  0.047872917  0.259070587
##  [20,] -0.320140880  0.093054104
##  [21,]  0.039822897  0.170944642
##  [22,] -0.006674091  0.136677963
##  [23,] -0.084002203  0.068955490
##  [24,] -0.016248342 -0.120006955
##  [25,] -0.097702198 -0.067656132
##  [26,]  0.137459768  0.085822128
##  [27,] -0.070815933 -0.392649516
##  [28,] -0.139302441  0.020433846
##  [29,] -0.068954906 -0.286684755
##  [30,]  0.006020502 -0.159182967
##  [31,] -0.454406090  0.057557304
##  [32,] -0.452800832 -0.077382476
##  [33,] -0.319375547  0.009244558
##  [34,] -0.139253820  0.115079744
##  [35,] -0.067666931 -0.013471805
##  [36,] -0.197110505  0.199587409
##  [37,]  0.443571898 -0.108871442
##  [38,]  0.325822417  0.116423848
##  [39,]  0.044059600 -0.195197916
##  [40,] -0.253889414  0.087858465
##  [41,] -0.297460765  0.126667598
##  [42,] -0.300213347 -0.004090203
##  [43,] -0.332536606 -0.016238044
##  [44,] -0.264438712  0.323606442
##  [45,]  0.183513043  0.398048679
##  [46,] -0.185636278  0.264766554
##  [47,] -0.209071704  0.613489741
##  [48,] -0.492420780 -0.123923879
##  [49,] -0.169930267 -0.259842035
##  [50,] -0.182203986 -0.079142296
##  [51,]  0.100365970  0.375228525
##  [52,] -0.651125464 -0.064376631
##  [53,] -0.061726678 -0.090737686
##  [54,]  0.194726231  0.143053481
##  [55,] -0.458948882 -0.155502590
##  [56,] -0.516781534 -0.228032969
##  [57,] -0.191447530  0.060055998
##  [58,] -0.310278861 -0.245437795
##  [59,] -0.422164769 -0.090317160
##  [60,] -0.185166567  0.278331336
##  [61,] -0.260562957  0.477666615
##  [62,]  0.064688771  0.435755024
##  [63,] -0.079573364  0.381599298
##  [64,]  0.067713219  0.140131527
##  [65,] -0.712757291 -0.270007800
##  [66,] -0.836195297 -0.053494909
##  [67,] -0.845126594  0.134380227
##  [68,] -0.633204257  0.222406222
##  [69,] -0.352505863 -0.172821666
##  [70,] -0.545798926  0.031343311
##  [71,]  0.037022514 -0.063532691
##  [72,] -0.111462735  0.159580032
##  [73,] -0.229859013  0.397370295
##  [74,] -0.549334422  0.222814714
##  [75,] -0.154127203 -0.018137293
##  [76,] -0.203072167 -0.163468921
##  [77,] -0.144588669 -0.048847828
##  [78,] -0.491378236 -0.430962995
##  [79,]  0.664580210 -0.283233889
##  [80,]  0.477630491 -0.330207538
##  [81,]  0.537169650 -0.132285451
##  [82,]  0.936224060  0.292666612
##  [83,]  0.474369252 -0.128582241
##  [84,]  0.420682862 -0.315098800
##  [85,]  0.508544906 -0.357195539
##  [86,]  1.228250336 -0.235453644
##  [87,]  0.959260766 -0.293412131
##  [88,]  0.459788861  0.083885493
##  [89,]  0.431162687 -0.143315624
##  [90,]  0.812371120  0.118006448
##  [91,]  1.080547914 -0.038277222
##  [92,]  0.912178927 -0.157686775
##  [93,]  0.888963065  0.288952597
##  [94,]  0.912204430 -0.290958127
##  [95,]  1.288543684 -0.073951638
##  [96,]  0.922235140 -0.019453901
##  [97,]  0.825203011  0.287012274
##  [98,]  0.936559605  0.028647304
##  [99,]  0.854791995  0.153973066
## [100,]  1.119992347 -0.091116753
## [101,]  1.136404033 -0.041659580
## [102,] -0.555961678 -0.276232348
## [103,] -0.602444018 -0.004421154
## [104,] -0.746845699 -0.388069864
## [105,] -0.286441896 -0.054612688
## [106,] -0.659238353 -0.348139162
## [107,] -0.507759432 -0.230435590
myMDS$eig
##   [1]  2.821530e+01  5.373075e+00  3.591202e+00  4.574901e-01  1.481929e-01
##   [6]  4.923777e-02  5.304570e-03  3.910165e-15  2.321027e-15  2.081520e-15
##  [11]  9.597328e-16  8.106786e-16  6.933797e-16  6.264531e-16  6.141442e-16
##  [16]  5.603973e-16  5.150985e-16  4.096229e-16  3.247955e-16  3.071453e-16
##  [21]  3.039442e-16  2.863639e-16  2.789339e-16  2.758621e-16  2.325260e-16
##  [26]  2.308525e-16  2.123594e-16  2.009972e-16  1.863791e-16  1.780318e-16
##  [31]  1.663434e-16  1.333160e-16  1.247651e-16  1.208808e-16  1.167020e-16
##  [36]  1.151814e-16  1.092230e-16  1.084463e-16  1.054356e-16  1.014932e-16
##  [41]  1.010797e-16  9.547708e-17  8.879628e-17  8.632458e-17  6.299933e-17
##  [46]  5.027959e-17  3.179633e-17  2.182958e-17  2.021657e-17  1.940674e-17
##  [51]  1.756088e-17  1.575818e-17  1.320219e-17  1.083291e-17  8.422059e-18
##  [56] -2.811176e-18 -3.347959e-18 -1.126475e-17 -1.369468e-17 -1.832012e-17
##  [61] -1.919673e-17 -2.228527e-17 -2.953243e-17 -3.605631e-17 -3.954262e-17
##  [66] -4.090988e-17 -4.425734e-17 -4.430885e-17 -4.837987e-17 -4.901257e-17
##  [71] -5.347238e-17 -5.874031e-17 -6.269820e-17 -6.735390e-17 -7.376729e-17
##  [76] -7.838220e-17 -7.954472e-17 -8.920677e-17 -9.068868e-17 -1.068265e-16
##  [81] -1.076257e-16 -1.245449e-16 -1.328935e-16 -1.673374e-16 -1.748133e-16
##  [86] -1.825336e-16 -1.908819e-16 -1.979537e-16 -2.009078e-16 -2.256646e-16
##  [91] -2.333580e-16 -2.355956e-16 -2.363723e-16 -2.608230e-16 -2.628141e-16
##  [96] -2.805319e-16 -2.918840e-16 -3.039855e-16 -3.070616e-16 -3.390273e-16
## [101] -3.514740e-16 -4.021426e-16 -4.856320e-16 -5.115720e-16 -1.089989e-15
## [106] -1.704739e-15 -2.609192e-15
# plot the representavite points 

x <- myMDS$points[,1]
y <- myMDS$points[,2]
plot(x, y, xlab="Representative's Coordinate 1", ylab="Representative's Coordinate 2",main="MDS")
text(x, y, labels=row.names(myseedNORM), cex = 0.7)