Using R for Multivariate Analysis

Reading Multivariate Analysis Data into R

wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
          sep=",")
wine

Plotting Multivariate Data

A Matrix Scatterplot

wine1<-wine[2:6]
paged_table(wine1)

scatterplotMatrix(wine[2:6])

A Scatterplot with the Data Points Labelled by their Group

plot(wine$V4, wine$V5)

plot(wine$V4, wine$V5)
text(wine$V4, wine$V5, wine$V1, cex=0.7, pos=4, col="red")

A Profile Plot

makeProfilePlot <- function(mylist,names)
  {
     require(RColorBrewer)
     # find out how many variables we want to include
     numvariables <- length(mylist)
     # choose 'numvariables' random colours
     colours <- brewer.pal(numvariables,"Set1")
     # find out the minimum and maximum values of the variables:
     mymin <- 1e+20
     mymax <- 1e-20
     for (i in 1:numvariables)
     {
        vectori <- mylist[[i]]
        mini <- min(vectori)
        maxi <- max(vectori)
        if (mini < mymin) { mymin <- mini }
        if (maxi > mymax) { mymax <- maxi }
     }
     # plot the variables
     for (i in 1:numvariables)
     {
        vectori <- mylist[[i]]
        namei <- names[i]
        colouri <- colours[i]
        if (i == 1) { plot(vectori,col=colouri,type="l",ylim=c(mymin,mymax)) }
        else         { points(vectori, col=colouri,type="l")                                     }
        lastxval <- length(vectori)
        lastyval <- vectori[length(vectori)]
        text((lastxval-10),(lastyval),namei,col="black",cex=0.6)
     }
  }

names <- c("V2","V3","V4","V5","V6")
mylist <- list(wine$V2,wine$V3,wine$V4,wine$V5,wine$V6)
makeProfilePlot(mylist,names)

Calculating Summary Statistics for Multivariate Data

sapply(wine[2:14],mean)

         V2          V3          V4          V5          V6          V7 
 13.0006180   2.3363483   2.3665169  19.4949438  99.7415730   2.2951124 
         V8          V9         V10         V11         V12         V13 
  2.0292697   0.3618539   1.5908989   5.0580899   0.9574494   2.6116854 
        V14 
746.8932584

sapply(wine[2:14],sd)

         V2          V3          V4          V5          V6          V7 
  0.8118265   1.1171461   0.2743440   3.3395638  14.2824835   0.6258510 
         V8          V9         V10         V11         V12         V13 
  0.9988587   0.1244533   0.5723589   2.3182859   0.2285716   0.7099904 
        V14 
314.9074743

Means and Variances Per Group

cultivar2wine <- wine[wine$V1=="2",]

sapply(cultivar2wine[2:14],mean)

        V2         V3         V4         V5         V6         V7         V8 
 12.278732   1.932676   2.244789  20.238028  94.549296   2.258873   2.080845 
        V9        V10        V11        V12        V13        V14 
  0.363662   1.630282   3.086620   1.056282   2.785352 519.507042

sapply(cultivar2wine[2:14],sd)

         V2          V3          V4          V5          V6          V7 
  0.5379642   1.0155687   0.3154673   3.3497704  16.7534975   0.5453611 
         V8          V9         V10         V11         V12         V13 
  0.7057008   0.1239613   0.6020678   0.9249293   0.2029368   0.4965735 
        V14 
157.2112204

printMeanAndSdByGroup <- function(variables,groupvariable)
  {
     # find the names of the variables
     variablenames <- c(names(groupvariable),names(as.data.frame(variables)))
     # within each group, find the mean of each variable
     groupvariable <- groupvariable[,1] # ensures groupvariable is not a list
     means <- aggregate(as.matrix(variables) ~ groupvariable, FUN = mean)
     names(means) <- variablenames
     print(paste("Means:"))
     print(means)
     # within each group, find the standard deviation of each variable:
     sds <- aggregate(as.matrix(variables) ~ groupvariable, FUN = sd)
     names(sds) <- variablenames
     print(paste("Standard deviations:"))
     print(sds)
     # within each group, find the number of samples:
     samplesizes <- aggregate(as.matrix(variables) ~ groupvariable, FUN = length)
     names(samplesizes) <- variablenames
     print(paste("Sample sizes:"))
     print(samplesizes)
  }

printMeanAndSdByGroup(wine[2:14],wine[1])

[1] "Means:"
  V1       V2       V3       V4       V5       V6       V7        V8       V9
1  1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000
2  2 12.27873 1.932676 2.244789 20.23803  94.5493 2.258873 2.0808451 0.363662
3  3 13.15375 3.333750 2.437083 21.41667  99.3125 1.678750 0.7814583 0.447500
       V10      V11       V12      V13       V14
1 1.899322 5.528305 1.0620339 3.157797 1115.7119
2 1.630282 3.086620 1.0562817 2.785352  519.5070
3 1.153542 7.396250 0.6827083 1.683542  629.8958
[1] "Standard deviations:"
  V1        V2        V3        V4       V5       V6        V7        V8
1  1 0.4621254 0.6885489 0.2271660 2.546322 10.49895 0.3389614 0.3974936
2  2 0.5379642 1.0155687 0.3154673 3.349770 16.75350 0.5453611 0.7057008
3  3 0.5302413 1.0879057 0.1846902 2.258161 10.89047 0.3569709 0.2935041
          V9       V10       V11       V12       V13      V14
1 0.07004924 0.4121092 1.2385728 0.1164826 0.3570766 221.5208
2 0.12396128 0.6020678 0.9249293 0.2029368 0.4965735 157.2112
3 0.12413959 0.4088359 2.3109421 0.1144411 0.2721114 115.0970
[1] "Sample sizes:"
  V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1  1 59 59 59 59 59 59 59 59  59  59  59  59  59
2  2 71 71 71 71 71 71 71 71  71  71  71  71  71
3  3 48 48 48 48 48 48 48 48  48  48  48  48  48

Between-groups Variance and Within-groups Variance for a Variable

calcWithinGroupsVariance <- function(variable,groupvariable)
  {
     # find out how many values the group variable can take
     groupvariable2 <- as.factor(groupvariable[[1]])
     levels <- levels(groupvariable2)
     numlevels <- length(levels)
     # get the mean and standard deviation for each group:
     numtotal <- 0
     denomtotal <- 0
     for (i in 1:numlevels)
     {
        leveli <- levels[i]
        levelidata <- variable[groupvariable==leveli,]
        levelilength <- length(levelidata)
        # get the standard deviation for group i:
        sdi <- sd(levelidata)
        numi <- (levelilength - 1)*(sdi * sdi)
        denomi <- levelilength
        numtotal <- numtotal + numi
        denomtotal <- denomtotal + denomi
     }
     # calculate the within-groups variance
     Vw <- numtotal / (denomtotal - numlevels)
     return(Vw)
  }

calcWithinGroupsVariance(wine[2],wine[1])

[1] 0.2620525

calcBetweenGroupsVariance <- function(variable,groupvariable)
  {
     # find out how many values the group variable can take
     groupvariable2 <- as.factor(groupvariable[[1]])
     levels <- levels(groupvariable2)
     numlevels <- length(levels)
     # calculate the overall grand mean:
     grandmean <- mean(wine$V2)
      
     # get the mean and standard deviation for each group:
     numtotal <- 0
     denomtotal <- 0
     for (i in 1:numlevels)
     {
        leveli <- levels[i]
        levelidata <- variable[groupvariable==leveli,]
        levelilength <- length(levelidata)
        # get the mean and standard deviation for group i:
        meani <- mean(levelidata)
        sdi <- sd(levelidata)
        numi <- levelilength * ((meani - grandmean)^2)
        denomi <- levelilength
        numtotal <- numtotal + numi
        denomtotal <- denomtotal + denomi
     }
     # calculate the between-groups variance
     Vb <- numtotal / (numlevels - 1)
     Vb <- Vb[[1]]
     return(Vb)
  }

calcBetweenGroupsVariance(wine[2], wine[1])

[1] 35.39742

35.39742/0.2620525

[1] 135.0776

calcSeparations <- function(variables,groupvariable)
  {
     # find out how many variables we have
     variables <- as.data.frame(variables)
     numvariables <- length(variables)
     # find the variable names
     variablenames <- colnames(variables)
     # calculate the separation for each variable
     for (i in 1:numvariables)
     {
        variablei <- variables[i]
        variablename <- variablenames[i]
        Vw <- calcWithinGroupsVariance(variablei, groupvariable)
        Vb <- calcBetweenGroupsVariance(variablei, groupvariable)
        sep <- Vb/Vw
        print(paste("variable",variablename,"Vw=",Vw,"Vb=",Vb,"separation=",sep))
     }
  }

calcSeparations(wine[2:14],wine[1])

[1] "variable V2 Vw= 0.262052469153907 Vb= 35.3974249602692 separation= 135.0776242428"
[1] "variable V3 Vw= 0.887546796746581 Vb= 10154.4606409588 separation= 11441.0425210043"
[1] "variable V4 Vw= 0.0660721013425184 Vb= 10065.3651082674 separation= 152339.109907954"
[1] "variable V5 Vw= 8.00681118121156 Vb= 4040.10461181253 separation= 504.583475290745"
[1] "variable V6 Vw= 180.65777316441 Vb= 671880.903309069 separation= 3719.08106438141"
[1] "variable V7 Vw= 0.191270475224227 Vb= 10218.0270550471 separation= 53421.8730991727"
[1] "variable V8 Vw= 0.274707514337437 Vb= 10777.2342568213 separation= 39231.6689363768"
[1] "variable V9 Vw= 0.0119117022132797 Vb= 14217.0422061125 separation= 1193535.73079276"
[1] "variable V10 Vw= 0.246172943795542 Vb= 11593.6224025303 separation= 47095.4371499059"
[1] "variable V11 Vw= 2.28492308133354 Vb= 5890.16197758506 separation= 2577.83818882314"
[1] "variable V12 Vw= 0.0244876469432414 Vb= 12910.854863443 separation= 527239.505427713"
[1] "variable V13 Vw= 0.160778729560982 Vb= 9636.30640975892 separation= 59935.2068278657"
[1] "variable V14 Vw= 29707.6818705169 Vb= 54112090.6081053 separation= 1821.48478780528"

Between-groups Covariance and Within-groups Covariance for Two Variables

calcWithinGroupsCovariance <- function(variable1,variable2,groupvariable)
  {
     # find out how many values the group variable can take
     groupvariable2 <- as.factor(groupvariable[[1]])
     levels <- levels(groupvariable2)
     numlevels <- length(levels)
     # get the covariance of variable 1 and variable 2 for each group:
     Covw <- 0
     for (i in 1:numlevels)
     {
        leveli <- levels[i]
        levelidata1 <- variable1[groupvariable==leveli,]
        levelidata2 <- variable2[groupvariable==leveli,]
        mean1 <- mean(levelidata1)
        mean2 <- mean(levelidata2)
        levelilength <- length(levelidata1)
        # get the covariance for this group:
        term1 <- 0
        for (j in 1:levelilength)
        {
           term1 <- term1 + ((levelidata1[j] - mean1)*(levelidata2[j] - mean2))
        }
        Cov_groupi <- term1 # covariance for this group
        Covw <- Covw + Cov_groupi
     }
     totallength <- nrow(variable1)
     Covw <- Covw / (totallength - numlevels)
     return(Covw)
  }

calcWithinGroupsCovariance(wine[8],wine[11],wine[1])

[1] 0.2866783

calcBetweenGroupsCovariance <- function(variable1,variable2,groupvariable)
  {
     # find out how many values the group variable can take
     groupvariable2 <- as.factor(groupvariable[[1]])
     levels <- levels(groupvariable2)
     numlevels <- length(levels)
     # calculate the grand means
     variable1mean <- mean(wine$V8)
     variable2mean <- mean(wine$V11)
     # calculate the between-groups covariance
     Covb <- 0
     for (i in 1:numlevels)
     {
        leveli <- levels[i]
        levelidata1 <- variable1[groupvariable==leveli,]
        levelidata2 <- variable2[groupvariable==leveli,]
        mean1 <- mean(levelidata1)
        mean2 <- mean(levelidata2)
        levelilength <- length(levelidata1)
        term1 <- (mean1 - variable1mean)*(mean2 - variable2mean)*(levelilength)
        Covb <- Covb + term1
     }
     Covb <- Covb / (numlevels - 1)
     Covb <- Covb[[1]]
     return(Covb)
  }

calcBetweenGroupsCovariance(wine[8],wine[11],wine[1])

[1] -60.41077

Calculating Correlations for Multivariate Data

cor.test(wine$V2, wine$V3)


    Pearson's product-moment correlation

data:  wine$V2 and wine$V3
t = 1.2579, df = 176, p-value = 0.2101
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.05342959  0.23817474
sample estimates:
       cor 
0.09439694

mosthighlycorrelated <- function(mydataframe,numtoreport)
  {
     # find the correlations
     cormatrix <- cor(mydataframe)
     # set the correlations on the diagonal or lower triangle to zero,
     # so they will not be reported as the highest ones:
     diag(cormatrix) <- 0
     cormatrix[lower.tri(cormatrix)] <- 0
     # flatten the matrix into a dataframe for easy sorting
     fm <- as.data.frame(as.table(cormatrix))
     # assign human-friendly names
     names(fm) <- c("First.Variable", "Second.Variable","Correlation")
     # sort and print the top n correlations
     head(fm[order(abs(fm$Correlation),decreasing=T),],n=numtoreport)
  }

mosthighlycorrelated(wine[2:14], 10)

    First.Variable Second.Variable Correlation
84              V7              V8   0.8645635
150             V8             V13   0.7871939
149             V7             V13   0.6999494
111             V8             V10   0.6526918
157             V2             V14   0.6437200
110             V7             V10   0.6124131
154            V12             V13   0.5654683
132             V3             V12  -0.5612957
118             V2             V11   0.5463642
137             V8             V12   0.5434786

Standardising Variables

standardisedconcentrations <- as.data.frame(scale(wine[2:14]))

sapply(standardisedconcentrations,mean)

           V2            V3            V4            V5            V6 
-8.591766e-16 -6.776446e-17  8.045176e-16 -7.720494e-17 -4.073935e-17 
           V7            V8            V9           V10           V11 
-1.395560e-17  6.958263e-17 -1.042186e-16 -1.221369e-16  3.649376e-17 
          V12           V13           V14 
 2.093741e-16  3.003459e-16 -1.034429e-16

sapply(standardisedconcentrations,sd)

 V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 
  1   1   1   1   1   1   1   1   1   1   1   1   1

Principal Component Analysis

standardisedconcentrations <- as.data.frame(scale(wine[2:14]))
wine.pca <- prcomp(standardisedconcentrations)
summary(wine.pca)

Importance of components:
                         PC1    PC2    PC3     PC4     PC5     PC6     PC7
Standard deviation     2.169 1.5802 1.2025 0.95863 0.92370 0.80103 0.74231
Proportion of Variance 0.362 0.1921 0.1112 0.07069 0.06563 0.04936 0.04239
Cumulative Proportion  0.362 0.5541 0.6653 0.73599 0.80162 0.85098 0.89337
                           PC8     PC9   PC10    PC11    PC12    PC13
Standard deviation     0.59034 0.53748 0.5009 0.47517 0.41082 0.32152
Proportion of Variance 0.02681 0.02222 0.0193 0.01737 0.01298 0.00795
Cumulative Proportion  0.92018 0.94240 0.9617 0.97907 0.99205 1.00000

wine.pca$sdev

 [1] 2.1692972 1.5801816 1.2025273 0.9586313 0.9237035 0.8010350 0.7423128
 [8] 0.5903367 0.5374755 0.5009017 0.4751722 0.4108165 0.3215244

sum((wine.pca$sdev)^2)

[1] 13

Deciding How Many Principal Components to Retain

screeplot(wine.pca, type="lines")

(wine.pca$sdev)^2

 [1] 4.7058503 2.4969737 1.4460720 0.9189739 0.8532282 0.6416570 0.5510283
 [8] 0.3484974 0.2888799 0.2509025 0.2257886 0.1687702 0.1033779

Loadings for the Principal Components

wine.pca$rotation[,1]

          V2           V3           V4           V5           V6           V7 
-0.144329395  0.245187580  0.002051061  0.239320405 -0.141992042 -0.394660845 
          V8           V9          V10          V11          V12          V13 
-0.422934297  0.298533103 -0.313429488  0.088616705 -0.296714564 -0.376167411 
         V14 
-0.286752227

sum((wine.pca$rotation[,1])^2)

[1] 1

calcpc <- function(variables,loadings)
  {
     # find the number of samples in the data set
     as.data.frame(variables)
     numsamples <- nrow(variables)
     # make a vector to store the component
     pc <- numeric(numsamples)
     # find the number of variables
     numvariables <- length(variables)
     # calculate the value of the component for each sample
     for (i in 1:numsamples)
     {
        valuei <- 0
        for (j in 1:numvariables)
        {
           valueij <- variables[i,j]
           loadingj <- loadings[j]
           valuei <- valuei + (valueij * loadingj)
        }
        pc[i] <- valuei
     }
     return(pc)
  }

calcpc(standardisedconcentrations, wine.pca$rotation[,1])

  [1] -3.30742097 -2.20324981 -2.50966069 -3.74649719 -1.00607049 -3.04167373
  [7] -2.44220051 -2.05364379 -2.50381135 -2.74588238 -3.46994837 -1.74981688
 [13] -2.10751729 -3.44842921 -4.30065228 -2.29870383 -2.16584568 -1.89362947
 [19] -3.53202167 -2.07865856 -3.11561376 -1.08351361 -2.52809263 -1.64036108
 [25] -1.75662066 -0.98729406 -1.77028387 -1.23194878 -2.18225047 -2.24976267
 [31] -2.49318704 -2.66987964 -1.62399801 -1.89733870 -1.40642118 -1.89847087
 [37] -1.38096669 -1.11905070 -1.49796891 -2.52268490 -2.58081526 -0.66660159
 [43] -3.06216898 -0.46090897 -2.09544094 -1.13297020 -2.71893118 -2.81340300
 [49] -2.00419725 -2.69987528 -3.20587409 -2.85091773 -3.49574328 -2.21853316
 [55] -2.14094846 -2.46238340 -2.73380617 -2.16762631 -3.13054925  0.92596992
 [61]  1.53814123  1.83108449 -0.03052074 -2.04449433  0.60796583 -0.89769555
 [67] -2.24218226 -0.18286818  0.81051865 -1.97006319  1.56779366 -1.65301884
 [73]  0.72333196 -2.55501977 -1.82741266  0.86555129 -0.36897357  1.45327752
 [79] -1.25937829 -0.37509228 -0.75992026 -1.03166776  0.49348469  2.53183508
 [85] -0.83297044 -0.78568828  0.80456258  0.55647288  1.11197430  0.55415961
 [91]  1.34548982  1.56008180  1.92711944 -0.74456561 -0.95476209 -2.53670943
 [97]  0.54242248 -1.02814946 -2.24557492 -1.40624916 -0.79547585  0.54798592
[103]  0.16072037  0.65793897 -0.39125074  1.76751314  0.36523707  1.61611371
[109] -0.08230361 -1.57383547 -1.41657326  0.27791878  1.29947929  0.45578615
[115]  0.49279573 -0.48071836  0.25217752  0.10692601  2.42616867  0.54953935
[121] -0.73754141 -1.33256273  1.17377592  0.46103449 -0.97572169  0.09653741
[127] -0.03837888  1.59266578  0.47821593  1.78779033  1.32336859  2.37779336
[133]  2.92867865  2.14077227  2.36320318  3.05522315  3.90473898  3.92539034
[139]  3.08557209  2.36779237  2.77099630  2.28012931  2.97723506  2.36851341
[145]  2.20364930  2.61823528  4.26859758  3.57256360  2.79916760  2.89150275
[151]  2.31420887  2.54265841  1.80744271  2.75238051  2.72945105  3.59472857
[157]  2.88169708  3.38261413  1.04523342  1.60538369  3.13428951  2.23385546
[163]  2.83966343  2.59019044  2.94100316  3.52010248  2.39934228  2.92084537
[169]  2.17527658  2.37423037  3.20258311  3.66757294  2.45862032  3.36104305
[175]  2.59463669  2.67030685  2.38030254  3.19973210

wine.pca$x[,1]

  [1] -3.30742097 -2.20324981 -2.50966069 -3.74649719 -1.00607049 -3.04167373
  [7] -2.44220051 -2.05364379 -2.50381135 -2.74588238 -3.46994837 -1.74981688
 [13] -2.10751729 -3.44842921 -4.30065228 -2.29870383 -2.16584568 -1.89362947
 [19] -3.53202167 -2.07865856 -3.11561376 -1.08351361 -2.52809263 -1.64036108
 [25] -1.75662066 -0.98729406 -1.77028387 -1.23194878 -2.18225047 -2.24976267
 [31] -2.49318704 -2.66987964 -1.62399801 -1.89733870 -1.40642118 -1.89847087
 [37] -1.38096669 -1.11905070 -1.49796891 -2.52268490 -2.58081526 -0.66660159
 [43] -3.06216898 -0.46090897 -2.09544094 -1.13297020 -2.71893118 -2.81340300
 [49] -2.00419725 -2.69987528 -3.20587409 -2.85091773 -3.49574328 -2.21853316
 [55] -2.14094846 -2.46238340 -2.73380617 -2.16762631 -3.13054925  0.92596992
 [61]  1.53814123  1.83108449 -0.03052074 -2.04449433  0.60796583 -0.89769555
 [67] -2.24218226 -0.18286818  0.81051865 -1.97006319  1.56779366 -1.65301884
 [73]  0.72333196 -2.55501977 -1.82741266  0.86555129 -0.36897357  1.45327752
 [79] -1.25937829 -0.37509228 -0.75992026 -1.03166776  0.49348469  2.53183508
 [85] -0.83297044 -0.78568828  0.80456258  0.55647288  1.11197430  0.55415961
 [91]  1.34548982  1.56008180  1.92711944 -0.74456561 -0.95476209 -2.53670943
 [97]  0.54242248 -1.02814946 -2.24557492 -1.40624916 -0.79547585  0.54798592
[103]  0.16072037  0.65793897 -0.39125074  1.76751314  0.36523707  1.61611371
[109] -0.08230361 -1.57383547 -1.41657326  0.27791878  1.29947929  0.45578615
[115]  0.49279573 -0.48071836  0.25217752  0.10692601  2.42616867  0.54953935
[121] -0.73754141 -1.33256273  1.17377592  0.46103449 -0.97572169  0.09653741
[127] -0.03837888  1.59266578  0.47821593  1.78779033  1.32336859  2.37779336
[133]  2.92867865  2.14077227  2.36320318  3.05522315  3.90473898  3.92539034
[139]  3.08557209  2.36779237  2.77099630  2.28012931  2.97723506  2.36851341
[145]  2.20364930  2.61823528  4.26859758  3.57256360  2.79916760  2.89150275
[151]  2.31420887  2.54265841  1.80744271  2.75238051  2.72945105  3.59472857
[157]  2.88169708  3.38261413  1.04523342  1.60538369  3.13428951  2.23385546
[163]  2.83966343  2.59019044  2.94100316  3.52010248  2.39934228  2.92084537
[169]  2.17527658  2.37423037  3.20258311  3.66757294  2.45862032  3.36104305
[175]  2.59463669  2.67030685  2.38030254  3.19973210

wine.pca$rotation[,2]

          V2           V3           V4           V5           V6           V7 
 0.483651548  0.224930935  0.316068814 -0.010590502  0.299634003  0.065039512 
          V8           V9          V10          V11          V12          V13 
-0.003359812  0.028779488  0.039301722  0.529995672 -0.279235148 -0.164496193 
         V14 
 0.364902832

sum((wine.pca$rotation[,2])^2)

[1] 1

Scatterplots of the Principal Components

plot(wine.pca$x[,1],wine.pca$x[,2])
text(wine.pca$x[,1],wine.pca$x[,2], wine$V1, cex=0.7, pos=4, col="red")

printMeanAndSdByGroup(standardisedconcentrations,wine[1])

[1] "Means:"
  V1         V2         V3         V4         V5          V6          V7
1  1  0.9166093 -0.2915199  0.3246886 -0.7359212  0.46192317  0.87090552
2  2 -0.8892116 -0.3613424 -0.4437061  0.2225094 -0.36354162 -0.05790375
3  3  0.1886265  0.8928122  0.2572190  0.5754413 -0.03004191 -0.98483874
           V8          V9        V10        V11        V12        V13
1  0.95419225 -0.57735640  0.5388633  0.2028288  0.4575567  0.7691811
2  0.05163434  0.01452785  0.0688079 -0.8503999  0.4323908  0.2446043
3 -1.24923710  0.68817813 -0.7641311  1.0085728 -1.2019916 -1.3072623
         V14
1  1.1711967
2 -0.7220731
3 -0.3715295
[1] "Standard deviations:"
  V1        V2        V3        V4        V5        V6        V7        V8
1  1 0.5692415 0.6163463 0.8280333 0.7624716 0.7350927 0.5416007 0.3979478
2  2 0.6626591 0.9090742 1.1498967 1.0030563 1.1730101 0.8713912 0.7065071
3  3 0.6531461 0.9738258 0.6732065 0.6761844 0.7625055 0.5703767 0.2938394
         V9       V10       V11       V12       V13       V14
1 0.5628555 0.7200189 0.5342623 0.5096112 0.5029315 0.7034472
2 0.9960462 1.0519061 0.3989712 0.8878480 0.6994087 0.4992299
3 0.9974790 0.7142999 0.9968323 0.5006795 0.3832607 0.3654948
[1] "Sample sizes:"
  V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1  1 59 59 59 59 59 59 59 59  59  59  59  59  59
2  2 71 71 71 71 71 71 71 71  71  71  71  71  71
3  3 48 48 48 48 48 48 48 48  48  48  48  48  48

Linear Discriminant Analysis

wine.lda <- lda(wine$V1 ~ wine$V2 + wine$V3 + wine$V4 + wine$V5 + wine$V6 + wine$V7 +
                            wine$V8 + wine$V9 + wine$V10 + wine$V11 + wine$V12 + wine$V13 +
                            wine$V14)
wine.lda

Call:
lda(wine$V1 ~ wine$V2 + wine$V3 + wine$V4 + wine$V5 + wine$V6 + 
    wine$V7 + wine$V8 + wine$V9 + wine$V10 + wine$V11 + wine$V12 + 
    wine$V13 + wine$V14)

Prior probabilities of groups:
        1         2         3 
0.3314607 0.3988764 0.2696629 

Group means:
   wine$V2  wine$V3  wine$V4  wine$V5  wine$V6  wine$V7   wine$V8  wine$V9
1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000
2 12.27873 1.932676 2.244789 20.23803  94.5493 2.258873 2.0808451 0.363662
3 13.15375 3.333750 2.437083 21.41667  99.3125 1.678750 0.7814583 0.447500
  wine$V10 wine$V11  wine$V12 wine$V13  wine$V14
1 1.899322 5.528305 1.0620339 3.157797 1115.7119
2 1.630282 3.086620 1.0562817 2.785352  519.5070
3 1.153542 7.396250 0.6827083 1.683542  629.8958

Coefficients of linear discriminants:
                  LD1           LD2
wine$V2  -0.403399781  0.8717930699
wine$V3   0.165254596  0.3053797325
wine$V4  -0.369075256  2.3458497486
wine$V5   0.154797889 -0.1463807654
wine$V6  -0.002163496 -0.0004627565
wine$V7   0.618052068 -0.0322128171
wine$V8  -1.661191235 -0.4919980543
wine$V9  -1.495818440 -1.6309537953
wine$V10  0.134092628 -0.3070875776
wine$V11  0.355055710  0.2532306865
wine$V12 -0.818036073 -1.5156344987
wine$V13 -1.157559376  0.0511839665
wine$V14 -0.002691206  0.0028529846

Proportion of trace:
   LD1    LD2 
0.6875 0.3125

wine.lda$scaling[,1]

     wine$V2      wine$V3      wine$V4      wine$V5      wine$V6      wine$V7 
-0.403399781  0.165254596 -0.369075256  0.154797889 -0.002163496  0.618052068 
     wine$V8      wine$V9     wine$V10     wine$V11     wine$V12     wine$V13 
-1.661191235 -1.495818440  0.134092628  0.355055710 -0.818036073 -1.157559376 
    wine$V14 
-0.002691206

calclda <- function(variables,loadings)
  {
     # find the number of samples in the data set
     as.data.frame(variables)
     numsamples <- nrow(variables)
     # make a vector to store the discriminant function
     ld <- numeric(numsamples)
     # find the number of variables
     numvariables <- length(variables)
     # calculate the value of the discriminant function for each sample
     for (i in 1:numsamples)
     {
        valuei <- 0
        for (j in 1:numvariables)
        {
           valueij <- variables[i,j]
           loadingj <- loadings[j]
           valuei <- valuei + (valueij * loadingj)
        }
        ld[i] <- valuei
     }
     # standardise the discriminant function so that its mean value is 0:
     ld <- as.data.frame(scale(ld, center=TRUE, scale=FALSE))
     ld <- ld[[1]]
     return(ld)
  }

calclda(wine[2:14], wine.lda$scaling[,1])

  [1] -4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934
  [7] -4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646
 [13] -3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262
 [19] -5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961
 [25] -3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573
 [31] -2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996
 [37] -2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062
 [43] -4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356
 [49] -2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293
 [55] -3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551  1.58618749
 [61]  0.79967216  2.38015446 -0.45917726 -0.50726885  0.39398359 -0.92256616
 [67] -1.95549377 -0.34732815  0.20371212 -0.24831914  1.17987999 -1.07718925
 [73]  0.64100179 -1.74684421 -0.34721117  1.14274222  0.18665882  0.90052500
 [79] -0.70709551 -0.59562833 -0.55761818 -1.80430417  0.23077079  2.03482711
 [85] -0.62113021 -1.03372742  0.76598781  0.35042568  0.15324508 -0.14962842
 [91]  0.48079504  1.39689016  0.91972331 -0.59102937  0.49411386 -1.62614426
 [97]  2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340  0.02594549
[103] -0.21887407  1.36437640 -1.12901245 -0.21263094 -0.77946884  0.61546732
[109]  0.22550192 -2.03869851  0.79274716  0.30229545 -0.50664882  0.99837397
[115] -0.21954922 -0.37131517  0.05545894 -0.09137874  1.79755252 -0.17405009
[121] -1.17870281 -3.21054390  0.62605202  0.03366613 -0.69930080 -0.72061079
[127] -0.51933512  1.17030045  0.10824791  1.12319783  2.24632419  3.28527755
[133]  4.07236441  3.86691235  3.45088333  3.71583899  3.92220510  4.85161020
[139]  3.54993389  3.76889174  2.66942250  2.32491492  3.17712883  2.88964418
[145]  3.78325562  3.04411324  4.70697017  4.85021393  4.98359184  4.86968293
[151]  4.59869190  5.67447884  5.32986123  5.03401031  4.52080087  5.09783710
[157]  5.04368277  4.86980829  5.61316558  5.67046737  5.37413513  3.09975377
[163]  3.35888137  3.04007194  4.94861303  4.54504458  5.27255844  5.13016117
[169]  4.30468082  5.08336782  4.06743571  5.74212961  4.48205140  4.29150758
[175]  4.50329623  5.04747033  4.27615505  5.53808610

wine.lda.values <- predict(wine.lda, wine[2:14])
wine.lda.values$x[,1]

          1           2           3           4           5           6 
-4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934 
          7           8           9          10          11          12 
-4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646 
         13          14          15          16          17          18 
-3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262 
         19          20          21          22          23          24 
-5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961 
         25          26          27          28          29          30 
-3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573 
         31          32          33          34          35          36 
-2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996 
         37          38          39          40          41          42 
-2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062 
         43          44          45          46          47          48 
-4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356 
         49          50          51          52          53          54 
-2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293 
         55          56          57          58          59          60 
-3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551  1.58618749 
         61          62          63          64          65          66 
 0.79967216  2.38015446 -0.45917726 -0.50726885  0.39398359 -0.92256616 
         67          68          69          70          71          72 
-1.95549377 -0.34732815  0.20371212 -0.24831914  1.17987999 -1.07718925 
         73          74          75          76          77          78 
 0.64100179 -1.74684421 -0.34721117  1.14274222  0.18665882  0.90052500 
         79          80          81          82          83          84 
-0.70709551 -0.59562833 -0.55761818 -1.80430417  0.23077079  2.03482711 
         85          86          87          88          89          90 
-0.62113021 -1.03372742  0.76598781  0.35042568  0.15324508 -0.14962842 
         91          92          93          94          95          96 
 0.48079504  1.39689016  0.91972331 -0.59102937  0.49411386 -1.62614426 
         97          98          99         100         101         102 
 2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340  0.02594549 
        103         104         105         106         107         108 
-0.21887407  1.36437640 -1.12901245 -0.21263094 -0.77946884  0.61546732 
        109         110         111         112         113         114 
 0.22550192 -2.03869851  0.79274716  0.30229545 -0.50664882  0.99837397 
        115         116         117         118         119         120 
-0.21954922 -0.37131517  0.05545894 -0.09137874  1.79755252 -0.17405009 
        121         122         123         124         125         126 
-1.17870281 -3.21054390  0.62605202  0.03366613 -0.69930080 -0.72061079 
        127         128         129         130         131         132 
-0.51933512  1.17030045  0.10824791  1.12319783  2.24632419  3.28527755 
        133         134         135         136         137         138 
 4.07236441  3.86691235  3.45088333  3.71583899  3.92220510  4.85161020 
        139         140         141         142         143         144 
 3.54993389  3.76889174  2.66942250  2.32491492  3.17712883  2.88964418 
        145         146         147         148         149         150 
 3.78325562  3.04411324  4.70697017  4.85021393  4.98359184  4.86968293 
        151         152         153         154         155         156 
 4.59869190  5.67447884  5.32986123  5.03401031  4.52080087  5.09783710 
        157         158         159         160         161         162 
 5.04368277  4.86980829  5.61316558  5.67046737  5.37413513  3.09975377 
        163         164         165         166         167         168 
 3.35888137  3.04007194  4.94861303  4.54504458  5.27255844  5.13016117 
        169         170         171         172         173         174 
 4.30468082  5.08336782  4.06743571  5.74212961  4.48205140  4.29150758 
        175         176         177         178 
 4.50329623  5.04747033  4.27615505  5.53808610

groupStandardise <- function(variables, groupvariable)
  {
     # find out how many variables we have
     variables <- as.data.frame(variables)
     numvariables <- length(variables)
     # find the variable names
     variablenames <- colnames(variables)
     # calculate the group-standardised version of each variable
     for (i in 1:numvariables)
     {
        variablei <- variables[i]
        variablei_name <- variablenames[i]
        variablei_Vw <- calcWithinGroupsVariance(variablei, groupvariable)
        variablei_mean <- mean(wine$V2) + mean(wine$V3) + mean(wine$V4) + mean(wine$V5) + mean(wine$V6) + mean(wine$V7) + mean(wine$V8) + mean(wine$V9) + mean(wine$V10) + mean(wine$V11) + mean(wine$V12) + mean(wine$V13) + mean(wine$V14)
        variablei_new <- (variablei - variablei_mean)/(sqrt(variablei_Vw))
        data_length <- nrow(variablei)
        if (i == 1) { variables_new <- data.frame(row.names=seq(1,data_length)) }
        variables_new[`variablei_name`] <- variablei_new
     }
     return(variables_new)
  }

groupstandardisedconcentrations <- groupStandardise(wine[2:14], wine[1])

wine.lda2 <- lda(wine$V1 ~ groupstandardisedconcentrations$V2 + groupstandardisedconcentrations$V3 +
                             groupstandardisedconcentrations$V4 + groupstandardisedconcentrations$V5 +
                             groupstandardisedconcentrations$V6 + groupstandardisedconcentrations$V7 +
                             groupstandardisedconcentrations$V8 + groupstandardisedconcentrations$V9 +
                             groupstandardisedconcentrations$V10 + groupstandardisedconcentrations$V11 +
                             groupstandardisedconcentrations$V12 + groupstandardisedconcentrations$V13 +
                             groupstandardisedconcentrations$V14)
wine.lda2

Call:
lda(wine$V1 ~ groupstandardisedconcentrations$V2 + groupstandardisedconcentrations$V3 + 
    groupstandardisedconcentrations$V4 + groupstandardisedconcentrations$V5 + 
    groupstandardisedconcentrations$V6 + groupstandardisedconcentrations$V7 + 
    groupstandardisedconcentrations$V8 + groupstandardisedconcentrations$V9 + 
    groupstandardisedconcentrations$V10 + groupstandardisedconcentrations$V11 + 
    groupstandardisedconcentrations$V12 + groupstandardisedconcentrations$V13 + 
    groupstandardisedconcentrations$V14)

Prior probabilities of groups:
        1         2         3 
0.3314607 0.3988764 0.2696629 

Group means:
  groupstandardisedconcentrations$V2 groupstandardisedconcentrations$V3
1                          -1728.804                          -951.8414
2                          -1731.667                          -951.9242
3                          -1729.958                          -950.4370
  groupstandardisedconcentrations$V4 groupstandardisedconcentrations$V5
1                          -3486.869                          -311.5955
2                          -3487.689                          -310.4644
3                          -3486.941                          -310.0478
  groupstandardisedconcentrations$V6 groupstandardisedconcentrations$V7
1                          -58.95429                          -2048.492
2                          -59.83144                          -2049.821
3                          -59.47706                          -2051.148
  groupstandardisedconcentrations$V8 groupstandardisedconcentrations$V9
1                          -1709.047                          -8232.009
2                          -1710.767                          -8231.334
3                          -1713.247                          -8230.566
  groupstandardisedconcentrations$V10 groupstandardisedconcentrations$V11
1                           -1807.565                           -590.9047
2                           -1808.108                           -592.5200
3                           -1809.068                           -589.6690
  groupstandardisedconcentrations$V12 groupstandardisedconcentrations$V13
1                           -5736.485                           -2233.521
2                           -5736.522                           -2234.450
3                           -5738.909                           -2237.198
  groupstandardisedconcentrations$V14
1                            1.258849
2                           -2.200234
3                           -1.559777

Coefficients of linear discriminants:
                                            LD1          LD2
groupstandardisedconcentrations$V2  -0.20650463  0.446280119
groupstandardisedconcentrations$V3   0.15568586  0.287697336
groupstandardisedconcentrations$V4  -0.09486893  0.602988809
groupstandardisedconcentrations$V5   0.43802089 -0.414203541
groupstandardisedconcentrations$V6  -0.02907934 -0.006219863
groupstandardisedconcentrations$V7   0.27030186 -0.014088108
groupstandardisedconcentrations$V8  -0.87067265 -0.257868714
groupstandardisedconcentrations$V9  -0.16325474 -0.178003512
groupstandardisedconcentrations$V10  0.06653116 -0.152364015
groupstandardisedconcentrations$V11  0.53670086  0.382782544
groupstandardisedconcentrations$V12 -0.12801061 -0.237174509
groupstandardisedconcentrations$V13 -0.46414916  0.020523349
groupstandardisedconcentrations$V14 -0.46385409  0.491738050

Proportion of trace:
   LD1    LD2 
0.6875 0.3125

wine.lda.values <- predict(wine.lda, wine[2:14])
wine.lda.values$x[,1]

          1           2           3           4           5           6 
-4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934 
          7           8           9          10          11          12 
-4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646 
         13          14          15          16          17          18 
-3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262 
         19          20          21          22          23          24 
-5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961 
         25          26          27          28          29          30 
-3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573 
         31          32          33          34          35          36 
-2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996 
         37          38          39          40          41          42 
-2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062 
         43          44          45          46          47          48 
-4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356 
         49          50          51          52          53          54 
-2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293 
         55          56          57          58          59          60 
-3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551  1.58618749 
         61          62          63          64          65          66 
 0.79967216  2.38015446 -0.45917726 -0.50726885  0.39398359 -0.92256616 
         67          68          69          70          71          72 
-1.95549377 -0.34732815  0.20371212 -0.24831914  1.17987999 -1.07718925 
         73          74          75          76          77          78 
 0.64100179 -1.74684421 -0.34721117  1.14274222  0.18665882  0.90052500 
         79          80          81          82          83          84 
-0.70709551 -0.59562833 -0.55761818 -1.80430417  0.23077079  2.03482711 
         85          86          87          88          89          90 
-0.62113021 -1.03372742  0.76598781  0.35042568  0.15324508 -0.14962842 
         91          92          93          94          95          96 
 0.48079504  1.39689016  0.91972331 -0.59102937  0.49411386 -1.62614426 
         97          98          99         100         101         102 
 2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340  0.02594549 
        103         104         105         106         107         108 
-0.21887407  1.36437640 -1.12901245 -0.21263094 -0.77946884  0.61546732 
        109         110         111         112         113         114 
 0.22550192 -2.03869851  0.79274716  0.30229545 -0.50664882  0.99837397 
        115         116         117         118         119         120 
-0.21954922 -0.37131517  0.05545894 -0.09137874  1.79755252 -0.17405009 
        121         122         123         124         125         126 
-1.17870281 -3.21054390  0.62605202  0.03366613 -0.69930080 -0.72061079 
        127         128         129         130         131         132 
-0.51933512  1.17030045  0.10824791  1.12319783  2.24632419  3.28527755 
        133         134         135         136         137         138 
 4.07236441  3.86691235  3.45088333  3.71583899  3.92220510  4.85161020 
        139         140         141         142         143         144 
 3.54993389  3.76889174  2.66942250  2.32491492  3.17712883  2.88964418 
        145         146         147         148         149         150 
 3.78325562  3.04411324  4.70697017  4.85021393  4.98359184  4.86968293 
        151         152         153         154         155         156 
 4.59869190  5.67447884  5.32986123  5.03401031  4.52080087  5.09783710 
        157         158         159         160         161         162 
 5.04368277  4.86980829  5.61316558  5.67046737  5.37413513  3.09975377 
        163         164         165         166         167         168 
 3.35888137  3.04007194  4.94861303  4.54504458  5.27255844  5.13016117 
        169         170         171         172         173         174 
 4.30468082  5.08336782  4.06743571  5.74212961  4.48205140  4.29150758 
        175         176         177         178 
 4.50329623  5.04747033  4.27615505  5.53808610

wine.lda.values2 <- predict(wine.lda2, groupstandardisedconcentrations)
wine.lda.values2$x[,1]

          1           2           3           4           5           6 
-4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934 
          7           8           9          10          11          12 
-4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646 
         13          14          15          16          17          18 
-3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262 
         19          20          21          22          23          24 
-5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961 
         25          26          27          28          29          30 
-3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573 
         31          32          33          34          35          36 
-2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996 
         37          38          39          40          41          42 
-2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062 
         43          44          45          46          47          48 
-4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356 
         49          50          51          52          53          54 
-2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293 
         55          56          57          58          59          60 
-3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551  1.58618749 
         61          62          63          64          65          66 
 0.79967216  2.38015446 -0.45917726 -0.50726885  0.39398359 -0.92256616 
         67          68          69          70          71          72 
-1.95549377 -0.34732815  0.20371212 -0.24831914  1.17987999 -1.07718925 
         73          74          75          76          77          78 
 0.64100179 -1.74684421 -0.34721117  1.14274222  0.18665882  0.90052500 
         79          80          81          82          83          84 
-0.70709551 -0.59562833 -0.55761818 -1.80430417  0.23077079  2.03482711 
         85          86          87          88          89          90 
-0.62113021 -1.03372742  0.76598781  0.35042568  0.15324508 -0.14962842 
         91          92          93          94          95          96 
 0.48079504  1.39689016  0.91972331 -0.59102937  0.49411386 -1.62614426 
         97          98          99         100         101         102 
 2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340  0.02594549 
        103         104         105         106         107         108 
-0.21887407  1.36437640 -1.12901245 -0.21263094 -0.77946884  0.61546732 
        109         110         111         112         113         114 
 0.22550192 -2.03869851  0.79274716  0.30229545 -0.50664882  0.99837397 
        115         116         117         118         119         120 
-0.21954922 -0.37131517  0.05545894 -0.09137874  1.79755252 -0.17405009 
        121         122         123         124         125         126 
-1.17870281 -3.21054390  0.62605202  0.03366613 -0.69930080 -0.72061079 
        127         128         129         130         131         132 
-0.51933512  1.17030045  0.10824791  1.12319783  2.24632419  3.28527755 
        133         134         135         136         137         138 
 4.07236441  3.86691235  3.45088333  3.71583899  3.92220510  4.85161020 
        139         140         141         142         143         144 
 3.54993389  3.76889174  2.66942250  2.32491492  3.17712883  2.88964418 
        145         146         147         148         149         150 
 3.78325562  3.04411324  4.70697017  4.85021393  4.98359184  4.86968293 
        151         152         153         154         155         156 
 4.59869190  5.67447884  5.32986123  5.03401031  4.52080087  5.09783710 
        157         158         159         160         161         162 
 5.04368277  4.86980829  5.61316558  5.67046737  5.37413513  3.09975377 
        163         164         165         166         167         168 
 3.35888137  3.04007194  4.94861303  4.54504458  5.27255844  5.13016117 
        169         170         171         172         173         174 
 4.30468082  5.08336782  4.06743571  5.74212961  4.48205140  4.29150758 
        175         176         177         178 
 4.50329623  5.04747033  4.27615505  5.53808610

Separation Achieved by the Discriminant Functions

wine.lda.values <- predict(wine.lda, standardisedconcentrations)

calcSeparations(wine.lda.values$x,wine[1])

[1] "variable LD1 Vw= 0.999999999999999 Vb= 15837.082234555 separation= 15837.082234555"
[1] "variable LD2 Vw= 1 Vb= 15403.6710754822 separation= 15403.6710754822"

wine.lda

Call:
lda(wine$V1 ~ wine$V2 + wine$V3 + wine$V4 + wine$V5 + wine$V6 + 
    wine$V7 + wine$V8 + wine$V9 + wine$V10 + wine$V11 + wine$V12 + 
    wine$V13 + wine$V14)

Prior probabilities of groups:
        1         2         3 
0.3314607 0.3988764 0.2696629 

Group means:
   wine$V2  wine$V3  wine$V4  wine$V5  wine$V6  wine$V7   wine$V8  wine$V9
1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000
2 12.27873 1.932676 2.244789 20.23803  94.5493 2.258873 2.0808451 0.363662
3 13.15375 3.333750 2.437083 21.41667  99.3125 1.678750 0.7814583 0.447500
  wine$V10 wine$V11  wine$V12 wine$V13  wine$V14
1 1.899322 5.528305 1.0620339 3.157797 1115.7119
2 1.630282 3.086620 1.0562817 2.785352  519.5070
3 1.153542 7.396250 0.6827083 1.683542  629.8958

Coefficients of linear discriminants:
                  LD1           LD2
wine$V2  -0.403399781  0.8717930699
wine$V3   0.165254596  0.3053797325
wine$V4  -0.369075256  2.3458497486
wine$V5   0.154797889 -0.1463807654
wine$V6  -0.002163496 -0.0004627565
wine$V7   0.618052068 -0.0322128171
wine$V8  -1.661191235 -0.4919980543
wine$V9  -1.495818440 -1.6309537953
wine$V10  0.134092628 -0.3070875776
wine$V11  0.355055710  0.2532306865
wine$V12 -0.818036073 -1.5156344987
wine$V13 -1.157559376  0.0511839665
wine$V14 -0.002691206  0.0028529846

Proportion of trace:
   LD1    LD2 
0.6875 0.3125

(wine.lda$svd)^2

[1] 794.6522 361.2410

A Stacked Histogram of the LDA Values

ldahist(data = wine.lda.values$x[,1], g=wine$V1)

ldahist(data = wine.lda.values$x[,2], g=wine$V1)

Scatterplots of the Discriminant Functions

plot(wine.lda.values$x[,1],wine.lda.values$x[,2]) 

text(wine.lda.values$x[,1],wine.lda.values$x[,2],wine$V1,cex=0.7,pos=4,col="red")

Allocation Rules and Misclassification Rate

printMeanAndSdByGroup(wine.lda.values$x,wine[1])

[1] "Means:"
  V1         LD1       LD2
1  1 -3.42248851  1.691674
2  2 -0.07972623 -2.472656
3  3  4.32473717  1.578120
[1] "Standard deviations:"
  V1       LD1       LD2
1  1 0.9394626 1.0176394
2  2 1.0839318 0.9973165
3  3 0.9404188 0.9818673
[1] "Sample sizes:"
  V1 LD1 LD2
1  1  59  59
2  2  71  71
3  3  48  48

calcAllocationRuleAccuracy <- function(ldavalue, groupvariable, cutoffpoints)
  {
     # find out how many values the group variable can take
     groupvariable2 <- as.factor(groupvariable[[1]])
     levels <- levels(groupvariable2)
     numlevels <- length(levels)
     # calculate the number of true positives and false negatives for each group
     numlevels <- length(levels)
     for (i in 1:numlevels)
     {
        leveli <- levels[i]
        levelidata <- ldavalue[groupvariable==leveli]
        # see how many of the samples from this group are classified in each group
        for (j in 1:numlevels)
        {
           levelj <- levels[j]
           if (j == 1)
           {
              cutoff1 <- cutoffpoints[1]
              cutoff2 <- "NA"
              results <- summary(levelidata <= cutoff1)
           }
           else if (j == numlevels)
           {
              cutoff1 <- cutoffpoints[(numlevels-1)]
              cutoff2 <- "NA"
              results <- summary(levelidata > cutoff1)
           }
           else
           {
              cutoff1 <- cutoffpoints[(j-1)]
              cutoff2 <- cutoffpoints[(j)]
              results <- summary(levelidata > cutoff1 & levelidata <= cutoff2)
           }
           trues <- results["TRUE"]
           trues <- trues[[1]]
           print(paste("Number of samples of group",leveli,"classified as group",levelj," : ",
              trues,"(cutoffs:",cutoff1,",",cutoff2,")"))
        }
     }
  }

calcAllocationRuleAccuracy(wine.lda.values$x[,1], wine[1], c(-1.751107, 2.122505))

[1] "Number of samples of group 1 classified as group 1  :  56 (cutoffs: -1.751107 , NA )"
[1] "Number of samples of group 1 classified as group 2  :  3 (cutoffs: -1.751107 , 2.122505 )"
[1] "Number of samples of group 1 classified as group 3  :  NA (cutoffs: 2.122505 , NA )"
[1] "Number of samples of group 2 classified as group 1  :  5 (cutoffs: -1.751107 , NA )"
[1] "Number of samples of group 2 classified as group 2  :  65 (cutoffs: -1.751107 , 2.122505 )"
[1] "Number of samples of group 2 classified as group 3  :  1 (cutoffs: 2.122505 , NA )"
[1] "Number of samples of group 3 classified as group 1  :  NA (cutoffs: -1.751107 , NA )"
[1] "Number of samples of group 3 classified as group 2  :  NA (cutoffs: -1.751107 , 2.122505 )"
[1] "Number of samples of group 3 classified as group 3  :  48 (cutoffs: 2.122505 , NA )"

Summary

In finding the best low-dimensional representation of the variation in a multivariate data set, we used the principal component analysis (PCA). The wine data set includes thirteen chemical concentrations describing wine samples from three different cultivars. Using a principal component analysis, we discovered that the first two principal components—each of which is a specific linear combination of the 13 chemical concentrations—can effectively capture most of the variance in concentration levels between the samples. On the one hand, linear discriminant analysis (LDA) aims to identify the linear combinations of the initial variables—13 chemical concentrations—that provide the best possible separation between the groups—wine cultivars—in a given data set. The wines, in this instance, come from three (3) different cultivars, so there are three groups and thirteen variables if we want to split the wines by cultivar. So, the minimum, either 2 or 13, useful discriminant functions that may distinguish the wines by cultivar are the maximum. So, utilizing 13 chemical concentration data, we can only discover a maximum of 2 appropriate discriminant functions to differentiate the wines by cultivar.

ACTIVITY 1

Marvanessa G. Dinorog

03-06-2023