wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
sep=",")
wine
wine1<-wine[2:6]
paged_table(wine1)
scatterplotMatrix(wine[2:6])
plot(wine$V4, wine$V5)
plot(wine$V4, wine$V5)
text(wine$V4, wine$V5, wine$V1, cex=0.7, pos=4, col="red")
makeProfilePlot <- function(mylist,names)
{
require(RColorBrewer)
# find out how many variables we want to include
numvariables <- length(mylist)
# choose 'numvariables' random colours
colours <- brewer.pal(numvariables,"Set1")
# find out the minimum and maximum values of the variables:
mymin <- 1e+20
mymax <- 1e-20
for (i in 1:numvariables)
{
vectori <- mylist[[i]]
mini <- min(vectori)
maxi <- max(vectori)
if (mini < mymin) { mymin <- mini }
if (maxi > mymax) { mymax <- maxi }
}
# plot the variables
for (i in 1:numvariables)
{
vectori <- mylist[[i]]
namei <- names[i]
colouri <- colours[i]
if (i == 1) { plot(vectori,col=colouri,type="l",ylim=c(mymin,mymax)) }
else { points(vectori, col=colouri,type="l") }
lastxval <- length(vectori)
lastyval <- vectori[length(vectori)]
text((lastxval-10),(lastyval),namei,col="black",cex=0.6)
}
}
names <- c("V2","V3","V4","V5","V6")
mylist <- list(wine$V2,wine$V3,wine$V4,wine$V5,wine$V6)
makeProfilePlot(mylist,names)
sapply(wine[2:14],mean)
V2 V3 V4 V5 V6 V7
13.0006180 2.3363483 2.3665169 19.4949438 99.7415730 2.2951124
V8 V9 V10 V11 V12 V13
2.0292697 0.3618539 1.5908989 5.0580899 0.9574494 2.6116854
V14
746.8932584
sapply(wine[2:14],sd)
V2 V3 V4 V5 V6 V7
0.8118265 1.1171461 0.2743440 3.3395638 14.2824835 0.6258510
V8 V9 V10 V11 V12 V13
0.9988587 0.1244533 0.5723589 2.3182859 0.2285716 0.7099904
V14
314.9074743
cultivar2wine <- wine[wine$V1=="2",]
sapply(cultivar2wine[2:14],mean)
V2 V3 V4 V5 V6 V7 V8
12.278732 1.932676 2.244789 20.238028 94.549296 2.258873 2.080845
V9 V10 V11 V12 V13 V14
0.363662 1.630282 3.086620 1.056282 2.785352 519.507042
sapply(cultivar2wine[2:14],sd)
V2 V3 V4 V5 V6 V7
0.5379642 1.0155687 0.3154673 3.3497704 16.7534975 0.5453611
V8 V9 V10 V11 V12 V13
0.7057008 0.1239613 0.6020678 0.9249293 0.2029368 0.4965735
V14
157.2112204
printMeanAndSdByGroup <- function(variables,groupvariable)
{
# find the names of the variables
variablenames <- c(names(groupvariable),names(as.data.frame(variables)))
# within each group, find the mean of each variable
groupvariable <- groupvariable[,1] # ensures groupvariable is not a list
means <- aggregate(as.matrix(variables) ~ groupvariable, FUN = mean)
names(means) <- variablenames
print(paste("Means:"))
print(means)
# within each group, find the standard deviation of each variable:
sds <- aggregate(as.matrix(variables) ~ groupvariable, FUN = sd)
names(sds) <- variablenames
print(paste("Standard deviations:"))
print(sds)
# within each group, find the number of samples:
samplesizes <- aggregate(as.matrix(variables) ~ groupvariable, FUN = length)
names(samplesizes) <- variablenames
print(paste("Sample sizes:"))
print(samplesizes)
}
printMeanAndSdByGroup(wine[2:14],wine[1])
[1] "Means:"
V1 V2 V3 V4 V5 V6 V7 V8 V9
1 1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000
2 2 12.27873 1.932676 2.244789 20.23803 94.5493 2.258873 2.0808451 0.363662
3 3 13.15375 3.333750 2.437083 21.41667 99.3125 1.678750 0.7814583 0.447500
V10 V11 V12 V13 V14
1 1.899322 5.528305 1.0620339 3.157797 1115.7119
2 1.630282 3.086620 1.0562817 2.785352 519.5070
3 1.153542 7.396250 0.6827083 1.683542 629.8958
[1] "Standard deviations:"
V1 V2 V3 V4 V5 V6 V7 V8
1 1 0.4621254 0.6885489 0.2271660 2.546322 10.49895 0.3389614 0.3974936
2 2 0.5379642 1.0155687 0.3154673 3.349770 16.75350 0.5453611 0.7057008
3 3 0.5302413 1.0879057 0.1846902 2.258161 10.89047 0.3569709 0.2935041
V9 V10 V11 V12 V13 V14
1 0.07004924 0.4121092 1.2385728 0.1164826 0.3570766 221.5208
2 0.12396128 0.6020678 0.9249293 0.2029368 0.4965735 157.2112
3 0.12413959 0.4088359 2.3109421 0.1144411 0.2721114 115.0970
[1] "Sample sizes:"
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1 1 59 59 59 59 59 59 59 59 59 59 59 59 59
2 2 71 71 71 71 71 71 71 71 71 71 71 71 71
3 3 48 48 48 48 48 48 48 48 48 48 48 48 48
calcWithinGroupsVariance <- function(variable,groupvariable)
{
# find out how many values the group variable can take
groupvariable2 <- as.factor(groupvariable[[1]])
levels <- levels(groupvariable2)
numlevels <- length(levels)
# get the mean and standard deviation for each group:
numtotal <- 0
denomtotal <- 0
for (i in 1:numlevels)
{
leveli <- levels[i]
levelidata <- variable[groupvariable==leveli,]
levelilength <- length(levelidata)
# get the standard deviation for group i:
sdi <- sd(levelidata)
numi <- (levelilength - 1)*(sdi * sdi)
denomi <- levelilength
numtotal <- numtotal + numi
denomtotal <- denomtotal + denomi
}
# calculate the within-groups variance
Vw <- numtotal / (denomtotal - numlevels)
return(Vw)
}
calcWithinGroupsVariance(wine[2],wine[1])
[1] 0.2620525
calcBetweenGroupsVariance <- function(variable,groupvariable)
{
# find out how many values the group variable can take
groupvariable2 <- as.factor(groupvariable[[1]])
levels <- levels(groupvariable2)
numlevels <- length(levels)
# calculate the overall grand mean:
grandmean <- mean(wine$V2)
# get the mean and standard deviation for each group:
numtotal <- 0
denomtotal <- 0
for (i in 1:numlevels)
{
leveli <- levels[i]
levelidata <- variable[groupvariable==leveli,]
levelilength <- length(levelidata)
# get the mean and standard deviation for group i:
meani <- mean(levelidata)
sdi <- sd(levelidata)
numi <- levelilength * ((meani - grandmean)^2)
denomi <- levelilength
numtotal <- numtotal + numi
denomtotal <- denomtotal + denomi
}
# calculate the between-groups variance
Vb <- numtotal / (numlevels - 1)
Vb <- Vb[[1]]
return(Vb)
}
calcBetweenGroupsVariance(wine[2], wine[1])
[1] 35.39742
35.39742/0.2620525
[1] 135.0776
calcSeparations <- function(variables,groupvariable)
{
# find out how many variables we have
variables <- as.data.frame(variables)
numvariables <- length(variables)
# find the variable names
variablenames <- colnames(variables)
# calculate the separation for each variable
for (i in 1:numvariables)
{
variablei <- variables[i]
variablename <- variablenames[i]
Vw <- calcWithinGroupsVariance(variablei, groupvariable)
Vb <- calcBetweenGroupsVariance(variablei, groupvariable)
sep <- Vb/Vw
print(paste("variable",variablename,"Vw=",Vw,"Vb=",Vb,"separation=",sep))
}
}
calcSeparations(wine[2:14],wine[1])
[1] "variable V2 Vw= 0.262052469153907 Vb= 35.3974249602692 separation= 135.0776242428"
[1] "variable V3 Vw= 0.887546796746581 Vb= 10154.4606409588 separation= 11441.0425210043"
[1] "variable V4 Vw= 0.0660721013425184 Vb= 10065.3651082674 separation= 152339.109907954"
[1] "variable V5 Vw= 8.00681118121156 Vb= 4040.10461181253 separation= 504.583475290745"
[1] "variable V6 Vw= 180.65777316441 Vb= 671880.903309069 separation= 3719.08106438141"
[1] "variable V7 Vw= 0.191270475224227 Vb= 10218.0270550471 separation= 53421.8730991727"
[1] "variable V8 Vw= 0.274707514337437 Vb= 10777.2342568213 separation= 39231.6689363768"
[1] "variable V9 Vw= 0.0119117022132797 Vb= 14217.0422061125 separation= 1193535.73079276"
[1] "variable V10 Vw= 0.246172943795542 Vb= 11593.6224025303 separation= 47095.4371499059"
[1] "variable V11 Vw= 2.28492308133354 Vb= 5890.16197758506 separation= 2577.83818882314"
[1] "variable V12 Vw= 0.0244876469432414 Vb= 12910.854863443 separation= 527239.505427713"
[1] "variable V13 Vw= 0.160778729560982 Vb= 9636.30640975892 separation= 59935.2068278657"
[1] "variable V14 Vw= 29707.6818705169 Vb= 54112090.6081053 separation= 1821.48478780528"
calcWithinGroupsCovariance <- function(variable1,variable2,groupvariable)
{
# find out how many values the group variable can take
groupvariable2 <- as.factor(groupvariable[[1]])
levels <- levels(groupvariable2)
numlevels <- length(levels)
# get the covariance of variable 1 and variable 2 for each group:
Covw <- 0
for (i in 1:numlevels)
{
leveli <- levels[i]
levelidata1 <- variable1[groupvariable==leveli,]
levelidata2 <- variable2[groupvariable==leveli,]
mean1 <- mean(levelidata1)
mean2 <- mean(levelidata2)
levelilength <- length(levelidata1)
# get the covariance for this group:
term1 <- 0
for (j in 1:levelilength)
{
term1 <- term1 + ((levelidata1[j] - mean1)*(levelidata2[j] - mean2))
}
Cov_groupi <- term1 # covariance for this group
Covw <- Covw + Cov_groupi
}
totallength <- nrow(variable1)
Covw <- Covw / (totallength - numlevels)
return(Covw)
}
calcWithinGroupsCovariance(wine[8],wine[11],wine[1])
[1] 0.2866783
calcBetweenGroupsCovariance <- function(variable1,variable2,groupvariable)
{
# find out how many values the group variable can take
groupvariable2 <- as.factor(groupvariable[[1]])
levels <- levels(groupvariable2)
numlevels <- length(levels)
# calculate the grand means
variable1mean <- mean(wine$V8)
variable2mean <- mean(wine$V11)
# calculate the between-groups covariance
Covb <- 0
for (i in 1:numlevels)
{
leveli <- levels[i]
levelidata1 <- variable1[groupvariable==leveli,]
levelidata2 <- variable2[groupvariable==leveli,]
mean1 <- mean(levelidata1)
mean2 <- mean(levelidata2)
levelilength <- length(levelidata1)
term1 <- (mean1 - variable1mean)*(mean2 - variable2mean)*(levelilength)
Covb <- Covb + term1
}
Covb <- Covb / (numlevels - 1)
Covb <- Covb[[1]]
return(Covb)
}
calcBetweenGroupsCovariance(wine[8],wine[11],wine[1])
[1] -60.41077
cor.test(wine$V2, wine$V3)
Pearson's product-moment correlation
data: wine$V2 and wine$V3
t = 1.2579, df = 176, p-value = 0.2101
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.05342959 0.23817474
sample estimates:
cor
0.09439694
mosthighlycorrelated <- function(mydataframe,numtoreport)
{
# find the correlations
cormatrix <- cor(mydataframe)
# set the correlations on the diagonal or lower triangle to zero,
# so they will not be reported as the highest ones:
diag(cormatrix) <- 0
cormatrix[lower.tri(cormatrix)] <- 0
# flatten the matrix into a dataframe for easy sorting
fm <- as.data.frame(as.table(cormatrix))
# assign human-friendly names
names(fm) <- c("First.Variable", "Second.Variable","Correlation")
# sort and print the top n correlations
head(fm[order(abs(fm$Correlation),decreasing=T),],n=numtoreport)
}
mosthighlycorrelated(wine[2:14], 10)
First.Variable Second.Variable Correlation
84 V7 V8 0.8645635
150 V8 V13 0.7871939
149 V7 V13 0.6999494
111 V8 V10 0.6526918
157 V2 V14 0.6437200
110 V7 V10 0.6124131
154 V12 V13 0.5654683
132 V3 V12 -0.5612957
118 V2 V11 0.5463642
137 V8 V12 0.5434786
standardisedconcentrations <- as.data.frame(scale(wine[2:14]))
sapply(standardisedconcentrations,mean)
V2 V3 V4 V5 V6
-8.591766e-16 -6.776446e-17 8.045176e-16 -7.720494e-17 -4.073935e-17
V7 V8 V9 V10 V11
-1.395560e-17 6.958263e-17 -1.042186e-16 -1.221369e-16 3.649376e-17
V12 V13 V14
2.093741e-16 3.003459e-16 -1.034429e-16
sapply(standardisedconcentrations,sd)
V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1 1 1 1 1 1 1 1 1 1 1 1 1
standardisedconcentrations <- as.data.frame(scale(wine[2:14]))
wine.pca <- prcomp(standardisedconcentrations)
summary(wine.pca)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.169 1.5802 1.2025 0.95863 0.92370 0.80103 0.74231
Proportion of Variance 0.362 0.1921 0.1112 0.07069 0.06563 0.04936 0.04239
Cumulative Proportion 0.362 0.5541 0.6653 0.73599 0.80162 0.85098 0.89337
PC8 PC9 PC10 PC11 PC12 PC13
Standard deviation 0.59034 0.53748 0.5009 0.47517 0.41082 0.32152
Proportion of Variance 0.02681 0.02222 0.0193 0.01737 0.01298 0.00795
Cumulative Proportion 0.92018 0.94240 0.9617 0.97907 0.99205 1.00000
wine.pca$sdev
[1] 2.1692972 1.5801816 1.2025273 0.9586313 0.9237035 0.8010350 0.7423128
[8] 0.5903367 0.5374755 0.5009017 0.4751722 0.4108165 0.3215244
sum((wine.pca$sdev)^2)
[1] 13
screeplot(wine.pca, type="lines")
(wine.pca$sdev)^2
[1] 4.7058503 2.4969737 1.4460720 0.9189739 0.8532282 0.6416570 0.5510283
[8] 0.3484974 0.2888799 0.2509025 0.2257886 0.1687702 0.1033779
wine.pca$rotation[,1]
V2 V3 V4 V5 V6 V7
-0.144329395 0.245187580 0.002051061 0.239320405 -0.141992042 -0.394660845
V8 V9 V10 V11 V12 V13
-0.422934297 0.298533103 -0.313429488 0.088616705 -0.296714564 -0.376167411
V14
-0.286752227
sum((wine.pca$rotation[,1])^2)
[1] 1
calcpc <- function(variables,loadings)
{
# find the number of samples in the data set
as.data.frame(variables)
numsamples <- nrow(variables)
# make a vector to store the component
pc <- numeric(numsamples)
# find the number of variables
numvariables <- length(variables)
# calculate the value of the component for each sample
for (i in 1:numsamples)
{
valuei <- 0
for (j in 1:numvariables)
{
valueij <- variables[i,j]
loadingj <- loadings[j]
valuei <- valuei + (valueij * loadingj)
}
pc[i] <- valuei
}
return(pc)
}
calcpc(standardisedconcentrations, wine.pca$rotation[,1])
[1] -3.30742097 -2.20324981 -2.50966069 -3.74649719 -1.00607049 -3.04167373
[7] -2.44220051 -2.05364379 -2.50381135 -2.74588238 -3.46994837 -1.74981688
[13] -2.10751729 -3.44842921 -4.30065228 -2.29870383 -2.16584568 -1.89362947
[19] -3.53202167 -2.07865856 -3.11561376 -1.08351361 -2.52809263 -1.64036108
[25] -1.75662066 -0.98729406 -1.77028387 -1.23194878 -2.18225047 -2.24976267
[31] -2.49318704 -2.66987964 -1.62399801 -1.89733870 -1.40642118 -1.89847087
[37] -1.38096669 -1.11905070 -1.49796891 -2.52268490 -2.58081526 -0.66660159
[43] -3.06216898 -0.46090897 -2.09544094 -1.13297020 -2.71893118 -2.81340300
[49] -2.00419725 -2.69987528 -3.20587409 -2.85091773 -3.49574328 -2.21853316
[55] -2.14094846 -2.46238340 -2.73380617 -2.16762631 -3.13054925 0.92596992
[61] 1.53814123 1.83108449 -0.03052074 -2.04449433 0.60796583 -0.89769555
[67] -2.24218226 -0.18286818 0.81051865 -1.97006319 1.56779366 -1.65301884
[73] 0.72333196 -2.55501977 -1.82741266 0.86555129 -0.36897357 1.45327752
[79] -1.25937829 -0.37509228 -0.75992026 -1.03166776 0.49348469 2.53183508
[85] -0.83297044 -0.78568828 0.80456258 0.55647288 1.11197430 0.55415961
[91] 1.34548982 1.56008180 1.92711944 -0.74456561 -0.95476209 -2.53670943
[97] 0.54242248 -1.02814946 -2.24557492 -1.40624916 -0.79547585 0.54798592
[103] 0.16072037 0.65793897 -0.39125074 1.76751314 0.36523707 1.61611371
[109] -0.08230361 -1.57383547 -1.41657326 0.27791878 1.29947929 0.45578615
[115] 0.49279573 -0.48071836 0.25217752 0.10692601 2.42616867 0.54953935
[121] -0.73754141 -1.33256273 1.17377592 0.46103449 -0.97572169 0.09653741
[127] -0.03837888 1.59266578 0.47821593 1.78779033 1.32336859 2.37779336
[133] 2.92867865 2.14077227 2.36320318 3.05522315 3.90473898 3.92539034
[139] 3.08557209 2.36779237 2.77099630 2.28012931 2.97723506 2.36851341
[145] 2.20364930 2.61823528 4.26859758 3.57256360 2.79916760 2.89150275
[151] 2.31420887 2.54265841 1.80744271 2.75238051 2.72945105 3.59472857
[157] 2.88169708 3.38261413 1.04523342 1.60538369 3.13428951 2.23385546
[163] 2.83966343 2.59019044 2.94100316 3.52010248 2.39934228 2.92084537
[169] 2.17527658 2.37423037 3.20258311 3.66757294 2.45862032 3.36104305
[175] 2.59463669 2.67030685 2.38030254 3.19973210
wine.pca$x[,1]
[1] -3.30742097 -2.20324981 -2.50966069 -3.74649719 -1.00607049 -3.04167373
[7] -2.44220051 -2.05364379 -2.50381135 -2.74588238 -3.46994837 -1.74981688
[13] -2.10751729 -3.44842921 -4.30065228 -2.29870383 -2.16584568 -1.89362947
[19] -3.53202167 -2.07865856 -3.11561376 -1.08351361 -2.52809263 -1.64036108
[25] -1.75662066 -0.98729406 -1.77028387 -1.23194878 -2.18225047 -2.24976267
[31] -2.49318704 -2.66987964 -1.62399801 -1.89733870 -1.40642118 -1.89847087
[37] -1.38096669 -1.11905070 -1.49796891 -2.52268490 -2.58081526 -0.66660159
[43] -3.06216898 -0.46090897 -2.09544094 -1.13297020 -2.71893118 -2.81340300
[49] -2.00419725 -2.69987528 -3.20587409 -2.85091773 -3.49574328 -2.21853316
[55] -2.14094846 -2.46238340 -2.73380617 -2.16762631 -3.13054925 0.92596992
[61] 1.53814123 1.83108449 -0.03052074 -2.04449433 0.60796583 -0.89769555
[67] -2.24218226 -0.18286818 0.81051865 -1.97006319 1.56779366 -1.65301884
[73] 0.72333196 -2.55501977 -1.82741266 0.86555129 -0.36897357 1.45327752
[79] -1.25937829 -0.37509228 -0.75992026 -1.03166776 0.49348469 2.53183508
[85] -0.83297044 -0.78568828 0.80456258 0.55647288 1.11197430 0.55415961
[91] 1.34548982 1.56008180 1.92711944 -0.74456561 -0.95476209 -2.53670943
[97] 0.54242248 -1.02814946 -2.24557492 -1.40624916 -0.79547585 0.54798592
[103] 0.16072037 0.65793897 -0.39125074 1.76751314 0.36523707 1.61611371
[109] -0.08230361 -1.57383547 -1.41657326 0.27791878 1.29947929 0.45578615
[115] 0.49279573 -0.48071836 0.25217752 0.10692601 2.42616867 0.54953935
[121] -0.73754141 -1.33256273 1.17377592 0.46103449 -0.97572169 0.09653741
[127] -0.03837888 1.59266578 0.47821593 1.78779033 1.32336859 2.37779336
[133] 2.92867865 2.14077227 2.36320318 3.05522315 3.90473898 3.92539034
[139] 3.08557209 2.36779237 2.77099630 2.28012931 2.97723506 2.36851341
[145] 2.20364930 2.61823528 4.26859758 3.57256360 2.79916760 2.89150275
[151] 2.31420887 2.54265841 1.80744271 2.75238051 2.72945105 3.59472857
[157] 2.88169708 3.38261413 1.04523342 1.60538369 3.13428951 2.23385546
[163] 2.83966343 2.59019044 2.94100316 3.52010248 2.39934228 2.92084537
[169] 2.17527658 2.37423037 3.20258311 3.66757294 2.45862032 3.36104305
[175] 2.59463669 2.67030685 2.38030254 3.19973210
wine.pca$rotation[,2]
V2 V3 V4 V5 V6 V7
0.483651548 0.224930935 0.316068814 -0.010590502 0.299634003 0.065039512
V8 V9 V10 V11 V12 V13
-0.003359812 0.028779488 0.039301722 0.529995672 -0.279235148 -0.164496193
V14
0.364902832
sum((wine.pca$rotation[,2])^2)
[1] 1
plot(wine.pca$x[,1],wine.pca$x[,2])
text(wine.pca$x[,1],wine.pca$x[,2], wine$V1, cex=0.7, pos=4, col="red")
printMeanAndSdByGroup(standardisedconcentrations,wine[1])
[1] "Means:"
V1 V2 V3 V4 V5 V6 V7
1 1 0.9166093 -0.2915199 0.3246886 -0.7359212 0.46192317 0.87090552
2 2 -0.8892116 -0.3613424 -0.4437061 0.2225094 -0.36354162 -0.05790375
3 3 0.1886265 0.8928122 0.2572190 0.5754413 -0.03004191 -0.98483874
V8 V9 V10 V11 V12 V13
1 0.95419225 -0.57735640 0.5388633 0.2028288 0.4575567 0.7691811
2 0.05163434 0.01452785 0.0688079 -0.8503999 0.4323908 0.2446043
3 -1.24923710 0.68817813 -0.7641311 1.0085728 -1.2019916 -1.3072623
V14
1 1.1711967
2 -0.7220731
3 -0.3715295
[1] "Standard deviations:"
V1 V2 V3 V4 V5 V6 V7 V8
1 1 0.5692415 0.6163463 0.8280333 0.7624716 0.7350927 0.5416007 0.3979478
2 2 0.6626591 0.9090742 1.1498967 1.0030563 1.1730101 0.8713912 0.7065071
3 3 0.6531461 0.9738258 0.6732065 0.6761844 0.7625055 0.5703767 0.2938394
V9 V10 V11 V12 V13 V14
1 0.5628555 0.7200189 0.5342623 0.5096112 0.5029315 0.7034472
2 0.9960462 1.0519061 0.3989712 0.8878480 0.6994087 0.4992299
3 0.9974790 0.7142999 0.9968323 0.5006795 0.3832607 0.3654948
[1] "Sample sizes:"
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1 1 59 59 59 59 59 59 59 59 59 59 59 59 59
2 2 71 71 71 71 71 71 71 71 71 71 71 71 71
3 3 48 48 48 48 48 48 48 48 48 48 48 48 48
wine.lda <- lda(wine$V1 ~ wine$V2 + wine$V3 + wine$V4 + wine$V5 + wine$V6 + wine$V7 +
wine$V8 + wine$V9 + wine$V10 + wine$V11 + wine$V12 + wine$V13 +
wine$V14)
wine.lda
Call:
lda(wine$V1 ~ wine$V2 + wine$V3 + wine$V4 + wine$V5 + wine$V6 +
wine$V7 + wine$V8 + wine$V9 + wine$V10 + wine$V11 + wine$V12 +
wine$V13 + wine$V14)
Prior probabilities of groups:
1 2 3
0.3314607 0.3988764 0.2696629
Group means:
wine$V2 wine$V3 wine$V4 wine$V5 wine$V6 wine$V7 wine$V8 wine$V9
1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000
2 12.27873 1.932676 2.244789 20.23803 94.5493 2.258873 2.0808451 0.363662
3 13.15375 3.333750 2.437083 21.41667 99.3125 1.678750 0.7814583 0.447500
wine$V10 wine$V11 wine$V12 wine$V13 wine$V14
1 1.899322 5.528305 1.0620339 3.157797 1115.7119
2 1.630282 3.086620 1.0562817 2.785352 519.5070
3 1.153542 7.396250 0.6827083 1.683542 629.8958
Coefficients of linear discriminants:
LD1 LD2
wine$V2 -0.403399781 0.8717930699
wine$V3 0.165254596 0.3053797325
wine$V4 -0.369075256 2.3458497486
wine$V5 0.154797889 -0.1463807654
wine$V6 -0.002163496 -0.0004627565
wine$V7 0.618052068 -0.0322128171
wine$V8 -1.661191235 -0.4919980543
wine$V9 -1.495818440 -1.6309537953
wine$V10 0.134092628 -0.3070875776
wine$V11 0.355055710 0.2532306865
wine$V12 -0.818036073 -1.5156344987
wine$V13 -1.157559376 0.0511839665
wine$V14 -0.002691206 0.0028529846
Proportion of trace:
LD1 LD2
0.6875 0.3125
wine.lda$scaling[,1]
wine$V2 wine$V3 wine$V4 wine$V5 wine$V6 wine$V7
-0.403399781 0.165254596 -0.369075256 0.154797889 -0.002163496 0.618052068
wine$V8 wine$V9 wine$V10 wine$V11 wine$V12 wine$V13
-1.661191235 -1.495818440 0.134092628 0.355055710 -0.818036073 -1.157559376
wine$V14
-0.002691206
calclda <- function(variables,loadings)
{
# find the number of samples in the data set
as.data.frame(variables)
numsamples <- nrow(variables)
# make a vector to store the discriminant function
ld <- numeric(numsamples)
# find the number of variables
numvariables <- length(variables)
# calculate the value of the discriminant function for each sample
for (i in 1:numsamples)
{
valuei <- 0
for (j in 1:numvariables)
{
valueij <- variables[i,j]
loadingj <- loadings[j]
valuei <- valuei + (valueij * loadingj)
}
ld[i] <- valuei
}
# standardise the discriminant function so that its mean value is 0:
ld <- as.data.frame(scale(ld, center=TRUE, scale=FALSE))
ld <- ld[[1]]
return(ld)
}
calclda(wine[2:14], wine.lda$scaling[,1])
[1] -4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934
[7] -4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646
[13] -3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262
[19] -5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961
[25] -3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573
[31] -2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996
[37] -2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062
[43] -4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356
[49] -2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293
[55] -3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551 1.58618749
[61] 0.79967216 2.38015446 -0.45917726 -0.50726885 0.39398359 -0.92256616
[67] -1.95549377 -0.34732815 0.20371212 -0.24831914 1.17987999 -1.07718925
[73] 0.64100179 -1.74684421 -0.34721117 1.14274222 0.18665882 0.90052500
[79] -0.70709551 -0.59562833 -0.55761818 -1.80430417 0.23077079 2.03482711
[85] -0.62113021 -1.03372742 0.76598781 0.35042568 0.15324508 -0.14962842
[91] 0.48079504 1.39689016 0.91972331 -0.59102937 0.49411386 -1.62614426
[97] 2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340 0.02594549
[103] -0.21887407 1.36437640 -1.12901245 -0.21263094 -0.77946884 0.61546732
[109] 0.22550192 -2.03869851 0.79274716 0.30229545 -0.50664882 0.99837397
[115] -0.21954922 -0.37131517 0.05545894 -0.09137874 1.79755252 -0.17405009
[121] -1.17870281 -3.21054390 0.62605202 0.03366613 -0.69930080 -0.72061079
[127] -0.51933512 1.17030045 0.10824791 1.12319783 2.24632419 3.28527755
[133] 4.07236441 3.86691235 3.45088333 3.71583899 3.92220510 4.85161020
[139] 3.54993389 3.76889174 2.66942250 2.32491492 3.17712883 2.88964418
[145] 3.78325562 3.04411324 4.70697017 4.85021393 4.98359184 4.86968293
[151] 4.59869190 5.67447884 5.32986123 5.03401031 4.52080087 5.09783710
[157] 5.04368277 4.86980829 5.61316558 5.67046737 5.37413513 3.09975377
[163] 3.35888137 3.04007194 4.94861303 4.54504458 5.27255844 5.13016117
[169] 4.30468082 5.08336782 4.06743571 5.74212961 4.48205140 4.29150758
[175] 4.50329623 5.04747033 4.27615505 5.53808610
wine.lda.values <- predict(wine.lda, wine[2:14])
wine.lda.values$x[,1]
1 2 3 4 5 6
-4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934
7 8 9 10 11 12
-4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646
13 14 15 16 17 18
-3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262
19 20 21 22 23 24
-5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961
25 26 27 28 29 30
-3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573
31 32 33 34 35 36
-2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996
37 38 39 40 41 42
-2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062
43 44 45 46 47 48
-4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356
49 50 51 52 53 54
-2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293
55 56 57 58 59 60
-3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551 1.58618749
61 62 63 64 65 66
0.79967216 2.38015446 -0.45917726 -0.50726885 0.39398359 -0.92256616
67 68 69 70 71 72
-1.95549377 -0.34732815 0.20371212 -0.24831914 1.17987999 -1.07718925
73 74 75 76 77 78
0.64100179 -1.74684421 -0.34721117 1.14274222 0.18665882 0.90052500
79 80 81 82 83 84
-0.70709551 -0.59562833 -0.55761818 -1.80430417 0.23077079 2.03482711
85 86 87 88 89 90
-0.62113021 -1.03372742 0.76598781 0.35042568 0.15324508 -0.14962842
91 92 93 94 95 96
0.48079504 1.39689016 0.91972331 -0.59102937 0.49411386 -1.62614426
97 98 99 100 101 102
2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340 0.02594549
103 104 105 106 107 108
-0.21887407 1.36437640 -1.12901245 -0.21263094 -0.77946884 0.61546732
109 110 111 112 113 114
0.22550192 -2.03869851 0.79274716 0.30229545 -0.50664882 0.99837397
115 116 117 118 119 120
-0.21954922 -0.37131517 0.05545894 -0.09137874 1.79755252 -0.17405009
121 122 123 124 125 126
-1.17870281 -3.21054390 0.62605202 0.03366613 -0.69930080 -0.72061079
127 128 129 130 131 132
-0.51933512 1.17030045 0.10824791 1.12319783 2.24632419 3.28527755
133 134 135 136 137 138
4.07236441 3.86691235 3.45088333 3.71583899 3.92220510 4.85161020
139 140 141 142 143 144
3.54993389 3.76889174 2.66942250 2.32491492 3.17712883 2.88964418
145 146 147 148 149 150
3.78325562 3.04411324 4.70697017 4.85021393 4.98359184 4.86968293
151 152 153 154 155 156
4.59869190 5.67447884 5.32986123 5.03401031 4.52080087 5.09783710
157 158 159 160 161 162
5.04368277 4.86980829 5.61316558 5.67046737 5.37413513 3.09975377
163 164 165 166 167 168
3.35888137 3.04007194 4.94861303 4.54504458 5.27255844 5.13016117
169 170 171 172 173 174
4.30468082 5.08336782 4.06743571 5.74212961 4.48205140 4.29150758
175 176 177 178
4.50329623 5.04747033 4.27615505 5.53808610
groupStandardise <- function(variables, groupvariable)
{
# find out how many variables we have
variables <- as.data.frame(variables)
numvariables <- length(variables)
# find the variable names
variablenames <- colnames(variables)
# calculate the group-standardised version of each variable
for (i in 1:numvariables)
{
variablei <- variables[i]
variablei_name <- variablenames[i]
variablei_Vw <- calcWithinGroupsVariance(variablei, groupvariable)
variablei_mean <- mean(wine$V2) + mean(wine$V3) + mean(wine$V4) + mean(wine$V5) + mean(wine$V6) + mean(wine$V7) + mean(wine$V8) + mean(wine$V9) + mean(wine$V10) + mean(wine$V11) + mean(wine$V12) + mean(wine$V13) + mean(wine$V14)
variablei_new <- (variablei - variablei_mean)/(sqrt(variablei_Vw))
data_length <- nrow(variablei)
if (i == 1) { variables_new <- data.frame(row.names=seq(1,data_length)) }
variables_new[`variablei_name`] <- variablei_new
}
return(variables_new)
}
groupstandardisedconcentrations <- groupStandardise(wine[2:14], wine[1])
wine.lda2 <- lda(wine$V1 ~ groupstandardisedconcentrations$V2 + groupstandardisedconcentrations$V3 +
groupstandardisedconcentrations$V4 + groupstandardisedconcentrations$V5 +
groupstandardisedconcentrations$V6 + groupstandardisedconcentrations$V7 +
groupstandardisedconcentrations$V8 + groupstandardisedconcentrations$V9 +
groupstandardisedconcentrations$V10 + groupstandardisedconcentrations$V11 +
groupstandardisedconcentrations$V12 + groupstandardisedconcentrations$V13 +
groupstandardisedconcentrations$V14)
wine.lda2
Call:
lda(wine$V1 ~ groupstandardisedconcentrations$V2 + groupstandardisedconcentrations$V3 +
groupstandardisedconcentrations$V4 + groupstandardisedconcentrations$V5 +
groupstandardisedconcentrations$V6 + groupstandardisedconcentrations$V7 +
groupstandardisedconcentrations$V8 + groupstandardisedconcentrations$V9 +
groupstandardisedconcentrations$V10 + groupstandardisedconcentrations$V11 +
groupstandardisedconcentrations$V12 + groupstandardisedconcentrations$V13 +
groupstandardisedconcentrations$V14)
Prior probabilities of groups:
1 2 3
0.3314607 0.3988764 0.2696629
Group means:
groupstandardisedconcentrations$V2 groupstandardisedconcentrations$V3
1 -1728.804 -951.8414
2 -1731.667 -951.9242
3 -1729.958 -950.4370
groupstandardisedconcentrations$V4 groupstandardisedconcentrations$V5
1 -3486.869 -311.5955
2 -3487.689 -310.4644
3 -3486.941 -310.0478
groupstandardisedconcentrations$V6 groupstandardisedconcentrations$V7
1 -58.95429 -2048.492
2 -59.83144 -2049.821
3 -59.47706 -2051.148
groupstandardisedconcentrations$V8 groupstandardisedconcentrations$V9
1 -1709.047 -8232.009
2 -1710.767 -8231.334
3 -1713.247 -8230.566
groupstandardisedconcentrations$V10 groupstandardisedconcentrations$V11
1 -1807.565 -590.9047
2 -1808.108 -592.5200
3 -1809.068 -589.6690
groupstandardisedconcentrations$V12 groupstandardisedconcentrations$V13
1 -5736.485 -2233.521
2 -5736.522 -2234.450
3 -5738.909 -2237.198
groupstandardisedconcentrations$V14
1 1.258849
2 -2.200234
3 -1.559777
Coefficients of linear discriminants:
LD1 LD2
groupstandardisedconcentrations$V2 -0.20650463 0.446280119
groupstandardisedconcentrations$V3 0.15568586 0.287697336
groupstandardisedconcentrations$V4 -0.09486893 0.602988809
groupstandardisedconcentrations$V5 0.43802089 -0.414203541
groupstandardisedconcentrations$V6 -0.02907934 -0.006219863
groupstandardisedconcentrations$V7 0.27030186 -0.014088108
groupstandardisedconcentrations$V8 -0.87067265 -0.257868714
groupstandardisedconcentrations$V9 -0.16325474 -0.178003512
groupstandardisedconcentrations$V10 0.06653116 -0.152364015
groupstandardisedconcentrations$V11 0.53670086 0.382782544
groupstandardisedconcentrations$V12 -0.12801061 -0.237174509
groupstandardisedconcentrations$V13 -0.46414916 0.020523349
groupstandardisedconcentrations$V14 -0.46385409 0.491738050
Proportion of trace:
LD1 LD2
0.6875 0.3125
wine.lda.values <- predict(wine.lda, wine[2:14])
wine.lda.values$x[,1]
1 2 3 4 5 6
-4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934
7 8 9 10 11 12
-4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646
13 14 15 16 17 18
-3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262
19 20 21 22 23 24
-5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961
25 26 27 28 29 30
-3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573
31 32 33 34 35 36
-2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996
37 38 39 40 41 42
-2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062
43 44 45 46 47 48
-4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356
49 50 51 52 53 54
-2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293
55 56 57 58 59 60
-3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551 1.58618749
61 62 63 64 65 66
0.79967216 2.38015446 -0.45917726 -0.50726885 0.39398359 -0.92256616
67 68 69 70 71 72
-1.95549377 -0.34732815 0.20371212 -0.24831914 1.17987999 -1.07718925
73 74 75 76 77 78
0.64100179 -1.74684421 -0.34721117 1.14274222 0.18665882 0.90052500
79 80 81 82 83 84
-0.70709551 -0.59562833 -0.55761818 -1.80430417 0.23077079 2.03482711
85 86 87 88 89 90
-0.62113021 -1.03372742 0.76598781 0.35042568 0.15324508 -0.14962842
91 92 93 94 95 96
0.48079504 1.39689016 0.91972331 -0.59102937 0.49411386 -1.62614426
97 98 99 100 101 102
2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340 0.02594549
103 104 105 106 107 108
-0.21887407 1.36437640 -1.12901245 -0.21263094 -0.77946884 0.61546732
109 110 111 112 113 114
0.22550192 -2.03869851 0.79274716 0.30229545 -0.50664882 0.99837397
115 116 117 118 119 120
-0.21954922 -0.37131517 0.05545894 -0.09137874 1.79755252 -0.17405009
121 122 123 124 125 126
-1.17870281 -3.21054390 0.62605202 0.03366613 -0.69930080 -0.72061079
127 128 129 130 131 132
-0.51933512 1.17030045 0.10824791 1.12319783 2.24632419 3.28527755
133 134 135 136 137 138
4.07236441 3.86691235 3.45088333 3.71583899 3.92220510 4.85161020
139 140 141 142 143 144
3.54993389 3.76889174 2.66942250 2.32491492 3.17712883 2.88964418
145 146 147 148 149 150
3.78325562 3.04411324 4.70697017 4.85021393 4.98359184 4.86968293
151 152 153 154 155 156
4.59869190 5.67447884 5.32986123 5.03401031 4.52080087 5.09783710
157 158 159 160 161 162
5.04368277 4.86980829 5.61316558 5.67046737 5.37413513 3.09975377
163 164 165 166 167 168
3.35888137 3.04007194 4.94861303 4.54504458 5.27255844 5.13016117
169 170 171 172 173 174
4.30468082 5.08336782 4.06743571 5.74212961 4.48205140 4.29150758
175 176 177 178
4.50329623 5.04747033 4.27615505 5.53808610
wine.lda.values2 <- predict(wine.lda2, groupstandardisedconcentrations)
wine.lda.values2$x[,1]
1 2 3 4 5 6
-4.70024401 -4.30195811 -3.42071952 -4.20575366 -1.50998168 -4.51868934
7 8 9 10 11 12
-4.52737794 -4.14834781 -3.86082876 -3.36662444 -4.80587907 -3.42807646
13 14 15 16 17 18
-3.66610246 -5.58824635 -5.50131449 -3.18475189 -3.28936988 -2.99809262
19 20 21 22 23 24
-5.24640372 -3.13653106 -3.57747791 -1.69077135 -4.83515033 -3.09588961
25 26 27 28 29 30
-3.32164716 -2.14482223 -3.98242850 -2.68591432 -3.56309464 -3.17301573
31 32 33 34 35 36
-2.99626797 -3.56866244 -3.38506383 -3.52753750 -2.85190852 -2.79411996
37 38 39 40 41 42
-2.75808511 -2.17734477 -3.02926382 -3.27105228 -2.92065533 -2.23721062
43 44 45 46 47 48
-4.69972568 -1.23036133 -2.58203904 -2.58312049 -3.88887889 -3.44975356
49 50 51 52 53 54
-2.34223331 -3.52062596 -3.21840912 -4.38214896 -4.36311727 -3.51917293
55 56 57 58 59 60
-3.12277475 -1.80240540 -2.87378754 -3.61690518 -3.73868551 1.58618749
61 62 63 64 65 66
0.79967216 2.38015446 -0.45917726 -0.50726885 0.39398359 -0.92256616
67 68 69 70 71 72
-1.95549377 -0.34732815 0.20371212 -0.24831914 1.17987999 -1.07718925
73 74 75 76 77 78
0.64100179 -1.74684421 -0.34721117 1.14274222 0.18665882 0.90052500
79 80 81 82 83 84
-0.70709551 -0.59562833 -0.55761818 -1.80430417 0.23077079 2.03482711
85 86 87 88 89 90
-0.62113021 -1.03372742 0.76598781 0.35042568 0.15324508 -0.14962842
91 92 93 94 95 96
0.48079504 1.39689016 0.91972331 -0.59102937 0.49411386 -1.62614426
97 98 99 100 101 102
2.00044562 -1.00534818 -2.07121314 -1.63815890 -1.05894340 0.02594549
103 104 105 106 107 108
-0.21887407 1.36437640 -1.12901245 -0.21263094 -0.77946884 0.61546732
109 110 111 112 113 114
0.22550192 -2.03869851 0.79274716 0.30229545 -0.50664882 0.99837397
115 116 117 118 119 120
-0.21954922 -0.37131517 0.05545894 -0.09137874 1.79755252 -0.17405009
121 122 123 124 125 126
-1.17870281 -3.21054390 0.62605202 0.03366613 -0.69930080 -0.72061079
127 128 129 130 131 132
-0.51933512 1.17030045 0.10824791 1.12319783 2.24632419 3.28527755
133 134 135 136 137 138
4.07236441 3.86691235 3.45088333 3.71583899 3.92220510 4.85161020
139 140 141 142 143 144
3.54993389 3.76889174 2.66942250 2.32491492 3.17712883 2.88964418
145 146 147 148 149 150
3.78325562 3.04411324 4.70697017 4.85021393 4.98359184 4.86968293
151 152 153 154 155 156
4.59869190 5.67447884 5.32986123 5.03401031 4.52080087 5.09783710
157 158 159 160 161 162
5.04368277 4.86980829 5.61316558 5.67046737 5.37413513 3.09975377
163 164 165 166 167 168
3.35888137 3.04007194 4.94861303 4.54504458 5.27255844 5.13016117
169 170 171 172 173 174
4.30468082 5.08336782 4.06743571 5.74212961 4.48205140 4.29150758
175 176 177 178
4.50329623 5.04747033 4.27615505 5.53808610
wine.lda.values <- predict(wine.lda, standardisedconcentrations)
calcSeparations(wine.lda.values$x,wine[1])
[1] "variable LD1 Vw= 0.999999999999999 Vb= 15837.082234555 separation= 15837.082234555"
[1] "variable LD2 Vw= 1 Vb= 15403.6710754822 separation= 15403.6710754822"
wine.lda
Call:
lda(wine$V1 ~ wine$V2 + wine$V3 + wine$V4 + wine$V5 + wine$V6 +
wine$V7 + wine$V8 + wine$V9 + wine$V10 + wine$V11 + wine$V12 +
wine$V13 + wine$V14)
Prior probabilities of groups:
1 2 3
0.3314607 0.3988764 0.2696629
Group means:
wine$V2 wine$V3 wine$V4 wine$V5 wine$V6 wine$V7 wine$V8 wine$V9
1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000
2 12.27873 1.932676 2.244789 20.23803 94.5493 2.258873 2.0808451 0.363662
3 13.15375 3.333750 2.437083 21.41667 99.3125 1.678750 0.7814583 0.447500
wine$V10 wine$V11 wine$V12 wine$V13 wine$V14
1 1.899322 5.528305 1.0620339 3.157797 1115.7119
2 1.630282 3.086620 1.0562817 2.785352 519.5070
3 1.153542 7.396250 0.6827083 1.683542 629.8958
Coefficients of linear discriminants:
LD1 LD2
wine$V2 -0.403399781 0.8717930699
wine$V3 0.165254596 0.3053797325
wine$V4 -0.369075256 2.3458497486
wine$V5 0.154797889 -0.1463807654
wine$V6 -0.002163496 -0.0004627565
wine$V7 0.618052068 -0.0322128171
wine$V8 -1.661191235 -0.4919980543
wine$V9 -1.495818440 -1.6309537953
wine$V10 0.134092628 -0.3070875776
wine$V11 0.355055710 0.2532306865
wine$V12 -0.818036073 -1.5156344987
wine$V13 -1.157559376 0.0511839665
wine$V14 -0.002691206 0.0028529846
Proportion of trace:
LD1 LD2
0.6875 0.3125
(wine.lda$svd)^2
[1] 794.6522 361.2410
ldahist(data = wine.lda.values$x[,1], g=wine$V1)
ldahist(data = wine.lda.values$x[,2], g=wine$V1)
plot(wine.lda.values$x[,1],wine.lda.values$x[,2])
text(wine.lda.values$x[,1],wine.lda.values$x[,2],wine$V1,cex=0.7,pos=4,col="red")
printMeanAndSdByGroup(wine.lda.values$x,wine[1])
[1] "Means:"
V1 LD1 LD2
1 1 -3.42248851 1.691674
2 2 -0.07972623 -2.472656
3 3 4.32473717 1.578120
[1] "Standard deviations:"
V1 LD1 LD2
1 1 0.9394626 1.0176394
2 2 1.0839318 0.9973165
3 3 0.9404188 0.9818673
[1] "Sample sizes:"
V1 LD1 LD2
1 1 59 59
2 2 71 71
3 3 48 48
calcAllocationRuleAccuracy <- function(ldavalue, groupvariable, cutoffpoints)
{
# find out how many values the group variable can take
groupvariable2 <- as.factor(groupvariable[[1]])
levels <- levels(groupvariable2)
numlevels <- length(levels)
# calculate the number of true positives and false negatives for each group
numlevels <- length(levels)
for (i in 1:numlevels)
{
leveli <- levels[i]
levelidata <- ldavalue[groupvariable==leveli]
# see how many of the samples from this group are classified in each group
for (j in 1:numlevels)
{
levelj <- levels[j]
if (j == 1)
{
cutoff1 <- cutoffpoints[1]
cutoff2 <- "NA"
results <- summary(levelidata <= cutoff1)
}
else if (j == numlevels)
{
cutoff1 <- cutoffpoints[(numlevels-1)]
cutoff2 <- "NA"
results <- summary(levelidata > cutoff1)
}
else
{
cutoff1 <- cutoffpoints[(j-1)]
cutoff2 <- cutoffpoints[(j)]
results <- summary(levelidata > cutoff1 & levelidata <= cutoff2)
}
trues <- results["TRUE"]
trues <- trues[[1]]
print(paste("Number of samples of group",leveli,"classified as group",levelj," : ",
trues,"(cutoffs:",cutoff1,",",cutoff2,")"))
}
}
}
calcAllocationRuleAccuracy(wine.lda.values$x[,1], wine[1], c(-1.751107, 2.122505))
[1] "Number of samples of group 1 classified as group 1 : 56 (cutoffs: -1.751107 , NA )"
[1] "Number of samples of group 1 classified as group 2 : 3 (cutoffs: -1.751107 , 2.122505 )"
[1] "Number of samples of group 1 classified as group 3 : NA (cutoffs: 2.122505 , NA )"
[1] "Number of samples of group 2 classified as group 1 : 5 (cutoffs: -1.751107 , NA )"
[1] "Number of samples of group 2 classified as group 2 : 65 (cutoffs: -1.751107 , 2.122505 )"
[1] "Number of samples of group 2 classified as group 3 : 1 (cutoffs: 2.122505 , NA )"
[1] "Number of samples of group 3 classified as group 1 : NA (cutoffs: -1.751107 , NA )"
[1] "Number of samples of group 3 classified as group 2 : NA (cutoffs: -1.751107 , 2.122505 )"
[1] "Number of samples of group 3 classified as group 3 : 48 (cutoffs: 2.122505 , NA )"
In finding the best low-dimensional representation of the variation in a multivariate data set, we used the principal component analysis (PCA). The wine data set includes thirteen chemical concentrations describing wine samples from three different cultivars. Using a principal component analysis, we discovered that the first two principal components—each of which is a specific linear combination of the 13 chemical concentrations—can effectively capture most of the variance in concentration levels between the samples. On the one hand, linear discriminant analysis (LDA) aims to identify the linear combinations of the initial variables—13 chemical concentrations—that provide the best possible separation between the groups—wine cultivars—in a given data set. The wines, in this instance, come from three (3) different cultivars, so there are three groups and thirteen variables if we want to split the wines by cultivar. So, the minimum, either 2 or 13, useful discriminant functions that may distinguish the wines by cultivar are the maximum. So, utilizing 13 chemical concentration data, we can only discover a maximum of 2 appropriate discriminant functions to differentiate the wines by cultivar.