ArcLakeGroupSummary <- read_excel("~/Desktop/EPSRC Project /ArcLakeGroupSummary.xlsx")
dundeedata <- read_csv("~/Desktop/EPSRC Project /dundeedata.csv.xls")

colnames(dundeedata)[1]<-"GloboLakes_ID" # change the GloboLID column name to GloboLakes_ID to make the merge easier.

Data<-merge(ArcLakeGroupSummary, dundeedata, by = "GloboLakes_ID", all = TRUE )
Data<-subset(Data, Group!="NA") # The data set is back to the original 732 rows just with extra columns of information

Data$Group<-as.factor(Data$Group)

Data1<-data.frame(Data[,c("Group","PC1","PC2")])

# Stratify the entire training set into training and test sets

set.seed(1)

train.index<-createDataPartition(Data1$Group, p=0.8, list = FALSE)
train.set<-Data1[train.index, ]
test.set<-Data1[-train.index, ]

nrow(train.set)
## [1] 590
nrow(test.set)
## [1] 142

1 PCs for the training set

PC.Plot<-ggplot(train.set, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=Group))+
  labs(colour = "Class", title="PC1 vs PC2 - Training set") 

PC.Plot

2 Training Error

svmfit<-svm(Group~PC1+PC2,data = train.set, kernel="radial", cost = 1538065, gamma = 0.02650774)

svm.y<-train.set$Group
svm.predy<-predict(svmfit, train.set)

svmfit
## 
## Call:
## svm(formula = Group ~ PC1 + PC2, data = train.set, kernel = "radial", 
##     cost = 1538065, gamma = 0.02650774)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1538065 
##       gamma:  0.02650774 
## 
## Number of Support Vectors:  67
mean(svm.y!=svm.predy) 
## [1] 0.01186441
table(svm.y, svm.predy)
##      svm.predy
## svm.y   1   2   3   4   5   6   7   8   9
##     1  44   0   0   0   0   0   0   0   0
##     2   0  35   0   0   0   0   0   0   0
##     3   0   0  64   0   0   0   0   0   0
##     4   0   0   0  93   4   0   0   0   0
##     5   0   0   0   1 193   0   0   0   2
##     6   0   0   0   0   0  34   0   0   0
##     7   0   0   0   0   0   0  16   0   0
##     8   0   0   0   0   0   0   0  24   0
##     9   0   0   0   0   0   0   0   0  80
length(svm.y)
## [1] 590
xgrid<-expand.grid(X1=seq(min(Data$PC1), max(Data$PC1), length.out = 150), 
                   X2=seq(min(Data$PC2), max(Data$PC2), length.out =150))

colnames(xgrid)<-c("PC1", "PC2")

group.train.set.pred<-predict(svmfit, xgrid)

xgrid<-cbind(xgrid, group.train.set.pred)

r1<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
  geom_point(data = train.set[-svmfit$index,], aes(x=PC1, y=PC2, colour=Group)) +
  geom_point(data = train.set[svmfit$index,], aes(x=PC1, y=PC2, colour=Group), shape=4) + 
  labs(colour = "Class", title="Decision Surface With Training Set Observations") 

r1

3 Test Error

svmfit<-svm(Group~PC1+PC2,data = train.set, kernel="radial", cost = 1538065, gamma = 0.02650774)

svm.y<-test.set$Group
svm.predy<-predict(svmfit, test.set)

svmfit
## 
## Call:
## svm(formula = Group ~ PC1 + PC2, data = train.set, kernel = "radial", 
##     cost = 1538065, gamma = 0.02650774)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1538065 
##       gamma:  0.02650774 
## 
## Number of Support Vectors:  67
mean(svm.y!=svm.predy) 
## [1] 0.007042254
correct.test<-test.set[svm.y==svm.predy, ]
wrong.test<-test.set[svm.y!=svm.predy, ]

table(svm.y, svm.predy)
##      svm.predy
## svm.y  1  2  3  4  5  6  7  8  9
##     1 11  0  0  0  0  0  0  0  0
##     2  0  8  0  0  0  0  0  0  0
##     3  0  0 15  0  0  0  0  0  0
##     4  0  0  0 24  0  0  0  0  0
##     5  0  0  0  0 47  0  0  0  1
##     6  0  0  0  0  0  8  0  0  0
##     7  0  0  0  0  0  0  3  0  0
##     8  0  0  0  0  0  0  0  5  0
##     9  0  0  0  0  0  0  0  0 20
length(svm.y)
## [1] 142

4 Test Picture

r2<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
  geom_point(data = test.set, aes(x=PC1, y=PC2, colour=Group)) + 
  labs(colour = "Class", title="Decision Surface With Test Set Observations") 

r2

r2.NoTitle<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
  geom_point(data = correct.test, aes(x=PC1, y=PC2, colour=Group)) + 
  geom_point(data = wrong.test, aes(x=PC1, y=PC2, colour=Group), shape = 4, size = 2.5, stroke = 1.35) + 
  labs(colour = "Class") 

r2.NoTitle

This is perfect.

This may be suspected as we now have a larger training set, its gone up from 64% overall to 80% overall.

5 QDA Training Error

qdafit<-qda(Group~PC1+PC2,data = train.set)

qda.y<-train.set$Group
qda.predy<-predict(qdafit, train.set)$class

qdafit
## Call:
## qda(Group ~ PC1 + PC2, data = train.set)
## 
## Prior probabilities of groups:
##          1          2          3          4          5          6 
## 0.07457627 0.05932203 0.10847458 0.16440678 0.33220339 0.05762712 
##          7          8          9 
## 0.02711864 0.04067797 0.13559322 
## 
## Group means:
##         PC1          PC2
## 1  23.31288  26.57413042
## 2  65.88565  25.73098342
## 3 124.65875   0.02433045
## 4 -57.01693 -21.10175362
## 5 -42.06696   6.83432008
## 6  95.87322 -24.90538611
## 7 -12.89409 -63.21542433
## 8  40.05213 -56.14663773
## 9 -18.56758  22.62217730
mean(qda.y!=qda.predy) 
## [1] 0.0440678
table(qda.y, qda.predy)
##      qda.predy
## qda.y   1   2   3   4   5   6   7   8   9
##     1  44   0   0   0   0   0   0   0   0
##     2   1  34   0   0   0   0   0   0   0
##     3   0   0  64   0   0   0   0   0   0
##     4   0   0   0  91   6   0   0   0   0
##     5   0   0   0   6 186   0   0   0   4
##     6   0   0   0   0   0  34   0   0   0
##     7   0   0   0   0   0   0  15   1   0
##     8   0   0   0   0   0   0   0  24   0
##     9   2   0   0   0   6   0   0   0  72
length(qda.y)
## [1] 590
xgrid<-expand.grid(X1=seq(min(Data$PC1), max(Data$PC1), length.out = 150), 
                   X2=seq(min(Data$PC2), max(Data$PC2), length.out =150))

colnames(xgrid)<-c("PC1", "PC2")

group.train.set.pred<-predict(qdafit, xgrid)$class

xgrid<-cbind(xgrid, group.train.set.pred)

r3<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
  geom_point(data = train.set, aes(x=PC1, y=PC2, colour=Group)) +
  labs(colour = "Class", title="Decision Surface With Training Set Observations") 

r3

6 QDA Test Error

qdafit<-qda(Group~PC1+PC2,data = train.set)

qda.y<-test.set$Group
qda.predy<-predict(qdafit, test.set)$class

qdafit
## Call:
## qda(Group ~ PC1 + PC2, data = train.set)
## 
## Prior probabilities of groups:
##          1          2          3          4          5          6 
## 0.07457627 0.05932203 0.10847458 0.16440678 0.33220339 0.05762712 
##          7          8          9 
## 0.02711864 0.04067797 0.13559322 
## 
## Group means:
##         PC1          PC2
## 1  23.31288  26.57413042
## 2  65.88565  25.73098342
## 3 124.65875   0.02433045
## 4 -57.01693 -21.10175362
## 5 -42.06696   6.83432008
## 6  95.87322 -24.90538611
## 7 -12.89409 -63.21542433
## 8  40.05213 -56.14663773
## 9 -18.56758  22.62217730
mean(qda.y!=qda.predy) 
## [1] 0.03521127
correct.test<-test.set[qda.y==qda.predy, ]
wrong.test<-test.set[qda.y!=qda.predy, ]

table(qda.y, qda.predy)
##      qda.predy
## qda.y  1  2  3  4  5  6  7  8  9
##     1 10  0  0  0  0  0  0  0  1
##     2  0  8  0  0  0  0  0  0  0
##     3  0  0 15  0  0  0  0  0  0
##     4  0  0  0 24  0  0  0  0  0
##     5  0  0  0  1 46  0  0  0  1
##     6  0  0  0  0  0  8  0  0  0
##     7  0  0  0  0  0  0  3  0  0
##     8  0  0  0  0  0  0  0  5  0
##     9  1  0  0  0  1  0  0  0 18
length(qda.y)
## [1] 142
r4<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
  geom_point(data = test.set, aes(x=PC1, y=PC2, colour=Group)) + 
  labs(colour = "Class", title="Decision Surface With Test Set Observations") 

r4

r4.NoTitle<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
  geom_point(data = correct.test, aes(x=PC1, y=PC2, colour=Group)) +
  geom_point(data = wrong.test, aes(x=PC1, y=PC2, colour=Group), shape = 4, size = 2.5, stroke = 1.35) +  
  labs(colour = "Class") 

r4.NoTitle