ArcLakeGroupSummary <- read_excel("~/Desktop/EPSRC Project /ArcLakeGroupSummary.xlsx")
dundeedata <- read_csv("~/Desktop/EPSRC Project /dundeedata.csv.xls")
colnames(dundeedata)[1]<-"GloboLakes_ID" # change the GloboLID column name to GloboLakes_ID to make the merge easier.
Data<-merge(ArcLakeGroupSummary, dundeedata, by = "GloboLakes_ID", all = TRUE )
Data<-subset(Data, Group!="NA") # The data set is back to the original 732 rows just with extra columns of information
Data$Group<-as.factor(Data$Group)
Data1<-data.frame(Data[,c("Group","PC1","PC2")])
# Stratify the entire training set into training and test sets
set.seed(1)
train.index<-createDataPartition(Data1$Group, p=0.8, list = FALSE)
train.set<-Data1[train.index, ]
test.set<-Data1[-train.index, ]
nrow(train.set)
## [1] 590
nrow(test.set)
## [1] 142
PC.Plot<-ggplot(train.set, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=Group))+
labs(colour = "Class", title="PC1 vs PC2 - Training set")
PC.Plot
svmfit<-svm(Group~PC1+PC2,data = train.set, kernel="radial", cost = 1538065, gamma = 0.02650774)
svm.y<-train.set$Group
svm.predy<-predict(svmfit, train.set)
svmfit
##
## Call:
## svm(formula = Group ~ PC1 + PC2, data = train.set, kernel = "radial",
## cost = 1538065, gamma = 0.02650774)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1538065
## gamma: 0.02650774
##
## Number of Support Vectors: 67
mean(svm.y!=svm.predy)
## [1] 0.01186441
table(svm.y, svm.predy)
## svm.predy
## svm.y 1 2 3 4 5 6 7 8 9
## 1 44 0 0 0 0 0 0 0 0
## 2 0 35 0 0 0 0 0 0 0
## 3 0 0 64 0 0 0 0 0 0
## 4 0 0 0 93 4 0 0 0 0
## 5 0 0 0 1 193 0 0 0 2
## 6 0 0 0 0 0 34 0 0 0
## 7 0 0 0 0 0 0 16 0 0
## 8 0 0 0 0 0 0 0 24 0
## 9 0 0 0 0 0 0 0 0 80
length(svm.y)
## [1] 590
xgrid<-expand.grid(X1=seq(min(Data$PC1), max(Data$PC1), length.out = 150),
X2=seq(min(Data$PC2), max(Data$PC2), length.out =150))
colnames(xgrid)<-c("PC1", "PC2")
group.train.set.pred<-predict(svmfit, xgrid)
xgrid<-cbind(xgrid, group.train.set.pred)
r1<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
geom_point(data = train.set[-svmfit$index,], aes(x=PC1, y=PC2, colour=Group)) +
geom_point(data = train.set[svmfit$index,], aes(x=PC1, y=PC2, colour=Group), shape=4) +
labs(colour = "Class", title="Decision Surface With Training Set Observations")
r1
svmfit<-svm(Group~PC1+PC2,data = train.set, kernel="radial", cost = 1538065, gamma = 0.02650774)
svm.y<-test.set$Group
svm.predy<-predict(svmfit, test.set)
svmfit
##
## Call:
## svm(formula = Group ~ PC1 + PC2, data = train.set, kernel = "radial",
## cost = 1538065, gamma = 0.02650774)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1538065
## gamma: 0.02650774
##
## Number of Support Vectors: 67
mean(svm.y!=svm.predy)
## [1] 0.007042254
correct.test<-test.set[svm.y==svm.predy, ]
wrong.test<-test.set[svm.y!=svm.predy, ]
table(svm.y, svm.predy)
## svm.predy
## svm.y 1 2 3 4 5 6 7 8 9
## 1 11 0 0 0 0 0 0 0 0
## 2 0 8 0 0 0 0 0 0 0
## 3 0 0 15 0 0 0 0 0 0
## 4 0 0 0 24 0 0 0 0 0
## 5 0 0 0 0 47 0 0 0 1
## 6 0 0 0 0 0 8 0 0 0
## 7 0 0 0 0 0 0 3 0 0
## 8 0 0 0 0 0 0 0 5 0
## 9 0 0 0 0 0 0 0 0 20
length(svm.y)
## [1] 142
r2<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
geom_point(data = test.set, aes(x=PC1, y=PC2, colour=Group)) +
labs(colour = "Class", title="Decision Surface With Test Set Observations")
r2
r2.NoTitle<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
geom_point(data = correct.test, aes(x=PC1, y=PC2, colour=Group)) +
geom_point(data = wrong.test, aes(x=PC1, y=PC2, colour=Group), shape = 4, size = 2.5, stroke = 1.35) +
labs(colour = "Class")
r2.NoTitle
This is perfect.
This may be suspected as we now have a larger training set, its gone up from 64% overall to 80% overall.
qdafit<-qda(Group~PC1+PC2,data = train.set)
qda.y<-train.set$Group
qda.predy<-predict(qdafit, train.set)$class
qdafit
## Call:
## qda(Group ~ PC1 + PC2, data = train.set)
##
## Prior probabilities of groups:
## 1 2 3 4 5 6
## 0.07457627 0.05932203 0.10847458 0.16440678 0.33220339 0.05762712
## 7 8 9
## 0.02711864 0.04067797 0.13559322
##
## Group means:
## PC1 PC2
## 1 23.31288 26.57413042
## 2 65.88565 25.73098342
## 3 124.65875 0.02433045
## 4 -57.01693 -21.10175362
## 5 -42.06696 6.83432008
## 6 95.87322 -24.90538611
## 7 -12.89409 -63.21542433
## 8 40.05213 -56.14663773
## 9 -18.56758 22.62217730
mean(qda.y!=qda.predy)
## [1] 0.0440678
table(qda.y, qda.predy)
## qda.predy
## qda.y 1 2 3 4 5 6 7 8 9
## 1 44 0 0 0 0 0 0 0 0
## 2 1 34 0 0 0 0 0 0 0
## 3 0 0 64 0 0 0 0 0 0
## 4 0 0 0 91 6 0 0 0 0
## 5 0 0 0 6 186 0 0 0 4
## 6 0 0 0 0 0 34 0 0 0
## 7 0 0 0 0 0 0 15 1 0
## 8 0 0 0 0 0 0 0 24 0
## 9 2 0 0 0 6 0 0 0 72
length(qda.y)
## [1] 590
xgrid<-expand.grid(X1=seq(min(Data$PC1), max(Data$PC1), length.out = 150),
X2=seq(min(Data$PC2), max(Data$PC2), length.out =150))
colnames(xgrid)<-c("PC1", "PC2")
group.train.set.pred<-predict(qdafit, xgrid)$class
xgrid<-cbind(xgrid, group.train.set.pred)
r3<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
geom_point(data = train.set, aes(x=PC1, y=PC2, colour=Group)) +
labs(colour = "Class", title="Decision Surface With Training Set Observations")
r3
qdafit<-qda(Group~PC1+PC2,data = train.set)
qda.y<-test.set$Group
qda.predy<-predict(qdafit, test.set)$class
qdafit
## Call:
## qda(Group ~ PC1 + PC2, data = train.set)
##
## Prior probabilities of groups:
## 1 2 3 4 5 6
## 0.07457627 0.05932203 0.10847458 0.16440678 0.33220339 0.05762712
## 7 8 9
## 0.02711864 0.04067797 0.13559322
##
## Group means:
## PC1 PC2
## 1 23.31288 26.57413042
## 2 65.88565 25.73098342
## 3 124.65875 0.02433045
## 4 -57.01693 -21.10175362
## 5 -42.06696 6.83432008
## 6 95.87322 -24.90538611
## 7 -12.89409 -63.21542433
## 8 40.05213 -56.14663773
## 9 -18.56758 22.62217730
mean(qda.y!=qda.predy)
## [1] 0.03521127
correct.test<-test.set[qda.y==qda.predy, ]
wrong.test<-test.set[qda.y!=qda.predy, ]
table(qda.y, qda.predy)
## qda.predy
## qda.y 1 2 3 4 5 6 7 8 9
## 1 10 0 0 0 0 0 0 0 1
## 2 0 8 0 0 0 0 0 0 0
## 3 0 0 15 0 0 0 0 0 0
## 4 0 0 0 24 0 0 0 0 0
## 5 0 0 0 1 46 0 0 0 1
## 6 0 0 0 0 0 8 0 0 0
## 7 0 0 0 0 0 0 3 0 0
## 8 0 0 0 0 0 0 0 5 0
## 9 1 0 0 0 1 0 0 0 18
length(qda.y)
## [1] 142
r4<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
geom_point(data = test.set, aes(x=PC1, y=PC2, colour=Group)) +
labs(colour = "Class", title="Decision Surface With Test Set Observations")
r4
r4.NoTitle<-ggplot(xgrid, aes(x=PC1,y=PC2)) +
geom_point(aes(colour=group.train.set.pred), alpha = 1/5) +
geom_point(data = correct.test, aes(x=PC1, y=PC2, colour=Group)) +
geom_point(data = wrong.test, aes(x=PC1, y=PC2, colour=Group), shape = 4, size = 2.5, stroke = 1.35) +
labs(colour = "Class")
r4.NoTitle