ArcLakeGroupSummary <- read_excel("~/Desktop/EPSRC Project /ArcLakeGroupSummary.xlsx")
dundeedata <- read_csv("~/Desktop/EPSRC Project /dundeedata.csv.xls")
colnames(dundeedata)[1]<-"GloboLakes_ID" # change the GloboLID column name to GloboLakes_ID to make the merge easier.
Data<-merge(ArcLakeGroupSummary, dundeedata, by = "GloboLakes_ID", all = TRUE )
Data<-subset(Data, Group!="NA") # The data set is back to the original 732 rows just with extra columns of information
Data$Group<-as.factor(Data$Group)
Data1<-data.frame(Data[,c("Group","PC1","PC2")])
# Stratify the entire training set into training and test sets
set.seed(1)
train.index<-createDataPartition(Data1$Group, p=0.8, list = FALSE)
train.set<-Data1[train.index, ]
test.set<-Data1[-train.index, ]
# Stratify the training set into 5 folds
set.seed(1)
folds <- createFolds(y=factor(train.set$Group), k = 5, list = FALSE)
train.set$fold <- foldsSVM Linear
# Searching for the best SVM with linear kernel changing cost
costs<-seq(exp(-5), exp(20), length.out = 20 )
CV.errors<-numeric(length(costs))
for(j in 1:length(costs)){
errors<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group~PC1+PC2,data = train.data, kernel="linear", cost=costs[j], scale=FALSE)
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
errors<-c(errors,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
CV.errors[j]<-sum(errors)
}
min(CV.errors) ## [1] 0.03728814
costs[which.min(CV.errors)]## [1] 0.006737947
CV.errors## [1] 0.03728814 0.09491525 0.09322034 0.09661017 0.09491525 0.09152542
## [7] 0.09322034 0.09491525 0.09661017 0.09830508 0.09661017 0.09830508
## [13] 0.09661017 0.09661017 0.09661017 0.09661017 0.09661017 0.09322034
## [19] 0.09661017 0.09661017
SVM Polynomial
# Searching for the best SVM with polynomial kernel changing cost and degree
costs<-seq(exp(-5), exp(20), length.out = 20 )
degrees<-c(1:5)
matrix.errors<-matrix(NA, nrow = length(costs), ncol = length(degrees))
for(j in 1:length(costs)){
for(l in 1:length(degrees)){
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group~PC1+PC2, data = train.data, kernel="polynomial", cost=costs[j], degree=degrees[l], gamma=1)
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
matrix.errors[j, l]<-sum(CV.error)
}
}
min(matrix.errors)## [1] 0.03050847
# turn the matrix.error into a column vector
xgrid<-expand.grid(X1=costs, X2=degrees)
colnames(xgrid)<-c("costs", "degrees")
CV.Errors<-as.vector(matrix.errors)
xgrid<-cbind(xgrid, CV.Errors)
xgrid[which.min(CV.Errors), ]## costs degrees CV.Errors
## 42 25535010 3 0.03050847
SVM Radial
# Searching for the best SVM with radial kernel changing cost and gamma
costs<-seq(exp(-5), exp(20), length.out = 20)
gammas<-seq(exp(-9), exp(-0.75), length.out = 5 )
matrix.errors<-matrix(NA, nrow = length(costs), ncol = length(gammas))
for(j in 1:length(costs)){
for(l in 1:length(gammas)){
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group~PC1+PC2, data = train.data, kernel="radial", cost=costs[j], gamma=gammas[l])
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
matrix.errors[j, l]<-sum(CV.error)
}
}
min(matrix.errors)## [1] 0.02711864
# turn the matrix.error into a column vector
xgrid<-expand.grid(X1=costs, X2=gammas)
colnames(xgrid)<-c("costs", "gammas")
CV.Errors<-as.vector(matrix.errors)
xgrid<-cbind(xgrid, CV.Errors)
xgrid[which.min(CV.Errors), ]## costs gammas CV.Errors
## 10 229815093 0.0001234098 0.02711864