1 Reading in the Data

ArcLakeGroupSummary <- read_excel("~/Desktop/EPSRC Project /ArcLakeGroupSummary.xlsx")
dundeedata <- read_csv("~/Desktop/EPSRC Project /dundeedata.csv.xls")

colnames(dundeedata)[1]<-"GloboLakes_ID" # change the GloboLID column name to GloboLakes_ID to make the merge easier.

Data<-merge(ArcLakeGroupSummary, dundeedata, by = "GloboLakes_ID", all = TRUE )
Data<-subset(Data, Group!="NA") # The data set is back to the original 732 rows just with extra columns of information

Data$Group<-as.factor(Data$Group)

Data1<-data.frame(Data[,c("Group","PC1","PC2")])

# Stratify the entire training set into training and test sets

set.seed(1)

train.index<-createDataPartition(Data1$Group, p=0.8, list = FALSE)
train.set<-Data1[train.index, ]
test.set<-Data1[-train.index, ]

# Stratify the training set into 5 folds

set.seed(1)
folds <- createFolds(y=factor(train.set$Group), k = 5, list = FALSE)
train.set$fold <- folds

SVM Linear

  # Searching for the best SVM with linear kernel changing cost

  costs<-seq(exp(-5), exp(20), length.out = 20 )
    
  CV.errors<-numeric(length(costs))
    
  for(j in 1:length(costs)){
      
        errors<-NULL 
  
    for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group~PC1+PC2,data = train.data, kernel="linear", cost=costs[j], scale=FALSE)
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    errors<-c(errors,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  CV.errors[j]<-sum(errors)
    
  }
  
  min(CV.errors)  
## [1] 0.03728814
  costs[which.min(CV.errors)]
## [1] 0.006737947
  CV.errors
##  [1] 0.03728814 0.09491525 0.09322034 0.09661017 0.09491525 0.09152542
##  [7] 0.09322034 0.09491525 0.09661017 0.09830508 0.09661017 0.09830508
## [13] 0.09661017 0.09661017 0.09661017 0.09661017 0.09661017 0.09322034
## [19] 0.09661017 0.09661017

SVM Polynomial

    # Searching for the best SVM with polynomial kernel changing cost and degree

    costs<-seq(exp(-5), exp(20), length.out = 20 )
    degrees<-c(1:5)
    
    matrix.errors<-matrix(NA, nrow = length(costs), ncol = length(degrees))
    for(j in 1:length(costs)){
      for(l in 1:length(degrees)){
        
         CV.error<-NULL 
  
    for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group~PC1+PC2, data = train.data, kernel="polynomial", cost=costs[j], degree=degrees[l], gamma=1)
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  matrix.errors[j, l]<-sum(CV.error)
        
      }
    }
    
    min(matrix.errors)
## [1] 0.03050847
    # turn the matrix.error into a column vector
    
    xgrid<-expand.grid(X1=costs, X2=degrees)
    colnames(xgrid)<-c("costs", "degrees")
    
    CV.Errors<-as.vector(matrix.errors)
    
    xgrid<-cbind(xgrid, CV.Errors)
    
    xgrid[which.min(CV.Errors), ]
##       costs degrees  CV.Errors
## 42 25535010       3 0.03050847

SVM Radial

  # Searching for the best SVM with radial kernel changing cost and gamma

    costs<-seq(exp(-5), exp(20), length.out = 20)
    
    gammas<-seq(exp(-9), exp(-0.75), length.out = 5 )
  
    matrix.errors<-matrix(NA, nrow = length(costs), ncol = length(gammas))

    for(j in 1:length(costs)){
      for(l in 1:length(gammas)){
              CV.error<-NULL 
  
    for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group~PC1+PC2, data = train.data, kernel="radial", cost=costs[j], gamma=gammas[l])
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  matrix.errors[j, l]<-sum(CV.error)
      }
    }
  min(matrix.errors)
## [1] 0.02711864
  # turn the matrix.error into a column vector
  
    xgrid<-expand.grid(X1=costs, X2=gammas)
    colnames(xgrid)<-c("costs", "gammas")
    
    CV.Errors<-as.vector(matrix.errors)
    
    xgrid<-cbind(xgrid, CV.Errors)
    
    xgrid[which.min(CV.Errors), ]
##        costs       gammas  CV.Errors
## 10 229815093 0.0001234098 0.02711864