1 Reading in the Data

ArcLakeGroupSummary <- read_excel("~/Desktop/EPSRC Project /ArcLakeGroupSummary.xlsx")
dundeedata <- read_csv("~/Desktop/EPSRC Project /dundeedata.csv.xls")

colnames(dundeedata)[1]<-"GloboLakes_ID" # change the GloboLID column name to GloboLakes_ID to make the merge easier.

Data<-merge(ArcLakeGroupSummary, dundeedata, by = "GloboLakes_ID", all = TRUE )
Data<-subset(Data, Group!="NA") # The data set is back to the original 732 rows just with extra columns of information

Data$Group<-as.factor(Data$Group)

2 For Longitude + Latitude + Main Climate

  Main.Climate<-matrix(data = NA, nrow = 732, ncol = 1)
  colnames(Main.Climate)[1]<-"Main.Climate"
  
  for(i in 1:nrow(Data)){
    if(grepl("A.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Equitorial"}
    if(grepl("B.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Arid"}
    if(grepl("C.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Warm Temp"}
    if(grepl("D.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Snow"}
    if(grepl("E.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Polar"}
  }
  
  Data<-cbind(Data, Main.Climate)

2.1 Performing the One Hot Encoding

  Data3<-data.frame(Data[, c("Group", "Latitude", "Longitude", "Main.Climate")])
  
  # Get rid of the 5 rows with missing values of Main Climate
  
  Data3<-subset(Data3, Main.Climate!="NA")
  
  # Do the one hot encoding
  
  cat.variable<-subset(Data3, select = -c(Group, Latitude, Longitude))
  
  ohe.Data3<-data.frame(model.matrix(~Main.Climate-1, cat.variable), Group=Data3$Group, Latitude=Data3$Latitude, Longitude=Data3$Longitude)
  datatable(Data3) # Original
  datatable(ohe.Data3) # One Hot Encoded
  # Stratify the entire training set into training and test sets
  
  set.seed(234)
  
  library(caret)
  train.index<-createDataPartition(ohe.Data3$Group, p=0.8, list = FALSE)
  train.set<-ohe.Data3[train.index, ]
  test.set<-ohe.Data3[-train.index, ]
  
  # Stratify the training set into 5 folds
  
  folds <- createFolds(y=factor(train.set$Group), k = 5, list = FALSE)
  train.set$fold <- folds

2.2 SVM Linear Kernel

  # Using SVM linear kernel cost = 0.22 - just to test
  
  CV.error<-NULL 
  
  for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group ~  Main.ClimateArid + Main.ClimateEquitorial + Main.ClimatePolar + Main.ClimateSnow + Main.ClimateWarm.Temp + Latitude + Longitude ,data = train.data, kernel="linear", cost=0.22 ,scale=FALSE)
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  sum(CV.error)
## [1] 0.3020478

2.3 SVM Polynomial Kernel

  # Using SVM polynomial kernel degree = 1, cost = 500 - just to test
  
  CV.error<-NULL 
  
  for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group ~ Main.ClimateArid + Main.ClimateEquitorial + Main.ClimatePolar + Main.ClimateSnow + Main.ClimateWarm.Temp + Latitude + Longitude, data = train.data, kernel="polynomial",degree = 1, cost=500)
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  sum(CV.error)
## [1] 0.221843

2.4 SVM Radial Kernel

  # Using SVM radial kernel cost = 700, gamma = 0.11 - just to test 
  
  CV.error<-NULL 
  
  for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group ~ Main.ClimateArid + Main.ClimateEquitorial + Main.ClimatePolar + Main.ClimateSnow + Main.ClimateWarm.Temp + Latitude + Longitude , data = train.data, kernel="radial", cost=700, gamma=0.11)
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  sum(CV.error)
## [1] 0.1911263

3 For Longitude + Latitude + KG_Coding

3.1 Perfroming the One Hot Encoding

  Data4<-data.frame(Data[, c("Group", "Latitude", "Longitude", "KG_Coding")])
  
  # Get rid of the 5 rows with missing values of KG_Coding
  
  Data4<-subset(Data4, KG_Coding!="NA")
  
  # Do the one hot encoding
  
  cat.variable<-subset(Data4, select = -c(Group, Latitude, Longitude))
  
  ohe.Data4<-data.frame(model.matrix(~KG_Coding-1, cat.variable), Group=Data4$Group, Latitude=Data4$Latitude, Longitude=Data4$Longitude)
  datatable(Data4) # Original
  datatable(ohe.Data4) # One Hot Encoded
  # Stratify the entire training set into training and test sets
  
  set.seed(234)
  
  library(caret)
  train.index<-createDataPartition(ohe.Data4$Group, p=0.8, list = FALSE)
  train.set<-ohe.Data4[train.index, ]
  test.set<-ohe.Data4[-train.index, ]
  
  # Stratify the training set into 5 folds
  
  folds <- createFolds(y=factor(train.set$Group), k = 5, list = FALSE)
  train.set$fold <- folds

3.2 SVM Linear Kernel

  # Using SVM linear kernel cost = 100 - just to test
  
  CV.error<-NULL 
  
  for (i in 1:5) { 
    valid.data <- subset(train.set, fold == i)
    train.data <- subset(train.set, fold != i) 
    
    svmfit<-svm(Group ~ . ,data = train.data, kernel="linear", cost=100 ,scale=FALSE)
    svm.y<-valid.data$Group
    svm.predy<-predict(svmfit, valid.data)
    
    ith.test.error<- mean(svm.y!=svm.predy) 
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
  }
  
  sum(CV.error)
## [1] 0.2525597