ArcLakeGroupSummary <- read_excel("~/Desktop/EPSRC Project /ArcLakeGroupSummary.xlsx")
dundeedata <- read_csv("~/Desktop/EPSRC Project /dundeedata.csv.xls")
colnames(dundeedata)[1]<-"GloboLakes_ID" # change the GloboLID column name to GloboLakes_ID to make the merge easier.
Data<-merge(ArcLakeGroupSummary, dundeedata, by = "GloboLakes_ID", all = TRUE )
Data<-subset(Data, Group!="NA") # The data set is back to the original 732 rows just with extra columns of information
Data$Group<-as.factor(Data$Group)
Main.Climate<-matrix(data = NA, nrow = 732, ncol = 1)
colnames(Main.Climate)[1]<-"Main.Climate"
for(i in 1:nrow(Data)){
if(grepl("A.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Equitorial"}
if(grepl("B.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Arid"}
if(grepl("C.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Warm Temp"}
if(grepl("D.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Snow"}
if(grepl("E.*", Data$KG_Coding[i])) {Main.Climate[i,1]<-"Polar"}
}
Data<-cbind(Data, Main.Climate)
Data3<-data.frame(Data[, c("Group", "Latitude", "Longitude", "Main.Climate")])
# Get rid of the 5 rows with missing values of Main Climate
Data3<-subset(Data3, Main.Climate!="NA")
# Do the one hot encoding
cat.variable<-subset(Data3, select = -c(Group, Latitude, Longitude))
ohe.Data3<-data.frame(model.matrix(~Main.Climate-1, cat.variable), Group=Data3$Group, Latitude=Data3$Latitude, Longitude=Data3$Longitude)
datatable(Data3) # Original
datatable(ohe.Data3) # One Hot Encoded
# Stratify the entire training set into training and test sets
set.seed(234)
library(caret)
train.index<-createDataPartition(ohe.Data3$Group, p=0.8, list = FALSE)
train.set<-ohe.Data3[train.index, ]
test.set<-ohe.Data3[-train.index, ]
# Stratify the training set into 5 folds
folds <- createFolds(y=factor(train.set$Group), k = 5, list = FALSE)
train.set$fold <- folds
# Using SVM linear kernel cost = 0.22 - just to test
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group ~ Main.ClimateArid + Main.ClimateEquitorial + Main.ClimatePolar + Main.ClimateSnow + Main.ClimateWarm.Temp + Latitude + Longitude ,data = train.data, kernel="linear", cost=0.22 ,scale=FALSE)
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
sum(CV.error)
## [1] 0.3020478
# Using SVM polynomial kernel degree = 1, cost = 500 - just to test
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group ~ Main.ClimateArid + Main.ClimateEquitorial + Main.ClimatePolar + Main.ClimateSnow + Main.ClimateWarm.Temp + Latitude + Longitude, data = train.data, kernel="polynomial",degree = 1, cost=500)
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
sum(CV.error)
## [1] 0.221843
# Using SVM radial kernel cost = 700, gamma = 0.11 - just to test
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group ~ Main.ClimateArid + Main.ClimateEquitorial + Main.ClimatePolar + Main.ClimateSnow + Main.ClimateWarm.Temp + Latitude + Longitude , data = train.data, kernel="radial", cost=700, gamma=0.11)
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
sum(CV.error)
## [1] 0.1911263
Data4<-data.frame(Data[, c("Group", "Latitude", "Longitude", "KG_Coding")])
# Get rid of the 5 rows with missing values of KG_Coding
Data4<-subset(Data4, KG_Coding!="NA")
# Do the one hot encoding
cat.variable<-subset(Data4, select = -c(Group, Latitude, Longitude))
ohe.Data4<-data.frame(model.matrix(~KG_Coding-1, cat.variable), Group=Data4$Group, Latitude=Data4$Latitude, Longitude=Data4$Longitude)
datatable(Data4) # Original
datatable(ohe.Data4) # One Hot Encoded
# Stratify the entire training set into training and test sets
set.seed(234)
library(caret)
train.index<-createDataPartition(ohe.Data4$Group, p=0.8, list = FALSE)
train.set<-ohe.Data4[train.index, ]
test.set<-ohe.Data4[-train.index, ]
# Stratify the training set into 5 folds
folds <- createFolds(y=factor(train.set$Group), k = 5, list = FALSE)
train.set$fold <- folds
# Using SVM linear kernel cost = 100 - just to test
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
svmfit<-svm(Group ~ . ,data = train.data, kernel="linear", cost=100 ,scale=FALSE)
svm.y<-valid.data$Group
svm.predy<-predict(svmfit, valid.data)
ith.test.error<- mean(svm.y!=svm.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
sum(CV.error)
## [1] 0.2525597