# Data Preprocessing
rm(list=ls())


# Importing the dataset
dataset =read.csv('C:/Users/asus/Desktop/Forest Cover Type/train.csv', head=T, stringsAsFactors=F, na.strings='')
#removing id,Soil_Type_7 and Soil_Type_15
dataset=dataset[,c(-1,-22,-30)]

#view(dataset)
length(boxplot(dataset$Slope)$out) #understand more
## [1] 57
#check and delete outliers
i = 1
dataset[,1:53] = lapply(dataset[,1:53],as.numeric)
vizu = round(cor(dataset),1) #why does it give a warnng message? --> In cor(dataset) : the standard deviation is zero
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.6.3
## Loading required package: ggplot2

#checking for correlation
ggcorrplot(vizu)

#function for finding outliers
while(i <= ncol(dataset)){
  
  
  thirdQuantile = quantile(dataset[,i],0.75)
  upperOutlier = (IQR(dataset[,i]) * 3) + thirdQuantile
  #print(upperOutlier)
  x = length(which(dataset[,i]>upperOutlier))
  
  
  
  firstQuantile = quantile(dataset[,i],0.25)
  lowerOutlier =  firstQuantile -(IQR(dataset[,i])* 3)
  #print(lowerOutlier)
  y = length(which(dataset[,i] < lowerOutlier))
  
  print(paste(x+y," is the number of outliers in ", colnames(dataset)[i]))
  
  i = i+1
  
}
## [1] "0  is the number of outliers in  Elevation"
## [1] "0  is the number of outliers in  Aspect"
## [1] "0  is the number of outliers in  Slope"
## [1] "53  is the number of outliers in  Horizontal_Distance_To_Hydrology"
## [1] "49  is the number of outliers in  Vertical_Distance_To_Hydrology"
## [1] "3  is the number of outliers in  Horizontal_Distance_To_Roadways"
## [1] "7  is the number of outliers in  Hillshade_9am"
## [1] "20  is the number of outliers in  Hillshade_Noon"
## [1] "0  is the number of outliers in  Hillshade_3pm"
## [1] "132  is the number of outliers in  Horizontal_Distance_To_Fire_Points"
## [1] "3597  is the number of outliers in  Wilderness_Area1"
## [1] "499  is the number of outliers in  Wilderness_Area2"
## [1] "0  is the number of outliers in  Wilderness_Area3"
## [1] "0  is the number of outliers in  Wilderness_Area4"
## [1] "355  is the number of outliers in  Soil_Type1"
## [1] "623  is the number of outliers in  Soil_Type2"
## [1] "962  is the number of outliers in  Soil_Type3"
## [1] "843  is the number of outliers in  Soil_Type4"
## [1] "165  is the number of outliers in  Soil_Type5"
## [1] "650  is the number of outliers in  Soil_Type6"
## [1] "1  is the number of outliers in  Soil_Type8"
## [1] "10  is the number of outliers in  Soil_Type9"
## [1] "2142  is the number of outliers in  Soil_Type10"
## [1] "406  is the number of outliers in  Soil_Type11"
## [1] "227  is the number of outliers in  Soil_Type12"
## [1] "476  is the number of outliers in  Soil_Type13"
## [1] "169  is the number of outliers in  Soil_Type14"
## [1] "114  is the number of outliers in  Soil_Type16"
## [1] "612  is the number of outliers in  Soil_Type17"
## [1] "60  is the number of outliers in  Soil_Type18"
## [1] "46  is the number of outliers in  Soil_Type19"
## [1] "139  is the number of outliers in  Soil_Type20"
## [1] "16  is the number of outliers in  Soil_Type21"
## [1] "345  is the number of outliers in  Soil_Type22"
## [1] "757  is the number of outliers in  Soil_Type23"
## [1] "257  is the number of outliers in  Soil_Type24"
## [1] "1  is the number of outliers in  Soil_Type25"
## [1] "54  is the number of outliers in  Soil_Type26"
## [1] "15  is the number of outliers in  Soil_Type27"
## [1] "9  is the number of outliers in  Soil_Type28"
## [1] "1291  is the number of outliers in  Soil_Type29"
## [1] "725  is the number of outliers in  Soil_Type30"
## [1] "332  is the number of outliers in  Soil_Type31"
## [1] "690  is the number of outliers in  Soil_Type32"
## [1] "616  is the number of outliers in  Soil_Type33"
## [1] "22  is the number of outliers in  Soil_Type34"
## [1] "102  is the number of outliers in  Soil_Type35"
## [1] "10  is the number of outliers in  Soil_Type36"
## [1] "34  is the number of outliers in  Soil_Type37"
## [1] "728  is the number of outliers in  Soil_Type38"
## [1] "657  is the number of outliers in  Soil_Type39"
## [1] "459  is the number of outliers in  Soil_Type40"
## [1] "0  is the number of outliers in  Cover_Type"
thirdQuantile = quantile(dataset$Horizontal_Distance_To_Hydrology,0.75)
upperOutlier = (IQR(dataset$Horizontal_Distance_To_Hydrology) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Horizontal_Distance_To_Hydrology,0.25)
lowerOutlier =  firstQuantile -(IQR(dataset$Horizontal_Distance_To_Hydrology)* 3)

removeUpper = which(dataset$Horizontal_Distance_To_Hydrology > upperOutlier)
removeLower = which(dataset$Horizontal_Distance_To_Hydrology < lowerOutlier)
#removing upper outlier for Horizontal_Distance_To_Hydrology
if(length(removeUpper)>0){
  dataset = dataset[-removeUpper,]
}
#removing lower outlier for Horizontal_Distance_To_Hydrology
if(length(removeLower)>0){
  dataset = dataset[-removeLower,]
  
}

thirdQuantile = quantile(dataset$Horizontal_Distance_To_Roadways,0.75)
upperOutlier = (IQR(dataset$Horizontal_Distance_To_Roadways) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Horizontal_Distance_To_Roadways,0.25)
lowerOutlier =  firstQuantile -(IQR(dataset$Horizontal_Distance_To_Roadways)* 3)
removeUpper = which(dataset$Horizontal_Distance_To_Roadways > upperOutlier)
removeLower = which(dataset$Horizontal_Distance_To_Roadways < lowerOutlier)
#removing upper outlier for Horizontal_Distance_To_Roadways
if(length(removeUpper)>0){
  dataset = dataset[-removeUpper,]
}
#removing lower outlier for Horizontal_Distance_To_Roadways
if(length(removeLower)>0){
  dataset = dataset[-removeLower,]
  
}

thirdQuantile = quantile(dataset$Horizontal_Distance_To_Fire_Points,0.75)
upperOutlier = (IQR(dataset$Horizontal_Distance_To_Fire_Points) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Horizontal_Distance_To_Fire_Points,0.25)
lowerOutlier =  firstQuantile -(IQR(dataset$Horizontal_Distance_To_Fire_Points)* 3)
removeUpper = which(dataset$Horizontal_Distance_To_Fire_Points > upperOutlier)
removeLower = which(dataset$Horizontal_Distance_To_Fire_Points < lowerOutlier)
#removing upper outlier for Horizontal_Distance_To_Fire_Points
if(length(removeUpper)>0){
  dataset = dataset[-removeUpper,]
}
#removing lower outlier for Horizontal_Distance_To_Fire_Points
if(length(removeLower)>0){
  dataset = dataset[-removeLower,]
  
}

thirdQuantile = quantile(dataset$Vertical_Distance_To_Hydrology,0.75)
upperOutlier = (IQR(dataset$Vertical_Distance_To_Hydrology) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Vertical_Distance_To_Hydrology,0.25)
lowerOutlier =  firstQuantile -(IQR(dataset$Vertical_Distance_To_Hydrology)* 3)
removeUpper = which(dataset$Vertical_Distance_To_Hydrology > upperOutlier)
removeLower = which(dataset$Vertical_Distance_To_Hydrology < lowerOutlier)
#removing upper outlier for Vertical_Distance_To_Hydrology
if(length(removeUpper)>0){
  dataset = dataset[-removeUpper,]
}
#removing lower outlier for Vertical_Distance_To_Hydrology
if(length(removeLower)>0){
  dataset = dataset[-removeLower,]
  
}
#checking dimensions for cleaned data
dim(dataset)
## [1] 14888    53
#scaling cleaned dataset
dataset[,c(1:10)]= scale(dataset[,c(1:10)])
rownames(dataset) = seq.int(nrow(dataset))
#importing libraries for CART and Random forest
library(caTools)
library(rpart)
library(rpart.plot)
library(caret)
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
#splitting dataset into test and train
set.seed(123)
splitting = sample.split(dataset$Cover_Type , SplitRatio = 0.6)
dataset_train = subset(dataset , splitting == TRUE)
dataset__test  = subset(dataset ,splitting == FALSE)
#applying CART on train data
classtree = rpart(Cover_Type ~., method = "class", data = dataset_train )
#predict full tree
predict_fulltree= predict(classtree , newdata = dataset__test, type = "class")
#confusion matrix for predicted full tree
confusion_matrix_fulltree = table(predict_fulltree, dataset__test$Cover_Type)
#finding accuracy for full tree
Accuracy_fulltree = sum(diag(confusion_matrix_fulltree))/nrow(dataset__test)

#plotting tree using prp()
prp(classtree)

#plotting tree using plot() and text()
plot(classtree)
text(classtree)

#plotting tree using rpart.plot()
rpart.plot(classtree, box.palette = 0)

names(classtree)
##  [1] "frame"               "where"               "call"               
##  [4] "terms"               "cptable"             "method"             
##  [7] "parms"               "control"             "functions"          
## [10] "numresp"             "splits"              "variable.importance"
## [13] "y"                   "ordered"
#finding CP for full tree
classtree$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.16524813      0 1.0000000 1.0149273 0.004193399
## 2 0.13827419      1 0.8347519 0.8431321 0.005551828
## 3 0.09977740      2 0.6964777 0.6971324 0.006072828
## 4 0.08498101      3 0.5967003 0.6294356 0.006169930
## 5 0.02762865      4 0.5117193 0.5193139 0.006148964
## 6 0.02396229      5 0.4840906 0.4954825 0.006115269
## 7 0.01414168      6 0.4601283 0.4717821 0.006071196
## 8 0.01361791      7 0.4459866 0.4605211 0.006046500
## 9 0.01000000      8 0.4323687 0.4416656 0.005999625
#prunning tree at lowest xerror 
prune_classtree = prune(classtree , cp =0.01715333 )
#plotting prune tree using rpart.plot()
rpart.plot(prune_classtree,box.palette = 0)

#plotting prune tree using prp()
prp(prune_classtree)

#predict test data using prune tree
predict_classification = predict(prune_classtree, newdata = dataset__test, type = "class")
#find accuracy for prune tree
confusion_matrix_class=table(dataset__test$Cover_Type, predict_classification)
confusion_matrix_class
##    predict_classification
##       1   2   3   4   5   6   7
##   1 528   0   2   0 105   3 207
##   2 298   0  81   0 407  12  29
##   3   0   0 551 182  57  74   0
##   4   0   0 106 758   0   0   0
##   5  13   0  67   0 771   3   0
##   6   0   0 411 138  64 251   0
##   7  70   0   0   0   4   0 763
Accuracy = sum(diag(confusion_matrix_class))/nrow(dataset__test)
Accuracy
## [1] 0.6082284
#classification of test data with prune tree
plot(predict_classification, main ="Classification")

#plotting random forest with nt=200
Random_tree = randomForest(factor(Cover_Type) ~. , data =dataset_train,ntree=200 )
plot(Random_tree)

#factoring Cover_Type
dataset__test$Cover_Type = as.factor(dataset__test$Cover_Type)
#predict test data with random forest
predict_randomforest = predict(Random_tree , newdata = dataset__test)
#finding accuracy for RAndom forest
confusion_matrix_RForest=table(dataset__test$Cover_Type , predict_randomforest)
confusion_matrix_RForest
##    predict_randomforest
##       1   2   3   4   5   6   7
##   1 618 118   0   0  31   7  71
##   2 158 525  17   0  83  39   5
##   3   0   0 609  67   7 181   0
##   4   0   0   8 832   0  24   0
##   5   0  25  11   0 801  17   0
##   6   0   1 114  37   7 705   0
##   7  45   1   2   0   2   0 787
Accuracy_RF = sum(diag(confusion_matrix_RForest))/nrow(dataset__test)
Accuracy_RF
## [1] 0.8189757
#summary for predicted random forest
summary(predict_randomforest)
##   1   2   3   4   5   6   7 
## 821 670 761 936 931 973 863
#plotting predicted random forest
plot(predict_randomforest, main = "Random Forest Classification")

##
#KNN
#divide test and train data based on outcome variable
library(class)
Xtrain = dataset_train[1:52]
Xtest = dataset__test[1:52]
ytrain = dataset_train[53]
ytest = dataset__test[53]
#try k values for sqrt(train)
klen=round(sqrt(nrow(Xtrain)),0)

tpredict<-numeric() #Holding variable
for(j in 1:klen){
  #Apply knn with k = i
  ypred<-knn(Xtrain,Xtest,ytrain$Cover_Type,k=j)
  #finding accurately predicted values
  tpredict<-c(tpredict,
              mean(ypred==ytest$Cover_Type))
}
#validation error
error<-1-tpredict
#plotting validation error with K values
plot(1-tpredict,type="l",ylab="Error Rate",xlab="K",xaxt="n",main="Error Rate for cover type With Varying K")
axis(1, seq(0,klen, 1),las=2, font=2,cex.axis=0.8)

#predicting results and accuracy for k=4
ypred1 = knn(Xtrain, Xtest, ytrain$Cover_Type, k=4, prob=T)
cm1=as.matrix(table(ytest$Cover_Type, ypred1)) #confusion matrix
AccuracyK4=sum(diag(cm1))/length(ytest$Cover_Type) #accuracy

#predicting results and accuracy for k=12
ypred2 = knn(Xtrain, Xtest, ytrain$Cover_Type, k=12, prob=T)
cm2=as.matrix(table(ytest$Cover_Type, ypred2))
Accuracyk12=sum(diag(cm2))/length(ytest$Cover_Type)

#predicting results and accuracy for k=16
ypred3 = knn(Xtrain, Xtest, ytrain$Cover_Type, k=16, prob=T)
cm3=as.matrix(table(ytest$Cover_Type, ypred3))
Accuracyk16=sum(diag(cm3))/length(ytest$Cover_Type)

#test data classification with k=4
plot(ypred1, main = "knn  Classification")