# Data Preprocessing
rm(list=ls())
# Importing the dataset
dataset =read.csv('C:/Users/asus/Desktop/Forest Cover Type/train.csv', head=T, stringsAsFactors=F, na.strings='')
#removing id,Soil_Type_7 and Soil_Type_15
dataset=dataset[,c(-1,-22,-30)]
#view(dataset)
length(boxplot(dataset$Slope)$out) #understand more
## [1] 57
#check and delete outliers
i = 1
dataset[,1:53] = lapply(dataset[,1:53],as.numeric)
vizu = round(cor(dataset),1) #why does it give a warnng message? --> In cor(dataset) : the standard deviation is zero
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.6.3
## Loading required package: ggplot2

#checking for correlation
ggcorrplot(vizu)

#function for finding outliers
while(i <= ncol(dataset)){
thirdQuantile = quantile(dataset[,i],0.75)
upperOutlier = (IQR(dataset[,i]) * 3) + thirdQuantile
#print(upperOutlier)
x = length(which(dataset[,i]>upperOutlier))
firstQuantile = quantile(dataset[,i],0.25)
lowerOutlier = firstQuantile -(IQR(dataset[,i])* 3)
#print(lowerOutlier)
y = length(which(dataset[,i] < lowerOutlier))
print(paste(x+y," is the number of outliers in ", colnames(dataset)[i]))
i = i+1
}
## [1] "0 is the number of outliers in Elevation"
## [1] "0 is the number of outliers in Aspect"
## [1] "0 is the number of outliers in Slope"
## [1] "53 is the number of outliers in Horizontal_Distance_To_Hydrology"
## [1] "49 is the number of outliers in Vertical_Distance_To_Hydrology"
## [1] "3 is the number of outliers in Horizontal_Distance_To_Roadways"
## [1] "7 is the number of outliers in Hillshade_9am"
## [1] "20 is the number of outliers in Hillshade_Noon"
## [1] "0 is the number of outliers in Hillshade_3pm"
## [1] "132 is the number of outliers in Horizontal_Distance_To_Fire_Points"
## [1] "3597 is the number of outliers in Wilderness_Area1"
## [1] "499 is the number of outliers in Wilderness_Area2"
## [1] "0 is the number of outliers in Wilderness_Area3"
## [1] "0 is the number of outliers in Wilderness_Area4"
## [1] "355 is the number of outliers in Soil_Type1"
## [1] "623 is the number of outliers in Soil_Type2"
## [1] "962 is the number of outliers in Soil_Type3"
## [1] "843 is the number of outliers in Soil_Type4"
## [1] "165 is the number of outliers in Soil_Type5"
## [1] "650 is the number of outliers in Soil_Type6"
## [1] "1 is the number of outliers in Soil_Type8"
## [1] "10 is the number of outliers in Soil_Type9"
## [1] "2142 is the number of outliers in Soil_Type10"
## [1] "406 is the number of outliers in Soil_Type11"
## [1] "227 is the number of outliers in Soil_Type12"
## [1] "476 is the number of outliers in Soil_Type13"
## [1] "169 is the number of outliers in Soil_Type14"
## [1] "114 is the number of outliers in Soil_Type16"
## [1] "612 is the number of outliers in Soil_Type17"
## [1] "60 is the number of outliers in Soil_Type18"
## [1] "46 is the number of outliers in Soil_Type19"
## [1] "139 is the number of outliers in Soil_Type20"
## [1] "16 is the number of outliers in Soil_Type21"
## [1] "345 is the number of outliers in Soil_Type22"
## [1] "757 is the number of outliers in Soil_Type23"
## [1] "257 is the number of outliers in Soil_Type24"
## [1] "1 is the number of outliers in Soil_Type25"
## [1] "54 is the number of outliers in Soil_Type26"
## [1] "15 is the number of outliers in Soil_Type27"
## [1] "9 is the number of outliers in Soil_Type28"
## [1] "1291 is the number of outliers in Soil_Type29"
## [1] "725 is the number of outliers in Soil_Type30"
## [1] "332 is the number of outliers in Soil_Type31"
## [1] "690 is the number of outliers in Soil_Type32"
## [1] "616 is the number of outliers in Soil_Type33"
## [1] "22 is the number of outliers in Soil_Type34"
## [1] "102 is the number of outliers in Soil_Type35"
## [1] "10 is the number of outliers in Soil_Type36"
## [1] "34 is the number of outliers in Soil_Type37"
## [1] "728 is the number of outliers in Soil_Type38"
## [1] "657 is the number of outliers in Soil_Type39"
## [1] "459 is the number of outliers in Soil_Type40"
## [1] "0 is the number of outliers in Cover_Type"
thirdQuantile = quantile(dataset$Horizontal_Distance_To_Hydrology,0.75)
upperOutlier = (IQR(dataset$Horizontal_Distance_To_Hydrology) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Horizontal_Distance_To_Hydrology,0.25)
lowerOutlier = firstQuantile -(IQR(dataset$Horizontal_Distance_To_Hydrology)* 3)
removeUpper = which(dataset$Horizontal_Distance_To_Hydrology > upperOutlier)
removeLower = which(dataset$Horizontal_Distance_To_Hydrology < lowerOutlier)
#removing upper outlier for Horizontal_Distance_To_Hydrology
if(length(removeUpper)>0){
dataset = dataset[-removeUpper,]
}
#removing lower outlier for Horizontal_Distance_To_Hydrology
if(length(removeLower)>0){
dataset = dataset[-removeLower,]
}
thirdQuantile = quantile(dataset$Horizontal_Distance_To_Roadways,0.75)
upperOutlier = (IQR(dataset$Horizontal_Distance_To_Roadways) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Horizontal_Distance_To_Roadways,0.25)
lowerOutlier = firstQuantile -(IQR(dataset$Horizontal_Distance_To_Roadways)* 3)
removeUpper = which(dataset$Horizontal_Distance_To_Roadways > upperOutlier)
removeLower = which(dataset$Horizontal_Distance_To_Roadways < lowerOutlier)
#removing upper outlier for Horizontal_Distance_To_Roadways
if(length(removeUpper)>0){
dataset = dataset[-removeUpper,]
}
#removing lower outlier for Horizontal_Distance_To_Roadways
if(length(removeLower)>0){
dataset = dataset[-removeLower,]
}
thirdQuantile = quantile(dataset$Horizontal_Distance_To_Fire_Points,0.75)
upperOutlier = (IQR(dataset$Horizontal_Distance_To_Fire_Points) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Horizontal_Distance_To_Fire_Points,0.25)
lowerOutlier = firstQuantile -(IQR(dataset$Horizontal_Distance_To_Fire_Points)* 3)
removeUpper = which(dataset$Horizontal_Distance_To_Fire_Points > upperOutlier)
removeLower = which(dataset$Horizontal_Distance_To_Fire_Points < lowerOutlier)
#removing upper outlier for Horizontal_Distance_To_Fire_Points
if(length(removeUpper)>0){
dataset = dataset[-removeUpper,]
}
#removing lower outlier for Horizontal_Distance_To_Fire_Points
if(length(removeLower)>0){
dataset = dataset[-removeLower,]
}
thirdQuantile = quantile(dataset$Vertical_Distance_To_Hydrology,0.75)
upperOutlier = (IQR(dataset$Vertical_Distance_To_Hydrology) * 3) + thirdQuantile
firstQuantile = quantile(dataset$Vertical_Distance_To_Hydrology,0.25)
lowerOutlier = firstQuantile -(IQR(dataset$Vertical_Distance_To_Hydrology)* 3)
removeUpper = which(dataset$Vertical_Distance_To_Hydrology > upperOutlier)
removeLower = which(dataset$Vertical_Distance_To_Hydrology < lowerOutlier)
#removing upper outlier for Vertical_Distance_To_Hydrology
if(length(removeUpper)>0){
dataset = dataset[-removeUpper,]
}
#removing lower outlier for Vertical_Distance_To_Hydrology
if(length(removeLower)>0){
dataset = dataset[-removeLower,]
}
#checking dimensions for cleaned data
dim(dataset)
## [1] 14888 53
#scaling cleaned dataset
dataset[,c(1:10)]= scale(dataset[,c(1:10)])
rownames(dataset) = seq.int(nrow(dataset))
#importing libraries for CART and Random forest
library(caTools)
library(rpart)
library(rpart.plot)
library(caret)
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
#splitting dataset into test and train
set.seed(123)
splitting = sample.split(dataset$Cover_Type , SplitRatio = 0.6)
dataset_train = subset(dataset , splitting == TRUE)
dataset__test = subset(dataset ,splitting == FALSE)
#applying CART on train data
classtree = rpart(Cover_Type ~., method = "class", data = dataset_train )
#predict full tree
predict_fulltree= predict(classtree , newdata = dataset__test, type = "class")
#confusion matrix for predicted full tree
confusion_matrix_fulltree = table(predict_fulltree, dataset__test$Cover_Type)
#finding accuracy for full tree
Accuracy_fulltree = sum(diag(confusion_matrix_fulltree))/nrow(dataset__test)
#plotting tree using prp()
prp(classtree)

#plotting tree using plot() and text()
plot(classtree)
text(classtree)

#plotting tree using rpart.plot()
rpart.plot(classtree, box.palette = 0)

names(classtree)
## [1] "frame" "where" "call"
## [4] "terms" "cptable" "method"
## [7] "parms" "control" "functions"
## [10] "numresp" "splits" "variable.importance"
## [13] "y" "ordered"
#finding CP for full tree
classtree$cptable
## CP nsplit rel error xerror xstd
## 1 0.16524813 0 1.0000000 1.0149273 0.004193399
## 2 0.13827419 1 0.8347519 0.8431321 0.005551828
## 3 0.09977740 2 0.6964777 0.6971324 0.006072828
## 4 0.08498101 3 0.5967003 0.6294356 0.006169930
## 5 0.02762865 4 0.5117193 0.5193139 0.006148964
## 6 0.02396229 5 0.4840906 0.4954825 0.006115269
## 7 0.01414168 6 0.4601283 0.4717821 0.006071196
## 8 0.01361791 7 0.4459866 0.4605211 0.006046500
## 9 0.01000000 8 0.4323687 0.4416656 0.005999625
#prunning tree at lowest xerror
prune_classtree = prune(classtree , cp =0.01715333 )
#plotting prune tree using rpart.plot()
rpart.plot(prune_classtree,box.palette = 0)

#plotting prune tree using prp()
prp(prune_classtree)

#predict test data using prune tree
predict_classification = predict(prune_classtree, newdata = dataset__test, type = "class")
#find accuracy for prune tree
confusion_matrix_class=table(dataset__test$Cover_Type, predict_classification)
confusion_matrix_class
## predict_classification
## 1 2 3 4 5 6 7
## 1 528 0 2 0 105 3 207
## 2 298 0 81 0 407 12 29
## 3 0 0 551 182 57 74 0
## 4 0 0 106 758 0 0 0
## 5 13 0 67 0 771 3 0
## 6 0 0 411 138 64 251 0
## 7 70 0 0 0 4 0 763
Accuracy = sum(diag(confusion_matrix_class))/nrow(dataset__test)
Accuracy
## [1] 0.6082284
#classification of test data with prune tree
plot(predict_classification, main ="Classification")

#plotting random forest with nt=200
Random_tree = randomForest(factor(Cover_Type) ~. , data =dataset_train,ntree=200 )
plot(Random_tree)

#factoring Cover_Type
dataset__test$Cover_Type = as.factor(dataset__test$Cover_Type)
#predict test data with random forest
predict_randomforest = predict(Random_tree , newdata = dataset__test)
#finding accuracy for RAndom forest
confusion_matrix_RForest=table(dataset__test$Cover_Type , predict_randomforest)
confusion_matrix_RForest
## predict_randomforest
## 1 2 3 4 5 6 7
## 1 618 118 0 0 31 7 71
## 2 158 525 17 0 83 39 5
## 3 0 0 609 67 7 181 0
## 4 0 0 8 832 0 24 0
## 5 0 25 11 0 801 17 0
## 6 0 1 114 37 7 705 0
## 7 45 1 2 0 2 0 787
Accuracy_RF = sum(diag(confusion_matrix_RForest))/nrow(dataset__test)
Accuracy_RF
## [1] 0.8189757
#summary for predicted random forest
summary(predict_randomforest)
## 1 2 3 4 5 6 7
## 821 670 761 936 931 973 863
#plotting predicted random forest
plot(predict_randomforest, main = "Random Forest Classification")

##
#KNN
#divide test and train data based on outcome variable
library(class)
Xtrain = dataset_train[1:52]
Xtest = dataset__test[1:52]
ytrain = dataset_train[53]
ytest = dataset__test[53]
#try k values for sqrt(train)
klen=round(sqrt(nrow(Xtrain)),0)
tpredict<-numeric() #Holding variable
for(j in 1:klen){
#Apply knn with k = i
ypred<-knn(Xtrain,Xtest,ytrain$Cover_Type,k=j)
#finding accurately predicted values
tpredict<-c(tpredict,
mean(ypred==ytest$Cover_Type))
}
#validation error
error<-1-tpredict
#plotting validation error with K values
plot(1-tpredict,type="l",ylab="Error Rate",xlab="K",xaxt="n",main="Error Rate for cover type With Varying K")
axis(1, seq(0,klen, 1),las=2, font=2,cex.axis=0.8)

#predicting results and accuracy for k=4
ypred1 = knn(Xtrain, Xtest, ytrain$Cover_Type, k=4, prob=T)
cm1=as.matrix(table(ytest$Cover_Type, ypred1)) #confusion matrix
AccuracyK4=sum(diag(cm1))/length(ytest$Cover_Type) #accuracy
#predicting results and accuracy for k=12
ypred2 = knn(Xtrain, Xtest, ytrain$Cover_Type, k=12, prob=T)
cm2=as.matrix(table(ytest$Cover_Type, ypred2))
Accuracyk12=sum(diag(cm2))/length(ytest$Cover_Type)
#predicting results and accuracy for k=16
ypred3 = knn(Xtrain, Xtest, ytrain$Cover_Type, k=16, prob=T)
cm3=as.matrix(table(ytest$Cover_Type, ypred3))
Accuracyk16=sum(diag(cm3))/length(ytest$Cover_Type)
#test data classification with k=4
plot(ypred1, main = "knn Classification")
