#K Nearest Neighbor
dataset=read.csv("lasvegas.csv")
dataset$Score <- with(dataset, ifelse(Score<=3,0,1))
dataset$Score<-factor(dataset$Score)
View(dataset)
#Taking only Nr.reviews,Nr.hotel reviews,Helpful votes and Score attributes
dataset=dataset[2:5]
View(dataset)
#Splitting the data
library(caTools)
set.seed(123)
split=sample.split(Y=dataset$Score,SplitRatio=2/3)
train_set=subset(x=dataset,split==TRUE)
test_set=subset(x=dataset,split==FALSE)
#Feature Scaling
train_set[-4]=scale(train_set[-4])
test_set[-4]=scale(test_set[-4])
str(train_set)
## 'data.frame': 336 obs. of 4 variables:
## $ Nr..reviews : num -0.48 0.945 -0.441 -0.599 -0.309 ...
## $ Nr..hotel.reviews: num -0.508 0.24 -0.376 -0.64 -0.552 ...
## $ Helpful.votes : num -0.365 0.866 -0.346 -0.544 -0.465 ...
## $ Score : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 1 1 1 1 ...
str(test_set)
## 'data.frame': 168 obs. of 4 variables:
## $ Nr..reviews : num -0.185 -0.6059 -0.2529 -0.0629 -0.5108 ...
## $ Nr..hotel.reviews: num -0.303 -0.456 -0.341 -0.189 -0.379 ...
## $ Helpful.votes : num -0.167 -0.681 -0.122 0.303 -0.479 ...
## $ Score : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 1 1 2 ...
#Creating a Model
library(class)
y_pred=knn(train_set[,-4],test=test_set[,-4],cl=train_set[,4],k=5)
err_metric=function(CM)
{
TN =CM[2,2]
TP =CM[1,1]
FP =CM[1,2]
FN =CM[2,1]
precision =(TP)/(TP+FP)
recall_score =(TP)/(TP+FN)
f1_score=2*((precision*recall_score)/(precision+recall_score))
accuracy_model =(TP+TN)/(TP+TN+FP+FN)
False_positive_rate =(FP)/(FP+TN)
False_negative_rate =(FN)/(FN+TP)
print(paste("Precision value of the model: ",round(precision,3)))
print(paste("Accuracy of the model: ",round(accuracy_model,3)))
print(paste("Recall value of the model: ",round(recall_score,3)))
print(paste("False Positive rate of the model: ",round(False_positive_rate,2)))
print(paste("False Negative rate of the model: ",round(False_negative_rate,2)))
print(paste("f1 score of the model: ",round(f1_score,2)))
}
cm=table(test_set[,4],y_pred)
cm
## y_pred
## 0 1
## 0 3 35
## 1 15 115
err_metric(cm)
## [1] "Precision value of the model: 0.079"
## [1] "Accuracy of the model: 0.702"
## [1] "Recall value of the model: 0.167"
## [1] "False Positive rate of the model: 0.23"
## [1] "False Negative rate of the model: 0.83"
## [1] "f1 score of the model: 0.11"