Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_144')
library(kernlab)
library(kknn)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
##
## alpha
#load the data
CC_Data = read.table("C:\\Users\\vabhagat\\Downloads\\credit_card_data.txt", header=F, stringsAsFactors = F)
head(CC_Data)
set.seed(1)
# Trying train.kknn for leave-one-out cross-validation Kmax=30
CC_Model=train.kknn(V11~.,CC_Data,kmax=30, scale=TRUE)
# Accuracy Matrix
CC_Model_accry=rep(0,30)
# calculate prediction qualities
for (k in 1:30) {
CC_Model_predicted <- as.integer(fitted(CC_Model)[[k]][1:nrow(CC_Data)] + 0.5)
CC_Model_accry[k] <- sum(CC_Model_predicted == CC_Data$V11)
}
# show accuracies
CC_Model_accry
## [1] 533 533 533 533 557 553 554 555 554 557 557 558 557 557 558 558 558
## [18] 557 556 556 555 554 552 553 553 552 550 548 549 550
#load the data
CC_Data = read.table("C:\\Users\\vabhagat\\Downloads\\credit_card_data.txt", header=F, stringsAsFactors = F)
head(CC_Data)
set.seed(1)
# Randomly taking 70% to use for training
CC_Data_smpl70 = sample(nrow(CC_Data), round(nrow(CC_Data)*.7))
CC_training_70 = CC_Data[CC_Data_smpl70, ]
# Split the remaing into validation and test
temp = CC_Data[-CC_Data_smpl70, ]
temp2 = sample(nrow(temp), round(nrow(temp)*.5))
cc_validation = temp[temp2, ]
cc_test = temp[-temp2, ]
### Create a function that trains a model with neighbors
train.model = function(neighbors)
{
cc_model = kknn(V11 ~., cc_test, cc_validation, k = neighbors, scale = TRUE)
# store the predictions
cc_model_fitted = as.matrix(cc_model$fitted.values)
# round the fitted values
cc_model_fitted_rounded = as.matrix(lapply(cc_model_fitted[, 1], round))
# merge rounded that values back into model_fitted
cc_model_results = data.frame(cc_validation[, 'V11'], cc_model_fitted_rounded)
colnames(cc_model_results) = c("actual", "predicted")
return(cc_model_results)
}
### Create a function that calculates the accuracy based on predictions
eval.model = function(cc_model)
{
results <- vector("list", nrow(cc_model))
for(i in 1:nrow(cc_model)){
results[[i]] <- as.integer(cc_model[i, 'actual'] == cc_model[i, 'predicted'])
}
# calculates the percent of correct predictions
correct = sum(data.frame(results))
accuracy = correct / nrow(cc_model)
return(accuracy)
}
### Train three models using k values of 4, 12, and 15
cc_model_k4 = train.model(4)
cc_model_k12 = train.model(12)
cc_model_k16 = train.model(16)
### Return accuracy for each model
eval.model(cc_model_k4)
## [1] 0.7244898
eval.model(cc_model_k12)
## [1] 0.8571429
eval.model(cc_model_k16)
## [1] 0.877551
### Selecting model_k16 as it has the best results.
### Re-train model_k16 using the entire training dataset and score against the test dataset
cc_model = kknn(V11 ~., CC_training_70, cc_test, k = 16, scale = TRUE)
cc_model_fitted = as.matrix(cc_model$fitted.values)
cc_model_fitted_rounded = as.matrix(lapply(cc_model_fitted[, 1], round))
cc_model_results = data.frame(cc_validation[, 'V11'], cc_model_fitted_rounded)
cc_model_results = data.frame(cc_test[, 'V11'], cc_model_fitted_rounded)
colnames(cc_model_results) = c("actual", "predicted")
model_accuracy = eval.model(cc_model_results)
model_accuracy
## [1] 0.8367347
I used to work for a large healthcare firm, clustering was needed to figure the where they should be opening emergerncy care wards. Some of the important predictors were identifying Accident Prone Areas, Average driving distance and distance from nearest emergency clinic.
iris_df = read.table("C:\\Users\\vabhagat\\Downloads\\iris.txt", header= TRUE, stringsAsFactors = F)
iris_df = iris_df[,2:6]
head(iris_df)
set.seed(33)
# scale the data:
iris_scaled_df = iris_df
for (i in 1:4) { iris_scaled_df[,i] <- (iris_df[,i]-min(iris_df[,i]))/(max(iris_df[,i])-min(iris_df[,i])) }
# Kmean clustering
iris_cluster_1 = kmeans(iris_scaled_df[,1:4], 2, nstart = 20)
iris_cluster_2 = kmeans(iris_scaled_df[,1:4], 3, nstart = 20)
iris_cluster_3 = kmeans(iris_scaled_df[,1:4], 4, nstart = 20)
iris_cluster_4 = kmeans(iris_scaled_df[,1:4], 5, nstart = 20)
# total distance between data points and cluster centers:
cluster_dist = 0
for (i in 1:nrow(iris_df)) {
cluster_dist = cluster_dist + dist(rbind(iris_df[i,1:4],iris_cluster_1$centers[iris_cluster_1$cluster[i],]))
}
cluster_dist[1]
## [1] 1052.231
#Comparing clusters with the species.
table(iris_cluster_1$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 50 50
table(iris_cluster_2$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 47 14
## 3 0 3 36
table(iris_cluster_3$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 0 27 2
## 2 50 0 0
## 3 0 23 19
## 4 0 0 29
table(iris_cluster_4$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 0 27 2
## 2 22 0 0
## 3 0 0 29
## 4 0 23 19
## 5 28 0 0
# Using Sepal Length Width
iris_cluster_sepal_1 = kmeans(iris_scaled_df[,1:2], 2, nstart = 20)
iris_cluster_sepal_2 = kmeans(iris_scaled_df[,1:2], 3, nstart = 20)
iris_cluster_sepal_3 = kmeans(iris_scaled_df[,1:2], 4, nstart = 20)
iris_cluster_sepal_4 = kmeans(iris_scaled_df[,1:2], 5, nstart = 20)
#Comparing clusters with the species.
table(iris_cluster_sepal_1$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 50 7 1
## 2 0 43 49
table(iris_cluster_sepal_2$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 1 37 16
## 2 49 0 0
## 3 0 13 34
table(iris_cluster_sepal_3$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 33 1 0
## 2 0 10 30
## 3 17 6 1
## 4 0 33 19
table(iris_cluster_sepal_4$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 35 1 0
## 2 0 2 16
## 3 0 19 25
## 4 1 28 9
## 5 14 0 0
# plotting
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = iris_cluster_sepal_1$cluster)) + geom_point()
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = iris_cluster_sepal_2$cluster)) + geom_point()
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = iris_cluster_sepal_3$cluster)) + geom_point()
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = iris_cluster_sepal_4$cluster)) + geom_point()
# Using Petal Length Width
iris_cluster_petal_1 = kmeans(iris_scaled_df[,3:4], 2, nstart = 20)
iris_cluster_petal_2 = kmeans(iris_scaled_df[,3:4], 3, nstart = 20)
iris_cluster_petal_3 = kmeans(iris_scaled_df[,3:4], 4, nstart = 20)
iris_cluster_petal_4 = kmeans(iris_scaled_df[,3:4], 5, nstart = 20)
#Comparing clusters with the species.
table(iris_cluster_petal_1$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 0 50 50
## 2 50 0 0
table(iris_cluster_petal_2$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 0 48 4
## 2 50 0 0
## 3 0 2 46
table(iris_cluster_petal_3$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 0 42 0
## 2 0 0 27
## 3 50 0 0
## 4 0 8 23
table(iris_cluster_petal_4$cluster, iris_df$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 23 0
## 3 0 0 25
## 4 0 2 21
## 5 0 25 4
# plotting
ggplot(iris, aes(Petal.Length, Petal.Width, color = iris_cluster_petal_1$cluster)) + geom_point()
ggplot(iris, aes(Petal.Length, Petal.Width, color = iris_cluster_petal_2$cluster)) + geom_point()
ggplot(iris, aes(Petal.Length, Petal.Width, color = iris_cluster_petal_3$cluster)) + geom_point()
ggplot(iris, aes(Petal.Length, Petal.Width, color = iris_cluster_petal_4$cluster)) + geom_point()