Attributes include:
# Load libraries
library(caret)
library(readr)
library(plotly)
# Import the dataset
processed_trainingData <- read_csv("processed_trainingData.csv")
# Set up parallel processing
library(doParallel)
# Find how many cores are on your machine
detectCores()
## [1] 8
# Create Cluster with desired number of cores.
cl <- makeCluster(5)
# Register Cluster
registerDoParallel(cl)
# Confirm how many cores are now "assigned" to R and RStudio
getDoParWorkers()
## [1] 5
# Inspect the dataset
# Check the structure and summary of the dataset
str(processed_trainingData)
summary(processed_trainingData)
# Check the first 10 variables with the first 5 observations
head(processed_trainingData, n=5)[1:10]
## # A tibble: 5 x 10
## FLOOR BUILDINGID SPACEID RELATIVEPOSITION WAP001 WAP002 WAP003 WAP004 WAP005
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 1 106 2 100 100 100 100 100
## 2 2 1 106 2 100 100 100 100 100
## 3 2 1 103 2 100 100 100 100 100
## 4 2 1 102 2 100 100 100 100 100
## 5 0 0 122 2 100 100 100 100 100
## # ... with 1 more variable: WAP006 <dbl>
# Check the last 10 variables with the first 5 observations
tail(processed_trainingData, n=5)[520:529]
## # A tibble: 5 x 10
## WAP516 WAP517 WAP518 WAP519 WAP520 LONGITUDE LATITUDE USERID PHONEID TIMESTAMP
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 100 100 100 100 100 -7485. 4864875. 18 10 1.37e9
## 2 100 100 100 100 100 -7391. 4864836. 18 10 1.37e9
## 3 100 100 100 100 100 -7517. 4864889. 18 10 1.37e9
## 4 100 100 100 100 100 -7537. 4864896. 18 10 1.37e9
## 5 100 100 100 100 100 -7536. 4864898. 18 10 1.37e9
Remove Zero Variance Variables
Remove irrelevant variables
Creating a unique identifier-LOCATION
Subset the dataset by building# and refactorize the subsets
# Removing Near Zero Variance from the dataset
rzv_training <- processed_trainingData[, sapply(processed_trainingData, var) != 0]
str(rzv_training)
# Check if all zero variance columns have been removed
which(sapply(rzv_training, var) == 0)
## named integer(0)
# Remove extra dependent variables
rzv_training <- rzv_training[, -c(470:474)]
# Check if they are all removed
names(rzv_training)[465:469]
## [1] "WAP515" "WAP516" "WAP517" "WAP518" "WAP519"
# Use tidyr to create a new attribute
library(tidyr)
# To create a single unique identifier (new column attribute) that combine 4 other attributes function
newDF <- unite(rzv_training, "LOCATION", c(FLOOR, BUILDINGID, SPACEID, RELATIVEPOSITION), remove = FALSE, sep ="-")
# Convert location attribute to factor
newDF$LOCATION <- as.factor(newDF$LOCATION)
# Make sure the data type has been converted
str(newDF$LOCATION)
## Factor w/ 905 levels "0-0-102-2","0-0-106-2",..: 485 485 479 477 16 483 479 475 492 478 ...
# Subsetting the Building 0-2
trainingBUD0 <- subset(newDF, BUILDINGID== 0)
trainingBUD1 <- subset(newDF, BUILDINGID== 1)
trainingBUD2 <- subset(newDF, BUILDINGID== 2)
# Remove Floor, BuildingID,SPACEID, RELATIVELOCATION from these subsets
trainingBUD0[,2:5] <- NULL
trainingBUD1[,2:5] <- NULL
trainingBUD2[,2:5] <- NULL
# Factorize again after subsetting in order to drop factor levels
trainingBUD0$LOCATION <- factor(trainingBUD0$LOCATION)
trainingBUD1$LOCATION <- factor(trainingBUD1$LOCATION)
trainingBUD2$LOCATION <- factor(trainingBUD2$LOCATION)
# Check how many levels of LOCATION for building0
str(trainingBUD0$LOCATION)
## Factor w/ 259 levels "0-0-102-2","0-0-106-2",..: 16 1 4 5 3 2 9 8 7 6 ...
C5.0 and Random Forest models are both belong to the tree model family. Both model works by splitting the sample based on the field that provides the maximum information gain. Each subsample defined by the first split is then split again, usually based on a different field, and the process repeats until the subsamples cannot be split any further. Finally, the lowest-level splits are reexamined, and those that do not contribute significantly to the value of the model are removed or pruned.
C5.0 is more robust when processing a large number of variables, which usually takes less training time in general.
Random Forest algorithm normally requires longer training time. Because it consists of a large number of relatively uncorrelated models (trees) which protect each other from their individual errors. It is more conservative but also can prevent overfitting from a simple decision tree model.
# To set seed
set.seed(520)
## 10 fold cross validation
fitControl <- trainControl(method = "cv", number = 10)
# Define 75%/25% train/test split of the dataset to BUILDING 0
inTraining0 <- createDataPartition(trainingBUD0$LOCATION, p = .75, list = FALSE)
training0 <- trainingBUD0[inTraining0,]
testing0 <- trainingBUD0[-inTraining0,]
# C5.0 model
C50_BUD0 <- train(LOCATION~., data = training0, method = "C5.0", trControl=fitControl)
# Testing
prediction_C50BUD0 <- predict(C50_BUD0, testing0)
# Evaluate the model
cm_C50_BUD0 <- confusionMatrix(prediction_C50BUD0, testing0$LOCATION)
postResample(prediction_C50BUD0, testing0$LOCATION)
## Accuracy Kappa
## 0.7286512 0.7275444
# Random Forest Model
rf_BUD0 <- train(LOCATION~., data = training0, method = "rf", trControl=fitControl)
# Testing
prediction_rfBUD0<- predict(rf_BUD0, testing0)
# Evaluate the model
cm_rf_BUD0 <- confusionMatrix(prediction_rfBUD0, testing0$LOCATION)
postResample(prediction_rfBUD0, testing0$LOCATION)
## Accuracy Kappa
## 0.7741421 0.7732169
# KNN Model
KNN_BUD0 <- train(LOCATION~., data = training0, method = "knn", trControl=fitControl)
# Testing
prediction_KNNBUD0 <- predict(KNN_BUD0, testing0)
# Evaluation the model
cm_KNN_BUD0 <- confusionMatrix(prediction_KNNBUD0, testing0$LOCATION)
postResample(prediction_KNNBUD0, testing0$LOCATION)
## Accuracy Kappa
## 0.5602554 0.5584560
resample_BUD0 <- resamples( list(C50 = C50_BUD0, RF = rf_BUD0, KNN = KNN_BUD0))
summary(resample_BUD0)
##
## Call:
## summary.resamples(object = resample_BUD0)
##
## Models: C50, RF, KNN
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## C50 0.6857143 0.6966973 0.7050096 0.7057123 0.7124347 0.7398990 0
## RF 0.7114914 0.7312939 0.7658117 0.7562927 0.7802988 0.7926829 0
## KNN 0.5135802 0.5300000 0.5379138 0.5411246 0.5510385 0.5760599 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## C50 0.6843920 0.6954157 0.7037252 0.7044562 0.7111684 0.7388030 0
## RF 0.7102582 0.7301419 0.7647806 0.7552386 0.7793579 0.7917936 0
## KNN 0.5115194 0.5280295 0.5359863 0.5391965 0.5491473 0.5743259 0
# Define 75%/25% train/test split of the dataset to BUILDING 1
inTraining1 <- createDataPartition(trainingBUD1$LOCATION, p = .75, list = FALSE)
training1 <- trainingBUD1[inTraining1,]
testing1 <- trainingBUD1[-inTraining1,]
# Random Forest Model
rf_BUD1 <- train(LOCATION~., data = training1, method = "rf", trControl=fitControl)
# Testing
prediction_rfBUD1<- predict(rf_BUD1, testing1)
# Evaluate the model
cm_rf_BUD1 <- confusionMatrix(prediction_rfBUD1, testing1$LOCATION)
postResample(prediction_rfBUD1, testing1$LOCATION)
## Accuracy Kappa
## 0.8595779 0.8587076
# Define 75%/25% train/test split of the dataset to BUILDING 2
inTraining2 <- createDataPartition(trainingBUD2$LOCATION, p = .75, list = FALSE)
training2 <- trainingBUD2[inTraining2,]
testing2 <- trainingBUD2[-inTraining2,]
# Random Forest Model
rf_BUD2 <- train(LOCATION~., data = training2, method = "rf", trControl=fitControl)
# Testing
prediction_rfBUD2<- predict(rf_BUD2, testing2)
# Evaluate the model
cm_rf_BUD2 <- confusionMatrix(prediction_rfBUD2, testing2$LOCATION)
postResample(prediction_rfBUD2, testing2$LOCATION)
## Accuracy Kappa
## 0.8077601 0.8071570
# Stop Cluster
stopCluster(cl)