Our client is developing a system to be deployed on large industrial campuses, shopping malls, etc to help people to navigate a complex, unfamiliar interior space without getting lost. While GPS works fairly reliably outdoors, it generally doesn’t work indoors, so a different technology is necessary. Our client would like us to investigate the feasibility of using “wifi fingerprinting” to determine a person’s location in indoor spaces. Wifi fingerprinting uses the signals from multiple wifi hotspots within the building to determine location, analogously to how GPS uses satellite signals.
The objective of the task is to evaluate the application of machine learning techniques to the problem of indoor locationing via wifi fingerprinting. For this task we predicted FLOOR using WAP signals (classification problem). Three machine learning techniques were selected: 1) k-NN, 2) C5.0 and 3) Random forest. The results are summmarized in a table at the bottom of this report.
The dataset contains the following attributes:
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidyverse)
library(caret)
library(readxl)
library(rmarkdown)
training <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Wifi/Data/trainingData.csv")
valid <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Wifi/Data/validationData.csv")
No pre-processing (1)
First, no pre-processing of the data was performed before training machine learning algorithms. All the columns containing the 520 WAP signals were used as predictors, whereas FLOOR was set as the output variable, after transforming the data type to factor. There were no missing values.
Modalization (1)
k-NN
# Subsetting dataset by taking a sample (n=5000)
trainingdata1 <- training[sample(1:nrow(training), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata1$FLOOR, p = .8, list = FALSE)
trainSet1 <- trainingdata1[inTraining,]
testSet1 <- trainingdata1[-inTraining,]
#Train model
mod_knn1 <- train(FLOOR~.,
data = trainSet1 %>%
select(starts_with("WAP"), FLOOR),
method = "knn",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_knn1
#test results
pred_knn1_test <- predict(mod_knn1, newdata = testSet1)
postResample(testSet1$FLOOR, pred_knn1_test)
#validation results
pred_knn1_validation <- predict(object = mod_knn1, newdata = valid)
postResample(valid$FLOOR, pred_knn1_validation)
confusionMatrix(as.factor(valid$FLOOR), pred_knn1_validation)
C5.0
# Subsetting dataset by taking a sample (n=5000)
trainingdata1 <- training[sample(1:nrow(training), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata1$FLOOR, p = .8, list = FALSE)
trainSet1 <- trainingdata1[inTraining,]
testSet1 <- trainingdata1[-inTraining,]
#Train model
mod_C5_1 <- train(FLOOR~.,
data = trainSet1 %>%
select(starts_with("WAP"), FLOOR),
method = "C5.0",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_C5_1
#test results
pred_C5_1_test <- predict(mod_C5_1, newdata = testSet1)
postResample(testSet1$FLOOR, pred_C5_1_test)
#validation results
pred_C5_1_validation <- predict(object = mod_C5_1, newdata = valid)
postResample(valid$FLOOR, pred_C5_1_validation)
confusionMatrix(valid$FLOOR, pred_C5_1_validation)
Random forest
# Subsetting dataset by taking a sample (n=5000)
trainingdata1 <- training[sample(1:nrow(training), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata1$FLOOR, p = .8, list = FALSE)
trainSet1 <- trainingdata1[inTraining,]
testSet1 <- trainingdata1[-inTraining,]
#Train model
mod_rf_1 <- train(FLOOR~.,
data = trainSet1 %>%
select(starts_with("WAP"), FLOOR),
method = "rf",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_rf_1
#test results
pred_rf_1_test <- predict(mod_rf_1, newdata = testSet1)
postResample(testSet1$FLOOR, pred_rf_1_test)
#validation results
pred_rf_1_validation <- predict(object = mod_rf_1, newdata = valid)
postResample(valid$FLOOR, pred_rf_1_validation)
confusionMatrix(valid$FLOOR, pred_rf_1_validation)
Pre-processing (2)
A histogram was created to visualize the distribution of the signal strength of all 520 WAPs of the whole training dataset.
Zooming in on outliers (-30 to 0 dbm)
In the histogram above we can see some outliers in the range -30 to 0 dbm. If we filter this range, we can see the histogram below.
Since these outliers may disturb the results of the analysis they were removed by filtering out this range (-30 to 0 dbm) in the training dataset.
#filter out high values -30 to 0
outlier <- training %>%
rownames_to_column(var = "id") %>%
pivot_longer(
cols = starts_with("WAP"),
names_to = "WAP",
values_to = "values"
) %>%
filter(between(values, -30, 0))
outlier_data <- training %>%
rownames_to_column(var = "id") %>%
filter(id %in% outlier$id)
training2 <- training
training2$id <- seq.int(nrow(training2))
training3 <- training2[,c(ncol(training2), 1:(ncol(training2)-1))]
training3$ID <- NULL
training_no_outlier <- training3[!(training3$id %in% outlier_data$id), ]
Rescaling of the WAP values was performed so that the lowest signal strength (i.e. no signal detection) equals to 0 (by subtracting 105) and the strongest signal has the highest value (by adding 105).
# Rescale to 0-105 --------------------------------------------------------
#training without outliers
training_no_outlier_rs <- training_no_outlier
training_no_outlier_rs$id <- NULL
training_no_outlier_rs[training_no_outlier_rs == 100] <- -105
training_no_outlier_rs[,1:520] <- training_no_outlier_rs[,1:520] + 105
#validation
valid_rs <- valid
valid_rs[valid_rs == 100] <- -105
valid_rs[,1:520] <- valid_rs[,1:520] + 105
Modalization (2)
k-NN
# Subsetting dataset by taking a sample (n=5000)
trainingdata2 <- training_no_outlier_rs[sample(1:nrow(training_no_outlier_rs), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata2$FLOOR, p = .8, list = FALSE)
trainSet2 <- trainingdata2[inTraining,]
testSet2 <- trainingdata2[-inTraining,]
#Train model
mod_knn2 <- train(FLOOR~.,
data = trainSet2 %>%
select(starts_with("WAP"), FLOOR),
method = "knn",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_knn2
#test results
pred_knn2_test <- predict(mod_knn2, newdata = testSet2)
postResample(testSet2$FLOOR, pred_knn2_test)
#validation results
pred_knn2_validation <- predict(object = mod_knn2, newdata = valid_rs)
postResample(valid_rs$FLOOR, pred_knn2_validation)
confusionMatrix(valid_rs$FLOOR, pred_knn2_validation)
C5.0
# Subsetting dataset by taking a sample (n=5000)
trainingdata2 <- training_no_outlier_rs[sample(1:nrow(training_no_outlier_rs), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata2$FLOOR, p = .8, list = FALSE)
trainSet2 <- trainingdata2[inTraining,]
testSet2 <- trainingdata2[-inTraining,]
#Train model
mod_C5_2 <- train(FLOOR~.,
data = trainSet2 %>%
select(starts_with("WAP"), FLOOR),
method = "C5.0",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_C5_2
#test results
pred_C5_2_test <- predict(mod_C5_2, newdata = testSet2)
postResample(testSet2$FLOOR, pred_C5_2_test)
#validation results
pred_C5_2_validation <- predict(object = mod_C5_2, newdata = valid_rs)
postResample(valid_rs$FLOOR, pred_C5_2_validation)
confusionMatrix(valid_rs$FLOOR, pred_C5_2_validation)
Random forest
# Subsetting dataset by taking a sample (n=5000)
trainingdata2 <- training_no_outlier_rs[sample(1:nrow(training_no_outlier_rs), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata2$FLOOR, p = .8, list = FALSE)
trainSet2 <- trainingdata2[inTraining,]
testSet2 <- trainingdata2[-inTraining,]
#Train model
mod_rf_2 <- train(FLOOR~.,
data = trainSet2 %>%
select(starts_with("WAP"), FLOOR),
method = "rf",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_rf_2
#test results
pred_rf_2_test <- predict(mod_rf_2, newdata = testSet2)
postResample(testSet2$FLOOR, pred_rf_2_test)
#validation results
pred_rf_2_validation <- predict(object = mod_rf_2, newdata = valid_rs)
postResample(valid_rs$FLOOR, pred_rf_2_validation)
confusionMatrix(valid_rs$FLOOR, pred_rf_2_validation)
Pre-processing (3)
WAP values with zero variance indicate that there is no change in signal strength across the whole dataset. Because these WAPS are not informative and do not contribute to the model as relevant predictors, we decided to remove these attributes.
training_factorcolumns <- training_no_outlier_rs[,521:529]
training_no_outlier_rs_nzv <- training_no_outlier_rs %>%
select(starts_with("WAP")) %>%
select_if(function(x) var(x) != 0)
training_no_outlier_rs_nzv <- cbind(training_no_outlier_rs_nzv, training_factorcolumns)
# define no 0 variance columns on training
relevant_columns <- names(training_no_outlier_rs_nzv)
# select the no 0 variance columns on testing
valid_rs_nzv <- valid_rs %>%
select(relevant_columns)
Rows with zero variance are not useful either. It indicates that no signal is detected by the phone. Accordingly these observations are removed.
trainvar <- training_no_outlier_rs_nzv
#training
waps_training <- trainvar[,1:464]
# remove rows with 0 variance
trainvar <- trainvar[-which(apply(waps_training, 1, var) == 0), ]
dup_tr <- duplicated(trainvar[,1:464])
duplicates_tr <- trainvar[dup_tr,]
noduplicates_tr <- trainvar[!dup_tr,]
Modalization (3)
k-NN
# Subsetting dataset by taking a sample (n=5000)
trainingdata3 <- noduplicates_tr[sample(1:nrow(noduplicates_tr), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata3$FLOOR, p = .8, list = FALSE)
trainSet3 <- trainingdata3[inTraining,]
testSet3 <- trainingdata3[-inTraining,]
#Train model
mod_knn3 <- train(FLOOR~.,
data = trainSet3 %>%
select(starts_with("WAP"), FLOOR),
method = "knn",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_knn3
#test results
pred_knn3_test <- predict(mod_knn3, newdata = testSet3)
postResample(testSet3$FLOOR, pred_knn3_test)
#validation results
pred_knn3_validation <- predict(object = mod_knn3, newdata = valid_rs_nzv)
postResample(valid_rs_nzv$FLOOR, pred_knn3_validation)
confusionMatrix(valid_rs_nzv$FLOOR, pred_knn3_validation)
C5.0
# Subsetting dataset by taking a sample (n=5000)
trainingdata3 <- noduplicates_tr[sample(1:nrow(noduplicates_tr), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata3$FLOOR, p = .8, list = FALSE)
trainSet3 <- trainingdata3[inTraining,]
testSet3 <- trainingdata3[-inTraining,]
#Train model
mod_C5_3 <- train(FLOOR~.,
data = trainSet3 %>%
select(starts_with("WAP"), FLOOR),
method = "C5.0",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_C5_3
#test results
pred_C5_3_test <- predict(mod_C5_3, newdata = testSet3)
postResample(testSet3$FLOOR, pred_C5_3_test)
#validation results
pred_C5_3_validation <- predict(object = mod_C5_3, newdata = valid_rs_nzv)
postResample(valid_rs_nzv$FLOOR, pred_C5_3_validation)
confusionMatrix(valid_rs_nzv$FLOOR, pred_C5_3_validation)
Random forest
# Subsetting dataset by taking a sample (n=5000)
trainingdata3 <- noduplicates_tr[sample(1:nrow(noduplicates_tr), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata3$FLOOR, p = .8, list = FALSE)
trainSet3 <- trainingdata3[inTraining,]
testSet3 <- trainingdata3[-inTraining,]
#Train model
mod_rf_3 <- train(FLOOR~.,
data = trainSet3 %>%
select(starts_with("WAP"), FLOOR),
method = "rf",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_rf_3
#test results
pred_rf_3_test <- predict(mod_rf_3, newdata = testSet3)
postResample(testSet3$FLOOR, pred_rf_3_test)
#validation results
pred_rf_3_validation <- predict(object = mod_rf_3, newdata = valid_rs_nzv)
postResample(valid_rs_nzv$FLOOR, pred_rf_3_validation)
confusionMatrix(valid_rs_nzv$FLOOR, pred_rf_3_validation)
Pre-processing 4
Plotting the distribution of the WAP signals
The distribution of the WAPs of the whole training dataset after pre-processing.
In order to evaluate whether the distribution of the signals are similar among different FLOORs, PHONEIDs or USERIDs, violin plots were created:
Plotting distribution of WAP values by FLOOR
Plotting distribution of WAP values by USERID
Plotting distribution of WAP values by PHONEID
The violin plots of the USERID and PHONEID demonstrate that USERID 13 (=PHONEID 17) has a different WAP distribution compared with other users/phones. All observations of USERID 13/PHONEID17 were removed accordingly.
noduplicates_tr <- noduplicates_tr %>%
filter(PHONEID != 17)
The violin plots also shows that the PHONEIDs have different signal ranges. Normalization was applied in order to put all PHONEID on the same scale (0 to 1).
# train
nonnumeric_tr <- noduplicates_tr %>%
select(LATITUDE:PHONEID)
norm_min_max <- function(row) {
(row - min(row))/(max(row) - min(row))
}
train2 <- noduplicates_tr %>%
select(starts_with("WAP")) %>%
apply(1, function(x) norm_min_max(x)) %>%
t() %>%
as_tibble()
train3 <- cbind(train2, nonnumeric_tr)
# validation
nonnumeric_val <- valid_rs_nzv %>%
select(LATITUDE:PHONEID)
norm_min_max <- function(row) {
(row - min(row))/(max(row) - min(row))
}
valid2 <- valid_rs_nzv %>%
select(starts_with("WAP")) %>%
apply(1, function(x) norm_min_max(x)) %>%
t() %>%
as_tibble()
valid3 <- cbind(valid2, nonnumeric_val)
Modalization (4)
k-NN
# Subsetting dataset by taking a sample (n=5000)
trainingdata4 <- train3[sample(1:nrow(train3), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata4$FLOOR, p = .8, list = FALSE)
trainSet4 <- trainingdata4[inTraining,]
testSet4 <- trainingdata4[-inTraining,]
#Train model
mod_knn4 <- train(FLOOR~.,
data = trainSet4 %>%
select(starts_with("WAP"), FLOOR),
method = "knn",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_knn4
#test results
pred_knn4_test <- predict(mod_knn4, newdata = testSet4)
postResample(testSet4$FLOOR, pred_knn4_test)
#validation results
pred_knn4_validation <- predict(object = mod_knn4, newdata = valid3)
postResample(valid3$FLOOR, pred_knn4_validation)
confusionMatrix(valid3$FLOOR, pred_knn4_validation)
C5.0
# Subsetting dataset by taking a sample (n=5000)
trainingdata4 <- train3[sample(1:nrow(train3), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata4$FLOOR, p = .8, list = FALSE)
trainSet4 <- trainingdata4[inTraining,]
testSet4<- trainingdata4[-inTraining,]
#Train model
mod_C5_4 <- train(FLOOR~.,
data = trainSet4 %>%
select(starts_with("WAP"), FLOOR),
method = "C5.0",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_C5_4
#test results
pred_C5_4_test <- predict(mod_C5_4, newdata = testSet4)
#validation results
pred_C5_4_validation <- predict(object = mod_C5_4, newdata = valid3)
postResample(valid3$FLOOR, pred_C5_4_validation)
confusionMatrix(valid3$FLOOR, pred_C5_4_validation)
Random forest
# Subsetting dataset by taking a sample (n=5000)
trainingdata4 <- train3[sample(1:nrow(train3), 5000, replace = FALSE),]
# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata4$FLOOR, p = .8, list = FALSE)
trainSet4 <- trainingdata4[inTraining,]
testSet4<- trainingdata4[-inTraining,]
#Train model
mod_rf_4 <- train(FLOOR~.,
data = trainSet4 %>%
select(starts_with("WAP"), FLOOR),
method = "rf",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 1))
#training results (cross-validation)
mod_rf_4
#test results
pred_rf_4_test <- predict(mod_rf_4, newdata = testSet4)
postResample(testSet4$FLOOR, pred_rf_4_test)
#validation results
pred_rf_4_validation <- predict(object = mod_rf_4, newdata = valid3)
postResample(valid3$FLOOR, pred_rf_4_validation)
confusionMatrix(valid3$FLOOR, pred_rf_4_validation)
Plotting the errors (kNN) by LATITUDE and LONGITUDE.
library(writexl)
val_error <- read_excel("C:/Users/Y.S. Kim/Desktop/Ubiqum/Wifi/Data/validationset with error column floor.xlsx")
ggplot(data = val_error) +
geom_jitter(aes(x = LONGITUDE, y = LATITUDE, col=error_knn))
ggplot(data = val_error) +
geom_jitter(aes(x = LONGITUDE, y = LATITUDE, col=error_knn)) +
facet_wrap(~error_knn)
Accuracy and kappa