Background

Our client is developing a system to be deployed on large industrial campuses, shopping malls, etc to help people to navigate a complex, unfamiliar interior space without getting lost. While GPS works fairly reliably outdoors, it generally doesn’t work indoors, so a different technology is necessary. Our client would like us to investigate the feasibility of using “wifi fingerprinting” to determine a person’s location in indoor spaces. Wifi fingerprinting uses the signals from multiple wifi hotspots within the building to determine location, analogously to how GPS uses satellite signals.

Objective

The objective of the task is to evaluate the application of machine learning techniques to the problem of indoor locationing via wifi fingerprinting. For this task we predicted FLOOR using WAP signals (classification problem). Three machine learning techniques were selected: 1) k-NN, 2) C5.0 and 3) Random forest. The results are summmarized in a table at the bottom of this report.

Initial exploration of dataset

The dataset contains the following attributes:


Loading the required packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidyverse)
library(caret)
library(readxl)
library(rmarkdown)
Loading the datasets “trainingData” and “validationData”
training <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Wifi/Data/trainingData.csv")
valid <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Wifi/Data/validationData.csv")

No pre-processing (1)

First, no pre-processing of the data was performed before training machine learning algorithms. All the columns containing the 520 WAP signals were used as predictors, whereas FLOOR was set as the output variable, after transforming the data type to factor. There were no missing values.

Modalization (1)

k-NN

# Subsetting dataset by taking a sample (n=5000)
trainingdata1 <- training[sample(1:nrow(training), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata1$FLOOR, p = .8, list = FALSE)
trainSet1 <- trainingdata1[inTraining,]
testSet1 <- trainingdata1[-inTraining,]

#Train model
mod_knn1 <- train(FLOOR~., 
                  data = trainSet1 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "knn",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))


#training results (cross-validation)
mod_knn1

#test results
pred_knn1_test <- predict(mod_knn1, newdata = testSet1)
postResample(testSet1$FLOOR, pred_knn1_test)

#validation results
pred_knn1_validation <- predict(object = mod_knn1, newdata = valid) 
postResample(valid$FLOOR, pred_knn1_validation) 
confusionMatrix(as.factor(valid$FLOOR), pred_knn1_validation) 

C5.0

# Subsetting dataset by taking a sample (n=5000)
trainingdata1 <- training[sample(1:nrow(training), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata1$FLOOR, p = .8, list = FALSE)
trainSet1 <- trainingdata1[inTraining,]
testSet1 <- trainingdata1[-inTraining,]

#Train model
mod_C5_1 <- train(FLOOR~., 
                  data = trainSet1 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "C5.0",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))

#training results (cross-validation)
mod_C5_1

#test results
pred_C5_1_test <- predict(mod_C5_1, newdata = testSet1)
postResample(testSet1$FLOOR, pred_C5_1_test)

#validation results
pred_C5_1_validation <- predict(object = mod_C5_1, newdata = valid) 
postResample(valid$FLOOR, pred_C5_1_validation) 
confusionMatrix(valid$FLOOR, pred_C5_1_validation)  

Random forest

# Subsetting dataset by taking a sample (n=5000)
trainingdata1 <- training[sample(1:nrow(training), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata1$FLOOR, p = .8, list = FALSE)
trainSet1 <- trainingdata1[inTraining,]
testSet1 <- trainingdata1[-inTraining,]

#Train model
mod_rf_1 <- train(FLOOR~., 
                  data = trainSet1 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "rf",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))

#training results (cross-validation)
mod_rf_1

#test results
pred_rf_1_test <- predict(mod_rf_1, newdata = testSet1)
postResample(testSet1$FLOOR, pred_rf_1_test)

#validation results
pred_rf_1_validation <- predict(object = mod_rf_1, newdata = valid) 
postResample(valid$FLOOR, pred_rf_1_validation) 
confusionMatrix(valid$FLOOR, pred_rf_1_validation) 

Pre-processing (2)

The following steps were applied:
  • Exploration of data by creating histogram of WAP signals
  • Exploration of outliers
  • Removing outliers from trainingset
  • Rescaling WAP values

Exploration of the data

A histogram was created to visualize the distribution of the signal strength of all 520 WAPs of the whole training dataset.

Zooming in on outliers (-30 to 0 dbm)

In the histogram above we can see some outliers in the range -30 to 0 dbm. If we filter this range, we can see the histogram below.

Removing outliers (-30 to 0 dbm) from trainingset

Since these outliers may disturb the results of the analysis they were removed by filtering out this range (-30 to 0 dbm) in the training dataset.

#filter out high values -30 to 0
outlier <- training %>% 
  rownames_to_column(var = "id") %>% 
  pivot_longer(
    cols = starts_with("WAP"), 
    names_to = "WAP", 
    values_to = "values"
  ) %>% 
  filter(between(values, -30, 0))

outlier_data <- training %>% 
  rownames_to_column(var = "id") %>% 
  filter(id %in% outlier$id)

training2 <- training
training2$id <- seq.int(nrow(training2))

training3 <- training2[,c(ncol(training2), 1:(ncol(training2)-1))]
training3$ID <- NULL

training_no_outlier <- training3[!(training3$id %in% outlier_data$id), ]
Rescaling WAP values both in training and validationset

Rescaling of the WAP values was performed so that the lowest signal strength (i.e. no signal detection) equals to 0 (by subtracting 105) and the strongest signal has the highest value (by adding 105).

# Rescale to 0-105 --------------------------------------------------------
#training without outliers
training_no_outlier_rs <- training_no_outlier
training_no_outlier_rs$id <- NULL
training_no_outlier_rs[training_no_outlier_rs == 100] <- -105
training_no_outlier_rs[,1:520] <- training_no_outlier_rs[,1:520] + 105

#validation
valid_rs <- valid
valid_rs[valid_rs == 100] <- -105
valid_rs[,1:520] <- valid_rs[,1:520] + 105

Modalization (2)

k-NN

# Subsetting dataset by taking a sample (n=5000)
trainingdata2 <- training_no_outlier_rs[sample(1:nrow(training_no_outlier_rs), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata2$FLOOR, p = .8, list = FALSE)
trainSet2 <- trainingdata2[inTraining,]
testSet2 <- trainingdata2[-inTraining,]

#Train model
mod_knn2 <- train(FLOOR~., 
                  data = trainSet2 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "knn",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))

#training results (cross-validation)
mod_knn2

#test results
pred_knn2_test <- predict(mod_knn2, newdata = testSet2)
postResample(testSet2$FLOOR, pred_knn2_test)

#validation results
pred_knn2_validation <- predict(object = mod_knn2, newdata = valid_rs) 
postResample(valid_rs$FLOOR, pred_knn2_validation) 
confusionMatrix(valid_rs$FLOOR, pred_knn2_validation)

C5.0

# Subsetting dataset by taking a sample (n=5000)
trainingdata2 <- training_no_outlier_rs[sample(1:nrow(training_no_outlier_rs), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata2$FLOOR, p = .8, list = FALSE)
trainSet2 <- trainingdata2[inTraining,]
testSet2 <- trainingdata2[-inTraining,]

#Train model
mod_C5_2 <- train(FLOOR~., 
                  data = trainSet2 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "C5.0",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))

#training results (cross-validation)
mod_C5_2

#test results
pred_C5_2_test <- predict(mod_C5_2, newdata = testSet2)
postResample(testSet2$FLOOR, pred_C5_2_test)

#validation results
pred_C5_2_validation <- predict(object = mod_C5_2, newdata = valid_rs) 
postResample(valid_rs$FLOOR, pred_C5_2_validation) 
confusionMatrix(valid_rs$FLOOR, pred_C5_2_validation)  

Random forest

# Subsetting dataset by taking a sample (n=5000)
trainingdata2 <- training_no_outlier_rs[sample(1:nrow(training_no_outlier_rs), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata2$FLOOR, p = .8, list = FALSE)
trainSet2 <- trainingdata2[inTraining,]
testSet2 <- trainingdata2[-inTraining,]

#Train model
mod_rf_2 <- train(FLOOR~., 
                  data = trainSet2 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "rf",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))

#training results (cross-validation)
mod_rf_2

#test results
pred_rf_2_test <- predict(mod_rf_2, newdata = testSet2)
postResample(testSet2$FLOOR, pred_rf_2_test)

#validation results
pred_rf_2_validation <- predict(object = mod_rf_2, newdata = valid_rs) 
postResample(valid_rs$FLOOR, pred_rf_2_validation) 
confusionMatrix(valid_rs$FLOOR, pred_rf_2_validation) 

Pre-processing (3)

The following steps were applied:
  • Removing columns with zero variance
  • Removing rows with zero variance
  • Removing duplicates

Removing columns with zero variance both in training and validationset

WAP values with zero variance indicate that there is no change in signal strength across the whole dataset. Because these WAPS are not informative and do not contribute to the model as relevant predictors, we decided to remove these attributes.

training_factorcolumns <- training_no_outlier_rs[,521:529]

training_no_outlier_rs_nzv <- training_no_outlier_rs %>% 
  select(starts_with("WAP")) %>%
  select_if(function(x) var(x) != 0)
training_no_outlier_rs_nzv <- cbind(training_no_outlier_rs_nzv, training_factorcolumns)

# define no 0 variance columns on training
relevant_columns <- names(training_no_outlier_rs_nzv)

# select the no 0 variance columns on testing
valid_rs_nzv <- valid_rs %>%
  select(relevant_columns)
Removing rows with zero variance in trainingset

Rows with zero variance are not useful either. It indicates that no signal is detected by the phone. Accordingly these observations are removed.

trainvar <- training_no_outlier_rs_nzv

#training
waps_training <- trainvar[,1:464]

# remove rows with 0 variance
trainvar <- trainvar[-which(apply(waps_training, 1, var) == 0), ]
Removing duplicates in trainingset
dup_tr <- duplicated(trainvar[,1:464])
duplicates_tr <- trainvar[dup_tr,]
noduplicates_tr <- trainvar[!dup_tr,]

Modalization (3)

k-NN

# Subsetting dataset by taking a sample (n=5000)
trainingdata3 <- noduplicates_tr[sample(1:nrow(noduplicates_tr), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata3$FLOOR, p = .8, list = FALSE)
trainSet3 <- trainingdata3[inTraining,]
testSet3 <- trainingdata3[-inTraining,]

#Train model
mod_knn3 <- train(FLOOR~., 
                  data = trainSet3 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "knn",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))
#training results (cross-validation)
mod_knn3

#test results
pred_knn3_test <- predict(mod_knn3, newdata = testSet3)
postResample(testSet3$FLOOR, pred_knn3_test)

#validation results
pred_knn3_validation <- predict(object = mod_knn3, newdata = valid_rs_nzv) 
postResample(valid_rs_nzv$FLOOR, pred_knn3_validation) 
confusionMatrix(valid_rs_nzv$FLOOR, pred_knn3_validation) 

C5.0

# Subsetting dataset by taking a sample (n=5000)
trainingdata3 <- noduplicates_tr[sample(1:nrow(noduplicates_tr), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata3$FLOOR, p = .8, list = FALSE)
trainSet3 <- trainingdata3[inTraining,]
testSet3 <- trainingdata3[-inTraining,]

#Train model
mod_C5_3 <- train(FLOOR~., 
                  data = trainSet3 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "C5.0",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))
#training results (cross-validation)
mod_C5_3

#test results
pred_C5_3_test <- predict(mod_C5_3, newdata = testSet3)
postResample(testSet3$FLOOR, pred_C5_3_test)

#validation results
pred_C5_3_validation <- predict(object = mod_C5_3, newdata = valid_rs_nzv) 
postResample(valid_rs_nzv$FLOOR, pred_C5_3_validation) 
confusionMatrix(valid_rs_nzv$FLOOR, pred_C5_3_validation) 

Random forest

# Subsetting dataset by taking a sample (n=5000)
trainingdata3 <- noduplicates_tr[sample(1:nrow(noduplicates_tr), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata3$FLOOR, p = .8, list = FALSE)
trainSet3 <- trainingdata3[inTraining,]
testSet3 <- trainingdata3[-inTraining,]

#Train model
mod_rf_3 <- train(FLOOR~., 
                  data = trainSet3 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "rf",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))
#training results (cross-validation)
mod_rf_3

#test results
pred_rf_3_test <- predict(mod_rf_3, newdata = testSet3)
postResample(testSet3$FLOOR, pred_rf_3_test)

#validation results
pred_rf_3_validation <- predict(object = mod_rf_3, newdata = valid_rs_nzv) 
postResample(valid_rs_nzv$FLOOR, pred_rf_3_validation) 
confusionMatrix(valid_rs_nzv$FLOOR, pred_rf_3_validation)

Pre-processing 4

The following steps were applied:
  • Plotting distribution of WAP (violin plot)
  • Removing USERID 13
  • Normalization to 0 and 1

Plotting the distribution of the WAP signals

The distribution of the WAPs of the whole training dataset after pre-processing.

In order to evaluate whether the distribution of the signals are similar among different FLOORs, PHONEIDs or USERIDs, violin plots were created:

Plotting distribution of WAP values by FLOOR

Plotting distribution of WAP values by USERID

Plotting distribution of WAP values by PHONEID

Removing USERID 13 (PHONEID 17) from training dataset

The violin plots of the USERID and PHONEID demonstrate that USERID 13 (=PHONEID 17) has a different WAP distribution compared with other users/phones. All observations of USERID 13/PHONEID17 were removed accordingly.

noduplicates_tr <- noduplicates_tr %>% 
  filter(PHONEID != 17)
Normalization to 0 and 1 (min-max) within rows

The violin plots also shows that the PHONEIDs have different signal ranges. Normalization was applied in order to put all PHONEID on the same scale (0 to 1).

# train
nonnumeric_tr <- noduplicates_tr %>% 
  select(LATITUDE:PHONEID)

norm_min_max <- function(row) {
  (row - min(row))/(max(row) - min(row))
}

train2 <- noduplicates_tr %>% 
  select(starts_with("WAP")) %>% 
  apply(1, function(x) norm_min_max(x)) %>% 
  t() %>%  
  as_tibble()
train3 <- cbind(train2, nonnumeric_tr)

# validation
nonnumeric_val <- valid_rs_nzv %>% 
  select(LATITUDE:PHONEID)

norm_min_max <- function(row) {
  (row - min(row))/(max(row) - min(row))
}

valid2 <- valid_rs_nzv %>% 
  select(starts_with("WAP")) %>% 
  apply(1, function(x) norm_min_max(x)) %>% 
  t() %>%  
  as_tibble()
valid3 <- cbind(valid2, nonnumeric_val)

Modalization (4)

k-NN

# Subsetting dataset by taking a sample (n=5000)
trainingdata4 <- train3[sample(1:nrow(train3), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata4$FLOOR, p = .8, list = FALSE)
trainSet4 <- trainingdata4[inTraining,]
testSet4 <- trainingdata4[-inTraining,]

#Train model
mod_knn4 <- train(FLOOR~., 
                  data = trainSet4 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "knn",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))
#training results (cross-validation)
mod_knn4

#test results
pred_knn4_test <- predict(mod_knn4, newdata = testSet4)
postResample(testSet4$FLOOR, pred_knn4_test)

#validation results
pred_knn4_validation <- predict(object = mod_knn4, newdata = valid3) 
postResample(valid3$FLOOR, pred_knn4_validation) 
confusionMatrix(valid3$FLOOR, pred_knn4_validation)

C5.0

# Subsetting dataset by taking a sample (n=5000)
trainingdata4 <- train3[sample(1:nrow(train3), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata4$FLOOR, p = .8, list = FALSE)
trainSet4 <- trainingdata4[inTraining,]
testSet4<- trainingdata4[-inTraining,]

#Train model
mod_C5_4 <- train(FLOOR~., 
                  data = trainSet4 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "C5.0",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))
#training results (cross-validation)
mod_C5_4

#test results
pred_C5_4_test <- predict(mod_C5_4, newdata = testSet4)

#validation results
pred_C5_4_validation <- predict(object = mod_C5_4, newdata = valid3) 
postResample(valid3$FLOOR, pred_C5_4_validation) 
confusionMatrix(valid3$FLOOR, pred_C5_4_validation) 

Random forest

# Subsetting dataset by taking a sample (n=5000)
trainingdata4 <- train3[sample(1:nrow(train3), 5000, replace = FALSE),]

# Split data into trainset and testset (80/20)
set.seed(212)
inTraining <- createDataPartition(trainingdata4$FLOOR, p = .8, list = FALSE)
trainSet4 <- trainingdata4[inTraining,]
testSet4<- trainingdata4[-inTraining,]

#Train model
mod_rf_4 <- train(FLOOR~., 
                  data = trainSet4 %>%
                    select(starts_with("WAP"), FLOOR),
                  method = "rf",
                  trControl = trainControl(method = "repeatedcv", 
                                           number = 5, 
                                           repeats = 1))
#training results (cross-validation)
mod_rf_4

#test results
pred_rf_4_test <- predict(mod_rf_4, newdata = testSet4)
postResample(testSet4$FLOOR, pred_rf_4_test)

#validation results
pred_rf_4_validation <- predict(object = mod_rf_4, newdata = valid3) 
postResample(valid3$FLOOR, pred_rf_4_validation) 
confusionMatrix(valid3$FLOOR, pred_rf_4_validation) 

Plotting the errors (kNN) by LATITUDE and LONGITUDE.

library(writexl)
val_error <- read_excel("C:/Users/Y.S. Kim/Desktop/Ubiqum/Wifi/Data/validationset with error column floor.xlsx")
ggplot(data = val_error) +
  geom_jitter(aes(x = LONGITUDE, y = LATITUDE, col=error_knn))

ggplot(data = val_error) +
  geom_jitter(aes(x = LONGITUDE, y = LATITUDE, col=error_knn)) +
  facet_wrap(~error_knn)

Accuracy and kappa