Many real world applications need to know the localization of a user in the world to provide their services. Outdoor localization problem can be solved very accurately thanks to the inclusion of GPS sensors into the mobile devices. However, indoor localization is still an open problem mainly due to the loss of GPS signal in indoor environments. For this reason, we evaluated the application of machine learning techniques to this problem, replacing the GPS signal with WAPS signal.
For this task, we followed the next steps:
You can see the complete code we used in the next URL https://github.com/Nell87/WifiPositioning
After a thorough analysis, we concluded that WAPs signal can be a good resource for positioning indoor. We have the next conclusions:
WAPS: we have 520 different WAPS with signals from -104 to 0.
We had an accuracy=1 predicting bulding, an accuracy=0.918 predictin floor and the next performance predicting longitude/latitude:
Here you can obtain all the code used for this task.
1. Includes
#Load Libraries: p_load can install, load, and update packages
if(require("pacman")=="FALSE"){
install.packages("pacman")
}
pacman::p_load(gdata, caret, highcharter, doParallel,reshape2, ggplot2,ggpubr,ggfortify,
RColorBrewer, tidyr, dplyr, lubridate, ggfortify, stringr, shiny, forecast,
tidyquant,fpp2,parallel, foreach,progress,randomForest,mlr, plotly,ggfortify)
# Load Data
setwd("C:/SARA/Ubiqum/Section3/Task3")
#setwd("C:/Users/Patty/OneDrive/Documents/Ubiqum/Task 6/Input")
DataTrain <- read.csv2("trainingData.csv", header=TRUE, sep=",", stringsAsFactors=FALSE, na.strings=c("NA", "-", "?"))
DataValid<-read.csv2("validationData.csv", header=TRUE, sep=",", stringsAsFactors=FALSE, na.strings=c("NA", "-", "?"))
2. Data Cleaning
##### 2.1. Remove repeated observations ##### -------------------------
# Remove repeated rows in DataTrain
DataTrain<-distinct(DataTrain) #<- 19937 to 19300
# Remove repeated rows in DataValidation
DataValid<-distinct(DataValid) #<- No repeated rows!!
##### 2.2. Transformations #####-------------------------
# Combine datasets for speeding-up the transformations and add ID
Data_Full<-gdata::combine(DataTrain, DataValid)
Data_Full <- Data_Full %>% mutate(ID = row_number())
# Transform some variables to factor/numeric/datetime
factors<-c("FLOOR", "BUILDINGID", "SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "source")
Data_Full[,factors] <-lapply(Data_Full[,factors], as.factor)
rm(factors)
numeric<-c("LONGITUDE", "LATITUDE")
Data_Full[,numeric]<-lapply(Data_Full[,numeric], as.numeric)
rm(numeric)
Data_Full$TIMESTAMP <- as_datetime(Data_Full$TIMESTAMP,
origin = "1970-01-01", tz="UTC")
# Change value of WAPS= 100 to WAPS=-110
WAPS<-grep("WAP", names(Data_Full), value=T)
Data_Full[,WAPS] <- sapply(Data_Full[,WAPS],function(x) ifelse(x==100,-110,x))
##### 2.3. Select Relevant WAPS ##### -------------------------
WAPS_VarTrain<-nearZeroVar(Data_Full[Data_Full$source=="DataTrain",WAPS], saveMetrics=TRUE)
WAPS_VarValid<-nearZeroVar(Data_Full[Data_Full$source=="DataValid",WAPS], saveMetrics=TRUE)
Data_Full<-Data_Full[-which(WAPS_VarTrain$zeroVar==TRUE |
WAPS_VarValid$zeroVar==TRUE)] # 529 -> 322 variables
rm(WAPS_VarTrain, WAPS_VarValid)
##### 2.4. Remove rows with no variance ##### -------------------------
# New WAPS
WAPS<-grep("WAP", names(Data_Full), value=T)
# Filter Rows with all RSSI = -110 <- From 21048 to 20972 rows
Data_Full <- Data_Full %>%
filter(apply(Data_Full[WAPS], 1, function(x)length(unique(x)))>1)
3. Feature Engineering
##### 3.1. Add HighWAP and HighRSSI ##### -------------------------
# Craete new variables HighWAP and HighRSSI
Data_Full<-Data_Full %>%
mutate(HighWAP=NA, HighRSSI=NA)
Data_Full<-Data_Full %>%
mutate(HighWAP=colnames(Data_Full[WAPS])[apply(Data_Full[WAPS],1,which.max)])
Data_Full<-Data_Full %>%
mutate(HighRSSI=apply(Data_Full[WAPS], 1, max))
# Transform to factor
Data_Full$HighWAP<-as.factor(Data_Full$HighWAP)
Data_Full$BUILDINGID<-as.factor(Data_Full$BUILDINGID)
# Remove from "max_WAP" those WAPs that never provided the best signal to any of the observations
# of the training set
outersect <- function(x, y) {
x[!x%in%y]
}
remove_WAPs<-unique(
outersect(Data_Full$HighWAP[Data_Full$source=="DataValid"],
Data_Full$HighWAP[Data_Full$source=="DataTrain"])
)
remove_WAPs
#remove WAPs 268 & 323
Data_Full<-Data_Full[-which(Data_Full$HighWAP %in% remove_WAPs),]
##### 3.2. Add a variable combining Build + Floor ##### -------------------------
# Add variable Build_Floor
Data_Full$Build_floorID<-as.factor(group_indices(Data_Full, BUILDINGID, FLOOR))
unique(Data_Full$Build_floorID) # <- 13 different floors
check_ID<-Data_Full%>%
arrange(BUILDINGID, FLOOR)%>%
distinct(BUILDINGID, FLOOR, Build_floorID)%>%
select(BUILDINGID, FLOOR, Build_floorID) #sIDs assigned sequentially. OK!!
rm(check_ID)
4. Explore Strange Things
# Are there same HighWAP in different Buildings?
WAPS_Recoloc<-Data_Full %>%
select(HighWAP, BUILDINGID, source) %>%
distinct(HighWAP, BUILDINGID, source)
RepWAPS<-WAPS_Recoloc %>% distinct(HighWAP, BUILDINGID)
RepWAPS<-sort(RepWAPS$HighWAP[duplicated(RepWAPS$HighWAP)])
RepWAPS # WAP 248 is highwap in the 3 buildings!!
# Examine WAP 248
WAP248<-Data_Full[Data_Full$HighWAP=="WAP248",313:326]
plot(LATITUDE ~ LONGITUDE, data = Data_Full, pch = 20, col = "grey")
points(LATITUDE ~ LONGITUDE, data=WAP248[WAP248$source=="DataTrain",], pch=20, col="blue")
points(LATITUDE ~ LONGITUDE, data=WAP248[WAP248$source=="DataValid",], pch=20, col="red")
5. Preparing for modelling
# Split Data before modeling
Data_FullSplit<-split(Data_Full, Data_Full$source)
list2env(Data_FullSplit, envir=.GlobalEnv)
rm(Data_FullSplit)
# Create a DataValidOriginal for analyzing performance
DataValidOrig<-DataValid
# Prepare Parallel Process
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)
# 10 fold cross validation
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, allowParallel = TRUE)
6. Predicting Building
# Train dt model <-- 52.03 sec elapsed Accuracy:1 Kappa: 1
# system.time(Building_SVM01<-caret::train(BUILDINGID~HighWAP, data= DataTrain, method="svmLinear",
# trControl=fitControl))
# save(Building_SVM01, file = "Building_SVM01.rda")
load("Building_SVM01.rda")
PredictorsBuild<-predict(Building_SVM01, DataValidOrig)
ConfusionMatrix<-confusionMatrix(PredictorsBuild, DataValid$BUILDINGID) # Accuracy:1 Kappa:1
ConfusionMatrix
rm(ConfusionMatrix)
# Add building Predictions in DataValidation
Data_Full$BUILDINGID[Data_Full$source=="DataValid"]<-PredictorsBuild
7. Predicting longitude per building (1 model per building)
# Split DataTrain per Building
Buildings<-split(DataTrain, DataTrain$BUILDINGID)
names(Buildings)<-c("BUILDING0", "BUILDING1", "BUILDING2")
list2env(Buildings, envir = .GlobalEnv)
##### 7.1. Random Forest #####
# Random Forest -B0- <- # RMSE 7.01 R 0.932 MAE 4.64 TIME 52
# bestmtry_LON_BO_RF<-tuneRF(BUILDING0[WAPS], BUILDING0$LONGITUDE, ntreeTry=100, stepFactor=2,
# improve=0.05,trace=TRUE, plot=T)
# system.time(LON_BO_RF<-randomForest(LONGITUDE~. -LATITUDE -FLOOR -SPACEID -RELATIVEPOSITION -USERID -PHONEID
# -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -ID,
# data= BUILDING0,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry=104,allowParalel=TRUE))
#
#
# save(LON_BO_RF, file = "LON_BO_RF.rda")
load("LON_BO_RF.rda")
predictions_LONBORF<-predict(LON_BO_RF, DataValid[DataValid$BUILDINGID==0, ])
rf_postRes_LONBORF<-postResample(predictions_LONBORF, DataValidOrig$LONGITUDE[DataValidOrig$BUILDINGID==0])
rf_postRes_LONBORF
# Random Forest -B1- <- # RMSE 9.21 R 0.96 MAE 6.59 TIME 42 sec
# bestmtry_LON_B1_RF<-tuneRF(BUILDING1[WAPS], BUILDING1$LONGITUDE, ntreeTry=100, stepFactor=2,
# improve=0.05,trace=TRUE, plot=T)
# system.time(LON_B1_RF<-randomForest(LONGITUDE~. -LATITUDE -FLOOR -SPACEID -RELATIVEPOSITION -USERID
# -PHONEID -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -ID,
# data= BUILDING1,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry=104,allowParalel=TRUE))
# save(LON_B1_RF, file = "LON_B1_RF.rda")
load("LON_B1_RF.rda")
predictions_LONB1RF<-predict(LON_B1_RF, DataValid[DataValid$BUILDINGID==1, ])
rf_postRes_LONB1RF<-postResample(predictions_LONB1RF, DataValidOrig$LONGITUDE[DataValidOrig$BUILDINGID==1])
rf_postRes_LONB1RF
# Random Forest -B2- <- # RMSE 10.43 R 0.89 MAE 7.14 TIME sec 86
# bestmtry_LON_B2_RF<-tuneRF(BUILDING2[WAPS], BUILDING2$LONGITUDE, ntreeTry=100, stepFactor=2,
# improve=0.05,trace=TRUE, plot=T)
# system.time(LON_B2_RF<-randomForest(LONGITUDE~. -LATITUDE -FLOOR -SPACEID -RELATIVEPOSITION -USERID
# -PHONEID -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -ID,
# data= BUILDING2,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry=52,allowParalel=TRUE))
# save(LON_B2_RF, file = "LON_B2_RF.rda")
load("LON_B2_RF.rda")
predictions_LONB2RF<-predict(LON_B2_RF, DataValid[DataValid$BUILDINGID==2, ])
rf_postRes_LONB2RF<-postResample(predictions_LONB2RF, DataValidOrig$LONGITUDE[DataValidOrig$BUILDINGID==2])
rf_postRes_LONB2RF
8. Predicting longitude using PCA
# Add dummy variable for BuildingID
DummyVar <- dummyVars("~BUILDINGID", data = Data_Full, fullRank=T)
DummyVarDF <- data.frame(predict(DummyVar, newdata = Data_Full))
Data_Full<-cbind(Data_Full, DummyVarDF)
#store WAP names in a vector. Exclude those variable names that do not have at least one number
#(i.e. "max_WAP")
WAPs<-grep("[[:digit:]]",(grep("WAP", names(Data_Full), value=T)), value=T)#312 WAPs
#PC overview with the caret package
pca1<-preProcess(Data_Full[Data_Full$source=="DataTrain", WAPs],
#78 components capture 80 percent of the variance
method=c("center", "scale", "pca"), thresh=0.8)
#78 components capture 80 percent of the variance
#singular value decomposition
# By default, prcomp already centers the variable to have mean equals to zero
training_WAPs<-Data_Full[Data_Full$source=="DataTrain", WAPs]
nrow(training_WAPs)#19,227 obs
pca2 <- prcomp(training_WAPs, scale. = T)
names(pca2)
pca2$rotation #312 PC loadings
#plot the 78 resulting pc's. Scale = 0 ensures that arrows are scaled to represent the loadings
biplot(pca2[,1:78], scale = 0)#can't see anything!
std_dev <- pca2$sdev
var <- std_dev^2 #eigenvalues
prop_var <- var/sum(var)
summary(pca2)
#Decide how many PC to keep for the modeling stage: use scree plot.
#scree plot
plot(prop_var, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
#cumulative scree plot
plot(cumsum(prop_var), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
sum(prop_var[1:78])
# loadings (eigenvectors)
# head(pca2$rotation)
# PCs (aka scores)
# head(pca2$x)
#From the graph above we can infer that 78 components capture 80 pct
#of the variance, as already determined in "pca1".
autoplot(pca2, scale=0)
#Change training set WAPs for the 78 principal components
WAPs<-grep("[[:digit:]]",(grep("WAP", names(Data_Full), value=T)), value=T)
train_pca <- data.frame(Data_Full[Data_Full$source=="DataTrain", -which(colnames(Data_Full) %in% WAPs)],
pca2$x[,1:78])
nrow(train_pca)#19,227
#We are interested in the first 78 PCs
nrow(pca2$x)
nrow(train_pca)#19,227 obs
valid_pca <- predict(pca2, newdata = Data_Full[Data_Full$source=="DataValid",])
#select the first 78 components
valid_pca <- valid_pca[, 1:78]
valid_pca <- data.frame(Data_Full[Data_Full$source=="DataValid", -which(colnames(Data_Full) %in% WAPs)],
valid_pca)
all.equal(colnames(train_pca), colnames(valid_pca))#TRUE
##### 8.1. Random Forest ##### RMSE 8.92 R 0.994 MAE 6.04 TIME 183 sec
# system.time(LON_PCA<-randomForest(LONGITUDE~. -LATITUDE -FLOOR -BUILDINGID -SPACEID -RELATIVEPOSITION
# -USERID-PHONEID-TIMESTAMP-source-ID -HighWAP -HighRSSI -Build_floorID,
# data=train_pca,
# method="rf",
# ntree=100,
# trControl=fitControl,
# importance = T,
# maximize =T,
# allowParalel=TRUE))
# save(LON_PCA, file = "LON_PCA.rda")
load("LON_PCA.rda")
predictions_LON_PCA<-predict(LON_PCA, valid_pca)
rf_postRes_LON_PCA<-postResample(predictions_LON_PCA, valid_pca$LONGITUDE)
rf_postRes_LON_PCA
9. Predicting longitude in all buildings (the same model for every building)
# Split Data before modeling
Data_FullSplit<-split(Data_Full, Data_Full$source)
list2env(Data_FullSplit, envir=.GlobalEnv)
rm(Data_FullSplit)
##### 9.1. Random Forest ##### RMSE 8.44 R 0.995 MAE 5.66 TIME 712 sec
# VarInd<-c(WAPS, "BUILDINGID.1", "BUILDINGID.2")
# bestmtry<-tuneRF(DataTrain[VarInd], DataTrain$LONGITUDE, ntreeTry=100, stepFactor=2, improve=0.05,
# trace=TRUE, plot=T)
# system.time(LON_AllB_RF<-randomForest(LONGITUDE~. -LATITUDE -FLOOR -SPACEID -RELATIVEPOSITION -USERID -PHONEID
# -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -BUILDINGID -ID,
# data= DataTrain,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry= 104, allowParalel=TRUE))
# save(LON_AllB_RF, file = "LON_AllB_RF.rda")
load("LON_AllB_RF.rda")
predictions_LON_AllBRF<-predict(LON_AllB_RF, DataValid)
rf_postRes_LON_AllBRF<-postResample(predictions_LON_AllBRF, DataValid$LONGITUDE)
rf_postRes_LON_AllBRF
# Visualizing errors
Problems_LON<-as.data.frame(cbind(predictions_LON_AllBRF, DataValidOrig$LONGITUDE))
Problems_LON<-Problems_LON %>% mutate(Error= Problems_LON[,1]-Problems_LON[,2]) %>%
mutate(ID=DataValid$ID)
Problems_LON10m<-Problems_LON %>% filter(abs(Error)>10)
boxplot(Problems_LON10m$Error)
hist(Problems_LON$Error, xlab = "Error", ylab="Frequency",
main="Error predicting Longitude")
boxplot(Problems_LON10m$Error, xlab = "Error", ylab="Frequency",
main="Error predicting Longitude >10m")
# Mean Error
MeanErrorLON<-mean(abs(Problems_LON$Error)) # mean: 5.66m median:3.84m
# Subset the sample with these errors
Indices10m<-Problems_LON10m$ID
DF_LON10m<-DataValidOrig[DataValidOrig$ID %in% Indices10m, ]
DF_LON10m_NOWAPS<-DF_LON10m %>% select(313:326)
# Plot long & lat
ggplot(DataValidOrig, aes(x = LONGITUDE, y = LATITUDE)) +
geom_point (stat="identity", position=position_dodge(),color="grey53") +
labs(x = "Longitude", y = "Latitude", title = "Errors >10m") +
facet_wrap(~FLOOR) +
geom_point(data = DF_LON10m, aes(x = LONGITUDE, y = LATITUDE), color="red")
plot(LATITUDE ~ LONGITUDE, data = DataValidOrig, pch = 20, col = "grey53")
points(LATITUDE ~ LONGITUDE, data = DF_LON10m, pch = 20, col = "red")
##### 9.2. KNN ##### RMSE 20.23 R 0.972 MAE 8.04 TIME 687 sec
# Scale Data: Calculate the pre-process parameters
# preprocessParams <- preProcess(Data_Full[,c(WAPS, "LATITUDE")], method=c("scale", "center"))
#
# # Transform the dataset using the parameters
# Data_Full_KNN <- predict(preprocessParams, Data_Full)
#
# # Split data
# Data_FullSplit<-split(Data_Full_KNN, Data_Full_KNN$source)
# names(Data_FullSplit)<-c("DataTrainKNN", "DataValidKNN")
# list2env(Data_FullSplit, envir=.GlobalEnv)
# rm(Data_FullSplit)
# system.time(LON_AllB_KNN<-caret::train(LONGITUDE~. -LATITUDE -FLOOR -BUILDINGID -SPACEID -RELATIVEPOSITION
# -USERID-PHONEID-TIMESTAMP-source-ID -HighWAP -HighRSSI -Build_floorID,
# data=DataTrainKNN,
# method="knn"))
#
# save(LON_AllB_KNN, file = "LON_AllB_KNN.rda")
# load("LON_AllB_KNN.rda")
# predictions_LON_AllBKNN<-predict(LON_AllB_KNN, DataValid)
# rf_postRes_LON_AllBKNN<-postResample(predictions_LON_AllBKNN, DataValid$LONGITUDE)
# rf_postRes_LON_AllBKNN
#
#
# ##### 9.3. SVM #####
# ParamSVM <- tune(svm, DataTrainKNN$LONGITUDE~., data = DataTrainKNN[,c(WAPS, "BUILDINGID.1","BUILDINGID.2")],
# ranges = list(gamma = 2^(-1:1), cost = 2^(2:4)))
#system.time(LON_AllB_SVM<-caret::train(LONGITUDE~. -LATITUDE -FLOOR -BUILDINGID -SPACEID -RELATIVEPOSITION
# -USERID-PHONEID-TIMESTAMP-source-ID -HighWAP -HighRSSI -Build_floorID,
# data=DataTrain,
# method="svmLinear",
# preProcess=c("center", "scale"),
# trControl=fitControl))
#
# save(LON_AllB_SVM, file = "LON_AllB_SVM.rda")
#
# load("LON_AllB_SVM.rda")
# predictions_LON_AllBSVM<-predict(LON_AllB_SVM, DataValid)
# rf_postRes_LON_AllBSVM<-postResample(predictions_LON_AllBSVM, DataValid$LONGITUDE)
# rf_postRes_LON_AllBSVM
10. Predicting latitude using PCA
##### 10.1. Random Forest ##### RMSE 8.35 R 0.986 MAE 5.57 TIME 194 sec
# system.time(LAT_PCA<-randomForest(LATITUDE~. -LONGITUDE -FLOOR -BUILDINGID -SPACEID -RELATIVEPOSITION
# -USERID-PHONEID-TIMESTAMP-source-ID -HighWAP -HighRSSI -Build_floorID,
# data=train_pca,
# method="rf",
# ntree=100,
# trControl=fitControl,
# importance = T,
# maximize =T,
# allowParalel=TRUE))
# save(LAT_PCA, file = "LAT_PCA.rda")
load("LAT_PCA.rda")
predictions_LAT_PCA<-predict(LAT_PCA, valid_pca)
rf_postRes_LAT_PCA<-postResample(predictions_LAT_PCA, valid_pca$LATITUDE)
rf_postRes_LAT_PCA
11. Predicting latitude in all buildings (the same model for every building)
#### 11. PREDICTING LATITUDE IN ALL BUILDINGS (the same model for every building) ####------------------
##### 11.1. Random Forest ##### <- # RMSE 8.11 R 0.986 MAE 5.47 TIME 646 sec
# # VarInd<-c(WAPS, "BUILDINGID.1", "BUILDINGID.2")
# bestmtry<-tuneRF(DataTrain[VarInd], DataTrain$LATITUDE, ntreeTry=100, stepFactor=2, improve=0.05,
# trace=TRUE, plot=T)
# system.time(LAT_AllB_RF<-randomForest(LATITUDE~. -LONGITUDE -FLOOR -SPACEID -RELATIVEPOSITION -USERID -PHONEID
# -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -BUILDINGID -ID,
# data= DataTrain,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry= 104,allowParalel=TRUE))
# save(LAT_AllB_RF, file = "LAT_AllB_RF.rda")
load("LAT_AllB_RF.rda")
predictions_LAT_AllBRF<-predict(LAT_AllB_RF, DataValid)
rf_postRes_LAT_AllBRF<-postResample(predictions_LAT_AllBRF, DataValid$LATITUDE)
rf_postRes_LAT_AllBRF
# Visualizing errors
Problems_LAT<-as.data.frame(cbind(predictions_LAT_AllBRF, DataValidOrig$LATITUDE))
Problems_LAT<-Problems_LAT %>% mutate(Error= Problems_LAT[,1]-Problems_LAT[,2]) %>%
mutate(ID=DataValid$ID)
Problems_LAT10m<-Problems_LAT %>% filter(abs(Error)>10)
hist(Problems_LAT$Error, xlab = "Error", ylab="Frequency",
main="Error predicting Latitude" )
boxplot(Problems_LAT10m$Error, xlab = "Error", ylab="Frequency",
main="Error predicting Latitude >10m" )
# Mean Error
MeanErrorLAT<-mean(abs(Problems_LAT$Error)) # mean:5.47m median: 3.54m
# Subset the sample with these errors
Indices10m<-Problems_LAT$ID
DF_LAT10m<-DataValidOrig[DataValidOrig$ID %in% Indices10m, ]
##### 11.2. KNN #####
# system.time(LAT_AllB_KNN<-caret::train(LATITUDE~. -LONGITUDE -FLOOR -BUILDINGID -SPACEID -RELATIVEPOSITION
# -USERID-PHONEID-TIMESTAMP-source-ID -HighWAP -HighRSSI -Build_floorID,
# data=DataTrain,
# method="knn",
# preProcess=c("center", "scale"),
# trControl=fitControl))
#
# save(LAT_AllB_KNN, file = "LAT_AllB_KNN.rda")
# load("LAT_AllB_KNN.rda")
# predictions_LAT_AllBKNN<-predict(LAT_AllB_KNN, DataValid)
# rf_postRes_LAT_AllBKNN<-postResample(predictions_LAT_AllBKNN, DataValid$LATGITUDE)
# rf_postRes_LAT_AllBKNN
##### 11.3. SVM #####
# system.time(LAT_AllB_SVM<-caret::train(LATITUDE~. -LONGITUDE -FLOOR -BUILDINGID -SPACEID -RELATIVEPOSITION
# -USERID-PHONEID-TIMESTAMP-source-ID -HighWAP -HighRSSI -Build_floorID,
# data=DataTrain,
# method="svmLinear",
# preProcess=c("center", "scale"),
# trControl=fitControl))
#
# save(LAT_AllB_SVM, file = "LAT_AllB_SVM.rda")
# load("LAT_AllB_SVM.rda")
# predictions_LAT_AllBSVM<-predict(LAT_AllB_SVM, DataValid)
# rf_postRes_LAT_AllBSVM<-postResample(predictions_LAT_AllBSVM, DataValid$LATGITUDE)
# rf_postRes_LAT_AllBSVM
12. Predicting floor using HighWAP
##### 12.1. SVM ##### # <- Accuracy: 0.893 Kappa: 0.85 Time: 48
# system.time(Floor_HighWAP_SVM<-caret::train(FLOOR~HighWAP, data= DataTrain, method="svmLinear",
# trControl=fitControl))
# save(Floor_HighWAP_SVM, file = "Floor_HighWAP_SVM.rda")
load("Floor_HighWAP_SVM.rda")
predictions_Floor_HighWAPSVM<-predict(Floor_HighWAP_SVM, DataValid)
ConfusionMatrix<-confusionMatrix(predictions_Floor_HighWAPSVM, DataValid$FLOOR)
ConfusionMatrix
rm(ConfusionMatrix)EXPL
13.Predicting Build_FloorID using HighWAP
##### 13.1. SVM ##### # <- Accuracy: 0.893 Kappa: 0.880 Time: 56
# system.time(Build_floorID_HighWAP_SVM<-caret::train(Build_floorID~HighWAP, data= DataTrain, method="svmLinear",
# trControl=fitControl))
# save(Build_floorID_HighWAP_SVM, file = "Build_floorID_HighWAP_SVM.rda")
load("Build_floorID_HighWAP_SVM.rda")
predictions_Build_floorID_HighWAPSVM<-predict(Build_floorID_HighWAP_SVM, DataValid)
ConfusionMatrix<-confusionMatrix(predictions_Build_floorID_HighWAPSVM, DataValid$Build_floorID)
ConfusionMatrix
rm(ConfusionMatrix)
14. Predicting Floor using WAPS, latitude, longitude & dummy building
# Add predicted longitude & latitude to DataValid
Data_Full$LATITUDE[Data_Full$source=="DataValid"]<-predictions_LAT_AllBRF
Data_Full$LONGITUDE[Data_Full$source=="DataValid"]<-predictions_LON_AllBRF
# Split Data before modeling
Data_FullSplit<-split(Data_Full, Data_Full$source)
list2env(Data_FullSplit, envir=.GlobalEnv)
rm(Data_FullSplit)
# Random Forest <- # Accuracy 0.918 Kappa 0.88 Time 208 sec
# VarInd<-c(WAPS, "BUILDINGID.1", "BUILDINGID.2", "LATITUDE", "LONGITUDE)
# bestmtry<-tuneRF(DataTrain[VarInd], DataTrain$FLOOR, ntreeTry=100, stepFactor=2, improve=0.05,
# trace=TRUE, plot=T)
#
# system.time(FLOOR_BUILDLATLONG_RF<-randomForest(FLOOR~. -SPACEID -RELATIVEPOSITION -USERID -PHONEID
# -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -BUILDINGID -ID,
# data= DataTrain,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry= 34,allowParalel=TRUE))
# save(FLOOR_BUILDLATLONG_RF, file = "FLOOR_BUILDLATLONG_RF.rda")
load("FLOOR_BUILDLATLONG_RF.rda")
predictions_Floor_BuilLatLongRF<-predict(FLOOR_BUILDLATLONG_RF, DataValid)
rf_postRes_Floor_BuilLatLongRF<-postResample(predictions_Floor_BuilLatLongRF, DataValid$FLOOR)
rf_postRes_Floor_BuilLatLongRF
15. Predicting Build_FloorID using WAPS, latitude, longitude & dummy building
#### 15. PREDICTING Build_floorID USING WAPS, LAT & LONG #### ---------------------------------------------------------------------------
# Random Forest <- # Accuracy 0.91 Kappa: 0.90 Time 155 sec
# VarInd<-c(WAPS, "LATITUDE", "LONGITUDE)
# bestmtry<-tuneRF(DataTrain[VarInd], DataTrain$Build_floorID, ntreeTry=100, stepFactor=2, improve=0.05,
# trace=TRUE, plot=T)
#
# system.time(BUILDFLOORID_LATLONG_RF<-randomForest(Build_floorID~. -SPACEID -RELATIVEPOSITION -USERID -PHONEID
# -TIMESTAMP -source -HighWAP -HighRSSI -Build_floorID -BUILDINGID -FLOOR
# -BUILDINGID.1 -BUILDINGID.2 -ID,
# data= DataTrain,
# importance=T,maximize=T,
# method="rf", trControl=fitControl,
# ntree=100, mtry= 34,allowParalel=TRUE))
# save(BUILDFLOORID_LATLONG_RF, file = "BUILDFLOORID_LATLONG_RF.rda")
load("BUILDFLOORID_LATLONG_RF.rda")
predictions_BuildFloorID_LatLongRF<-predict(BUILDFLOORID_LATLONG_RF, DataValid)
rf_postRes_BuildFloorID_LatLongRF<-postResample(predictions_BuildFloorID_LatLongRF, DataValid$Build_floorID)
rf_postRes_BuildFloorID_LatLongRF
16. Final Visualizations
# Add predicted floor
DataValid$FLOOR<-predictions_Floor_BuilLatLongRF
VarInd<-c("LATITUDE", "LONGITUDE", "FLOOR")
VisLatLon<- gdata::combine(DataValidOrig[VarInd], DataValid[VarInd])
VisLatLon$source<-ifelse(VisLatLon$source=="DataValidOrig[VarInd]", "Original", "Predicted")
VisLatLon$source<-as.factor(VisLatLon$source)
p <- plot_ly(VisLatLon, x = ~LONGITUDE, y = ~LATITUDE, z = ~FLOOR, color = ~source,
colors=c("blue","peru")) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Longitude'),
yaxis = list(title = 'Latitude'),
zaxis = list(title = 'Floor')))
ggplot(DataValidOrig, aes(x = LONGITUDE, y = LATITUDE)) +
geom_point (stat="identity", position=position_dodge(),color="grey53") +
labs(x = "Longitude", y = "Latitude", title = "Predictions per Floor") +
facet_wrap(~FLOOR) +
geom_point(data = DataValid, aes(x = LONGITUDE, y = LATITUDE), color="blue")
plot(LATITUDE ~ LONGITUDE, data = DataValidOrig, pch = 20, col = "grey53")
points(LATITUDE ~ LONGITUDE, data = DataValid, pch = 20, col = "blue")
# Stop Parallel process
stopCluster(cluster)
rm(cluster)
registerDoSEQ()