Loading required packages and data:

pacman::p_load(readr, caret, dplyr, scatterplot3d, ggplot2, tidyr, doSNOW, parallel, ranger, e1071)

setwd("~/Desktop/Ubiqum/R_task6_Wifi")
validationData <- read.csv("validationData.csv", stringsAsFactors = FALSE)
Training.data<-read.csv("trainingData.csv", stringsAsFactors = FALSE ) 
head(Training.data[,515:529])
##   WAP515 WAP516 WAP517 WAP518 WAP519 WAP520 LONGITUDE LATITUDE FLOOR
## 1    100    100    100    100    100    100 -7541.264  4864921     2
## 2    100    100    100    100    100    100 -7536.621  4864934     2
## 3    100    100    100    100    100    100 -7519.152  4864950     2
## 4    100    100    100    100    100    100 -7524.570  4864934     2
## 5    100    100    100    100    100    100 -7632.144  4864982     0
## 6    100    100    100    100    100    100 -7533.896  4864939     2
##   BUILDINGID SPACEID RELATIVEPOSITION USERID PHONEID  TIMESTAMP
## 1          1     106                2      2      23 1371713733
## 2          1     106                2      2      23 1371713691
## 3          1     103                2      2      23 1371714095
## 4          1     102                2      2      23 1371713807
## 5          0     122                2     11      13 1369909710
## 6          1     105                2      2      23 1371713841
head(validationData[,515:529])
##   WAP515 WAP516 WAP517 WAP518 WAP519 WAP520 LONGITUDE LATITUDE FLOOR
## 1    100    100    100    100    100    100 -7515.917  4864890     1
## 2    100    100    100    100    100    100 -7383.867  4864840     4
## 3    100    100    100    100    100    100 -7374.302  4864847     4
## 4    100    100    100    100    100    100 -7365.825  4864843     4
## 5    100    100    100    100    100    100 -7641.499  4864922     2
## 6    100    100    100    100    100    100 -7338.807  4864825     2
##   BUILDINGID SPACEID RELATIVEPOSITION USERID PHONEID  TIMESTAMP
## 1          1       0                0      0       0 1380872703
## 2          2       0                0      0      13 1381155054
## 3          2       0                0      0      13 1381155095
## 4          2       0                0      0      13 1381155138
## 5          0       0                0      0       2 1380877774
## 6          2       0                0      0      12 1380874853

Exploring the Data

Plots Training Data

# Convert longitud and latitude values to absolute values into Training and validation data: 
Training.data$LATITUDE<- Training.data$LATITUDE -min(Training.data$LATITUDE)
Training.data$LATITUDE<-round(Training.data$LATITUDE, digits = 1)
Training.data$LONGITUDE<-Training.data$LONGITUDE -min(Training.data$LONGITUDE)
Training.data$LONGITUDE<-round(Training.data$LONGITUDE, digits = 1)

plotForUSer<-function(builNum) {
  builId<- paste("B.", builNum, sep = "" )
  plotTitle<- paste("Building", builNum, ": Users in floors", sep=" ")
  
  message(paste("building Id:", builId))
  message(paste("Plot title:", plotTitle))


Training.data%>%filter(BUILDINGID==builId)%>%
  ggplot()+
  geom_point(aes(x=LONGITUDE, y= LATITUDE, color=USERID)) + 
  facet_grid(. ~ FLOOR) + 
  labs(title=plotTitle) + 
  theme_linedraw(base_size = 11, base_family = "") + 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))
}

plotForUSer("1")
## building Id: B.1
## Plot title: Building 1 : Users in floors

plotForUSer("2")
## building Id: B.2
## Plot title: Building 2 : Users in floors

plotForUSer("3")
## building Id: B.3
## Plot title: Building 3 : Users in floors

  • Building1 is bad represented by User.
  • Building 2 and Building 3 are well represented by User.
plotForRelativePosition <- function(buildingNumber) {
  buildingId<-paste("B.", buildingNumber, sep = "")
  plotTitle<-paste("Building ", buildingNumber, ": Position in floors", sep = "")
  
  message(paste("building Id:", buildingId))
  message(paste("Plot title:", plotTitle))
  
  Training.data%>%filter(BUILDINGID==buildingId)%>%
    ggplot() + 
    geom_point(aes(x=LONGITUDE, y= LATITUDE, color=RELATIVEPOSITION)) + 
    facet_grid(. ~ FLOOR) + 
    scale_color_manual(values = c("blue", "green")) + 
    labs(title=plotTitle) + 
    theme_linedraw(base_size = 11, base_family = "") + 
    theme(plot.title = element_text(hjust = 0.5, face="bold"))
}

plotForRelativePosition("1")
## building Id: B.1
## Plot title: Building 1: Position in floors

plotForRelativePosition("2")
## building Id: B.2
## Plot title: Building 2: Position in floors

plotForRelativePosition("3")
## building Id: B.3
## Plot title: Building 3: Position in floors

  • Bad represented in general by position: Lack of representativity inside rooms.
plotForPhone<- function (buildID){
  buildingID<-paste("B.",buildID,sep="")
  plotTitle<- paste("Building", buildID, ": Phone ID in floors")
  
  message(paste("Building ID:", buildingID))
  message(paste( "Plot Title:", plotTitle))

Training.data%>%
  filter(BUILDINGID==buildingID)%>%
  ggplot()+
  geom_point(aes(x=LONGITUDE, y= LATITUDE, color=PHONEID)) + 
  facet_grid(. ~ FLOOR) + 
  labs(title=plotTitle) + 
  theme_linedraw(base_size = 11, base_family = "") + 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))
}

plotForPhone("1")
## Building ID: B.1
## Plot Title: Building 1 : Phone ID in floors

plotForPhone("2")
## Building ID: B.2
## Plot Title: Building 2 : Phone ID in floors

plotForPhone("3")
## Building ID: B.3
## Plot Title: Building 3 : Phone ID in floors

  • Building 1 is again bad representet by Phone ID. Those two Phone ID are the same two Users ID found in the UserID graph
plot(Training.data$USERID,
     xlab="USER NUMBER", ylab="frequency",
     main="Number of locations by User",
     col="pink")

# Number of location by PhoneId 
plot(Training.data$PHONEID,
     xlab="PHONE ID NUMBER",
     ylab="frequency",
     main="Number of locations by Phone",
     col="turquoise3")

  • The extra activity by Phone and User are the two users of Building 1. They are not representative.
B1%>%
  filter(FLOOR==1)%>%
  group_by(LONGITUDE, LATITUDE, RELATIVEPOSITION)%>%
  summarize(count=n())  
## # A tibble: 67 x 4
## # Groups:   LONGITUDE, LATITUDE [?]
##    LONGITUDE LATITUDE RELATIVEPOSITION count
##        <dbl>    <dbl> <fct>            <int>
##  1    -7691. 4864928. FrontDoor           20
##  2    -7690. 4864929. FrontDoor           19
##  3    -7684. 4864932. FrontDoor           20
##  4    -7684. 4864930. FrontDoor           20
##  5    -7683. 4864932. FrontDoor           20
##  6    -7682. 4864931. FrontDoor           20
##  7    -7677. 4864934. FrontDoor           29
##  8    -7675. 4864933. FrontDoor           20
##  9    -7675. 4864934. FrontDoor           20
## 10    -7669. 4864936. FrontDoor           29
## # ... with 57 more rows
B3%>%
  filter(FLOOR==1)%>%
  group_by(LONGITUDE, LATITUDE, RELATIVEPOSITION)%>%
  summarize(count=n()) 
## # A tibble: 89 x 4
## # Groups:   LONGITUDE, LATITUDE [?]
##    LONGITUDE LATITUDE RELATIVEPOSITION count
##        <dbl>    <dbl> <fct>            <int>
##  1    -7409. 4864797. FrontDoor           20
##  2    -7408. 4864812. Inside              20
##  3    -7406. 4864802. FrontDoor           20
##  4    -7405. 4864808. FrontDoor           20
##  5    -7404. 4864809. FrontDoor           20
##  6    -7399. 4864788. FrontDoor           20
##  7    -7396. 4864776. Inside              10
##  8    -7395. 4864786. FrontDoor           20
##  9    -7395. 4864837. Inside              27
## 10    -7392. 4864841. Inside              30
## # ... with 79 more rows

Counted the exact locations in Building1 Floor1 and in Building3 Floor1 in Training data: Same locations in Training Data have above 20 rows. The user has recorded his position several times on the same location. Hence, there is a lack of general representativity for Training data.

Plots Validation Data

We don’t have the identity of the user in the Validation Data

plotForPhoneVal<- function (buildID){
  buildingID<-paste("B.",buildID,sep="")
  plotTitle<- paste("Building", buildID, ": Phone ID in floors - Validation")
  
  message(paste("Building ID:", buildingID))
  message(paste( "Plot Title:", plotTitle))
  
  validationData%>%
    filter(BUILDINGID==buildingID)%>%
    ggplot()+
    geom_point(aes(x=LONGITUDE, y= LATITUDE, color=PHONEID)) + 
    facet_grid(. ~ FLOOR) + 
    labs(title=plotTitle) + 
    theme_linedraw(base_size = 11, base_family = "") + 
    theme(plot.title = element_text(hjust = 0.5, face="bold"))
}

plotForPhoneVal("1")
## Building ID: B.1
## Plot Title: Building 1 : Phone ID in floors - Validation

plotForPhoneVal("2")
## Building ID: B.2
## Plot Title: Building 2 : Phone ID in floors - Validation

plotForPhoneVal("3")
## Building ID: B.3
## Plot Title: Building 3 : Phone ID in floors - Validation

  • Phone ID in the Validation Data is well represented in all buildings.
B1_validation%>%
  group_by(LONGITUDE, LATITUDE, RELATIVEPOSITION)%>%
  summarize(count=n())
## # A tibble: 487 x 4
## # Groups:   LONGITUDE, LATITUDE [?]
##    LONGITUDE LATITUDE RELATIVEPOSITION count
##        <dbl>    <dbl> <fct>            <int>
##  1      0.       183. <NA>                 1
##  2      0.       184. <NA>                 1
##  3      3.70     181. <NA>                 1
##  4      4.20     180. <NA>                 1
##  5      4.80     180. <NA>                 2
##  6      5.20     180. <NA>                 1
##  7      5.40     180. <NA>                 1
##  8      5.70     187. <NA>                 1
##  9      6.00     186. <NA>                 1
## 10      6.20     186. <NA>                 1
## # ... with 477 more rows
B2_validation%>%
  group_by(LONGITUDE, LATITUDE, RELATIVEPOSITION)%>%
  summarize(count=n())
## # A tibble: 288 x 4
## # Groups:   LONGITUDE, LATITUDE [?]
##    LONGITUDE LATITUDE RELATIVEPOSITION count
##        <dbl>    <dbl> <fct>            <int>
##  1      124.     123. <NA>                 1
##  2      125.     124. <NA>                 1
##  3      125.     144. <NA>                 1
##  4      126.     143. <NA>                 1
##  5      126.     127. <NA>                 1
##  6      126.     148. <NA>                 1
##  7      127.     128. <NA>                 1
##  8      127.     128. <NA>                 1
##  9      127.     129. <NA>                 1
## 10      129.     121. <NA>                 1
## # ... with 278 more rows

Counted the exact locations in Building 1 and 2 in Validation Data. We can see as we have just one register/row by location in the Validation Data. Users were moving randomnly (as we were told). Even though we have less register, the representativity it’s much better.

For the lack of representativity of the locations of the Training data, I have joined Training and Validationa data in order to get a better perfomance of the prediction models.

Preprocessing the data

  1. Removed all colums with 100 ( Waps without signal) of both datas:
waps_only_training<- Training.data[, c(1:520)]
remove_100<- apply(waps_only_training, 2, function(x) length(unique(x))==1)
Training.data<-Training.data[,-c(which(remove_100==TRUE))]

waps_only_validation<-validationData[,c(1:520)]
remove_100_validation<-apply(validationData, 2, function(x) length(unique(x))==1)
validationData<-validationData[,-c(which(remove_100_validation==TRUE))]
  1. Removed all columns (Waps) that are not share between training and validation data ( Waps removed or added between datas) and joined both datsets:
in.training <- (colnames(Training.data)%in%colnames(validationData))
Training.data<-Training.data[,-c(which(in.training==FALSE))]

in.validation<-(colnames(validationData)%in%colnames(Training.data))
validationData<-validationData[,-c(which(in.validation==FALSE))]

all.data<-rbind(Training.data,validationData)
  1. Replaced the rest of the 100 ( Waps without signal) by -105 to not confuse the model, and added 105 to all Waps signal. Hence, no signal will be = 0 , and max signal = +104::
all.data[all.data==100]<- -105
all.data[,c(1:312)]<-all.data[,c(1:312)] +105
  1. Removed the rows with all -105 ( record without signal):
waps.only.all<-all.data[,c(1:312)]
delete.zv.rows<-apply(waps.only.all, 1, function(x) length(unique(x))==1)
all.data<-all.data[-c(which(delete.zv.rows==TRUE)),]

Modeling

Building Prediction:

building1<- all.data%>%filter(BUILDINGID=="B.1")
building2<- all.data%>%filter(BUILDINGID=="B.2")
building3<- all.data%>%filter(BUILDINGID=="B.3")

waps_only_b1<-building1[,c(1:312)]
deliting_b1cols<-apply(waps_only_b1, 2, function(x) length(unique(x))==1)
waps_only_b1<-waps_only_b1[,-c(which(deliting_b1cols==TRUE))]

waps_only_b2<-building2[,c(1:312)]
deliting_b2cols<-apply(waps_only_b2, 2, function(x) length(unique(x))==1)
waps_only_b2<-waps_only_b2[,-c(which(deliting_b2cols==TRUE))]

waps_only_b3<-building3[,c(1:312)]
deliting_b3cols<-apply(waps_only_b3, 2, function(x) length(unique(x))==1)
waps_only_b3<-waps_only_b3[,-c(which(deliting_b3cols==TRUE))]


match.1in2<-(colnames(waps_only_b1)%in%colnames(waps_only_b2))
waps_only_b1<-waps_only_b1[,-c(which(match.1in2==TRUE))]

match.1in3<-(colnames(waps_only_b1)%in%colnames(waps_only_b3))
waps_only_b1<-waps_only_b1[,-c(which(match.1in3==TRUE))]

match.2in3<-(colnames(waps_only_b2)%in%colnames(waps_only_b3))
waps_only_b2<-waps_only_b2[,-c(which(match.2in3==TRUE))]

Knn

  • Tried Knn and SVM radial model, but Knn gave the better results:

  • Created a formula for adding the waps.only columns to the model:

waps.used <- colnames(waps_only_b1)
waps.used <- c(waps.used,colnames(waps_only_b2))
waps.used <- c(waps.used,colnames(waps_only_b3))
f <- paste("BUILDINGID" , "~", paste(waps.used, collapse = " + " ))
f <- as.formula(f)

set.seed(998)
inTraining<-createDataPartition(all.data$BUILDINGID,
                                p=.70,
                                list = FALSE)

training<-all.data[inTraining,]
test<-all.data[-inTraining,]
cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl <- trainControl(method = "cv",
                           number = 5,
                           allowParallel = TRUE,
                           verboseIter = TRUE)

knn.model.building<-train(f,
                          data=training,
                          method='knn',
                          trControl = fitControl,
                          preProcess = c("zv", "center", "scale"))
print(knn.model.building)

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.building<-predict(knn.model.building, test)
confusionMatrix(test$BUILDINGID,prediction.building)

test$prediction<-prediction.building  # create column in our test with the predictions of the model 
test<-test%>%
  mutate(error.model=abs(as.numeric(prediction)-(as.numeric(BUILDINGID))))  #create a column with 0 if obs and predicted are the same, otherwise !=0. 
which(test$error.model!=0)  

df.error<-test%>%filter(error.model!=0)

test$error.model<-as.character(error.model)
Confusion Matrix Building Prediction

Confusion Matrix Building Prediction

  • I analized those two errors, those are the location of the errors:
ggplot() + 
  geom_point(data=test, aes(x= LONGITUDE, y= LATITUDE, color= error.model)) +
  scale_color_manual(values = c("white", "black", "black")) +
  ggtitle("Errors prediction Building") + 
  theme(plot.title = element_text(hjust = 0.5, face="bold"))

building errors Analized the errors:

error.2<-df.error[2,]
waps.error.2<-apply(error.2,1,function(x) x!=0)
prova.error2<-error.2[,-c(which(waps.error.2==FALSE))]
A<-which(colnames(prova.error2)%in%colnames(waps_only_b1))
B<-which(colnames(prova.error2)%in%colnames(waps_only_b3)) # All Predicted waps are on B.3 
C<-which(colnames(prova.error2)%in%colnames(waps_only_b2)) # All Observed Waps in B.2
  • All predicted Waps were on B3 and were unique for B3. But, they are actually on Building 2.

Conclusion: Some Waps were very probably moved between both datasets ( Training and Validation). We know that there was 3 month between them.

Floor Prediction

Tried Knn and SVM radial model for each floor. SVM radial gave me the bests result; highest accuracy and Kappa.

Floor Building 3

building3$TIMESTAMP<-NULL
building3$PHONEID<-NULL
building3$LONGITUDE<-NULL
building3$LATITUDE<-NULL
building3$BUILDINGID<-NULL


set.seed(998)
inTraining<-createDataPartition(building3$FLOOR,
                                p=.70,
                                list = FALSE)
training<-building3[inTraining,]
test<-building3[-inTraining,]

numCores<-detectCores()-1
cluster<-makeCluster(numCores)

cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl <- trainControl(method = "cv", number = 5, allowParallel = TRUE, verboseIter = TRUE)

svm.floor3<-train(FLOOR ~.,
                   data=training,
                   method='svmRadial',
                   trControl = fitControl,
                   preProcess = c("zv", "center", "scale"))


print(svm.floor3)

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)


prediction.floor.svm<-predict(svm.floor3, test)
confusionf3<-confusionMatrix(test$FLOOR,prediction.floor.svm)

# Accuracy= 98,87%
# KAPPA= 0.098 

Floor Building 2

building2<-all.data%>%filter(BUILDINGID=="B.2")
building2$FLOOR<-as.numeric(building2$FLOOR)
building2$FLOOR<-as.factor(building2$FLOOR)


building2$TIMESTAMP<-NULL
building2$PHONEID<-NULL
building2$LONGITUDE<-NULL
building2$LATITUDE<-NULL
building2$BUILDINGID<-NULL

set.seed(248)
inTraining<-createDataPartition(building2$FLOOR,
                                p=.70,
                                list = FALSE)
training.f2<-building2[inTraining,]
test.f2<-building2[-inTraining,]

cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl <- trainControl(method = "cv", number = 5, allowParallel = TRUE, verboseIter = TRUE)

svm.floor2<-train(FLOOR ~.,
                  data=training.f2,
                  method='svmRadial',
                  trControl = fitControl,
                  preProcess = c("zv", "center", "scale"))


print(svm.floor2)

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)


prediction.floor2.svm<-predict(svm.floor2, test.f2)
confusion.svmf2<-confusionMatrix(test.f2$FLOOR,prediction.floor2.svm)

# Accuracy = 98,96% 
# KAPPA= 0.986

Floor Building 2

building1<- all.data%>%filter(BUILDINGID=="B.1")
building1$TIMESTAMP<-NULL
building1$PHONEID<-NULL
building1$LONGITUDE<-NULL
building1$LATITUDE<-NULL
building1$BUILDINGID<-NULL

building1$FLOOR<-as.numeric(building1$FLOOR)
building1$FLOOR<-as.factor(building1$FLOOR)

set.seed(998)
inTraining<-createDataPartition(building1$FLOOR,
                                p=.70,
                                list = FALSE)
training_f1<-building1[inTraining,]
test_f1<-building1[-inTraining,]


cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl <- trainControl(method = "cv", number = 5, allowParallel = TRUE, verboseIter = TRUE)

svm.floor1<-train(FLOOR ~.,
                  data=training_f1,
                  method='svmRadial',
                  trControl = fitControl,
                  preProcess = c("zv", "center", "scale"))


print(svm.floor1)

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)


prediction.f1.svm<-predict(svm.floor1, test_f1)
confusion_f1<-confusionMatrix(test_f1$FLOOR,prediction.f1.svm)

# Accuracy= 98,38% 
# KAPPA= 0.984

Latitude

Tried Knn and Random Forest for each prediction. Random Forest gave me a better result; less RMSE and Rsquared.

Latitude ( Building 1)

set.seed(123)
inTraining.lat1<-createDataPartition(build1.lat$LATITUDE,
                                     p=.70,
                                     list = FALSE)

training.RF.lat1<-build1.lat[inTraining.lat1,]
test.RF.lat1<-build1.lat[-inTraining.lat1,]

numCores<-detectCores()-1
cluster<-makeCluster(numCores)

cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl<- trainControl(method = "cv", number = 5,  verboseIter = TRUE)

rf.lat1<- train(LATITUDE ~ ., 
                data = training.RF.lat1,
                method = "ranger",
                trControl = fitControl, 
                preProcess = c("zv", "medianImpute"))

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.RF.lat1<-predict(rf.lat1, test.RF.lat1)
error.rf.lat1<-postResample(prediction.RF.lat1, test.RF.lat1$LATITUDE)
# RMSE= 2.57  
# Rsquared= 0.99

Latitude ( Building 2)

set.seed(123)
inTraining.lat2<-createDataPartition(build2.lat$LATITUDE,
                                     p=.70,
                                     list = FALSE)

training.RF.lat2<-build2.lat[inTraining.lat2,]
test.RF.lat2<-build2.lat[-inTraining.lat2,]

numCores<-detectCores()-1
cluster<-makeCluster(numCores)

cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl<- trainControl(method = "cv", number = 5,  verboseIter = TRUE)
  
rf.lat2<- train(LATITUDE ~ ., 
                data = training.RF.lat2,
                method = "ranger",
                trControl = fitControl, 
                preProcess = c("zv", "medianImpute"))

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.RF.lat2<-predict(rf.lat2, test.RF.lat2)
error.rf.lat2<-postResample(prediction.RF.lat2, test.RF.lat2$LATITUDE)
# RMSE= 3.67
# Rsquared =99%

Latitude ( Building 3)

set.seed(123)
inTraining.lat3<-createDataPartition(build3.lat$LATITUDE,
                                     p=.70,
                                     list = FALSE)

training.RF.lat3<-build3.lat[inTraining.lat3,]
test.RF.lat3<-build3.lat[-inTraining.lat3,]

numCores<-detectCores()-1
cluster<-makeCluster(numCores)

cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl<- trainControl(method = "cv", number = 5,  verboseIter = TRUE)

rf.lat3<- train(LATITUDE ~ ., 
                data = training.RF.lat3,
                method = "ranger",
                trControl = fitControl, 
                preProcess = c("zv", "medianImpute"))

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.RF.lat3<-predict(rf.lat3, test.RF.lat3)
error.rf.lat3<-postResample(prediction.RF.lat3, test.RF.lat3$LATITUDE)
# RMSE = 3.40  
# Rsquared= 0.98

Longitude

Tried Knn and Random Forest for each prediction. Random Forest gave me a better result; less RMSE and Rsquared.

Longitude (Building 1)

set.seed(123)
inTraining<-createDataPartition(build1.long$LONGITUDE,
                                     p=.70,
                                     list = FALSE)

training.RF.long1<-build1.long[inTraining,]
test.RF.long1<-build1.long[-inTraining,]


cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl<- trainControl(method = "cv", number = 5,  verboseIter = TRUE)

rf.long1<- train(LONGITUDE ~ ., 
                data = training.RF.long1,
                method = "ranger",
                trControl = fitControl, 
                preProcess = c("zv", "medianImpute"))

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.RF.long1<-predict(rf.long1, test.RF.long1)
error.rf.long1<-postResample(prediction.RF.long1, test.RF.long1$LONGITUDE) 
# RMSE= 3,14 
# Rsquared= 0.98

Longitude (Building 2)

set.seed(123)
inTraining<-createDataPartition(build2.long$LONGITUDE,
                                p=.70,
                                list = FALSE)

training.RF.long2<-build2.long[inTraining,]
test.RF.long2<-build2.long[-inTraining,]


cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl<- trainControl(method = "cv", number = 5,  verboseIter = TRUE)

rf.long2<- train(LONGITUDE ~ ., 
                 data = training.RF.long2,
                 method = "ranger",
                 trControl = fitControl, 
                 preProcess = c("zv", "medianImpute"))

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.RF.long2<-predict(rf.long2, test.RF.long2)
error.rf.long2<-postResample(prediction.RF.long2, test.RF.long2$LONGITUDE) 
# RMSE= 4.10 
# Rsquared= 0,99

Longitude (Building 3)

set.seed(123)
inTraining<-createDataPartition(build3.long$LONGITUDE,
                                p=.70,
                                list = FALSE)

training.RF.long3<-build3.long[inTraining,]
test.RF.long3<-build3.long[-inTraining,]


cluster <- makeCluster(detectCores()-1)
registerDoSNOW(cluster)

fitControl<- trainControl(method = "cv", number = 5,  verboseIter = TRUE)

rf.long3<- train(LONGITUDE ~ ., 
                 data = training.RF.long3,
                 method = "ranger",
                 trControl = fitControl, 
                 preProcess = c("zv", "medianImpute"))

stopCluster(cluster)
registerDoSEQ(cluster)
rm(cluster)

prediction.RF.long3<-predict(rf.long3, test.RF.long3)
error.rf.long3<-postResample(prediction.RF.long3, test.RF.long3$LONGITUDE) 
# RMSE= 5,36 
# Rsquared= 0,98

Conclusions

We have seen as some WAPS probably have been moved into different buildings between the creation of both datasets.Thus really worsen the predcition of the models, specially for Building 2 and Building 3. However, the predictions of the models are very accurate and quite representative. But, we have to take into account that we mixed both datas ( Training and Validation) to create the models because the training data had a lack of representativity. Hence, it is difficult to gauge if our model can have a problem of overfitting.