GeoLife Project

In this project I set a goal of predicting a single user geolocation by weekday & on specific time. I gathered some goelocation dataset form Microsoft GeoLife site: https://www.microsoft.com/en-us/download/details.aspx?id=52367&from=https%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fdownloads%2Fb16d359d-d164-469e-9fd4-daa38f2b2e13%2F

Let’s start with GeoLife data crawler. Note: the crwler is depend upon zip folder location.

setwd("C:/GeoLife/Geolife Trajectories 1.3/Geolife Trajectories 1.3/Data")
GeoLocation <- data.frame()
userData <- data.frame()
folderList <- dir()
j <- 1 
#For grab just a little sample
for (j in 3:5){

    path <- paste("./", folderList[j], "/Trajectory", sep = "")
    print(path)
    filesList <- dir(path)
    i <- 1
    for (i in 1:length(filesList)){
        fileName <- paste(path, "/", filesList[i], sep = "")
        print(fileName)
        userData <- read.csv2(file = fileName, skip = 6, sep = ',', header = FALSE) #skip 6 first rows
        userData <- userData[, -3]
        userData["userID"] <- folderList[j]
        GeoLocation <- rbind(GeoLocation, userData)
        userData <- 0
        i <- i + 1
  }

    j <- j + 1

}
#Remove unneccaery columns
GeoLocation <- GeoLocation[, -c(3:4)] #pls validate before which col to remove
GeoLocation["weekday"] <- 0
colnames(GeoLocation) <- c("Lat", "Long", "Date", "Time", "userID", "weekday")
setwd("C:/GeoLife")
write.csv2(GeoLocation, file = "GeoLocation.csv")

We will check for dataset header & length.

head(GeoLocation)
length(GeoLocation$Lat)

We will check in NA are exist.

sum(is.na(GeoLocation))

Since, dataset is big enough we will reduce it, as followed: - user: 004 - Date: 2009-01-01 - On next step we’ll focus on weekday: Thursday

subsetData <- data.frame()
subsetData <- GeoLocation
subsetData$weekday <- as.character(subsetData$weekday)
subsetData$weekday <- as.integer(subsetData$userID)
subsetData <- subset(GeoLocation, userID == "004")
subsetData$Date <- as.Date(subsetData$Date)
subsetData <- subset(subsetData, Date >= "2009-01-01")

Let’s convert date into weekday.

i <- 1
Sys.setlocale("LC_TIME", "C") #for not write in native language
subsetData$Date <- as.Date(subsetData$Date)
for (i in 1:length(subsetData$weekday)){

  subsetData$weekday[i] <- weekdays(subsetData$Date[i])
    print(i)
    i <- i + 1
}
subsetData <- subset(subsetData, weekday == "Thursday")

Testing a signle user (i.e.: user 4), for Machine learning.

In which will be test to predict user location on specific time & weekday. 1.Will user Random Forest 2.Training set 95% 3.Test set 5%

Since ML fail to predict 2 variables: Y1, Y2 (Latitide & Longitude) We will split user locations into 100 areas & label them. In this case ML will need to predict only Y(area or cluster) Now, weed will split the user dataset into 2 datasets: 1st (subsetData) will contain: Date, Time & weekday & cluster. 2nd (ClusterData) will contains: oring data of Lat. & Long. + kmeans. For this, we will use kmeans to create distinct 100 cluster(areas)

ClusterData <- subsetData
PolyLocation <- subsetData
ClusterData <- ClusterData[, -c(3:7)]
PolyLocation <- PolyLocation[, -c(3:7)]
ClusterData <- kmeans(ClusterData, centers=100)
#ClusterData$centers

Let’s get cluster alignment bewteen 2 datastes.

subsetData["Cluster"] <- 0
PolyLocation["Cluster"] <- 0
i <- 1
for (i in 1:length(ClusterData$cluster)){
        subsetData$Cluster[i] <- ClusterData$cluster[i]
        PolyLocation$Cluster[i] <- ClusterData$cluster[i]
        i <- i + 1
}
subsetData <- subsetData[, -c(1:2)]
PolyLocation <- PolyLocation[, c(1:3)]

Machine Learning

We will split dataset into 2 partitions: training & test. However: 1.If sample is too small user may have only a single cluster, hence ML will fail. 2.If dataset is too big ML will fail or PC preformance will be bad. 3.For easing ML dataset need to be very “narrow” or specific.

library(caret)
samplingTest <- data.frame()
samplingTest <- subsetData
samplingTest <- samplingTest[, -c(3,4)] #Reduce column
samplingTest$Time <- as.character(samplingTest$Time)
i <- 1 #convert time to simple time format
for (i in 1:length(samplingTest$Time)){
        samplingTest$Time[i] <- format(strptime(samplingTest$Time[i], '%H:%M'), '%H:%M')
        i <- i + 1
}
samplingTest <- subset(samplingTest, Time >= "10:33" & Time <= "11:37")
Training_cv <- createDataPartition(samplingTest$Cluster, p=0.9, list=FALSE)
Train_cv <- samplingTest[Training_cv,]
Test_cv <- samplingTest[-Training_cv,]

Pre-validation fro ML.

if (length(unique(Train_cv$Cluster)) == 0){
  print("Data is missing or corput")
} else if (length(unique(Train_cv$Cluster)) == 1){
  print("Only 1 cluster found - ML will not run")
} else if(length(unique(Train_cv$Cluster)) > 1){
  print("Data is OK - lets run ML")
}

Run machine learning (Random Forest) to predict user cluster/area at specific weekday & time. -if only 1 cluster ML will fail

library(caret)
library(randomForest)
MyTrainControl2 <- trainControl(method="cv", number=3)
modFit2 <- train(Cluster~., data=Train_cv, method="rf", trControl=MyTrainControl2)
modFit2
pred = predict(modFit2, newdata=Test_cv, type = "raw")
PredictionTable <- data.frame(pred, Test_cv$Cluster)
PredictionTable <- PredictionTable[order(PredictionTable$pred, decreasing = TRUE),]
PredictionTable <- head(PredictionTable, 1)
PredictionTable

Now, we need to remove all abnormal location from our data. We will check distance from cluster centroid per per each element.

distFromClustcenter <- cbind(as.numeric(as.character(ClusterData$centers[PredictionTable$Test_cv.Cluster,1])), as.numeric(as.character(ClusterData$centers[PredictionTable$Test_cv.Cluster,2])))
PolyData <- subset(PolyLocation, PolyLocation$Cluster == PredictionTable$Test_cv.Cluster)
Distance <- 0
DistElement <- cbind(as.numeric(as.character(PolyData$Lat)), as.numeric(as.character(PolyData$Long)))
dis1 <- spDistsN1(DistElement, distFromClustcenter)
DistElement <- cbind(DistElement, dis1)
clusterMean <- mean(DistElement[,3]) 
clusterSD <- sd(DistElement[,3])
DistElement <- DistElement[DistElement[,3] <= clusterMean + clusterSD,]
DistElement <- DistElement[DistElement[,3] >= clusterMean - clusterSD,]
DistElement <- DistElement[, -3]

Create spatial rectangle.

library(sp)
sps = SpatialPolygons(list(Polygons(list(Polygon(DistElement)),1)))
sps

Ploting rectangke on map

library(leaflet)
map <- leaflet() %>% addTiles() %>%
  addRectangles(
    lng1=ymin(sps), lat1=xmin(sps),
    lng2=ymax(sps), lat2=xmax(sps),
      fillColor = "transparent"
  )
map

save the map object & convert into PNG file.

library(htmlwidgets)
library(webshot)
saveWidget(map, file="map.html")
webshot("map.html", file = "map.png", cliprect = "viewport")

Send PNG as attachment via Gmail.

library(gmailr)
mime() %>%
  to("<receivr>@gmail.com") %>%
  from("<sender>@gmail.com") %>%
  html_body("<b>Gmailr</b>") -> html_msg
  html_msg %>%
  subject("Geolocation Prediction: userID: 4, weekday: Thursday, Time: 10:33 - 11:37") %>%
  attach_file("map.png") -> file_attachment
send_message(file_attachment)