In this project I set a goal of predicting a single user geolocation by weekday & on specific time. I gathered some goelocation dataset form Microsoft GeoLife site: https://www.microsoft.com/en-us/download/details.aspx?id=52367&from=https%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fdownloads%2Fb16d359d-d164-469e-9fd4-daa38f2b2e13%2F
Let’s start with GeoLife data crawler. Note: the crwler is depend upon zip folder location.
setwd("C:/GeoLife/Geolife Trajectories 1.3/Geolife Trajectories 1.3/Data")
GeoLocation <- data.frame()
userData <- data.frame()
folderList <- dir()
j <- 1
#For grab just a little sample
for (j in 3:5){
path <- paste("./", folderList[j], "/Trajectory", sep = "")
print(path)
filesList <- dir(path)
i <- 1
for (i in 1:length(filesList)){
fileName <- paste(path, "/", filesList[i], sep = "")
print(fileName)
userData <- read.csv2(file = fileName, skip = 6, sep = ',', header = FALSE) #skip 6 first rows
userData <- userData[, -3]
userData["userID"] <- folderList[j]
GeoLocation <- rbind(GeoLocation, userData)
userData <- 0
i <- i + 1
}
j <- j + 1
}
#Remove unneccaery columns
GeoLocation <- GeoLocation[, -c(3:4)] #pls validate before which col to remove
GeoLocation["weekday"] <- 0
colnames(GeoLocation) <- c("Lat", "Long", "Date", "Time", "userID", "weekday")
setwd("C:/GeoLife")
write.csv2(GeoLocation, file = "GeoLocation.csv")
We will check for dataset header & length.
head(GeoLocation)
length(GeoLocation$Lat)
We will check in NA are exist.
sum(is.na(GeoLocation))
Since, dataset is big enough we will reduce it, as followed: - user: 004 - Date: 2009-01-01 - On next step we’ll focus on weekday: Thursday
subsetData <- data.frame()
subsetData <- GeoLocation
subsetData$weekday <- as.character(subsetData$weekday)
subsetData$weekday <- as.integer(subsetData$userID)
subsetData <- subset(GeoLocation, userID == "004")
subsetData$Date <- as.Date(subsetData$Date)
subsetData <- subset(subsetData, Date >= "2009-01-01")
Let’s convert date into weekday.
i <- 1
Sys.setlocale("LC_TIME", "C") #for not write in native language
subsetData$Date <- as.Date(subsetData$Date)
for (i in 1:length(subsetData$weekday)){
subsetData$weekday[i] <- weekdays(subsetData$Date[i])
print(i)
i <- i + 1
}
subsetData <- subset(subsetData, weekday == "Thursday")
In which will be test to predict user location on specific time & weekday. 1.Will user Random Forest 2.Training set 95% 3.Test set 5%
Since ML fail to predict 2 variables: Y1, Y2 (Latitide & Longitude) We will split user locations into 100 areas & label them. In this case ML will need to predict only Y(area or cluster) Now, weed will split the user dataset into 2 datasets: 1st (subsetData) will contain: Date, Time & weekday & cluster. 2nd (ClusterData) will contains: oring data of Lat. & Long. + kmeans. For this, we will use kmeans to create distinct 100 cluster(areas)
ClusterData <- subsetData
PolyLocation <- subsetData
ClusterData <- ClusterData[, -c(3:7)]
PolyLocation <- PolyLocation[, -c(3:7)]
ClusterData <- kmeans(ClusterData, centers=100)
#ClusterData$centers
Let’s get cluster alignment bewteen 2 datastes.
subsetData["Cluster"] <- 0
PolyLocation["Cluster"] <- 0
i <- 1
for (i in 1:length(ClusterData$cluster)){
subsetData$Cluster[i] <- ClusterData$cluster[i]
PolyLocation$Cluster[i] <- ClusterData$cluster[i]
i <- i + 1
}
subsetData <- subsetData[, -c(1:2)]
PolyLocation <- PolyLocation[, c(1:3)]
We will split dataset into 2 partitions: training & test. However: 1.If sample is too small user may have only a single cluster, hence ML will fail. 2.If dataset is too big ML will fail or PC preformance will be bad. 3.For easing ML dataset need to be very “narrow” or specific.
library(caret)
samplingTest <- data.frame()
samplingTest <- subsetData
samplingTest <- samplingTest[, -c(3,4)] #Reduce column
samplingTest$Time <- as.character(samplingTest$Time)
i <- 1 #convert time to simple time format
for (i in 1:length(samplingTest$Time)){
samplingTest$Time[i] <- format(strptime(samplingTest$Time[i], '%H:%M'), '%H:%M')
i <- i + 1
}
samplingTest <- subset(samplingTest, Time >= "10:33" & Time <= "11:37")
Training_cv <- createDataPartition(samplingTest$Cluster, p=0.9, list=FALSE)
Train_cv <- samplingTest[Training_cv,]
Test_cv <- samplingTest[-Training_cv,]
Pre-validation fro ML.
if (length(unique(Train_cv$Cluster)) == 0){
print("Data is missing or corput")
} else if (length(unique(Train_cv$Cluster)) == 1){
print("Only 1 cluster found - ML will not run")
} else if(length(unique(Train_cv$Cluster)) > 1){
print("Data is OK - lets run ML")
}
Run machine learning (Random Forest) to predict user cluster/area at specific weekday & time. -if only 1 cluster ML will fail
library(caret)
library(randomForest)
MyTrainControl2 <- trainControl(method="cv", number=3)
modFit2 <- train(Cluster~., data=Train_cv, method="rf", trControl=MyTrainControl2)
modFit2
pred = predict(modFit2, newdata=Test_cv, type = "raw")
PredictionTable <- data.frame(pred, Test_cv$Cluster)
PredictionTable <- PredictionTable[order(PredictionTable$pred, decreasing = TRUE),]
PredictionTable <- head(PredictionTable, 1)
PredictionTable
Now, we need to remove all abnormal location from our data. We will check distance from cluster centroid per per each element.
distFromClustcenter <- cbind(as.numeric(as.character(ClusterData$centers[PredictionTable$Test_cv.Cluster,1])), as.numeric(as.character(ClusterData$centers[PredictionTable$Test_cv.Cluster,2])))
PolyData <- subset(PolyLocation, PolyLocation$Cluster == PredictionTable$Test_cv.Cluster)
Distance <- 0
DistElement <- cbind(as.numeric(as.character(PolyData$Lat)), as.numeric(as.character(PolyData$Long)))
dis1 <- spDistsN1(DistElement, distFromClustcenter)
DistElement <- cbind(DistElement, dis1)
clusterMean <- mean(DistElement[,3])
clusterSD <- sd(DistElement[,3])
DistElement <- DistElement[DistElement[,3] <= clusterMean + clusterSD,]
DistElement <- DistElement[DistElement[,3] >= clusterMean - clusterSD,]
DistElement <- DistElement[, -3]
Create spatial rectangle.
library(sp)
sps = SpatialPolygons(list(Polygons(list(Polygon(DistElement)),1)))
sps
Ploting rectangke on map
library(leaflet)
map <- leaflet() %>% addTiles() %>%
addRectangles(
lng1=ymin(sps), lat1=xmin(sps),
lng2=ymax(sps), lat2=xmax(sps),
fillColor = "transparent"
)
map
save the map object & convert into PNG file.
library(htmlwidgets)
library(webshot)
saveWidget(map, file="map.html")
webshot("map.html", file = "map.png", cliprect = "viewport")
Send PNG as attachment via Gmail.
library(gmailr)
mime() %>%
to("<receivr>@gmail.com") %>%
from("<sender>@gmail.com") %>%
html_body("<b>Gmailr</b>") -> html_msg
html_msg %>%
subject("Geolocation Prediction: userID: 4, weekday: Thursday, Time: 10:33 - 11:37") %>%
attach_file("map.png") -> file_attachment
send_message(file_attachment)