# geospatial GDAL objects library
library(sf)
# machine learning libraries
library(caret)
library(randomForest)
# visualization libraries
library(ggplot2)
library(reshape)
library(factoextra)
library(viridis)
library(rattle)
# remove scientific notation
options(scipen=999)
The data is used later in validating and classifying dark objects.
These are geocoded/georeferenced to accurate bounds. This is necessary prior to generating dark object polygons and for the validation of ship detection algorithm results.
Manually (spot checked / supervised classification) polygon features containing the column mmsi_suprv with the cateogorical values (NO NULL):
# load shapefile derived from manual ArcMap classification
dark_objects_original <- st_read("../gis_projects/gsd_shoreline_processing/calculate_geometry_dark_objects.shp")
## Reading layer `calculate_geometry_dark_objects' from data source `C:\repositories\ISR-Maritime-Analytics\analytics\gis_projects\gsd_shoreline_processing\calculate_geometry_dark_objects.shp' using driver `ESRI Shapefile'
## Simple feature collection with 727 features and 12 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: 53.95172 ymin: 24.41852 xmax: 56.57474 ymax: 25.77184
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
#convert 7s to 9s. 7s are for additional analysis. 7s should be extracted and joined to MMSI ship data tables.
dark_objects_edit1 <- dark_objects_original
dark_objects_edit1$mmsi_suprv[dark_objects_edit1$mmsi_suprv==7] <- 9
# remove geometry / this column breaks randomforest
dark_objects_original$geometry <- NULL
dark_objects_edit1$geometry <- NULL
dark_objects_edit1$sat_id <- NULL
meltData <- melt(dark_objects_edit1[ , -which(names(dark_objects_edit1) %in% c("mmsi","mmsi_suprv"))])
## Using as id variables
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")
desc_stats_df <- dark_objects_edit1[ , -which(names(dark_objects_edit1) %in% c("mmsi","mmsi_suprv"))]
desc_stats <- data.frame(
Min = apply(desc_stats_df, 2, min), # minimum
Med = apply(desc_stats_df, 2, median), # median
Mean = apply(desc_stats_df, 2, mean), # mean
SD = apply(desc_stats_df, 2, sd), # Standard deviation
Max = apply(desc_stats_df, 2, max) # Maximum
)
round(desc_stats, 4)
We notice some clustering similarity with the classified approach, but this method should not be applied towards supervised learning in lieu of an expert-driven classification scheme.
set.seed(12)
n_center <- 4
# data scaled to remove weights from variables
km.res <- kmeans(scale(dark_objects_edit1[ ,!(colnames(dark_objects_edit1) == "mmsi_suprv")]), centers = n_center, nstart = 25)
fviz_cluster(km.res,
data = dark_objects_edit1[ ,!(colnames(dark_objects_edit1) == "mmsi_suprv")],
palette = viridis_pal(option = "D")(n_center),
ggtheme = theme_minimal(),
main = "Partitioning Clustering Plot"
)
table(km.res$cluster,dark_objects_edit1$mmsi_suprv)
##
## 0 1 2 8 9
## 1 0 45 0 29 155
## 2 0 0 0 1 100
## 3 9 9 0 4 89
## 4 171 0 1 56 58
# https://beta.vu.nl/nl/Images/stageverslag-westerdijk_tcm235-912501.pdf
# To be determined - intermediate model
# Use identified mmsi values to extract estimates for targets
# use dark_objects_original
set.seed(2)
sample_size <- 1000
vessel_type_cat <- c("cargo_ship", "other", "dredging_vessel", "military_vessel", "fishing", "hsc", "passenger_ship","harbour_vessel", "pleasure_craft", "tanker")
vessel_types <- data.frame(
"name" = sample(vessel_type_cat,sample_size,replace=T),
"size_meter" = runif(sample_size, min(dark_objects_original$size_meter), max(dark_objects_original$size_meter)),
"short_axis" = runif(sample_size, min(dark_objects_original$short_axis), max(dark_objects_original$short_axis)),
"long_axis"= runif(sample_size, min(dark_objects_original$long_axis), max(dark_objects_original$long_axis)),
"fake_speed" = runif(sample_size, 1, 100)
)
# retrieve subset of classified AIS data
mmsi_hits <- dark_objects_original[ which(dark_objects_original$mmsi != 0), ]
mmsi_hits$fake_speed <- runif(nrow(mmsi_hits), 1, 100)
#https://rpubs.com/abdul_yunus/Kmeans_Clustering
mmsi_rf <- randomForest(name ~ ., data = vessel_types, type="classification", importance=TRUE)
pred2 = predict(mmsi_rf, newdata = mmsi_hits)
pred2
## 112 240 292 299 316
## passenger_ship passenger_ship pleasure_craft pleasure_craft hsc
## 323 393 438 440 466
## harbour_vessel military_vessel military_vessel military_vessel hsc
## 507
## pleasure_craft
## 10 Levels: cargo_ship dredging_vessel fishing harbour_vessel ... tanker
# Also need to determine the inverse relationship AIS signals not associated with dark objects
Cross Validation
Collinearity and correlation of predictors
OOB Out of Bag
High variance predictors
# clean up
# size_feet deselected - acts as a control
# azimuth also removed - acts as a false lead control
keep_cols <- c("mmsi_suprv", "size_meter", "near_shore","short_axis","long_axis","dim_ratio","near_platf","depth")
dark_objects_edit1 <- dark_objects_edit1[ names(dark_objects_edit1) %in% keep_cols ]
# seed for reproducibility
set.seed(1)
# split 25% / 75%
smp_siz <- floor(0.75 * nrow(dark_objects_edit1))
train_ind <- sample(seq_len(nrow(dark_objects_edit1)),size = smp_siz)
train <- dark_objects_edit1[train_ind,]
test <- dark_objects_edit1[-train_ind,]
#ensure that the response/target variable is a factor (factors for categorical response)
train$mmsi_suprv <- as.factor(train$mmsi_suprv)
# model1 with the kitchen sink
model1 <- randomForest(mmsi_suprv ~ ., data = train, type="classification", importance=TRUE)
#save models when you like them and they're statistically-sound
#saveRDS(model1,"model1.rds")
train_cv<-train
train_cv <- droplevels(train_cv[train_cv$mmsi_suprv!=2,])
mmsi_suprv <- train_cv$mmsi_suprv
train_cv$mmsi_suprv<-NULL
rf.cv1 <- rfcv(train_cv, mmsi_suprv, cv.fold=10)
with(rf.cv1, plot(n.var, error.cv, type="b", col="red"))
model2 <- caret::train(mmsi_suprv ~ ., method = "rpart", data = train)
#save models when you like them and they're statistically-sound
#saveRDS(model2,"model2")
fancyRpartPlot(model2$finalModel, main="model2 rpart dendogram")
# predictor selection analysis (model variable weight)
varImpPlot(model1, main="Model1 Variable Importance Plot randomForest", pch = 20 )
pred1 = predict(model1, newdata = test)
pred2 = predict(model2, newdata = test)
# test confusion matrix
mask.vals = factor(pred1, levels=c(0,1,2,8,9))
ref.data = factor(test$mmsi_suprv, levels=c(0,1,2,8,9))
confusionMatrix(mask.vals, ref.data)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 8 9
## 0 50 0 0 4 0
## 1 0 13 0 1 0
## 2 0 0 0 0 0
## 8 0 0 0 8 1
## 9 0 0 0 10 95
##
## Overall Statistics
##
## Accuracy : 0.9121
## 95% CI : (0.8612, 0.9489)
## No Information Rate : 0.5275
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.8541
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 8 Class: 9
## Sensitivity 1.0000 1.00000 NA 0.34783 0.9896
## Specificity 0.9697 0.99408 1 0.99371 0.8837
## Pos Pred Value 0.9259 0.92857 NA 0.88889 0.9048
## Neg Pred Value 1.0000 1.00000 NA 0.91329 0.9870
## Prevalence 0.2747 0.07143 0 0.12637 0.5275
## Detection Rate 0.2747 0.07143 0 0.04396 0.5220
## Detection Prevalence 0.2967 0.07692 0 0.04945 0.5769
## Balanced Accuracy 0.9848 0.99704 NA 0.67077 0.9367
# test confusion matrix
mask.vals = factor(pred2, levels=c(0,1,2,8,9))
ref.data = factor(test$mmsi_suprv, levels=c(0,1,2,8,9))
confusionMatrix(mask.vals, ref.data)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 8 9
## 0 49 0 0 10 4
## 1 0 13 0 1 0
## 2 0 0 0 0 0
## 8 0 0 0 0 0
## 9 1 0 0 12 92
##
## Overall Statistics
##
## Accuracy : 0.8462
## 95% CI : (0.7854, 0.8953)
## No Information Rate : 0.5275
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7415
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 8 Class: 9
## Sensitivity 0.9800 1.00000 NA 0.0000 0.9583
## Specificity 0.8939 0.99408 1 1.0000 0.8488
## Pos Pred Value 0.7778 0.92857 NA NaN 0.8762
## Neg Pred Value 0.9916 1.00000 NA 0.8736 0.9481
## Prevalence 0.2747 0.07143 0 0.1264 0.5275
## Detection Rate 0.2692 0.07143 0 0.0000 0.5055
## Detection Prevalence 0.3462 0.07692 0 0.0000 0.5769
## Balanced Accuracy 0.9370 0.99704 NA 0.5000 0.9036
Model1 randomForest == 91% accuracy against training data
Model2 rpart == 85% accuracy against training data
Confusion matrix specificy is the true negative rate
Confusion matrix sensitivity is the true positive rate
Cross-validation error checks seems to point at poor variable usage, however without adequate sample distributions / sizes, this is mostly irrelevant.