library(sf, quietly = TRUE)
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(randomForest, quietly = TRUE)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.

Prerequisites

Load Data and set up test/training data

# load shapefile derived from manual ArcMap classification
dark_objects_original <- st_read("../gis_projects/gsd_shoreline_processing/calculate_geometry_dark_objects.shp")
## Reading layer `calculate_geometry_dark_objects' from data source `C:\repositories\ISR-Maritime-Analytics\analytics\gis_projects\gsd_shoreline_processing\calculate_geometry_dark_objects.shp' using driver `ESRI Shapefile'
## Simple feature collection with 727 features and 12 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 53.95172 ymin: 24.41852 xmax: 56.57474 ymax: 25.77184
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
#convert 7s to 9s.  7s are for additional analysis. 7s should be extracted and joined to MMSI ship data tables.
dark_objects_edit1 <- dark_objects_original
dark_objects_edit1$mmsi_suprv[dark_objects_edit1$mmsi_suprv==7] <- 9

# seed for reproducibility
set.seed(11)
smp_siz <- floor(0.75*nrow(dark_objects_edit1)) 
train_ind <- sample(seq_len(nrow(dark_objects_edit1)),size = smp_siz)
train <- dark_objects_edit1[train_ind,]
test <- dark_objects_edit1[-train_ind,]

Generate basic randomForest model to predict ship detection output categories.

#remove geometry because it's upsetting randomForest
train$geometry <- NULL
#ensure that the response/target variable is a factor (factors for categorical response)
train$mmsi_suprv <- as.factor(train$mmsi_suprv)
#exclude junk columns from ArcGIS and whatever. Only keep the response variable and useful predictor variables.
exclude_cols <- c('area_sqkm','size_meter','sat_id','series_id','Shape_Leng','Shape_Area','mmsi','NEAR_FID','geometry', 'OBJECTID')

# model1 with the kitchen sink
model1 <- randomForest(mmsi_suprv ~ ., data = train[ !names(train) %in% exclude_cols ], type="classification", importance=TRUE)

# model2 remove "junk"
# tbd

#save models when you like them and they're statistically-sound
#saveRDS(model1,"fuckssdfsf.rds")

Check results against test data

pred1 = predict(model1, newdata = test)

Analyze model and results

# predictor selection analysis (model variable weight)
varImpPlot(model1)

# train confusion matrix
model1$confusion
##     0  1  8   9 class.error
## 0 117  0 16   2   0.1333333
## 1   0 20  4  16   0.5000000
## 8  20  4 19  25   0.7205882
## 9   8 17  8 269   0.1092715
# test confusion matrix
table(pred1,test$mmsi_suprv) 
##      
## pred1  0  1  2  8  9
##     0 44  0  0  9  5
##     1  0  7  0  3  2
##     8  1  1  0  5 10
##     9  0  6  1  5 83