Image exploration and supervised classification of satellite images

Summary

This work aims to demonstrate the possibilities of image exploration and analysis of satellite images. For the work Landsat images were used that show the area of Concord and Stockton in California, USA. The Landsat program is the longest-running programm providing satellite images of earth and the latest Landsat satellite (Landsat 8) launched in 2013.

Data

The data used for this project were multiband satellite images, downloaded and saved as .tif files. Multispectral images capture image data within specific wavelength ranges across the electromagnetic spectrum. Covered spectral bands are usually:

Blue for atmosphere and deep water imaging, Green for imaging vegetation and deep water structures, Red for imaging man-made objects, soil and vegetation, Near infrared (NIR), Mid-infrared (MIR), Far-infrared (FIR) and Thermal infrared.

As reference data, the National Land Cover Database (NLCD) was used which is based primarily on a decision-tree classification of circa 2011 Landsat data. The data is used to train the classifier. The classification is made with a binary classification algorithm which gives root nodes that represent a single input variable (x) and a split point on that variable. The corresponding land use land cover names come from the classdf dataframe.

library(raster)
## Loading required package: sp
nlcd <- brick("c://Users//david//Desktop//rsdata//rs//nlcd-L1.tif")
names(nlcd) <- c("nlcd2001", "nlcd2011")
nlcd
## class      : RasterBrick 
## dimensions : 1230, 1877, 2308710, 2  (nrow, ncol, ncell, nlayers)
## resolution : 0.0002694946, 0.0002694946  (x, y)
## extent     : -121.9258, -121.42, 37.85402, 38.1855  (xmin, xmax, ymin, ymax)
## crs        : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
## source     : c:/Users/david/Desktop/rsdata/rs/nlcd-L1.tif 
## names      : nlcd2001, nlcd2011 
## min values :        1,        1 
## max values :        9,        9
# The class names and colors for plotting
nlcdclass <- c("Water", "Developed", "Barren", "Forest", "Shrubland", "Herbaceous", "Planted/Cultivated", "Wetlands")
classdf <- data.frame(classvalue1 = c(1,2,3,4,5,7,8,9), classnames1 = nlcdclass)
# Hex codes of colors
classcolor <- c("#5475A8", "#B50000", "#D2CDC0", "#38814E", "#AF963C", "#D1D182", "#FBF65D", "#C8E6F8")
# Now we ratify (RAT = "Raster Attribute Table") the ncld2011 (define RasterLayer as a categorical variable). This is helpful for plotting.
nlcd2011 <- nlcd[[2]]
nlcd2011 <- ratify(nlcd2011)
rat <- levels(nlcd2011)[[1]]
#
rat$landcover <- nlcdclass
levels(nlcd2011) <- rat
set.seed(99)
samp2011 <- sampleStratified(nlcd2011, size = 200, na.rm = TRUE, sp = TRUE)
samp2011
## class       : SpatialPointsDataFrame 
## features    : 1600 
## extent      : -121.9257, -121.4207, 37.85415, 38.18536  (xmin, xmax, ymin, ymax)
## crs         : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
## variables   : 2
## names       :    cell, nlcd2011 
## min values  :     829,        1 
## max values  : 2308569,        9
table(samp2011$nlcd2011)
## 
##   1   2   3   4   5   7   8   9 
## 200 200 200 200 200 200 200 200
library(rasterVis)
## Loading required package: lattice
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
plt <- levelplot(nlcd2011, col.regions = classcolor, main = 'Distribution of Training Sites')
print(plt + layer(sp.points(samp2011, pch = 3, cex = 0.5, col = 1)))

landsat5 <- stack("c://Users//david//Desktop//rsdata//rs//centralvalley-2011LT5.tif")
names(landsat5) <- c('blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2')
sampvals <- extract(landsat5, samp2011, df = TRUE)
sampvals <- sampvals[, -1]
sampdata <- data.frame(classvalue = samp2011@data$nlcd2011, sampvals)
library(rpart)
cart <- rpart(as.factor(classvalue)~., data=sampdata, method = 'class', minsplit = 5)
plot(cart, uniform=TRUE, main="Classification Tree")
text(cart, cex = 0.8)

pr2011 <- predict(landsat5, cart, type='class')
pr2011
## class      : RasterLayer 
## dimensions : 1230, 1877, 2308710  (nrow, ncol, ncell)
## resolution : 0.0002694946, 0.0002694946  (x, y)
## extent     : -121.9258, -121.42, 37.85402, 38.1855  (xmin, xmax, ymin, ymax)
## crs        : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
## source     : memory
## names      : layer 
## values     : 1, 9  (min, max)
## attributes :
##        ID value
##  from:  1     1
##   to :  8     9
pr2011 <- ratify(pr2011)
rat <- levels(pr2011)[[1]]
rat$legend <- classdf$classnames
levels(pr2011) <- rat
levelplot(pr2011, maxpixels = 1e6,
          col.regions = classcolor,
          scales=list(draw=FALSE),
          main = "Decision Tree classification of Landsat 5")

library(dismo)
set.seed(99)
j <- kfold(sampdata, k = 5, by=sampdata$classvalue)
table(j)
## j
##   1   2   3   4   5 
## 320 320 320 320 320
x <- list()
for (k in 1:5) {
    train <- sampdata[j!= k, ]
    test <- sampdata[j == k, ]
    cart <- rpart(as.factor(classvalue)~., data=train, method = 'class', minsplit = 5)
    pclass <- predict(cart, test, type='class')
    # create a data.frame using the reference and prediction
    x[[k]] <- cbind(test$classvalue, as.integer(pclass))
}
y <- do.call(rbind, x)
y <- data.frame(y)
colnames(y) <- c('observed', 'predicted')
conmat <- table(y)
# change the name of the classes
colnames(conmat) <- classdf$classnames
rownames(conmat) <- classdf$classnames
conmat
##                     predicted
## observed             Water Developed Barren Forest Shrubland Herbaceous
##   Water                172         6      1      3         3          1
##   Developed              8        93     22     10         8         37
##   Barren                 6        54     52      6         8         62
##   Forest                 0         3      2    102        49          8
##   Shrubland              0         8      1     52       105         12
##   Herbaceous             1        16     24      4        16        121
##   Planted/Cultivated     2        20      5     24        37         24
##   Wetlands              14        12      1     32        25         16
##                     predicted
## observed             Planted/Cultivated Wetlands
##   Water                               1       13
##   Developed                          20        2
##   Barren                              5        7
##   Forest                             13       23
##   Shrubland                          17        5
##   Herbaceous                         17        1
##   Planted/Cultivated                 81        7
##   Wetlands                           32       68
n <- sum(conmat)
diag <- diag(conmat)
OA <- sum(diag) / n
OA
## [1] 0.49625
# observed (true) cases per class
rowsums <- apply(conmat, 1, sum)
p <- rowsums / n
# predicted cases per class
colsums <- apply(conmat, 2, sum)
q <- colsums / n
expAccuracy <- sum(p*q)
kappa <- (OA - expAccuracy) / (1 - expAccuracy)
kappa
## [1] 0.4242857
## [1] 0.4257143
# Producer accuracy
PA <- diag / colsums
# User accuracy
UA <- diag / rowsums
outAcc <- data.frame(producerAccuracy = PA, userAccuracy = UA)
outAcc