Image exploration and supervised classification of satellite images
Summary
This work aims to demonstrate the possibilities of image exploration and analysis of satellite images. For the work Landsat images were used that show the area of Concord and Stockton in California, USA. The Landsat program is the longest-running programm providing satellite images of earth and the latest Landsat satellite (Landsat 8) launched in 2013.
Data
The data used for this project were multiband satellite images, downloaded and saved as .tif files. Multispectral images capture image data within specific wavelength ranges across the electromagnetic spectrum. Covered spectral bands are usually:
Blue for atmosphere and deep water imaging, Green for imaging vegetation and deep water structures, Red for imaging man-made objects, soil and vegetation, Near infrared (NIR), Mid-infrared (MIR), Far-infrared (FIR) and Thermal infrared.
As reference data, the National Land Cover Database (NLCD) was used which is based primarily on a decision-tree classification of circa 2011 Landsat data. The data is used to train the classifier. The classification is made with a binary classification algorithm which gives root nodes that represent a single input variable (x) and a split point on that variable. The corresponding land use land cover names come from the classdf dataframe.
library(raster)
## Loading required package: sp
nlcd <- brick("c://Users//david//Desktop//rsdata//rs//nlcd-L1.tif")
names(nlcd) <- c("nlcd2001", "nlcd2011")
nlcd
## class : RasterBrick
## dimensions : 1230, 1877, 2308710, 2 (nrow, ncol, ncell, nlayers)
## resolution : 0.0002694946, 0.0002694946 (x, y)
## extent : -121.9258, -121.42, 37.85402, 38.1855 (xmin, xmax, ymin, ymax)
## crs : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
## source : c:/Users/david/Desktop/rsdata/rs/nlcd-L1.tif
## names : nlcd2001, nlcd2011
## min values : 1, 1
## max values : 9, 9
# The class names and colors for plotting
nlcdclass <- c("Water", "Developed", "Barren", "Forest", "Shrubland", "Herbaceous", "Planted/Cultivated", "Wetlands")
classdf <- data.frame(classvalue1 = c(1,2,3,4,5,7,8,9), classnames1 = nlcdclass)
# Hex codes of colors
classcolor <- c("#5475A8", "#B50000", "#D2CDC0", "#38814E", "#AF963C", "#D1D182", "#FBF65D", "#C8E6F8")
# Now we ratify (RAT = "Raster Attribute Table") the ncld2011 (define RasterLayer as a categorical variable). This is helpful for plotting.
nlcd2011 <- nlcd[[2]]
nlcd2011 <- ratify(nlcd2011)
rat <- levels(nlcd2011)[[1]]
#
rat$landcover <- nlcdclass
levels(nlcd2011) <- rat
set.seed(99)
samp2011 <- sampleStratified(nlcd2011, size = 200, na.rm = TRUE, sp = TRUE)
samp2011
## class : SpatialPointsDataFrame
## features : 1600
## extent : -121.9257, -121.4207, 37.85415, 38.18536 (xmin, xmax, ymin, ymax)
## crs : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
## variables : 2
## names : cell, nlcd2011
## min values : 829, 1
## max values : 2308569, 9
table(samp2011$nlcd2011)
##
## 1 2 3 4 5 7 8 9
## 200 200 200 200 200 200 200 200
library(rasterVis)
## Loading required package: lattice
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
plt <- levelplot(nlcd2011, col.regions = classcolor, main = 'Distribution of Training Sites')
print(plt + layer(sp.points(samp2011, pch = 3, cex = 0.5, col = 1)))
landsat5 <- stack("c://Users//david//Desktop//rsdata//rs//centralvalley-2011LT5.tif")
names(landsat5) <- c('blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2')
sampvals <- extract(landsat5, samp2011, df = TRUE)
sampvals <- sampvals[, -1]
sampdata <- data.frame(classvalue = samp2011@data$nlcd2011, sampvals)
library(rpart)
cart <- rpart(as.factor(classvalue)~., data=sampdata, method = 'class', minsplit = 5)
plot(cart, uniform=TRUE, main="Classification Tree")
text(cart, cex = 0.8)
pr2011 <- predict(landsat5, cart, type='class')
pr2011
## class : RasterLayer
## dimensions : 1230, 1877, 2308710 (nrow, ncol, ncell)
## resolution : 0.0002694946, 0.0002694946 (x, y)
## extent : -121.9258, -121.42, 37.85402, 38.1855 (xmin, xmax, ymin, ymax)
## crs : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
## source : memory
## names : layer
## values : 1, 9 (min, max)
## attributes :
## ID value
## from: 1 1
## to : 8 9
pr2011 <- ratify(pr2011)
rat <- levels(pr2011)[[1]]
rat$legend <- classdf$classnames
levels(pr2011) <- rat
levelplot(pr2011, maxpixels = 1e6,
col.regions = classcolor,
scales=list(draw=FALSE),
main = "Decision Tree classification of Landsat 5")
library(dismo)
set.seed(99)
j <- kfold(sampdata, k = 5, by=sampdata$classvalue)
table(j)
## j
## 1 2 3 4 5
## 320 320 320 320 320
x <- list()
for (k in 1:5) {
train <- sampdata[j!= k, ]
test <- sampdata[j == k, ]
cart <- rpart(as.factor(classvalue)~., data=train, method = 'class', minsplit = 5)
pclass <- predict(cart, test, type='class')
# create a data.frame using the reference and prediction
x[[k]] <- cbind(test$classvalue, as.integer(pclass))
}
y <- do.call(rbind, x)
y <- data.frame(y)
colnames(y) <- c('observed', 'predicted')
conmat <- table(y)
# change the name of the classes
colnames(conmat) <- classdf$classnames
rownames(conmat) <- classdf$classnames
conmat
## predicted
## observed Water Developed Barren Forest Shrubland Herbaceous
## Water 172 6 1 3 3 1
## Developed 8 93 22 10 8 37
## Barren 6 54 52 6 8 62
## Forest 0 3 2 102 49 8
## Shrubland 0 8 1 52 105 12
## Herbaceous 1 16 24 4 16 121
## Planted/Cultivated 2 20 5 24 37 24
## Wetlands 14 12 1 32 25 16
## predicted
## observed Planted/Cultivated Wetlands
## Water 1 13
## Developed 20 2
## Barren 5 7
## Forest 13 23
## Shrubland 17 5
## Herbaceous 17 1
## Planted/Cultivated 81 7
## Wetlands 32 68
n <- sum(conmat)
diag <- diag(conmat)
OA <- sum(diag) / n
OA
## [1] 0.49625
# observed (true) cases per class
rowsums <- apply(conmat, 1, sum)
p <- rowsums / n
# predicted cases per class
colsums <- apply(conmat, 2, sum)
q <- colsums / n
expAccuracy <- sum(p*q)
kappa <- (OA - expAccuracy) / (1 - expAccuracy)
kappa
## [1] 0.4242857
## [1] 0.4257143
# Producer accuracy
PA <- diag / colsums
# User accuracy
UA <- diag / rowsums
outAcc <- data.frame(producerAccuracy = PA, userAccuracy = UA)
outAcc