Classification

This lab will look at land cover classification, using a pair of Landsat 5 images collected in July, 2008 and February, 2009.

The data are here: https://www.dropbox.com/sh/za8bxeor5fdruye/wzA88i84ey

There are three data files you will need:

feb_small.tif OR feb.tif
jul_small.tif OR jul.tif
train_polys.Rdata

The difference between the tifs is that one file is at the native resolution of 30m, and has a filesize of about 27MB. The other file has been resampled to a resolution of 90m and only has a filesize of 3MB. The larger image is obviously a lot cooler (see if you can find Neyland Stadium!). But we'll use the smaller files.

library(raster)

## Loading required package: sp

## Warning: package 'sp' was built under R version 3.0.2

feb <- brick(x = "~/Dropbox/classes/Geog515_f13/Labs/feb_small.tif")  # or feb <- brick(x='feb.tif')

## rgdal: version: 0.8-8, (SVN revision 463)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.10.1, released 2013/08/26
## Path to GDAL shared files: /Library/Frameworks/GDAL.framework/Versions/1.10/Resources/gdal
## Loaded PROJ.4 runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480]
## Path to PROJ.4 shared files: (autodetected)

names(feb) <- paste("feb", 1:7, sep = ".")
jul <- brick(x = "~/Dropbox/classes/Geog515_f13/Labs/jul_small.tif")  # or jul <- brick(x='jul.tif')
names(jul) <- paste("jul", 1:7, sep = ".")
plotRGB(feb, 3, 2, 1, stretch = "lin")

plot of chunk data load

plotRGB(jul, 3, 2, 1, stretch = "lin")

plot of chunk data load


# Here's a hint for zooming in: It's interactive, so I commented it out.
# newextent <- zoom(feb[[3]]) # That will plot band 3.  Click twice on the
# map to define the upper left # and lower right corner you want to zoom
# into.  It will automatically zoom.  # The extent of the zoom is saved as
# newextent # If you want to see it in RGB, then follow up with this
# command: plotRGB(feb, r=3, g=2, b=1, ext=newextent, stretch='lin')

Create training regions

I already did this. But this is what I did. Note, I did some zooming in, which is not shown here.

# urb1 <- drawPoly() # downtown Knoxville urb2 <- drawPoly() # lower density
# urban urb3 <- drawPoly() # lower density urban ag1 <- drawPoly() forest1
# <- drawPoly() # evergreen forest2 <- drawPoly() # deciduous forest3 <-
# drawPoly() # deciduous cloud1 <- drawPoly() cloud2 <- drawPoly() cloud3 <-
# drawPoly() water1 <- drawPoly() # A portion of Tennessee River in
# Knoxville water2 <- drawPoly() # Douglas Lake save(studyext, urb1, urb2,
# urb3, ag1, forest1, forest2, forest3, cloud1, cloud2, cloud3, water1,
# water2, file='Labs/train_polys.Rdata')

load("~/Dropbox/classes/Geog515_f13/Labs/train_polys.Rdata")
plotRGB(feb, 3, 2, 1, stretch = "lin")
plot(urb1, add = TRUE, border = "red")
plot(ag1, add = TRUE, border = "red")

$plot of chunk train regions$

Create the training dataset.

# Create the row IDs for the training dataset
lsat.labels <- rep(NA, ncell(feb))
lsat.labels[unlist(cellFromPolygon(feb, urb1))] <- "urban1"
lsat.labels[unlist(cellFromPolygon(feb, urb2))] <- "urban2"
lsat.labels[unlist(cellFromPolygon(feb, urb3))] <- "urban2"
lsat.labels[unlist(cellFromPolygon(feb, cloud1))] <- "cloud"
lsat.labels[unlist(cellFromPolygon(feb, cloud2))] <- "cloud"
lsat.labels[unlist(cellFromPolygon(feb, cloud3))] <- "cloud"
lsat.labels[unlist(cellFromPolygon(feb, forest1))] <- "forest1"
lsat.labels[unlist(cellFromPolygon(feb, forest2))] <- "forest2"
lsat.labels[unlist(cellFromPolygon(feb, forest3))] <- "forest2"
lsat.labels[unlist(cellFromPolygon(feb, water1))] <- "water"
lsat.labels[unlist(cellFromPolygon(feb, water2))] <- "water"
lsat.labels[unlist(cellFromPolygon(feb, ag1))] <- "ag"

train.ids <- (!is.na(lsat.labels))  # Get a list of which rows are training data

Exploratory visualization:

There a lot of data here. It's hard to visualize it all. But we want to identify which feature we think will be good for classifying. As said in class, the red and NIR bands are often useful for anything with vegetation. So, let's look at those combinations, for Februar and July.

all.data <- data.frame(labels = lsat.labels, data.matrix(values(feb)), data.matrix(values(jul)))
# Plot red and NIR bands in February and July
library(ggplot2)
ggplot(data = subset(all.data, train.ids)) + geom_point(aes(x = feb.3, y = feb.4, 
    color = labels), size = 1, alpha = 0.2) + guides(color = guide_legend(override.aes = list(size = 3, 
    alpha = 1)))

plot of chunk vis1



ggplot(data = subset(all.data, train.ids)) + geom_point(aes(x = jul.3, y = jul.4, 
    color = labels), size = 1, alpha = 0.2) + guides(color = guide_legend(override.aes = list(size = 3, 
    alpha = 1)))

plot of chunk vis1

Regression Tree classification

# tree classification
library(rpart)
tree <- rpart(labels ~ ., data = all.data, subset = train.ids, method = "class")
plot(tree, margin = 0.05)
text(tree, use.n = TRUE, cex = 0.8)

plot of chunk regtree



pred.tree <- predict(tree, all.data, type = "class")
tree.map <- raster(feb)
values(tree.map) <- pred.tree
plot(tree.map, col = c("yellow", "white", "darkgreen", "lightgreen", "darkred", 
    "red", "darkblue"))

plot of chunk regtree

# Some Diagnostic Plotting
# There are too many points to plot: so let's just take a random sample of pixels
rand.pixels <- sample(1:ncell(feb), size=10000)
library(gridExtra)

## Loading required package: grid

grid.arrange(
  ggplot(data=all.data[rand.pixels,]) + 
    geom_point(aes(x=jul.3, y=jul.4, color=pred.tree[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),
  ggplot(data=all.data[rand.pixels,]) + 
    geom_point(aes(x=feb.3, y=feb.4, color=pred.tree[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),
  ggplot(data=all.data[rand.pixels,]) + 
    geom_point(aes(x=feb.4, y=jul.4, color=pred.tree[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),

  nrow=2
  )

## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).

plot of chunk treediagnostic

Confusion Matrix

Calculating the confusion matrix is easy enough with the table function

conf.mat <- table(pred = pred.tree[train.ids], train = all.data[train.ids, "labels"])
conf.mat

##          train
## pred        ag cloud forest1 forest2 urban1 urban2 water
##   ag      6533     1      50      47    198    335     2
##   cloud      0    91       0       0      0      0     0
##   forest1    8     0    2098     615      0      0     3
##   forest2    9    19     465    2959      0      0    12
##   urban1   189     0       0       0    921    131     0
##   urban2     0     0       0       0      0      0     0
##   water      2     0       0       0      0      0   475


# calculate producer's accuracy.  accuracy is the value on the diagonal.
sweep(conf.mat, 2, colSums(conf.mat), "/")

##          train
## pred             ag     cloud   forest1   forest2    urban1    urban2
##   ag      0.9691440 0.0090090 0.0191351 0.0129798 0.1769437 0.7188841
##   cloud   0.0000000 0.8198198 0.0000000 0.0000000 0.0000000 0.0000000
##   forest1 0.0011868 0.0000000 0.8029085 0.1698426 0.0000000 0.0000000
##   forest2 0.0013351 0.1711712 0.1779564 0.8171776 0.0000000 0.0000000
##   urban1  0.0280374 0.0000000 0.0000000 0.0000000 0.8230563 0.2811159
##   urban2  0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##   water   0.0002967 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##          train
## pred          water
##   ag      0.0040650
##   cloud   0.0000000
##   forest1 0.0060976
##   forest2 0.0243902
##   urban1  0.0000000
##   urban2  0.0000000
##   water   0.9654472

The producer's accuracy is read by reading down columns. The producer sees training data, and wants to know how accurately these are reproduced. The user's accuracy is read by reading along rows. The user sees a predicted map value, and want's to know how likely this is to be the truth.

Note that there is a pretty sizable confusion between the urban and ag classes, and between the two forest classes. In fact, the surburban landcover type is completely worthless! This is perhaps not surprising: suburbs in America are built to mimic the agrarian environment (lots of trees and green pasture). Good books about nature and urban environments are Delores Hayden's “Building Suburbia” and Kenneth Jackson's “Crabgrass Frontier.”

We would do better if we included either textural data (line density, for example), or of we included ancillary data - such as population density from census data.

Regression tree with NDVI features

That regression tree wasn't super fantastic. There are diagonal classes and regression trees only include horizontal and vertical breaks. Let's try it again, but with added classes for ndvi in Jul and Feb, and ndwi in July.

all.data2 <- all.data
all.data2$jul.ndvi <- with(all.data2, (jul.4-jul.3)/(jul.4+jul.3))
all.data2$feb.ndvi <- with(all.data2, (feb.4-feb.3)/(feb.4+feb.3))
all.data2$feb.ndwi <- with(all.data2, (jul.2-jul.4)/(jul.2-jul.4))

tree2 <- rpart(labels~., data=all.data2, subset=train.ids, method='class')
plot(tree2)

plot of chunk ndvitree

plot(tree2, margin=.05); text(tree2, use.n=TRUE, cex=.8)

plot of chunk ndvitree



pred.tree2 <- predict(tree2, all.data2, type='class')
tree2.map <- raster(feb)
values(tree2.map) <- pred.tree2
plot(tree2.map, col=c('yellow', 'white', 'darkgreen', 'lightgreen', 'darkred','red','darkblue'))

plot of chunk ndvitree


grid.arrange(
  ggplot(data=all.data2[rand.pixels,]) + 
    geom_point(aes(x=jul.3, y=jul.4, color=pred.tree2[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),
  ggplot(data=all.data2[rand.pixels,]) + 
    geom_point(aes(x=feb.3, y=feb.4, color=pred.tree2[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),
  ggplot(data=all.data2[rand.pixels,]) + 
    geom_point(aes(x=feb.4, y=jul.4, color=pred.tree2[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),

  nrow=2
  )

## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).

plot of chunk ndvitree


table(pred=pred.tree2[train.ids], train=all.data2[train.ids,'labels'])

##          train
## pred        ag cloud forest1 forest2 urban1 urban2 water
##   ag      6689     1      50      47    264    399     2
##   cloud      0     0       0       0      0      0     0
##   forest1    4     1    2171     562      0      0     3
##   forest2   13   109     392    3012      0      0    12
##   urban1    33     0       0       0    855     67     0
##   urban2     0     0       0       0      0      0     0
##   water      2     0       0       0      0      0   475

The classifier used the February ndvi a few times.

Logistic regression

We'll repeat with logistic regression. We'll have to build our own classifier. Most classifiers actually can only make two comparisons at a time. So, if we want to classify three classes, say 1, 2, and 3, there are usually two options:

fit 1 vs (2 and 3), 2 vs (1 and 3) and 3 vs (1 and 2). Pick the one with the highest probability. This is what I do here.
fit 1 vs 2, 1 vs 3, 2 vs 3. Pick the class with the most “votes.”

The reason I didn't pick option 2 is simply because that with 7 classes like we have here, it would require a lot more models to be be fit.

# Do a logistic regression for each class
classes <- levels(all.data$labels)
C <- length(classes)
logit.class <- as.data.frame(matrix(NA, nrow(all.data), C))
names(logit.class) <- classes
# We will fit the model for each class (vs all the other classes) We will
# save the predicted probability of being that class
for (c in classes) {
    model.fit <- glm(I(labels == c) ~ ., data = all.data, subset = train.ids, 
        family = "binomial")
    logit.class[[c]] <- as.vector(predict(model.fit, newdata = all.data[, -1], 
        type = "response"))
}

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred


# For each pixel (row), find which column (class) has the highest prob.
pred.logit <- apply(logit.class, 1, function(x) which(x == max(x)))
logit.map <- raster(jul)  # Initialize a map.  We will overwrite the values
pred.logit <- factor(pred.logit, levels = 1:7, labels = classes)
values(logit.map) <- pred.logit
plot(logit.map, col = c("yellow", "white", "darkgreen", "lightgreen", "darkred", 
    "red", "darkblue"))

plot of chunk logistic


table(pred = pred.logit[train.ids], train = all.data[train.ids, "labels"])

##          train
## pred        ag cloud forest1 forest2 urban1 urban2 water
##   ag      6529     0      22      59    149    329     0
##   cloud      0   111       0       0      0      0     0
##   forest1  141     0    1894     472      3      2     0
##   forest2   24     0     695    3090      0      0     0
##   urban1    39     0       0       0    957    102     0
##   urban2     8     0       2       0     10     33     0
##   water      0     0       0       0      0      0   492

Some diagnostic plotting of the results

grid.arrange(
  ggplot(data=all.data[rand.pixels,]) + 
    geom_point(aes(x=jul.3, y=jul.4, color=pred.logit[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),
  ggplot(data=all.data[rand.pixels,]) + 
    geom_point(aes(x=feb.3, y=feb.4, color=pred.logit[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),
  ggplot(data=all.data[rand.pixels,]) + 
    geom_point(aes(x=feb.4, y=jul.4, color=pred.logit[rand.pixels]), size=.8)+
    guides(color=guide_legend(override.aes=list(size=3, alpha=1))),

  nrow=2
  )

## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-2

Support Vector Machine Classification

library(e1071)

## Loading required package: class
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:raster':
## 
##     interpolate

# This takes a while: subset the training ids by only keeping 1/10 of them.
train2.ids <- rep(FALSE, ncell(jul))
train2.ids[seq(from = 1, to = ncell(jul), by = 10)] <- train.ids[seq(from = 1, 
    to = ncell(jul), by = 10)]

# Train and classify
fit.svm <- svm(labels ~ ., data = all.data, subset = train2.ids, kernel = "linear")
pred.svm <- predict(fit.svm, newdata = all.data[, -1])


svm.map <- raster(jul)  # Initialize a map.  We will overwrite the values
# Their are some incomplete cases in all.data, and pred.svm is shorter than
# all.data
values(svm.map)[complete.cases(all.data[, -1])] <- pred.svm
plot(svm.map, col = c("yellow", "white", "darkgreen", "lightgreen", "darkred", 
    "red", "darkblue"))

plot of chunk unnamed-chunk-3

# That's a pretty darn good result, apart from the continuing problems with
# suburban spaces.

conf.mat <- table(pred = pred.logit[train.ids], train = all.data[train.ids, 
    "labels"])

# producer's accuracy
diag(sweep(conf.mat, 2, colSums(conf.mat), "/"))

##      ag   cloud forest1 forest2  urban1  urban2   water 
## 0.96855 1.00000 0.72484 0.85336 0.85523 0.07082 1.00000

# User's accuracy
diag(sweep(conf.mat, 1, rowSums(conf.mat), "/"))

##      ag   cloud forest1 forest2  urban1  urban2   water 
##  0.9211  1.0000  0.7540  0.8112  0.8716  0.6226  1.0000

How to improve it:

At this point, that's about as good as it's going to get without more and better data. For example, the forested landscape East of Seymour is completely misclassified. Also, the terrain just South of Knoxville is completely misclassified. If I did a better job at training agriculture and tree, this would improve. Also, population data would probably improve the identification of suburb by quite a bit. Features that measured texture might improve things, too.