Script for R to analyse NDVI values in GTA

This script uses time-series maximum NDVI which generated from Google Earth Engine to determine the landuse change time of each pixel in the GTA. The maximum NDVI was generated from image composites over the growing seasons (June 1st - September 15th) from 1984 to 2016. I used Landsat 5 images for 1984 to 2011 and Landsat 8 for 2013 to 2016.

rm(list=ls())

library(rgdal)

## Loading required package: sp

## rgdal: version: 1.2-7, (SVN revision 660)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.0.1, released 2015/09/15
##  Path to GDAL shared files: C:/Users/Apple/Documents/R/win-library/3.4/rgdal/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.2, 08 September 2015, [PJ_VERSION: 492]
##  Path to PROJ.4 shared files: C:/Users/Apple/Documents/R/win-library/3.4/rgdal/proj
##  Linking to sp version: 1.2-4

library(raster)
library(rasterVis)

## Loading required package: lattice

## Loading required package: latticeExtra

## Loading required package: RColorBrewer

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:latticeExtra':
## 
##     layer

library(partykit)

## Warning: package 'partykit' was built under R version 3.4.4

## Loading required package: grid

## Loading required package: libcoin

## Warning: package 'libcoin' was built under R version 3.4.4

## Loading required package: mvtnorm

## Loading required package: rpart

#set working directory
setwd("D:/Graduate/GEE/Gracepolygon/result/MaxNDVI")

#import all NDVI files 
Maxndvi.list = list.files("D:/Graduate/GEE/Gracepolygon/result/MaxNDVI",
                          full.names = TRUE,
                          pattern = ".tif$")

#due to memory issue on allocating large size data, I split the matrix into two (a & b); matrix 'a' contains pixels number from 1 to 17132982, matrix 'b' contains the rest pixels.

aa = raster(Maxndvi.list[1])[,]
for (i in 2:16){
  aa = cbind(aa,raster(Maxndvi.list[i])[,])
}

bb = raster(Maxndvi.list[17])[,]
for (i in 18:32){
  bb = cbind(bb,raster(Maxndvi.list[i])[,])
}

a1 = aa[1:17132982,]
b1 = bb[1:17132982,]
a2 = aa[17132983:34265963,]
rm(aa)
b2 = bb[17132983:34265963,]
rm(bb)

a = cbind(a1,b1)
rm(a1,b1)
b = cbind(a2,b2)
rm(a2,b2)

#there are a few pixels' NDVI value larger than 1, I convert them into NA
a[a > 1] = NA #cell number 1:17132982
b[b > 1] = NA #cell number > 17132982

#add a NA column that represents the NDVI value for 2012 to matrix a and b
x = rep(NA, nrow(a))
a = cbind(a[, 1:28], x, a[, 29:32])

x = rep(NA, nrow(b))
b = cbind(b[, 1:28], x, b[, 29:32])

To smooth the noisy time series MaxNDVI, I applied a 5-point window median filter to the time series data.

#apply median filter to matrix a & b
a.med = matrix(rep(1,17132982*33),nrow = 17132982)
for (i in 1:17132982) {
  z = a[i,]
  if (sum(is.na(z)) == 33) {
    a.med[i,] = rep(NA, 33)
    } else {
       z [is.na(z)] = 0
       a.med[i,] = runmed(z, 5)}
}

b.med = matrix(rep(1,17132981*33),nrow = 17132981)
for (i in 1:17132981) {
  z = b[i,]
  if (sum(is.na(z)) == 33) {
    b.med[i,] = rep(NA, 33)
    } else {
       z [is.na(z)] = 0
       b.med[i,] = runmed(z, 5)}
}

#save the result to hard drive
save(a.med, b.med, file = "ab_med.RData")

I set five categories for land cover: 1) vegetated the entire time 2) initially vegetated then converted into residential 3) initially vegetated then converted into urban 4) residential the entire time 5) urban the entire time. For each category, I used Google Earth pro to find 40 random points that represent the land use type. Then, I extracted MaxNDVI time-series data from 1984 to 2016 for all the 200 points and developed a decision tree to identify the landcover change time.

load("ab_med.RData")

#read in x(longitude) and y(latitude) of the 200 points; 
dt = read.csv("D:/Graduate/GEE/Gracepolygon/result/MaxNDVI/ts picture points/Decision Tree2.csv")
 x = c(dt[,1], dt[,3], dt[,5], dt[,7], dt[,9])
 y = c(dt[,2], dt[,4], dt[,6], dt[,8], dt[,10])
 


#define dimension for matrix ts to store time series MaxNDVI, each row stores the NDVI value for one pixel from 1984 to 2016
 ts = matrix(1, nrow = length(x), ncol = 33)
 
 exa = raster(Maxndvi.list[1])
 
 for (i in 1:length(x)) {
   num = cellFromXY(exa,cbind(c(x[i],x[i]),c(y[i],y[i])))[1]
   num2 = num - 17132982
   if(num > 17132982) ts[i,] = b.med[num2,] else ts[i,] = a.med[num,]
 }
 
#because in most part of the area, Google Earth only have avaliable images from 2004 to 2016, so I used time series data started from 2004 when making decision tree
 ts_time = ts[, 21:33]

#example of time-series NDVI plot
par(mfrow = c(20, 1), oma = c(4, 5.5, 3, 3), mar = c(0, 0, 0, 0)) 
for(i in sample(seq(1,200), 20, replace = F)) {
  plot(ts_time[i, ], type = "o", pch = 16,
       xlim = c(1, 14) ,ylim = c(0,1), xlab = "",
       xaxt = "n", yaxt = "n")
  axis(2, at = 0.5, labels = 0.5, cex.axis = 2)
  abline(h = 0.5, lty = 2, col = "grey")
  
  num = cellFromXY(exa,cbind(c(x[i],x[i]),c(y[i],y[i])))[1]
  legend(x = 12.5, y = 0.8, bty = "n", legend = paste("pixel number", num), cex = 2)
}
axis(1, at = seq(1, 13, 2), labels = seq(2004, 2016, 2), cex.axis = 2)
mtext("Year", side = 1, line = 2.5, adj = 0.5, outer = T, cex = 2.5)
mtext("NDVI", side = 2, line = 2.5, adj = 0.5, outer = T, cex = 2.5)

#parameter for decision tree making 
#1)MD:find max difference in 'ts_time'. 
#2)FL:calculate the difference between the first three and the last three points.
#3)FLMean:calculate the mean of the first three and the last three points.

 MD = function(x) {
   z = max(abs(diff(x)))
 }
 
 FL = function(x, j = 13) {
   y1 = c(x[1], x[2], x[3])
   y2 = c(x[j - 2], x[j - 1], x[j])
   z = median(y1, na.rm = T) - median(y2, na.rm = T)
 }
 
 FLMean = function(x, j = 13) {
   y = c(x[1], x[2], x[3], x[j - 2], x[j - 1], x[j])
   z = mean(y, na.rm = T)
 }

 maxdiff = apply(ts_time, 1, MD)
 stanD = apply(ts_time, 1, sd)
 FLdiff = apply(ts_time, 1, FL)
 FLmean = apply(ts_time, 1, FLMean)

#use function 'ctree' in package 'partykit' to make a decision tree
#A:vegetated B:veg-resid C:veg-urban D:resid E:urban

n = 40
t1 = rep("A", n)
t2 = rep("B", n)
t3 = rep("C", n)
t4 = rep("D", n)
t5 = rep("E", n)
type = c(t1, t2, t3, t4, t5)
df = data.frame(FLdiff, maxdiff, stanD, FLmean, type)

myformula = type ~ FLdiff + maxdiff + stanD + FLmean 
tree = ctree(myformula, data = df)
plot(tree)

tree.simple = as.simpleparty(tree)
myfun <- function(i) c(
  as.character(i$prediction),
  paste("n =", i$n),
  format(round(i$distribution/i$n, digits = 3), nsmall = 3)
)
plot(tree.simple, tp_args = list(FUN = myfun))

#cross validation; each category's points were partitioned into four equal sized subsamples, of the four subsamples, one subsample is retained as the validation data for testing the model, and the remaining three subsamples are used as training data. The cross-validation process is then repeated 4 times
traindata = data.frame()
testdata = data.frame()
test.pre = numeric()
testd = numeric()

df = cbind(df, class = rep(c(rep(1, 10), rep(2, 10), rep(3, 10), rep(4, 10)), 5))

for (i in 1:4) {
  traindata = subset(df, class != i)
  testdata = subset(df, class == i)
  testd = c(testd, testdata$type)
  tree = ctree(myformula, data = traindata)
  test.pre = c(test.pre, predict(tree, newdata = testdata))
}

val.table = table(predicted = test.pre, observed = testd)
rownames(val.table) = c("A", "B", "C", "D", "E")
colnames(val.table) = c("A", "B", "C", "D", "E")
val.table

##          observed
## predicted  A  B  C  D  E
##         A 33  4  0  7  0
##         B  4 31  8  0  1
##         C  0  2 28  0  0
##         D  3  3  2 33  4
##         E  0  0  2  0 35

test.result = sum(diag(val.table))/200
test.result

## [1] 0.8

#apply decision tree to the whole area. 
#calculate max difference, standard deviation, mean of first three and last three points and difference between first three and last three points for the entire area. 
a.maxdiff = numeric(nrow(a.med))
for (i in 1:nrow(a.med)) {
  z = a.med[i,]
  if (sum(is.na(z)) == 33) {
    a.maxdiff[i] = NA
    } else {
       a.maxdiff[i] = max(abs(diff(z)))}
}

a.fldiff = numeric(nrow(a.med))
for (i in 1:nrow(a.med)) {
  z = a.med[i,]
  if (sum(is.na(z)) == 33) {
    a.fldiff[i] = NA
    } else {
       y1 = median(z[1:3], na.rm = T)
       y2 = median(z[31:33], na.rm = T)
       a.fldiff[i] = y1 - y2}
}

a.flmean = numeric(nrow(a.med))
for (i in 1:nrow(a.med)) {
  z = a.med[i,]
  if (sum(is.na(z)) == 33) {
    a.flmean[i] = NA
    } else {
       a.flmean[i] = mean(c(z[1:3],z[31:33]), na.rm = T)}
}

a.sd = numeric(nrow(a.med))
for (i in 1:nrow(a.med)) {
  z = a.med[i,]
  if (sum(is.na(z)) == 33) {
    a.sd[i] = NA
    } else {
       a.sd[i] = sd(z, na.rm = T)}
}

b.maxdiff = numeric(nrow(b.med))
for (i in 1:nrow(b.med)) {
  z = b.med[i,]
  if (sum(is.na(z)) == 33) {
    b.maxdiff[i] = NA
    } else {
       b.maxdiff[i] = max(abs(diff(z)))}
}

b.fldiff = numeric(nrow(b.med))
for (i in 1:nrow(b.med)) {
  z = b.med[i,]
  if (sum(is.na(z)) == 33) {
    b.fldiff[i] = NA
    } else {
       y1 = median(z[1:3], na.rm = T)
       y2 = median(z[31:33], na.rm = T)
       b.fldiff[i] = y1 - y2}
}

b.flmean = numeric(nrow(b.med))
for (i in 1:nrow(b.med)) {
  z = b.med[i,]
  if (sum(is.na(z)) == 33) {
    b.flmean[i] = NA
    } else {
       b.flmean[i] = mean(c(z[1:3],z[31:33]), na.rm = T)}
}

b.sd = numeric(nrow(b.med))
for (i in 1:nrow(b.med)) {
  z = b.med[i,]
  if (sum(is.na(z)) == 33) {
    b.sd[i] = NA
    } else {
       b.sd[i] = sd(z, na.rm = T)}
}

#the year of max difference
a.cyear = numeric(nrow(a.med))
for (i in 1:nrow(a.med)) {
  z = a.med[i,]
  if (sum(is.na(z)) == 33) {
    a.cyear[i] = NA
    } else {
       a.cyear[i] = which.max(abs(diff(z)))+1984}
}

b.cyear = numeric(nrow(b.med))
for (i in 1:nrow(b.med)) {
  z = b.med[i,]
  if (sum(is.na(z)) == 33) {
    b.cyear[i] = NA
    } else {
       b.cyear[i] = which.max(abs(diff(z)))+1984}
}

save(a.maxdiff, a.fldiff, a.flmean, a.sd, a.cyear, b.maxdiff, b.fldiff, b.flmean, b.sd, b.cyear, file = "10.RData")

#calculate the year when vegetated area converted into urban
#for pixels remain vegetated until 2016, I set the shift year to 2020; for pixels that were urban through the entire time, 1980 was set to be the shift year
load("10.RData")

a.cpoint = numeric(length(a.sd))

for (i in 1:length(a.sd)) {
 if (is.na(a.sd[i]) == T) {
   a.cpoint[i] = NA
  } else if (a.flmean[i] > 0.626 && a.sd[i] <= 0.073) {
      a.cpoint[i] = 2020
  } else if (a.flmean[i] <= 0.295 && a.maxdiff[i] <= 0.061) {
      a.cpoint[i] = 1980
  } else if (a.flmean[i] <= 0.626 && a.sd[i] <= 0.073) {
      a.cpoint[i] = 1980
  } else {
      a.cpoint[i] = a.cyear[i]
  }
}

b.cpoint = numeric(length(b.sd))

for (i in 1:length(b.sd)) {
 if (is.na(b.sd[i]) == T) {
   b.cpoint[i] = NA
  } else if (b.flmean[i] > 0.626 && b.sd[i] <= 0.073) {
      b.cpoint[i] = 2020
  } else if (b.flmean[i] <= 0.295 && b.maxdiff[i] <= 0.061) {
      b.cpoint[i] = 1980
  } else if (b.flmean[i] <= 0.626 && b.sd[i] <= 0.073) {
      b.cpoint[i] = 1980
  } else {
      b.cpoint[i] = b.cyear[i]
  }
}

save(a.cpoint, b.cpoint, file = "ChangePoint.RData")

#I reclassified NDVI value according to the shift year. I set 0 for vegetated pixel and 1 for urban pixel
load("ChangePoint.RData")

timerange = 2016 - 1984 + 1
T.a = rep(1, length(a.cpoint)*33)

for (i in 1:timerange) {
  a = i + 1983
  for (j in 1:length(a.cpoint)) {
   if (is.na(a.cpoint[j] == T)) {
    T.a[j + 17132982*(i-1)] = NA
   } else if (a.cpoint[j] > a) {
    T.a[j + 17132982*(i-1)] = 0
   } else {
    T.a[j + 17132982*(i-1)] = 1
   }
 }
}

T.b = rep(1, length(b.cpoint)*33)

for (i in 1:timerange) {
  a = i + 1983
  for (j in 1:length(b.cpoint)) {
   if (is.na(b.cpoint[j] == T)) {
    T.b[j + 17132981*(i-1)] = NA
   } else if (b.cpoint[j] > a) {
    T.b[j + 17132981*(i-1)] = 0
   } else {
    T.b[j + 17132981*(i-1)] = 1
   }
 }
}

save(T.a, T.b, file = "tatb_p40All.RData")

#export the final image
load("tatb_p40All.RData")

for(i in 1:timerange) {
  a = T.a[(1+(i-1)*17132982) : (17132982+(i-1)*17132982)]
  b = T.b[(1+(i-1)*17132981) : (17132981+(i-1)*17132981)]
  c = as.numeric(c(a, b))
  a.mat = t(matrix(c, nrow = 9319))
  a.ras = as.raster(a.mat)
  
  mypath <- file.path("D:", "Graduate", "GEE", "Gracepolygon", "result", "MaxNDVI", "pic3", paste("myplot_", i + 1983, ".jpg", sep = ""))

  jpeg(file=mypath, width = 1100, height = 600)
  plot(a.ras)
  dev.off()
}

#example of the reclassified image (with coordinate)
#year 2000
rm(a.med, b.med)
load("coordinate.RData")
load("tatb_p40All.RData")

i = 2000 - 1984 + 1
a = T.a[(1+(i-1)*17132982) : (17132982+(i-1)*17132982)]
b = T.b[(1+(i-1)*17132981) : (17132981+(i-1)*17132981)]
c = as.numeric(c(a, b))

df = data.frame(x, y, c)
coordinates(df) = ~x+y
proj4string(df) = proj

df.ras = raster(df, ncols=9319, nrows=3677)
df.ras[,] = c

pal = colorRampPalette(c("grey0", "grey100"))
plot(df.ras, col = pal(2))

#calculate the fraction of urbanized area from 1984 to 2016
timerange = 2016 - 1984 + 1
urb = rep(1, timerange)
year = seq(1984, 2016)
for(i in 1:timerange) {
   a = T.a[(1+(i-1)*17132982) : (17132982+(i-1)*17132982)]
   b = T.b[(1+(i-1)*17132981) : (17132981+(i-1)*17132981)]
   c = as.numeric(c(a, b))
   urb[i] = sum(c==1, na.rm = T)/9660913 #the pixel number of total area is 9660913
}

plot(year, urb, col = "red", pch = 16, type = "o",
     xlim = c(1983, 2017),
     ylim = c(0.2, 0.4),
     ylab = "",
     cex.axis = 2, cex.lab = 3, cex.main = 3,
     main = "Fraction of urbanized area")
mtext("urban area", side = 2, cex = 3, line = 2.5)

Script for R to analyse NDVI values in GTA

Grace (Qiuping) Li

2018-May-2