require(rgdal)
require(raster)
require(RStoolbox)
require(ggplot2)
require(reshape2)
require(knitr)
require(kableExtra)
imadir2 <- "./pcatest2"

1. PCA with R

Input image

b <- brick(file.path(imadir2,"mini1.tif"))
ggRGB(b,5,3,1,stretch="lin")

Run PCA with RStoolbox and extract eigenmatrix.
(read again for time benchmarking)

t0 <- Sys.time()
b <- brick(file.path(imadir2,"mini1.tif"))
bpca <- rasterPCA(b,spca=FALSE)
trpca <- Sys.time() -t0
trpca
Time difference of 5.875336 secs
eigenvR <- loadings(bpca$model)[]

PCA image (composite of PC1 to PC3)

ggRGB(bpca$map,1,2,3,stretch="lin")

2. PCA with OTB

For OTB the current (v7.1) implementation of otbcli_DimensionalityReduction implies an awkard sequence to cope with nodata: (otbPCAtest_mini1.scr)

C:\ALOBO\OTB-7.1.0-Win64\otbenv.bat
cd D:\OTBtests\OTBpca\pcatest2
otbcli_ManageNoData.bat -in ..\mini1.tif -out mini1MASK.tif uint8 -mode.buildmask.inv 1 -mode.buildmask.outv 0 
otbcli_ManageNoData.bat -in mini1.tif -out mini1Mask.tif uint8 -mode.buildmask.inv 1 -mode.buildmask.outv 0 
#without whitening
otbcli_DimensionalityReduction.bat -in mini1.tif -bv 0 -out mini1PCAotb.tif -method pca -method.pca.whiten false -outmatrix mini1PCAotbnowhit.eigmat.csv
otbcli_ManageNoData.bat -in mini1PCAotb.tif -out mini1PCAotbM.tif -mode apply -mode.apply.mask mini1Mask.tif -mode.apply.ndval 0

3. Compare eigenvectors

eigenvOTB <- t(read.csv(file.path(imadir2,"mini1PCAotbnowhit.eigmat.csv"),header=FALSE,sep=""))
eigenvOTB <- data.frame(eigenvOTB)
eigenvR <- read.csv(file.path(imadir2,"Rmini1eigenmat.csv"),stringsAsFactors = FALSE)[,-1]
colnames(eigenvR) <- colnames(eigenvOTB) <- paste0("PC",1:5)
rownames(eigenvR) <- rownames(eigenvOTB) <- paste0("Band_",1:5)

Eigenvectors from R:

kable(eigenvR)  %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
PC1 PC2 PC3 PC4 PC5
Band_1 0.0217211 0.2969163 0.2623071 0.2694614 0.8774705
Band_2 0.1326099 0.4015052 0.0335164 -0.8967462 0.1262185
Band_3 0.0331460 0.6279545 0.5775238 0.2428258 -0.4605176
Band_4 0.3865421 0.5127257 -0.7221213 0.2534016 -0.0450125
Band_5 0.9118275 -0.3056467 0.2740052 0.0077485 -0.0034370

Eigenvectors from OTB:

kable(eigenvOTB)  %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
PC1 PC2 PC3 PC4 PC5
Band_1 0.0225902 -0.2931714 -0.2664791 0.1835331 0.8993579
Band_2 0.1346418 -0.3977168 -0.0786326 -0.9037259 0.0280966
Band_3 0.0361656 -0.6199247 -0.5704303 0.3142812 -0.4361444
Band_4 0.3874017 -0.5230245 0.7251642 0.2244495 -0.0111637
Band_5 0.9110287 0.3130665 -0.2674920 0.0210914 -0.0043922

Difference: most values < 0.01, but note larger difference values in some elements of the eigenvectors

kable(abs(eigenvR) - abs(eigenvOTB)) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
PC1 PC2 PC3 PC4 PC5
Band_1 -0.0008691 0.0037449 -0.0041720 0.0859282 -0.0218874
Band_2 -0.0020319 0.0037884 -0.0451162 -0.0069797 0.0981219
Band_3 -0.0030196 0.0080297 0.0070935 -0.0714554 0.0243732
Band_4 -0.0008596 -0.0102988 -0.0030430 0.0289522 0.0338488
Band_5 0.0007988 -0.0074198 0.0065132 -0.0133429 -0.0009552

4. Plot PC values

rversion <- bpca$map
otbversion <- brick(file.path(imadir2,"mini1PCAotbM.tif")) #from otbPCAtest_mini1.scr
names(rversion) <- names(otbversion) <- paste0("PC",1:5)

Extract random values and tidy up for ggplot in long format

set.seed(121)
pdata <- sampleRandom(stack(rversion,otbversion), size=1000, na.rm=TRUE,
                      rowcol=TRUE, xy=TRUE, sp=FALSE)
pdata1 <- pdata[,1:9]
pdata2 <- pdata[,c(1:4,10:14)]
colnames(pdata1)[5:9] <- colnames(pdata2)[5:9] <- paste0("PC",1:5)
pdata1lf <- melt(data.frame(pdata1),id.vars=1:4)
names(pdata1lf)[6] <- "Rversion"
pdata2lf <- melt(data.frame(pdata2),id.vars=1:4)
names(pdata2lf)[6] <- "OTBversion"
pdatalf <- data.frame(cbind(pdata1lf,OTBversion=pdata2lf$OTBversion))
#kable(head(pdatalf))  %>%
#  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

Note dispersion increases for higher PCs. I’ve found this is consistent across images

ggplot(data=pdatalf) +
    geom_point(aes(x=Rversion,y=OTBversion)) +
    theme(aspect.ratio = 1) +
    facet_wrap(~variable,scale="free",ncol=3)

5. Execution times

I often need a pca output in the form of a linearly stretched PCs image in uint8 and leaving 0 for nodata, thus I include this step in the benchmarking too. The stretching is done from the 2% and 98% quantiles to the 1:255 range. For R, this linear stretching can done by combining raster::clamp() and RStoolbox::RescalImage

writeRaster(rescaleImage(clamp(bpca$map,lower=q[,1], upper=q[,2]), 
            xmin=q[,1], xmax=q[,2],ymin=1, ymax=255),
            file.path(imadir2,"mini1PCARresc"),
            datatype="INT1U", NAflag=0,
            format="GTiff",overwrite=TRUE)

Stretching with the current OTB v7.1 implies fiddling again to cope with nodata:

otbcli_DynamicConvert -in mini1PCAotbM.tif type linear -mask mini1Mask.tif -out mini1PCAotbMresc.tif uint8 -outmin 1 -outmax 255
otbcli_ManageNoData.bat -in mini1PCAotbMresc.tif -out mini1PCAotbMrescM.tif -mode apply -mode.apply.mask mini1Mask.tif -mode.apply.ndval 0

5.1 Small test image

Test with mini1.tif (small test image 85 rows x 116 cols x 5 bands x 32 bit).
Runing on a very low-profile, consumer grade old laptop:

  • Lenovo X220 with Intel Core i5-2520M CPU @2.50 GHz
  • SSD hard-drive
  • 8 Gb of RAM

Values in the table are in seconds

extimes <- read.csv("TimeBenchmark_PCA.csv",header=FALSE,stringsAsFactors = FALSE)
extimes1 <- extimes[2:4,1:3]
names(extimes1) <- extimes1[1,]
rownames(extimes1) <- extimes1[,1]
kable(extimes1[-1,-1])  %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
R OTB
PCA 2.05 1.7
stretching 0.15 1.04

5.2 Large-size image

Execution times with a real image (BertMICA20190531v2.tif) (8282 rows x 11329 cols x 5 bands x 32 bit).
For images with a real-life size, an R alternative is using gdalUtilities::gdal_transform(), but the output cannot be clamped to the 1:255 range using gdalUtilities, thus the image has to read back into R for clamp(), which significantly increases the execution time:

gdal_translate(src_dataset=file.path(rasdirpca,"BertMICA20190531v2PCARfloat.tif"),
               dst_dataset=file.path(rasdirpca,"BertMICA20190531v2PCARresc.tif"),
               scale=mscal,dryrun=FALSE)
BertMICA20190531v2PCARresc <- brick(file.path(rasdirpca,"BertMICA20190531v2PCARresc.tif"))
a1 <- clamp(BertMICA20190531v2PCARresc,lower=1,upper=255, useValues=TRUE,
            format="GTiff",datatype="INT1U",NAflag=0,
            file=file.path(rasdirpca,"BertMICA20190531v2PCARrescG.tif"),
            overwrite=TRUE)

Note the advantage of OTB: despite the fact so much reading and writing in OTB severely increases execution time, the OTB run is performed in a reasonable time while waiting for R is unpractical. Values in the table are in minutes.

extimes2 <- extimes[7:10,1:4]
names(extimes2) <- extimes2[1,]
rownames(extimes2) <- extimes2[,1]
kable(extimes2[-1,-1]) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
R gdal+R OTB
PCA 33.73 3.36
PCA with resampling 6.9
stretching 6.38 4.43 1.66
---
title: " Comparing PCA results from R and from OTB"
output:
  html_notebook:
   code_folding: hide
editor_options: 
  chunk_output_type: inline
---

* Agustin.Lobo@ictja.csic.es
* 20200508

```{r}
require(rgdal)
require(raster)
require(RStoolbox)
require(ggplot2)
require(reshape2)
require(knitr)
require(kableExtra)
imadir2 <- "./pcatest2"
```

## 1. PCA with R
Input image
```{r fig.height=4, fig.width=6}
b <- brick(file.path(imadir2,"mini1.tif"))
ggRGB(b,5,3,1,stretch="lin")
```
Run PCA with RStoolbox and extract eigenmatrix.  
(read again for time benchmarking)
```{r}
t0 <- Sys.time()
b <- brick(file.path(imadir2,"mini1.tif"))
bpca <- rasterPCA(b,spca=FALSE)
trpca <- Sys.time() -t0
trpca
eigenvR <- loadings(bpca$model)[]
```
PCA image (composite of PC1 to PC3)
```{r fig.height=4, fig.width=6}
ggRGB(bpca$map,1,2,3,stretch="lin")
```

## 2. PCA with OTB
For OTB the current (v7.1) implementation of otbcli_DimensionalityReduction implies an awkard sequence to cope with nodata:
(otbPCAtest_mini1.scr)

```
C:\ALOBO\OTB-7.1.0-Win64\otbenv.bat
cd D:\OTBtests\OTBpca\pcatest2
otbcli_ManageNoData.bat -in ..\mini1.tif -out mini1MASK.tif uint8 -mode.buildmask.inv 1 -mode.buildmask.outv 0 
otbcli_ManageNoData.bat -in mini1.tif -out mini1Mask.tif uint8 -mode.buildmask.inv 1 -mode.buildmask.outv 0 
#without whitening
otbcli_DimensionalityReduction.bat -in mini1.tif -bv 0 -out mini1PCAotb.tif -method pca -method.pca.whiten false -outmatrix mini1PCAotbnowhit.eigmat.csv
otbcli_ManageNoData.bat -in mini1PCAotb.tif -out mini1PCAotbM.tif -mode apply -mode.apply.mask mini1Mask.tif -mode.apply.ndval 0
```
## 3. Compare eigenvectors
```{r}
eigenvOTB <- t(read.csv(file.path(imadir2,"mini1PCAotbnowhit.eigmat.csv"),header=FALSE,sep=""))
eigenvOTB <- data.frame(eigenvOTB)
eigenvR <- read.csv(file.path(imadir2,"Rmini1eigenmat.csv"),stringsAsFactors = FALSE)[,-1]
colnames(eigenvR) <- colnames(eigenvOTB) <- paste0("PC",1:5)
rownames(eigenvR) <- rownames(eigenvOTB) <- paste0("Band_",1:5)
```
Eigenvectors from R:
```{r class.source = 'fold-hide'}
kable(eigenvR)  %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
```
Eigenvectors from OTB:
```{r class.source = 'fold-hide'}
kable(eigenvOTB)  %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
```
Difference: most values < 0.01, but note larger difference values in some elements of the eigenvectors
```{r class.source = 'fold-hide'}
kable(abs(eigenvR) - abs(eigenvOTB)) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
```
## 4. Plot PC values
```{r}
rversion <- bpca$map
otbversion <- brick(file.path(imadir2,"mini1PCAotbM.tif")) #from otbPCAtest_mini1.scr
names(rversion) <- names(otbversion) <- paste0("PC",1:5)
```

Extract random values and tidy up for ggplot in long format
```{r}
set.seed(121)
pdata <- sampleRandom(stack(rversion,otbversion), size=1000, na.rm=TRUE,
                      rowcol=TRUE, xy=TRUE, sp=FALSE)
pdata1 <- pdata[,1:9]
pdata2 <- pdata[,c(1:4,10:14)]
colnames(pdata1)[5:9] <- colnames(pdata2)[5:9] <- paste0("PC",1:5)
pdata1lf <- melt(data.frame(pdata1),id.vars=1:4)
names(pdata1lf)[6] <- "Rversion"
pdata2lf <- melt(data.frame(pdata2),id.vars=1:4)
names(pdata2lf)[6] <- "OTBversion"
pdatalf <- data.frame(cbind(pdata1lf,OTBversion=pdata2lf$OTBversion))
#kable(head(pdatalf))  %>%
#  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
```

Note dispersion increases for higher PCs. I've found this is consistent across images

```{r}
ggplot(data=pdatalf) +
    geom_point(aes(x=Rversion,y=OTBversion)) +
    theme(aspect.ratio = 1) +
    facet_wrap(~variable,scale="free",ncol=3)
```
## 5. Execution times
I often need a pca output in the form of a linearly stretched PCs image in uint8 and leaving 0 for nodata, thus I include this step in the benchmarking too. The stretching is done from the 2% and 98% quantiles to the 1:255 range.
For R, this linear stretching can done by combining raster::clamp() and RStoolbox::RescalImage
```{r, eval = FALSE}
writeRaster(rescaleImage(clamp(bpca$map,lower=q[,1], upper=q[,2]), 
            xmin=q[,1], xmax=q[,2],ymin=1, ymax=255),
            file.path(imadir2,"mini1PCARresc"),
            datatype="INT1U", NAflag=0,
            format="GTiff",overwrite=TRUE)
```

Stretching with the current OTB v7.1 implies fiddling again to cope with nodata:
```
otbcli_DynamicConvert -in mini1PCAotbM.tif type linear -mask mini1Mask.tif -out mini1PCAotbMresc.tif uint8 -outmin 1 -outmax 255
otbcli_ManageNoData.bat -in mini1PCAotbMresc.tif -out mini1PCAotbMrescM.tif -mode apply -mode.apply.mask mini1Mask.tif -mode.apply.ndval 0

```
### 5.1 Small test image
Test with mini1.tif (small test image 85 rows x 116 cols x 5 bands x 32 bit).  
Runing on a very low-profile, consumer grade old laptop:

* Lenovo X220 with Intel Core i5-2520M CPU @2.50 GHz
* SSD hard-drive
* 8 Gb of RAM

Values in the table are in seconds
```{r}
extimes <- read.csv("TimeBenchmark_PCA.csv",header=FALSE,stringsAsFactors = FALSE)
extimes1 <- extimes[2:4,1:3]
names(extimes1) <- extimes1[1,]
rownames(extimes1) <- extimes1[,1]
kable(extimes1[-1,-1])  %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
```
### 5.2 Large-size image
Execution times with a real image (BertMICA20190531v2.tif) (8282 rows x 11329 cols x 5 bands x 32 bit).  
For images with a real-life size, an R alternative is using gdalUtilities::gdal_transform(), but the output cannot be clamped to the 1:255 range using gdalUtilities, thus the image has to read back into R for clamp(), which significantly increases the execution time:
```{r, eval = FALSE}
gdal_translate(src_dataset=file.path(rasdirpca,"BertMICA20190531v2PCARfloat.tif"),
               dst_dataset=file.path(rasdirpca,"BertMICA20190531v2PCARresc.tif"),
               scale=mscal,dryrun=FALSE)
BertMICA20190531v2PCARresc <- brick(file.path(rasdirpca,"BertMICA20190531v2PCARresc.tif"))
a1 <- clamp(BertMICA20190531v2PCARresc,lower=1,upper=255, useValues=TRUE,
            format="GTiff",datatype="INT1U",NAflag=0,
            file=file.path(rasdirpca,"BertMICA20190531v2PCARrescG.tif"),
            overwrite=TRUE)
```

Note the advantage of OTB: despite the fact so much reading and writing in OTB severely increases execution time, the OTB run is performed in a reasonable time while waiting for R is unpractical.
Values in the table are in minutes.


```{r}
extimes2 <- extimes[7:10,1:4]
names(extimes2) <- extimes2[1,]
rownames(extimes2) <- extimes2[,1]
kable(extimes2[-1,-1]) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
```
