library(tidyverse)
library(magrittr)
library(fst)

PD <- read_fst("PhotoDir.fst")
ED <- read_fst("ExifData.fst")
source('RemoveDuplicatePhotos.R')
## Joining, by = c("date_dir", "fc_dir", "photoFile", "is_in_NFS")
## Joining, by = "ID"
np <- nrow(PD)
cat(paste("N. of photo:", np, "Average n. of fields per photo:", nrow(ED)/np))
## N. of photo: 11505 Average n. of fields per photo: 169.51890482399
ED <- ED %>%
  mutate(field = trimws(field), value = trimws(value))

Analysis of the fields and field-value frequencies, fixing duplicate fields, excluding constants

sts <- function(ED) {
  
  np <- ED %>% distinct(ID) %>% nrow()
  
  fields <- ED %>%
    group_by(field) %>%
    summarise(field.freq = n())
  
  field.value.combinations <- ED %>%
    group_by(field, value) %>%
    summarise(comb.frq = n())
  
  fields <- field.value.combinations %>%
    group_by(field) %>%
    summarise(n.of.distinct.values = n()) %>%
    full_join(fields)
  
  print(paste("Total number of photo:", np))
  print(t(table(fields$n.of.distinct.values, fields$field.freq)))
  return(list(fields, field.value.combinations))
}

l <- sts(ED)
## Joining, by = "field"
## [1] "Total number of photo: 11505"
##        
##           1   2   3   5   6   7  10  14  38  41  68  86 111 163 210 548
##   5970    1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   11505 127   4   5   4   0   1   1   5   1   1   1   1   1   1   1   1
##   23010   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##        
##         1483 1891 9867 11497 11505
##   5970     0    0    0     0     0
##   11505    1    2    1     1     5
##   23010    0    0    0     0     0
fields <- l[[1]]
field.value.combinations <- l[[2]]

cat("Anomalous frequencies")
## Anomalous frequencies
(af.fields <- fields %>%
  filter(field.freq != np))
## # A tibble: 3 x 3
##   field            n.of.distinct.values field.freq
##   <chr>                           <int>      <int>
## 1 Display Aperture                    1       5970
## 2 Focal Length                        6      23010
## 3 Lens                                2      23010
af.fields %>%
  left_join(field.value.combinations)
## Joining, by = "field"
## # A tibble: 9 x 5
##   field      n.of.distinct.va~ field.freq value                   comb.frq
##   <chr>                  <int>      <int> <chr>                      <int>
## 1 Display A~                 1       5970 2.7                         5970
## 2 Focal Len~                 6      23010 4.3 mm                     11370
## 3 Focal Len~                 6      23010 4.3 mm (35 mm equivale~    11370
## 4 Focal Len~                 6      23010 4.6 mm                       102
## 5 Focal Len~                 6      23010 4.6 mm (35 mm equivale~      102
## 6 Focal Len~                 6      23010 6.4 mm                        33
## 7 Focal Len~                 6      23010 6.4 mm (35 mm equivale~       33
## 8 Lens                       2      23010 4.3 - 150.5 mm             11505
## 9 Lens                       2      23010 4.3 - 150.5 mm (35 mm ~    11505
# add (2) to field name if field freq = 2 * np
x <- af.fields %>%
  filter(field.freq == 2*np) %>%
  left_join(field.value.combinations) %>%
  add_column(vl = str_length(.$value)) %>%
  arrange(field, comb.frq, vl)
## Joining, by = "field"
x <- x %>%
  group_by(field, comb.frq) %>%
  summarise(vl = max(vl)) %>%
  left_join(x) %>%
  select(field, value)
## Joining, by = c("field", "comb.frq", "vl")
ED <- ED %>%
  mutate(field = ifelse(value %in% x$value, paste0(field, "(2)"), field))

l <- sts(ED)
## Joining, by = "field"
## [1] "Total number of photo: 11505"
##        
##           1   2   3   5   7  10  14  38  41  68  86 111 163 210 548 1483
##   5970    1   0   0   0   0   0   0   0   0   0   0   0   0   0   0    0
##   11505 129   4   7   4   1   1   5   1   1   1   1   1   1   1   1    1
##        
##         1891 9867 11497 11505
##   5970     0    0     0     0
##   11505    2    1     1     5
fields <- l[[1]]
field.value.combinations <- l[[2]]

cat("Anomalous frequencies AFTER separation of duplicate field names")
## Anomalous frequencies AFTER separation of duplicate field names
(af.fields <- fields %>%
  filter(field.freq != np))
## # A tibble: 1 x 3
##   field            n.of.distinct.values field.freq
##   <chr>                           <int>      <int>
## 1 Display Aperture                    1       5970
af.fields %>%
  left_join(field.value.combinations)
## Joining, by = "field"
## # A tibble: 1 x 5
##   field            n.of.distinct.values field.freq value comb.frq
##   <chr>                           <int>      <int> <chr>    <int>
## 1 Display Aperture                    1       5970 2.7       5970
fvl <- function(xfield) {
  fields %>%
    filter(field == xfield) %>%
    t %>%
    print
  field.value.combinations %>%
    filter(field == xfield) %>%
    print
} 
z <- map(af.fields$field, fvl)
##                      [,1]              
## field                "Display Aperture"
## n.of.distinct.values "1"               
## field.freq           "5970"            
## # A tibble: 1 x 3
## # Groups:   field [1]
##   field            value comb.frq
##   <chr>            <chr>    <int>
## 1 Display Aperture 2.7       5970
constants <- fields %>%
  filter(n.of.distinct.values == 1 & field.freq == np) %>%
  select(field) %>%
  left_join(field.value.combinations)
## Joining, by = "field"
constants %>%
  select(field) %>%
  unlist %>%
  paste(collapse = " - ") %>%
  print
## [1] "AE Setting - AEB Bracket Value - AF Area Heights - AF Area Mode - AF Area Widths - AF Area X Positions - AF Area Y Positions - AF Image Height - AF Image Width - AF Point - AF Points In Focus - Aspect Ratio - Auto Exposure Bracketing - Base ISO - Bits Per Sample - Bulb Duration - Camera ISO - Camera Model Name - Camera Type - Canon Exposure Mode - Canon Firmware Version - Canon Flash Mode - Canon Image Height - Canon Image Size - Canon Image Type - Canon Image Width - Canon Model ID - Circle Of Confusion - Color Components - Color Space - Components Configuration - Compressed Bits Per Pixel - Compression - Continuous Drive - Contrast - Control Mode - Cropped Image Height - Cropped Image Left - Cropped Image Top - Cropped Image Width - Custom Rendered - Date Stamp Mode - Digital Zoom - Digital Zoom Ratio - Drive Mode - Easy Mode - Encoding Process - Exif Byte Order - Exif Image Height - Exif Image Width - Exif Version - ExifTool Version Number - Exposure Compensation - Exposure Mode - File Source - File Type - File Type Extension - Firmware Revision - Flash - Flash Activity - Flash Bits - Flash Exposure Compensation - Flash Guide Number - Flash Output - Flashpix Version - Focal Plane Resolution Unit - Focal Plane X Resolution - Focal Plane Y Resolution - Focal Units - Focus Continuous - Focus Distance Lower - Focus Mode - Focus Range - Image Description - Image Height - Image Size - Image Stabilization - Image Width - Intelligent Contrast - Interoperability Index - Interoperability Version - Lens - Lens ID - Lens Type - Lens(2) - Macro Mode - Make - Manual Flash Output - Max Focal Length - Megapixels - Metering Mode - MIME Type - Min Aperture - Min Focal Length - My Color Mode - ND Filter - Num AF Points - Owner Name - Primary AF Point - Quality - Rating - Record Mode - Related Image Height - Related Image Width - Resolution Unit - Saturation - Scale Factor To 35 mm Equivalent - Scene Capture Type - Self Timer - Self Timer 2 - Sensing Method - Sensitivity Type - Sharpness - Shooting Mode - Shot Number In Continuous Burst - Slow Shutter - Spot Metering Mode - Thumbnail Image Valid Area - Thumbnail Offset - User Comment - Valid AF Points - VRD Offset - White Balance - X Resolution - Y Cb Cr Positioning - Y Cb Cr Sub Sampling - Y Resolution - Zoom Source Width - Zoom Target Width"
constants %>% write.csv2("fields_with_same_value_for_all_photos.csv2")

ED <- constants %>%
  anti_join(ED, .)
## Joining, by = c("field", "value")
# non constant attributes 
l <- sts(ED)
## Joining, by = "field"
## [1] "Total number of photo: 11505"
##        
##         1 2 3 5 7 10 14 38 41 68 86 111 163 210 548 1483 1891 9867 11497
##   5970  1 0 0 0 0  0  0  0  0  0  0   0   0   0   0    0    0    0     0
##   11505 0 4 7 4 1  1  5  1  1  1  1   1   1   1   1    1    2    1     1
##        
##         11505
##   5970      0
##   11505     5
fields <- l[[1]]
field.value.combinations <- l[[2]]

PDext <- ED %>%
  spread(field,value) %>%
  left_join(PD, .)
## Joining, by = "ID"

Analysis of ‘Create date’ and ‘Orientation’ fields

Shooting days, number of photos and time required

library(ggpubr)
library(lubridate)

PDext %>%
  mutate(CD = str_sub(`Create Date`,1,10)) %>%
  group_by(date_dir, CD) %>%
  tally
## # A tibble: 7 x 3
## # Groups:   date_dir [?]
##   date_dir CD             n
##   <chr>    <chr>      <int>
## 1 20180615 2018:06:15  1188
## 2 20180619 2018:06:19  2196
## 3 20180622 2018:06:22  2466
## 4 20180626 2018:06:26  2208
## 5 20180627 2018:06:27  1809
## 6 20180702 2018:07:02  1221
## 7 20180703 2018:07:03   417
PDext %>%
  mutate(CD = str_sub(`Create Date`,1,10),
         CT = as_datetime(`Create Date`)) %>%
  group_by(CD) %>%
  summarise(minT = min(CT), maxT = max(CT)) %>%
  mutate(shhoting_time = difftime(maxT, minT, units = "hours"))
## # A tibble: 7 x 4
##   CD         minT                maxT                shhoting_time  
##   <chr>      <dttm>              <dttm>              <time>         
## 1 2018:06:15 2018-06-15 08:52:11 2018-06-15 11:53:34 3.0230556 hours
## 2 2018:06:19 2018-06-19 08:40:03 2018-06-19 15:16:06 6.6008333 hours
## 3 2018:06:22 2018-06-22 08:48:33 2018-06-22 15:21:00 6.5408333 hours
## 4 2018:06:26 2018-06-26 08:40:48 2018-06-26 15:06:14 6.4238889 hours
## 5 2018:06:27 2018-06-27 08:45:02 2018-06-27 15:06:40 6.3605556 hours
## 6 2018:07:02 2018-07-02 09:40:53 2018-07-02 14:45:48 5.0819444 hours
## 7 2018:07:03 2018-07-03 09:12:42 2018-07-03 10:01:03 0.8058333 hours

Distinguising A and B experimental sets

PDext <- within(PDext, {
  Orientation <- factor(Orientation)
  levels(Orientation)[pmatch("Rotate", levels(Orientation))] <- "B-portrait"
  levels(Orientation)[pmatch("Horizontal", levels(Orientation))] <- "A-landscape" 
  })

PDext %>%
  group_by(fc_dir, Orientation) %>%
  tally %>% 
  mutate(Orientation = fct_rev(Orientation)) %>%
  ggbarplot(x = "fc_dir", y = "n", fill = "Orientation", 
            x.text.angle = -90, xlab = "sample tree", ylab = "n.of.photo",
            main = "Distribution by sample tree (FC) or sample trees group")

Time required per tree

PDext <- mutate(PDext, fc_group = str_length(fc_dir)>4)

x <- PDext %>%
  filter(fc_group == F) %>%
  mutate(CT = as_datetime(`Create Date`)) %>%
  group_by(fc_dir, Orientation) %>%
  summarise(np = n(), minT = min(CT), maxT = max(CT)) %>%
  mutate(te = difftime(maxT, minT, units = "sec"),
         tebyp = te / (np-1))

xmg <- x %>%
  group_by(Orientation) %>%
  summarise(ym = mean(np)) %>%
  mutate(x = 1:2-.3, xend = 1:2+.3)

p1 <- 
  x %>%
  ggplot(aes(Orientation, np, colour = Orientation)) +
    geom_violin(trim = T, scale = "count", draw_quantiles = T) +
    scale_y_continuous(breaks = seq(from = 10, to = 150, by = 10)) +
    geom_dotplot(binaxis='y', stackdir='center', stackratio = 1, dotsize=.3) + 
    annotate("segment", x = xmg$x, xend = xmg$xend, y = xmg$ym, yend = xmg$ym,
  colour = "red", size = 1.5) +
    annotate("text", x = xmg$x-.1, y = xmg$ym, label = paste("mean =", formatC(xmg$ym,digits=0, format="f") )) +
#    geom_segment(aes(x = 50, xend = x, y = -.3, yend = -y)) +
    ylab("number of photos per tree") +
    coord_flip() +
    theme(legend.position="none")

p2 <- 
  x %>%
  ggplot(aes(as.numeric(tebyp), colour = Orientation, fill = Orientation)) +
  geom_density(alpha = 0.5) + expand_limits(x = c(2, 8)) + xlab("time between photos of same tree [sec]")

p3 <- 
  x %>%
  ggplot(aes(as.numeric(te)/60, colour = Orientation, fill = Orientation)) +
  geom_density(alpha = 0.5) + expand_limits(x = c(0, 11)) + xlab("time spent per tree [mins]")


ggarrange(p1,                                                 # First row with scatter plot
          ggarrange(p2, p3, ncol = 2, labels = c("B", "C"),
                    common.legend = TRUE, legend = "bottom"), # Second row 
          nrow = 2, 
          labels = "A"                                        # Labels of the scatter plot
          ) 
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

rmarkdown::render(“PPC_EXIFdataAnalysis.Rmd”)