File Restructure and Exif Extract

library(here)
library(stringr)
library(reticulate)

Lists all the files in the specified RDS folders - it works recursively down them so only the top level folder needs to be specified.

files_moved<-list.files("M:/biome_health_project_files/country_files/kenya/working_data/", recursive = TRUE, full.names = TRUE)

#Save the list of files as an R object
saveRDS(files_moved, "list_files_RDS_working.rds")

Reading in a list of all of the files that have been loaded on the RDS

exif_out<-readRDS("list_files_RDS_working.rds")

Excluding any that don’t have the desired file type - here “.JPG”

exif_out<-exif_out[grepl(".JPG", exif_out)]

Getting the filename out of filepath - this function extracts the last section of the string after it is split by “/”

file_split<-strsplit(exif_out, "/")

get_last<-function(x){
  image_out<-x[[length(x)]]
  return(image_out)
}

image_nos<-lapply(file_split, get_last)
img_nos<-unlist(image_nos)

Creating a dataframe with the filepath and filename as columns

exif_out<-data.frame(exif_out, img_nos)
colnames(exif_out)<-c("filepath", "image_no")

Removing anything that isn’t a number from the filename - in the image case it just removes the file extension

exif_out$image_num<-as.numeric(gsub("[^0-9]", "",exif_out$image_no))

get_second_last<-function(x){
  image_out<-x[[(length(x)-1)]]
  if(grepl("BTCF",image_out)){
    image_out<-x[[(length(x)-2)]]
  }
  #image_out<-strsplit(image_out, "_")[[1]][1]
  return(image_out)
}

site_cam<-lapply(file_split, get_second_last)
site_cam<-unlist(site_cam)

exif_out$site_cam<-site_cam

If there are sensors with A and B folders this standardises them. There was an issue with some folders names 100/101 BTCF rather than a/b in the kenya CT images

#last character in camera string
ab<-str_sub(exif_out$site_cam,-1,-1)

# Check that it doesn't match any non-letter
letters_only <- function(x) !grepl("[^A-Za-z]", x)

# Check that it doesn't match any non-number
numbers_only <- function(x) !grepl("\\D", x)

ab[numbers_only(ab)]<-"a"

ab[ab == "A"]<-"a"
ab[ab == "B"]<-"b"

table(ab)

## ab
##       a       b 
## 1949181  408276

exif_out$ab<-ab

Getting month from filepath - might be unique to Kenya CT.

exif_out$month<-ifelse(grepl("november",exif_out$filepath), "november", "october")

exif_out$new_img_num<-exif_out$image_num

Creating unique file numbers - there were duplicates for files in october/november and in a/b folders Remedy this by adding 20000 to files in november and 10000 to b files. The numbers are then padded with leading zeros so that they are 6 digits long

October a = 000001 - 009999 October b = 010001 - 019999 November a = 020001 - 029999 November b = 030001 - 039999

exif_out$new_img_num[exif_out$month == "november"]<-exif_out$image_num[exif_out$month == "november"]+20000

exif_out$new_img_num[exif_out$ab == "b"]<-exif_out$new_img_num[exif_out$ab == "b"]+10000

exif_out$new_img_num<-str_pad(exif_out$new_img_num, 6, pad = "0")

Getting site from the site id by splitting MN_02 by the underscore and getting the first section

site_split<-strsplit(exif_out$site_cam, "_")

Getting the site id out of site_cam

get_first<-function(x){
  site_out<-x[[1]]
  return(site_out)
}

site_id<-lapply(site_split, get_first)
site_ids<-unlist(site_id)

exif_out$site_id<-site_ids

Renaming the existing files so that they have unique names: YEAR_SITE_NUMBER.JPG e.g. 2018_MN_000001.JPG

This function gets the filepath but removes the last section - the part with the filename,

file_split<-strsplit(as.character(exif_out$filepath), "/")

remove_last<-function(x){
 image_out<-x[-length(x)]
 # image_out_img<-paste("2018",image_out[2], sep = "_")
 image_out<-paste(image_out, collapse = "/", sep="")
 return(image_out)
}

image_nos<-lapply(file_split, remove_last)
img_nos<-unlist(image_nos)

Creating the new filepath with the new filename

Year<-2018

exif_out$filepath_image_rename<-paste(img_nos, "/",Year,"_",exif_out$site_id,"_" ,exif_out$new_img_num,".JPG", sep= "")

write.csv(exif_out, "rename_image_file.csv", row.names = FALSE)

file.rename(as.character(exif_out$filepath), exif_out$filepath_image_rename)

New File Structure

exif_out<-read.csv("rename_image_file.csv")

exif_out$site<-gsub('[[:digit:]]+', '', exif_out$site_id)

base_path<-"M:/biome_health_project_files/country_files/kenya/working_data"

Data type CT or AD

data_type<-"CT"

exif_out$new_file_structure<-paste(base_path,"/",exif_out$site,"/",exif_out$site_id,"/",data_type,"/" ,Year,"/",Year,"_", site_id,"_",exif_out$new_img_num,".JPG", sep= "")

exif_out$new_dir_structure<-paste(base_path, exif_out$site,exif_out$site_id,data_type, Year, sep= "/")

dir_creator<-function(x){
    dir.create(x, recursive = TRUE)  
  }

lapply(unique(exif_out$new_dir_structure), dir_creator)

saveRDS(exif_out, "original_filepaths_working_filepaths.RDS")
write.csv(exif_out, "original_filepaths_working_filepaths.csv")

file.rename(exif_out$filepath_image_rename, exif_out$new_file_structure)

Getting exif data out

import PIL.Image
import PIL.ExifTags
import pickle
import os
import sys
import pandas as pd

df = pd.read_csv("original_filepaths_working_filepaths.csv")
allfiles = df.new_file_structure

sys.stdout = open(os.path.join('D:/Fiona/Biome_Health_Project/exif_output/exif_out_test.txt'), "w")

# Pick out which exif data you're interested in
keys = ['Make', 'Model', 'DateTime','DateTimeDigitized','LightSource', 'Flash']

###saves filepath rather than information extracted from it####

for image in allfiles:
  try:
    img = PIL.Image.open(image)
  except OSError as e:
    print('Bad file ' + image)
  exif = {
    PIL.ExifTags.TAGS[k]: v
    for k, v in img._getexif().items()
    if k in PIL.ExifTags.TAGS
  }
  keys_out = [str(exif.get(key)) for key in keys]
  filepath = str(image)
  print(filepath + ', ' + ', '.join(keys_out))