feature extraction

library(here)
library(dplyr)
library(reshape2)
library(ggplot2)
library(DescTools)
library(knitr)
library(dplyr)
library(lubridate)
library(pracma)

#####adding in appropriate date time vlaues for each of the acoustic datasets

file_name<-list.files(paste(here(),"/Data/Raw_data", sep=""), pattern = "*.csv")

f_sites<-gsub("_|.csv", "", file_name)

f_df<-data.frame(file_name, f_sites)

all_sites<-read.csv("surveyTimeAndDate_FSamend.csv")

sites_f<-merge(f_df, all_sites, by.x = "f_sites", by.y = "SiteCode")

sites_f$start_datetime<-as.POSIXct(paste(sites_f$Start_Date, sites_f$Start_Time, sep = " "), format = "%d-%m-%y %H:%M:%OS")
sites_f$end_datetime<-as.POSIXct(paste(sites_f$End_Date, sites_f$End_Time, sep = " "), format = "%d-%m-%y %H:%M:%OS")

sr<-0.524  #one record every 0.524 seconds


site_datetime<-function(site_name){
  
  df<-sites_f[sites_f$f_sites == site_name,]
  
  acoustic<-read.csv(paste(getwd(), "/Data/Raw_data/", df$file_name, sep=""))
  
  time_seq<-seq(
    from=as.POSIXct(df$start_datetime[1], format = "%Y-%m-%d %H:%M:%OS"),
    to=as.POSIXct(df$end_datetime[1], format = "%Y-%m-%d %H:%M:%OS"),
    by= sr
    #length.out = (7*24*60*60)/sr
  )  
  
  t.lub <- ymd_hms(time_seq)
  h.lub <- minute(t.lub)
  
  down_time<-which(h.lub == 29 |h.lub == 59 )    #removing times that are in the 29th or 59th minute
  
  time_seq_sub<-time_seq[-down_time]
  time_seq_sub<-time_seq_sub[1:nrow(acoustic)]
  df_sub <- acoustic
  #df_sub<-acoustic[1:length(time_seq_sub),]
  df_sub$datetime<-time_seq_sub
  df_sub$site<-site_name
  
  df_sub<-df_sub[complete.cases(df_sub),]
  
  write.csv(df_sub, paste(getwd(), "/Data/Processed_data_c/",site_name, "_acoustic_datetime_check.csv", sep=""), row.names = FALSE)
  return(df_sub)
  
}

sapply(sites_f$f_sites, site_datetime)

files<-list.files(here::here("/Data/Processed_data_c/"))

suntimes<-read.csv(here::here("Data/suntimes_summary_allsites.csv"), stringsAsFactors = FALSE)

sr_min <- as.POSIXct(strftime(suntimes$min_sunrise, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")
sr_max <- as.POSIXct(strftime(suntimes$max_sunrise, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")

ss_min<- as.POSIXct(strftime(suntimes$min_sunset, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")
ss_max<-as.POSIXct(strftime(suntimes$max_sunset, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")


for (i in 1:length(sr_min)){
  suntimes$mean_sunrise[i]<-mean(c(sr_min[i], sr_max[i]))
  suntimes$mean_sunset[i]<-mean(c(ss_min[i], ss_max[i]))
  suntimes$mean_sunrise<-as_datetime(suntimes$mean_sunrise)
  suntimes$mean_sunset<-as_datetime(suntimes$mean_sunset)
  }


suntimes$mean_sunrise <- strftime(suntimes$mean_sunrise, format="%H:%M:%OS")
suntimes$mean_sunrise <- as.POSIXct(suntimes$mean_sunrise, format="%H:%M:%OS")


suntimes$mean_sunset <- strftime(suntimes$mean_sunset, format="%H:%M:%OS")
suntimes$mean_sunset <- as.POSIXct(suntimes$mean_sunset, format="%H:%M:%OS")


suntimes$sr_mins_after_midnight<-as.numeric(difftime(suntimes$mean_sunrise, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
suntimes$sr_mins_after_midnight<-suntimes$sr_mins_after_midnight

suntimes$ss_mins_after_midnight<-as.numeric(difftime(suntimes$mean_sunset, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
suntimes$ss_mins_after_midnight<-suntimes$ss_mins_after_midnight

feature_extract<-function(file){

    all_data<-read.csv(paste(here::here("/Data/Processed_data_c/", file), sep=""))
    
    all_data$time_30min<-lubridate::round_date(as.POSIXct(all_data$datetime, format = "%Y-%m-%d %H:%M:%S"), "30 minutes") 
                     
    all_data_avg <- all_data %>%
    group_by(site, time_30min)%>%
    summarize(anthrop_30 = mean(anthrop), biotic_30 = mean(biotic))
    
    all_data_avg$time <- strftime(all_data_avg$time_30min, format="%H:%M:%OS")
    all_data_avg$time <- as.POSIXct(all_data_avg$time, format="%H:%M:%OS")
    
    all_data_avg$day<-format(as.Date(all_data_avg$time_30min ,format="%Y-%m-%d"), "%d")
    
    
    all_data_avg$minutes_after_midnight<-as.numeric(difftime(all_data_avg$time, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
    
    all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
    
    data_summary<-all_data_avg %>%
    group_by(site,day) %>%
    mutate(which_diff_anthro = which.max(diff(anthrop_30)), steepest_anth = max(diff(anthrop_30)), which_diff_bio = which.max(diff(biotic_30)), steepest_bio = max(diff(biotic_30)),steepest_time_anth = time_30min[unique(which_diff_anthro)],steepest_time_bio = time_30min[unique(which_diff_bio)], auc_anthro = AUC(minutes_after_midnight, anthrop_30, method = "spline"),auc_bio = AUC(minutes_after_midnight, biotic_30, method = "spline"), n_peaks_anth = ifelse( is.null(nrow(findpeaks(anthrop_30, threshold = 0.5))), NA,  nrow(findpeaks(anthrop_30, threshold = 0.5))),n_peaks_bio = ifelse( is.null(nrow(findpeaks(biotic_30, threshold = 0.5))), NA,  nrow(findpeaks(biotic_30, threshold = 0.5))), auc_an_min_bio = auc_anthro - auc_bio, peak_anth = max(anthrop_30), peak_bio = max(biotic_30), peak_anth_time = time_30min[which(anthrop_30 == peak_anth)],peak_bio_time = time_30min[which(biotic_30 == peak_bio)] )  %>%
    select(site, day, steepest_anth, steepest_bio,steepest_time_anth,steepest_time_bio ,auc_anthro, auc_bio, n_peaks_anth, n_peaks_bio, auc_an_min_bio, peak_anth, peak_bio, peak_anth_time, peak_bio_time)%>%
    distinct()
    
    print(file)
    return(data_summary)
}


fe_out<-lapply(files, feature_extract)


df<-bind_rows(fe_out, .id = "column_label")

write.csv(df, "example_feature_extraction.csv", row.names = FALSE)

Dawn/Dusk specific feature extraction

Need to make slope an absolute value - currently only pilling out steepest positive slopes i.e. slopes of rapidly increasing volume

library(forcats)

dawn_feature_extract<-function(file){

    all_data<-read.csv(paste(here::here("/Data/Processed_data_c/", file), sep=""))
    
    suntimes_sub<-suntimes[suntimes$site_id %in% all_data$site,]
    
    all_data$time_30min<-lubridate::round_date(as.POSIXct(all_data$datetime, format = "%Y-%m-%d %H:%M:%S"), "30 minutes") 
                     
    all_data_avg <- all_data %>%
    group_by(site, time_30min)%>%
    summarize(anthrop_30 = mean(anthrop), biotic_30 = mean(biotic))

    all_data_avg$time <- strftime(all_data_avg$time_30min, format="%H:%M:%OS")
    all_data_avg$time <- as.POSIXct(all_data_avg$time, format="%H:%M:%OS")
    all_data_avg$day<-format(as.Date(all_data_avg$time_30min ,format="%Y-%m-%d"), "%d")
    all_data_avg$minutes_after_midnight<-as.numeric(difftime(all_data_avg$time, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
    all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
    
    all_data_avg$site<-fct_explicit_na(all_data_avg$site, na_level = "(Missing)")
    
    all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
    
    sr_data<-all_data_avg[all_data_avg$minutes_after_midnight >= (suntimes_sub$sr_mins_after_midnight - 60) & all_data_avg$minutes_after_midnight <= (suntimes_sub$sr_mins_after_midnight + 60),]
    
    dawn_summary<-sr_data %>%
    group_by(site,day) %>%
    mutate(steepest_dawn_anth = max(diff(anthrop_30)), steepest_dawn_bio = max(diff(biotic_30)), auc_dawn_anthro = AUC(minutes_after_midnight, anthrop_30, method = "spline"),auc_dawn_bio = AUC(minutes_after_midnight, biotic_30, method = "spline"), dawn_auc_an_min_bio = auc_dawn_anthro - auc_dawn_bio, peak_dawn_anth = max(anthrop_30), peak_dawn_bio = max(biotic_30) )  %>%
    select(site, day, steepest_dawn_anth, steepest_dawn_bio, auc_dawn_anthro, auc_dawn_bio, dawn_auc_an_min_bio, peak_dawn_anth, peak_dawn_bio)%>%
    distinct()
    
    print(file)
    return(dawn_summary)
}

dawn_out<-lapply(files, dawn_feature_extract)


dawn_df<-bind_rows(dawn_out, .id = "column_label")


write.csv(df, "dawn_example_feature_extraction.csv", row.names = FALSE)

dusk_feature_extract<-function(file){

    all_data<-read.csv(paste(here::here("/Data/Processed_data_c/", file), sep=""))
    
    suntimes_sub<-suntimes[suntimes$site_id %in% all_data$site,]
    
    all_data$time_30min<-lubridate::round_date(as.POSIXct(all_data$datetime, format = "%Y-%m-%d %H:%M:%S"), "30 minutes") 
                     
    all_data_avg <- all_data %>%
    group_by(site, time_30min)%>%
    summarize(anthrop_30 = mean(anthrop), biotic_30 = mean(biotic))

    all_data_avg$time <- strftime(all_data_avg$time_30min, format="%H:%M:%OS")
    all_data_avg$time <- as.POSIXct(all_data_avg$time, format="%H:%M:%OS")
    all_data_avg$day<-format(as.Date(all_data_avg$time_30min ,format="%Y-%m-%d"), "%d")
    all_data_avg$minutes_after_midnight<-as.numeric(difftime(all_data_avg$time, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
    all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
    
    ss_data<-all_data_avg[all_data_avg$minutes_after_midnight >= (suntimes_sub$ss_mins_after_midnight - 60) & all_data_avg$minutes_after_midnight <= (suntimes_sub$ss_mins_after_midnight + 60),]
    
    dusk_summary<-ss_data %>%
    group_by(site,day) %>%
    mutate(steepest_dusk_anth = max(diff(anthrop_30)), steepest_dusk_bio = max(diff(biotic_30)), auc_dusk_anthro = AUC(minutes_after_midnight, anthrop_30, method = "spline"),auc_dusk_bio = AUC(minutes_after_midnight, biotic_30, method = "spline"), dusk_auc_an_min_bio = auc_dusk_anthro - auc_dusk_bio, peak_dusk_anth = max(anthrop_30), peak_dusk_bio = max(biotic_30) )  %>%
    select(site, day, steepest_dusk_anth, steepest_dusk_bio, auc_dusk_anthro, auc_dusk_bio, dusk_auc_an_min_bio, peak_dusk_anth, peak_dusk_bio)%>%
    distinct()
    
    print(file)
    return(dusk_summary)
}

dusk_out<-lapply(files, dusk_feature_extract)


dusk_df<-bind_rows(dusk_out, .id = "column_label")

write.csv(df, "dusk_example_feature_extraction.csv", row.names = FALSE)

df_dawn<-merge(df, dawn_df, by=c("site", "day"))

df_all<-merge(df_dawn, dusk_df, by=c("site", "day"))

df_all<-df_all[, -grep("column_label", colnames(df_all))]


df_all$n_peaks_anth[is.na(df_all$n_peaks_anth)]<-0
df_all$n_peaks_bio[is.na(df_all$n_peaks_bio)]<-0

write.csv(df_all, "all_days_feature_extractions.csv", row.names = FALSE)

df_all<-read.csv("all_days_feature_extractions.csv")

test_pca<-prcomp(df_all[,c(3,4,7:13, 16:22,25:29)], center = TRUE,scale. = TRUE)

summary(test_pca)

## Importance of components:
##                           PC1    PC2     PC3     PC4    PC5     PC6
## Standard deviation     3.2471 1.8969 1.29023 1.00421 0.8956 0.83684
## Proportion of Variance 0.5021 0.1714 0.07927 0.04802 0.0382 0.03335
## Cumulative Proportion  0.5021 0.6734 0.75270 0.80072 0.8389 0.87227
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.75958 0.70633 0.62936 0.57820 0.42593 0.41324
## Proportion of Variance 0.02747 0.02376 0.01886 0.01592 0.00864 0.00813
## Cumulative Proportion  0.89974 0.92350 0.94236 0.95828 0.96692 0.97505
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     0.35278 0.32523 0.29800 0.28700 0.25279 0.24216
## Proportion of Variance 0.00593 0.00504 0.00423 0.00392 0.00304 0.00279
## Cumulative Proportion  0.98098 0.98601 0.99024 0.99416 0.99721 1.00000
##                             PC19      PC20      PC21
## Standard deviation     4.053e-10 1.336e-10 1.201e-10
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00

library(devtools)

## Warning: package 'devtools' was built under R version 3.5.2

devtools::install_github("vqv/ggbiplot")

## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation

library(ggbiplot)

## Loading required package: plyr

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:lubridate':
## 
##     here

## The following object is masked from 'package:ggplot2':
## 
##     empty

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:here':
## 
##     here

## Loading required package: scales

## Loading required package: grid

ggbiplot(test_pca)

library(factoextra)

## Warning: package 'factoextra' was built under R version 3.5.3

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

fviz_pca_var(test_pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

fviz_pca_biplot(test_pca, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
                )

feature extraction

Fiona Spooner

April 3, 2019

Dawn/Dusk specific feature extraction

Need to make slope an absolute value - currently only pilling out steepest positive slopes i.e. slopes of rapidly increasing volume