library(here)
library(dplyr)
library(reshape2)
library(ggplot2)
library(DescTools)
library(knitr)
library(dplyr)
library(lubridate)
library(pracma)
#####adding in appropriate date time vlaues for each of the acoustic datasets
file_name<-list.files(paste(here(),"/Data/Raw_data", sep=""), pattern = "*.csv")
f_sites<-gsub("_|.csv", "", file_name)
f_df<-data.frame(file_name, f_sites)
all_sites<-read.csv("surveyTimeAndDate_FSamend.csv")
sites_f<-merge(f_df, all_sites, by.x = "f_sites", by.y = "SiteCode")
sites_f$start_datetime<-as.POSIXct(paste(sites_f$Start_Date, sites_f$Start_Time, sep = " "), format = "%d-%m-%y %H:%M:%OS")
sites_f$end_datetime<-as.POSIXct(paste(sites_f$End_Date, sites_f$End_Time, sep = " "), format = "%d-%m-%y %H:%M:%OS")
sr<-0.524 #one record every 0.524 seconds
site_datetime<-function(site_name){
df<-sites_f[sites_f$f_sites == site_name,]
acoustic<-read.csv(paste(getwd(), "/Data/Raw_data/", df$file_name, sep=""))
time_seq<-seq(
from=as.POSIXct(df$start_datetime[1], format = "%Y-%m-%d %H:%M:%OS"),
to=as.POSIXct(df$end_datetime[1], format = "%Y-%m-%d %H:%M:%OS"),
by= sr
#length.out = (7*24*60*60)/sr
)
t.lub <- ymd_hms(time_seq)
h.lub <- minute(t.lub)
down_time<-which(h.lub == 29 |h.lub == 59 ) #removing times that are in the 29th or 59th minute
time_seq_sub<-time_seq[-down_time]
time_seq_sub<-time_seq_sub[1:nrow(acoustic)]
df_sub <- acoustic
#df_sub<-acoustic[1:length(time_seq_sub),]
df_sub$datetime<-time_seq_sub
df_sub$site<-site_name
df_sub<-df_sub[complete.cases(df_sub),]
write.csv(df_sub, paste(getwd(), "/Data/Processed_data_c/",site_name, "_acoustic_datetime_check.csv", sep=""), row.names = FALSE)
return(df_sub)
}
sapply(sites_f$f_sites, site_datetime)
files<-list.files(here::here("/Data/Processed_data_c/"))
suntimes<-read.csv(here::here("Data/suntimes_summary_allsites.csv"), stringsAsFactors = FALSE)
sr_min <- as.POSIXct(strftime(suntimes$min_sunrise, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")
sr_max <- as.POSIXct(strftime(suntimes$max_sunrise, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")
ss_min<- as.POSIXct(strftime(suntimes$min_sunset, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")
ss_max<-as.POSIXct(strftime(suntimes$max_sunset, format="%H:%M:%S"), format="%H:%M:%S", tz = "GMT")
for (i in 1:length(sr_min)){
suntimes$mean_sunrise[i]<-mean(c(sr_min[i], sr_max[i]))
suntimes$mean_sunset[i]<-mean(c(ss_min[i], ss_max[i]))
suntimes$mean_sunrise<-as_datetime(suntimes$mean_sunrise)
suntimes$mean_sunset<-as_datetime(suntimes$mean_sunset)
}
suntimes$mean_sunrise <- strftime(suntimes$mean_sunrise, format="%H:%M:%OS")
suntimes$mean_sunrise <- as.POSIXct(suntimes$mean_sunrise, format="%H:%M:%OS")
suntimes$mean_sunset <- strftime(suntimes$mean_sunset, format="%H:%M:%OS")
suntimes$mean_sunset <- as.POSIXct(suntimes$mean_sunset, format="%H:%M:%OS")
suntimes$sr_mins_after_midnight<-as.numeric(difftime(suntimes$mean_sunrise, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
suntimes$sr_mins_after_midnight<-suntimes$sr_mins_after_midnight
suntimes$ss_mins_after_midnight<-as.numeric(difftime(suntimes$mean_sunset, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
suntimes$ss_mins_after_midnight<-suntimes$ss_mins_after_midnight
feature_extract<-function(file){
all_data<-read.csv(paste(here::here("/Data/Processed_data_c/", file), sep=""))
all_data$time_30min<-lubridate::round_date(as.POSIXct(all_data$datetime, format = "%Y-%m-%d %H:%M:%S"), "30 minutes")
all_data_avg <- all_data %>%
group_by(site, time_30min)%>%
summarize(anthrop_30 = mean(anthrop), biotic_30 = mean(biotic))
all_data_avg$time <- strftime(all_data_avg$time_30min, format="%H:%M:%OS")
all_data_avg$time <- as.POSIXct(all_data_avg$time, format="%H:%M:%OS")
all_data_avg$day<-format(as.Date(all_data_avg$time_30min ,format="%Y-%m-%d"), "%d")
all_data_avg$minutes_after_midnight<-as.numeric(difftime(all_data_avg$time, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
data_summary<-all_data_avg %>%
group_by(site,day) %>%
mutate(which_diff_anthro = which.max(diff(anthrop_30)), steepest_anth = max(diff(anthrop_30)), which_diff_bio = which.max(diff(biotic_30)), steepest_bio = max(diff(biotic_30)),steepest_time_anth = time_30min[unique(which_diff_anthro)],steepest_time_bio = time_30min[unique(which_diff_bio)], auc_anthro = AUC(minutes_after_midnight, anthrop_30, method = "spline"),auc_bio = AUC(minutes_after_midnight, biotic_30, method = "spline"), n_peaks_anth = ifelse( is.null(nrow(findpeaks(anthrop_30, threshold = 0.5))), NA, nrow(findpeaks(anthrop_30, threshold = 0.5))),n_peaks_bio = ifelse( is.null(nrow(findpeaks(biotic_30, threshold = 0.5))), NA, nrow(findpeaks(biotic_30, threshold = 0.5))), auc_an_min_bio = auc_anthro - auc_bio, peak_anth = max(anthrop_30), peak_bio = max(biotic_30), peak_anth_time = time_30min[which(anthrop_30 == peak_anth)],peak_bio_time = time_30min[which(biotic_30 == peak_bio)] ) %>%
select(site, day, steepest_anth, steepest_bio,steepest_time_anth,steepest_time_bio ,auc_anthro, auc_bio, n_peaks_anth, n_peaks_bio, auc_an_min_bio, peak_anth, peak_bio, peak_anth_time, peak_bio_time)%>%
distinct()
print(file)
return(data_summary)
}
fe_out<-lapply(files, feature_extract)
df<-bind_rows(fe_out, .id = "column_label")
write.csv(df, "example_feature_extraction.csv", row.names = FALSE)
Need to make slope an absolute value - currently only pilling out steepest positive slopes i.e. slopes of rapidly increasing volume
library(forcats)
dawn_feature_extract<-function(file){
all_data<-read.csv(paste(here::here("/Data/Processed_data_c/", file), sep=""))
suntimes_sub<-suntimes[suntimes$site_id %in% all_data$site,]
all_data$time_30min<-lubridate::round_date(as.POSIXct(all_data$datetime, format = "%Y-%m-%d %H:%M:%S"), "30 minutes")
all_data_avg <- all_data %>%
group_by(site, time_30min)%>%
summarize(anthrop_30 = mean(anthrop), biotic_30 = mean(biotic))
all_data_avg$time <- strftime(all_data_avg$time_30min, format="%H:%M:%OS")
all_data_avg$time <- as.POSIXct(all_data_avg$time, format="%H:%M:%OS")
all_data_avg$day<-format(as.Date(all_data_avg$time_30min ,format="%Y-%m-%d"), "%d")
all_data_avg$minutes_after_midnight<-as.numeric(difftime(all_data_avg$time, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
all_data_avg$site<-fct_explicit_na(all_data_avg$site, na_level = "(Missing)")
all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
sr_data<-all_data_avg[all_data_avg$minutes_after_midnight >= (suntimes_sub$sr_mins_after_midnight - 60) & all_data_avg$minutes_after_midnight <= (suntimes_sub$sr_mins_after_midnight + 60),]
dawn_summary<-sr_data %>%
group_by(site,day) %>%
mutate(steepest_dawn_anth = max(diff(anthrop_30)), steepest_dawn_bio = max(diff(biotic_30)), auc_dawn_anthro = AUC(minutes_after_midnight, anthrop_30, method = "spline"),auc_dawn_bio = AUC(minutes_after_midnight, biotic_30, method = "spline"), dawn_auc_an_min_bio = auc_dawn_anthro - auc_dawn_bio, peak_dawn_anth = max(anthrop_30), peak_dawn_bio = max(biotic_30) ) %>%
select(site, day, steepest_dawn_anth, steepest_dawn_bio, auc_dawn_anthro, auc_dawn_bio, dawn_auc_an_min_bio, peak_dawn_anth, peak_dawn_bio)%>%
distinct()
print(file)
return(dawn_summary)
}
dawn_out<-lapply(files, dawn_feature_extract)
dawn_df<-bind_rows(dawn_out, .id = "column_label")
write.csv(df, "dawn_example_feature_extraction.csv", row.names = FALSE)
dusk_feature_extract<-function(file){
all_data<-read.csv(paste(here::here("/Data/Processed_data_c/", file), sep=""))
suntimes_sub<-suntimes[suntimes$site_id %in% all_data$site,]
all_data$time_30min<-lubridate::round_date(as.POSIXct(all_data$datetime, format = "%Y-%m-%d %H:%M:%S"), "30 minutes")
all_data_avg <- all_data %>%
group_by(site, time_30min)%>%
summarize(anthrop_30 = mean(anthrop), biotic_30 = mean(biotic))
all_data_avg$time <- strftime(all_data_avg$time_30min, format="%H:%M:%OS")
all_data_avg$time <- as.POSIXct(all_data_avg$time, format="%H:%M:%OS")
all_data_avg$day<-format(as.Date(all_data_avg$time_30min ,format="%Y-%m-%d"), "%d")
all_data_avg$minutes_after_midnight<-as.numeric(difftime(all_data_avg$time, as.POSIXct('00:00:00', format = '%H:%M:%S'), units = 'min'))
all_data_avg<-all_data_avg[complete.cases(all_data_avg),]
ss_data<-all_data_avg[all_data_avg$minutes_after_midnight >= (suntimes_sub$ss_mins_after_midnight - 60) & all_data_avg$minutes_after_midnight <= (suntimes_sub$ss_mins_after_midnight + 60),]
dusk_summary<-ss_data %>%
group_by(site,day) %>%
mutate(steepest_dusk_anth = max(diff(anthrop_30)), steepest_dusk_bio = max(diff(biotic_30)), auc_dusk_anthro = AUC(minutes_after_midnight, anthrop_30, method = "spline"),auc_dusk_bio = AUC(minutes_after_midnight, biotic_30, method = "spline"), dusk_auc_an_min_bio = auc_dusk_anthro - auc_dusk_bio, peak_dusk_anth = max(anthrop_30), peak_dusk_bio = max(biotic_30) ) %>%
select(site, day, steepest_dusk_anth, steepest_dusk_bio, auc_dusk_anthro, auc_dusk_bio, dusk_auc_an_min_bio, peak_dusk_anth, peak_dusk_bio)%>%
distinct()
print(file)
return(dusk_summary)
}
dusk_out<-lapply(files, dusk_feature_extract)
dusk_df<-bind_rows(dusk_out, .id = "column_label")
write.csv(df, "dusk_example_feature_extraction.csv", row.names = FALSE)
df_dawn<-merge(df, dawn_df, by=c("site", "day"))
df_all<-merge(df_dawn, dusk_df, by=c("site", "day"))
df_all<-df_all[, -grep("column_label", colnames(df_all))]
df_all$n_peaks_anth[is.na(df_all$n_peaks_anth)]<-0
df_all$n_peaks_bio[is.na(df_all$n_peaks_bio)]<-0
write.csv(df_all, "all_days_feature_extractions.csv", row.names = FALSE)
df_all<-read.csv("all_days_feature_extractions.csv")
test_pca<-prcomp(df_all[,c(3,4,7:13, 16:22,25:29)], center = TRUE,scale. = TRUE)
summary(test_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.2471 1.8969 1.29023 1.00421 0.8956 0.83684
## Proportion of Variance 0.5021 0.1714 0.07927 0.04802 0.0382 0.03335
## Cumulative Proportion 0.5021 0.6734 0.75270 0.80072 0.8389 0.87227
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.75958 0.70633 0.62936 0.57820 0.42593 0.41324
## Proportion of Variance 0.02747 0.02376 0.01886 0.01592 0.00864 0.00813
## Cumulative Proportion 0.89974 0.92350 0.94236 0.95828 0.96692 0.97505
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.35278 0.32523 0.29800 0.28700 0.25279 0.24216
## Proportion of Variance 0.00593 0.00504 0.00423 0.00392 0.00304 0.00279
## Cumulative Proportion 0.98098 0.98601 0.99024 0.99416 0.99721 1.00000
## PC19 PC20 PC21
## Standard deviation 4.053e-10 1.336e-10 1.201e-10
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00
library(devtools)
## Warning: package 'devtools' was built under R version 3.5.2
devtools::install_github("vqv/ggbiplot")
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
library(ggbiplot)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
## The following object is masked from 'package:ggplot2':
##
## empty
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:here':
##
## here
## Loading required package: scales
## Loading required package: grid
ggbiplot(test_pca)

library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.3
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
fviz_pca_var(test_pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

fviz_pca_biplot(test_pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
