This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
The first step is adding all data sets, from weather, volume, sample collection, and volume corrected data from soil flux pro. All raw datasets have been added and then first stages of analysis can begin.
library(readxl)
# Imputing Data From Soil Flux Pro, adding all variables of interest.
# In soil flux pro editing volume to correspond to edits made on system such as
# water traps, ammonia traps, changes in soil collars etc
# Imputing Data from base weather station as well as local weather stations.
# Raw Base Weather Station Data.
Raw_Base_Weather_Station <- read.table("C:/Users/Daniel/Documents/Projects/Church_Town/Base Weather Station/Base Weather Station Churchtown.20240220.txt", header = TRUE, sep = "\t")
#2d and 3d volume more documentation on calculation methods available elsewhere
Volume_and_Surface_Area <- read_xlsx("C:/Users/Daniel/Documents/Projects/Church_Town/Manure Pile/Flux_Data_Quality/Flux_Data_Review/Volume_SurfaceArea.xlsx")
#all bulk density measurements
Bulk_Density <- read_xlsx("C:/Users/Daniel/Documents/Projects/Church_Town/Manure Pile/Flux_Data_Quality/Flux_Data_Review/Average_BD.xlsx")
#averaged bulk density, from the two measurements per pile per collection period
Averaged_Bulk_Density <- read.csv("BD_Averages.csv")
# moisture percentage - based of samples collected with bulk density.
Moisture_Data <- read.csv("C:/Users/Daniel/Documents/Projects/Church_Town/Manure Pile/Flux_Data_Quality/Flux_Data_Review/Moisture_Data_Averages.csv")
# Soil Flux Pro Headers
Full_Data_Set_Headers <- read.csv("C:/Users/Daniel/Documents/Projects/Church_Town/Manure Pile/Full_Data_Set_Creation/Excel_3_12_2024_Volume_Corrected_All_Manure_Pile_Data.csv")
Full_Data_Set_Headers <- Full_Data_Set_Headers[c(1:2),]
#imported VOLUME EDITED soil flux data from experimental period
Full_Data_Set <- read.csv("C:/Users/Daniel/Documents/Projects/Church_Town/Manure Pile/Full_Data_Set_Creation/3_15_2024_Full_Data_Set_No_Duplicates.csv",skip = 1)
# Removing units to make analysis easier
Full_Data_Set <- Full_Data_Set[-(1),]
# All Raw Data Added
# Converting DOY to numeric
Full_Data_Set$DOY.initial_value <- as.double(Full_Data_Set$DOY.initial_value)
Full_Data_Set$Original_DOY_Value <- Full_Data_Set$DOY.initial_value
# Setting Port as Numeric to isolate control and experimental piles
Full_Data_Set$PORT <- as.numeric(Full_Data_Set$PORT)
Combining the data Adding bulk density to each measurement for flux per unit mass throughout the experiment.
Now that bulk density has been added to the data set we can pair the information with moisture percentage to get dry mass per volume of the pile.
Code to add volume 2d (surface area) and 3d volume in M^2, this will help us take area flux values and apply them to the whole windrow.
# adding 2d and 3d volume to volume corrected data set.
# adding volume measurements to Good_measurements.
#filtering for Control and Exp piles
Volume_and_Surface_Area_Control <- Volume_and_Surface_Area[Volume_and_Surface_Area$Pile == "C",]
Volume_and_Surface_Area_Experimental <- Volume_and_Surface_Area[Volume_and_Surface_Area$Pile == "E", ]
# function to find Doy
find_closest_DOY <- function(doy_initial, volume_surface_area_df) {
if (!is.numeric(doy_initial) || length(doy_initial) != 1) {
stop("doy_initial must be a single numeric value")
}
if (!is.data.frame(volume_surface_area_df) || nrow(volume_surface_area_df) == 0) {
return(data.frame()) # Return an empty data frame
}
# Calculate the absolute difference between the DOY values
abs_diff <- abs(volume_surface_area_df$DOY - doy_initial)
# Find the index of the row with the minimum difference
closest_index <- which.min(abs_diff)
# Return the closest row, ensuring it remains a data frame
return(volume_surface_area_df[closest_index, , drop = FALSE])
}
combined_data <- data.frame()
for (i in 1:nrow(Full_Data_Set)) {
row <- Full_Data_Set[i, ]
# Select the correct subset of Volume_SurfaceArea based on PORT value
if (row$PORT %in% c(1, 2, 3)) {
volume_surface_area_subset <- Volume_and_Surface_Area_Control
} else if (row$PORT %in% c(4, 5, 6)) {
volume_surface_area_subset <- Volume_and_Surface_Area_Experimental
} else {
next
}
# Find the closest DOY row(s) from the selected subset
closest_rows <- find_closest_DOY(row$DOY.initial_value, volume_surface_area_subset)
# Initialize closest_row as an empty data frame with the same columns as volume_surface_area_subset
closest_row <- data.frame(matrix(ncol = ncol(volume_surface_area_subset), nrow = 1))
colnames(closest_row) <- colnames(volume_surface_area_subset)
# Apply the logic to determine the closest_row
if (nrow(closest_rows) > 0) {
if (nrow(closest_rows) > 1) {
decimal_part <- row$DOY.initial_value %% 1
if (decimal_part < 0.375) {
closest_row <- closest_rows[closest_rows$Turning == "B", , drop = FALSE]
} else {
closest_row <- closest_rows[closest_rows$Turning != "B", , drop = FALSE]
}
# If both or neither are "B", choose the first one
if (nrow(closest_row) != 1) {
closest_row <- closest_rows[1, , drop = FALSE]
}
} else {
closest_row <- closest_rows
}
} else {
# Fill in NA values for the missing data
closest_row[,] <- NA
}
# Combine the row with the closest_row
combined_row <- cbind(row, closest_row)
combined_data <- rbind(combined_data, combined_row)
}
# all data points have been combined (Besides Weather, saving files)
write.csv(combined_data,"Full_Data_Set_VC_BD_SA.csv")
Now that the volume data has been added lets add the last piece of information, weather from our weather station.
The above chunk took the data from the weather station and parsed it to make it easier to add to the full weather data.
Here -9999 has been replaced with NA as I do not want a numeric value to interfere with analysis. The method for removing outliers was outlined above. The question is should this be done individually for each variable ?
lets start cleaning the N2O.
# Load necessary libraries
library(xts)
library(zoo)
library(dplyr)
# Load the dataset
Cleaned_N2O <- read_csv("Uncleaned_All_Vars.csv", col_types = cols(...1 = col_skip()))
# Remove duplicates based on the "DOY" column, keeping the first occurrence
Cleaned_N2O <- distinct(Cleaned_N2O, DOY, .keep_all = TRUE)
# Convert 'DATE_TIME.initial_value' to POSIXct format for proper time series handling
Cleaned_N2O$DATE_TIME.initial_value <- as.POSIXct(Cleaned_N2O$DATE_TIME.initial_value, format = "%Y/%m/%d %H:%M:%S")
# Create an xts object with 'FN2O_DRY' as the data and 'DATE_TIME.initial_value' as the order index
N2O_XTS <- xts(Cleaned_N2O$FN2O_DRY, order.by = Cleaned_N2O$DATE_TIME.initial_value)
calculate_rolling_stats <- function(ts, timestamp, hours = 12) {
# Define the time window: 12 hours before and after the timestamp
start_time <- timestamp - hours*3600 # Convert hours to seconds
end_time <- timestamp + hours*3600 # Convert hours to seconds
# Ensure start_time and end_time are within the bounds of the ts index
start_time <- max(start_time, start(ts))
end_time <- min(end_time, end(ts))
# Subset the time series to the window
window_ts <- ts[paste(start_time, end_time, sep = "/")]
# Calculate mean and standard deviation, handling the case where window_ts might be empty or NA
window_mean <- ifelse(length(window_ts) > 0, mean(window_ts, na.rm = TRUE), NA)
window_sd <- ifelse(length(window_ts) > 0, sd(window_ts, na.rm = TRUE), NA)
# Return the results as a list
return(list(mean = window_mean, sd = window_sd))
}
# Apply the custom rolling stats function to each timestamp in the N2O_XTS object
timestamps <- index(N2O_XTS)
results <- lapply(timestamps, function(t) calculate_rolling_stats(N2O_XTS, t))
# Optional: To convert results back into a format that can be merged with your original data,
results_df <- do.call(rbind, lapply(1:length(results), function(i) {
data.frame(timestamp = timestamps[i], mean = results[[i]]$mean, sd = results[[i]]$sd)
}))
rownames(results_df) <- NULL
# Ensure both data frames use the same time zone, for example, UTC
Cleaned_N2O$DATE_TIME.initial_value <- as.POSIXct(Cleaned_N2O$DATE_TIME.initial_value, tz = "UTC")
# When you create or convert timestamps in results_df, ensure the time zone is also set to UTC
results_df$timestamp <- as.POSIXct(results_df$timestamp, origin = "1970-01-01", tz = "UTC")
# Assuming DATE_TIME.initial_value is already converted and timezone set
Cleaned_N2O$timestamp <- Cleaned_N2O$DATE_TIME.initial_value
# Ensure the timestamp format and time zone in results_df match
results_df$timestamp <- as.POSIXct(results_df$timestamp, tz = "UTC")
Cleaned_N2O <- merge(Cleaned_N2O, results_df, by = "timestamp", all.x = TRUE)
# Now, Cleaned_N2O contains your original data along with the rolling mean and sd for each measurement
# Step 1: Determine which measurements are within three SD of the rolling mean
is_within_three_sd <- with(Cleaned_N2O, abs(FN2O_DRY - mean) <= (3 * sd))
# Step 2: Create a new column that includes measurements within three SD of the rolling mean, NA otherwise
Cleaned_N2O$Flux_N2O_Cleaned <- ifelse(is_within_three_sd, Cleaned_N2O$FN2O_DRY, NA)
write.csv(Cleaned_N2O,"Cleaned_N2O.csv")
We have cleaned data using the rolling sd from the rolling mean and 3 sd. Lets add some data such as pile area and mass to this data to get flux of the whole piles and flux per kg.
Cleaned_N2O_Mass_Corrected <- read_csv("Cleaned_N2O.csv", col_types = cols(...1 = col_skip()))
## New names:
## • `` -> `...1`
## • `Pile...54` -> `Pile...55`
## • `Pile...63` -> `Pile...64`
# time to figure out what reasonable values look like.
# units of n2o flux [umol+1m-2s-1]
# surface area is not a good way to get an idea of value lets do it by dry mass. Issues in calculations of dry mass as we did not sample bulk density and moisture that often but this is the best we have.
# Pile Dry Mass <- bulk density in lbs per 5 gallon bucket * pile volume in meters
# 0.453592 lb > kg conversion factor as BD
# 5 gallon bucket 0.0189271 in cubic meters
Cleaned_N2O_Mass_Corrected$Pile_Dry_Mass <- (((Cleaned_N2O_Mass_Corrected$Nearest_BD * (0.453592))/0.0189271)* Cleaned_N2O_Mass_Corrected$Area_3D * (Cleaned_N2O_Mass_Corrected$Nearest_Moisture/100))
n2O_umoltomg_Conversion <- 44.01/1000
Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned_PerKg <- (Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned*(n2O_umoltomg_Conversion) *(Cleaned_N2O_Mass_Corrected$Area_2D))/Cleaned_N2O_Mass_Corrected$Pile_Dry_Mass
Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned_PerKg_HOUR <- Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned_PerKg*3600
# units of cleaned flux per mass are in mg per kg per second. DOUBLE CHECK.
# Bulk Density in lbs * Kg conversion / bucket cubic meters. * 3d cubic meters * moisture percentage = pile dry mass.
# flux per m^2 * conversion from umol to ug is ug per m^2. * total m^2 / total kg of pile = flux in ug per dry kg of pile mass.
# lets figure out what values make sense and keep those. Percentage of values outside -0.1 - 0.2 which seem to be reasonable values.
write.csv(Cleaned_N2O_Mass_Corrected,"Cleaned_N2O_Mass_Corrected_N2O_Cleaned_Per_kg.csv")
non_na_values <- !is.na(Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned_PerKg)
outside_range <- Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned_PerKg < 0.0 | Cleaned_N2O_Mass_Corrected$Flux_N2O_Cleaned_PerKg > 0.2
outside_range_non_na <- outside_range & non_na_values
percentage_outside_non_na <- sum(outside_range_non_na) / sum(non_na_values) * 100
print("The percent of cleaned values outside of expected range ")
## [1] "The percent of cleaned values outside of expected range "
print(percentage_outside_non_na)
## [1] 8.661934
Here we start to look at relationships between the variables.
library(ggplot2)
# Creating the histogram
ggplot(Cleaned_N2O_Mass_Corrected, aes(x = Flux_N2O_Cleaned_PerKg )) +
geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Histogram of Values", x = "Value", y = "Frequency")
## Warning: Removed 4272 rows containing non-finite values (`stat_bin()`).
N2O_lm_moisture <- lm(Flux_N2O_Cleaned_PerKg_HOUR ~ SWC_1.initial_value,
data = Cleaned_N2O_Mass_Corrected,
subset = (Flux_N2O_Cleaned_PerKg_HOUR > 0) & (SWC_1.initial_value > 0.5))
N2O_lm_Temp_Pile <- lm(Flux_N2O_Cleaned_PerKg_HOUR ~ TS_1.initial_value,
data = Cleaned_N2O_Mass_Corrected,
subset = (Flux_N2O_Cleaned_PerKg_HOUR > 0) & (TS_1.initial_value > 40))
N2O_lm_TEMP_BS <- lm(Flux_N2O_Cleaned_PerKg_HOUR ~ Temperature...F..1,
data = Cleaned_N2O_Mass_Corrected,
subset = (Flux_N2O_Cleaned_PerKg_HOUR > 0) )
N2O_lm_Temp_Pile <- lm(Flux_N2O_Cleaned_PerKg_HOUR ~ TS_1.initial_value,
data = Cleaned_N2O_Mass_Corrected,
subset = (Flux_N2O_Cleaned_PerKg_HOUR > 0) & (TS_1.initial_value > 40))
N2O_lm_rain <- lm(Flux_N2O_Cleaned_PerKg_HOUR ~ Rainfall..In.,
data = Cleaned_N2O_Mass_Corrected,
subset = (Flux_N2O_Cleaned_PerKg_HOUR > 0) & (Rainfall..In. > 0.01))
ggplot(Cleaned_N2O_Mass_Corrected, aes(x = SWC_1.initial_value, y = Flux_N2O_Cleaned_PerKg_HOUR)) +
geom_point() + # Add the data points
geom_smooth(method = "lm", se = FALSE) + # Add the linear regression line without confidence interval
theme_minimal() + # Optional: nicer theme
labs(title = "Relationship Between Moisture and N2O Flux ",
x = "Moisture Percent",
y = "N2O Flux ug per KG Per Hour")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6264 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 6264 rows containing missing values (`geom_point()`).
subset_df <- Cleaned_N2O_Mass_Corrected %>%
filter(Pile...55 == "C")
ggplot(subset_df, aes(x = DOY, y = TS_1.initial_value)) +
geom_line(alpha = 0.3) + # Original data in the background, more transparent
geom_point() +
theme_minimal() +
labs(title = "Temperature Over Time (Smoothed)",
x = "Time",
y = "Temperature")+
geom_vline(xintercept = c( 194, 222, 249, 276), linetype = "dashed", color = "black") +
geom_vline(xintercept = c(186, 208, 234, 262, 290), linetype = "dashed", color = "orange")
## Warning: Removed 782 rows containing missing values (`geom_point()`).
Now we have cleaned N2O lets do the same to Co2 and CH4 using the same technique. While these values are cleaned there still is more work to do as some of the values do not make sense. We should also have an expected range and only keep values within that expected range. There should not be negative values for n2O given the environment.
# Load necessary libraries
library(xts)
library(zoo)
library(dplyr)
Cleaned_N2O_Cleaned_CH4 <- read_csv("Cleaned_N2O_Mass_Corrected_N2O_Cleaned_Per_kg.csv", col_types = cols(...1 = col_skip()))
## New names:
## • `` -> `...1`
# Convert 'DATE_TIME.initial_value' to POSIXct format for proper time series handling
Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value <- as.POSIXct(Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value, format = "%Y/%m/%d %H:%M:%S")
# Create an xts object with 'FN2O_DRY' as the data and 'DATE_TIME.initial_value' as the order index
CH4_XTS <- xts(Cleaned_N2O_Cleaned_CH4$FCH4_DRY, order.by = Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value)
calculate_rolling_stats <- function(ts, timestamp, hours = 12) {
# Define the time window: 12 hours before and after the timestamp
start_time <- timestamp - hours*3600 # Convert hours to seconds
end_time <- timestamp + hours*3600 # Convert hours to seconds
# Ensure start_time and end_time are within the bounds of the ts index
start_time <- max(start_time, start(ts))
end_time <- min(end_time, end(ts))
# Subset the time series to the window
window_ts <- ts[paste(start_time, end_time, sep = "/")]
# Calculate mean and standard deviation, handling the case where window_ts might be empty or NA
window_mean <- ifelse(length(window_ts) > 0, mean(window_ts, na.rm = TRUE), NA)
window_sd <- ifelse(length(window_ts) > 0, sd(window_ts, na.rm = TRUE), NA)
# Return the results as a list
return(list(mean = window_mean, sd = window_sd))
}
# Apply the custom rolling stats function to each timestamp in the N2O_XTS object
timestamps <- index(CH4_XTS)
results <- lapply(timestamps, function(t) calculate_rolling_stats(CH4_XTS, t))
# Optional: To convert results back into a format that can be merged with your original data,
results_df_ch4 <- do.call(rbind, lapply(1:length(results), function(i) {
data.frame(timestamp = timestamps[i], mean = results[[i]]$mean, sd = results[[i]]$sd)
}))
rownames(results_df_ch4) <- NULL
# Ensure both data frames use the same time zone, for example, UTC
Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value <- as.POSIXct(Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value, tz = "UTC")
# When you create or convert timestamps in results_df, ensure the time zone is also set to UTC
results_df_ch4$timestamp <- as.POSIXct(results_df_ch4$timestamp, origin = "1970-01-01", tz = "UTC")
# Assuming DATE_TIME.initial_value is already converted and timezone set
Cleaned_N2O_Cleaned_CH4$timestamp <- Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value
# Ensure the timestamp format and time zone in results_df match
results_df_ch4$timestamp <- as.POSIXct(results_df_ch4$timestamp, tz = "UTC")
Cleaned_N2O_Cleaned_CH4 <- merge(Cleaned_N2O_Cleaned_CH4, results_df_ch4, by = "timestamp", all.x = TRUE)
# Now, Cleaned_N2O contains your original data along with the rolling mean and sd for each measurement
# Step 1: Determine which measurements are within three SD of the rolling mean
is_within_three_sd <- with(Cleaned_N2O_Cleaned_CH4, abs(FCH4_DRY - mean.y) <= (3 * sd.y))
# Step 2: Create a new column that includes measurements within three SD of the rolling mean, NA otherwise
Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned <- ifelse(is_within_three_sd, Cleaned_N2O_Cleaned_CH4$FCH4_DRY, NA)
write.csv(Cleaned_N2O_Cleaned_CH4,"Cleaned_N2O_CH4.csv")
further cleaning of ch4 - removing all before clock issue was resolved and removing those that have the same linear and exponential then adding per unit of mass as before
Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned <- ifelse(Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned == Cleaned_N2O_Cleaned_CH4$FCH4_DRY.LIN, NA, Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned)
Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value <- as.Date(Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value)
# Identify rows with dates before July 28 of the respective year
indices <- which(Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value < as.Date(sprintf("%s-07-28", format(Cleaned_N2O_Cleaned_CH4$DATE_TIME.initial_value, "%Y"))))
# Set values to NA for these rows in specified columns
Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned[indices] <- NA
# Pile Dry Mass <- bulk density in lbs per 5 gallon bucket * pile volume in meters
# 0.453592 lb > kg conversion factor as BD
# 5 gallon bucket 0.0189271 in cubic meters
ch4_umoltomg_Conversion <- 16.042/1000
Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned_perkg <- (Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned*(ch4_umoltomg_Conversion) *(Cleaned_N2O_Cleaned_CH4$Area_2D))/Cleaned_N2O_Cleaned_CH4$Pile_Dry_Mass
# units of cleaned flux per mass are in mg per kg per second.
# Bulk Density in lbs * Kg conversion / bucket cubic meters. * 3d cubic meters * moisture percentage = pile dry mass.
# flux per m^2 * conversion from umol to ug is ug per m^2. * total m^2 / total kg of pile = flux in ug per dry kg of pile mass.
# lets figure out what values make sense and keep those. Percentage of values outside -0.1 - 0.2 which seem to be reasonable values.
write.csv(Cleaned_N2O_Cleaned_CH4,"Cleaned_N2O_Mass_Corrected_N2O_CH4_Cleaned_Per_kg.csv")
library(ggplot2)
greater_than_zero_flux_subset <- Cleaned_N2O_Cleaned_CH4 %>% filter(Flux_N2O_Cleaned > 0 & Flux_CH4_Cleaned > 0)
lm_model_cleaned_n2o_ch4 <- lm(Flux_CH4_Cleaned ~ Flux_N2O_Cleaned ,data = greater_than_zero_flux_subset)
summary(lm_model_cleaned_n2o_ch4)
##
## Call:
## lm(formula = Flux_CH4_Cleaned ~ Flux_N2O_Cleaned, data = greater_than_zero_flux_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21311.5 -1443.2 -1164.0 60.8 23232.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1219.2547 96.6732 12.61 <2e-16 ***
## Flux_N2O_Cleaned 7.3061 0.2691 27.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4230 on 2469 degrees of freedom
## Multiple R-squared: 0.2299, Adjusted R-squared: 0.2296
## F-statistic: 737 on 1 and 2469 DF, p-value: < 2.2e-16
pearson_cor <- cor(Cleaned_N2O_Cleaned_CH4$Flux_CH4_Cleaned, Cleaned_N2O_Cleaned_CH4$Flux_N2O_Cleaned, method = "pearson", use = "complete.obs")
pearson_cor_go <- cor(greater_than_zero_flux_subset$Flux_CH4_Cleaned, greater_than_zero_flux_subset$Flux_N2O_Cleaned, method = "pearson")
Cleaning CO2 Values, last ghg - not going to clean h2o as the utility of h2o is in corrections ? Maybe I should and I can consider this later.
# Load necessary library
library(xts)
library(zoo)
library(dplyr)
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2 <- read_csv("Cleaned_N2O_Mass_Corrected_N2O_CH4_Cleaned_Per_kg.csv", col_types = cols(...1 = col_skip()))
## New names:
## • `` -> `...1`
# Convert 'DATE_TIME.initial_value' to POSIXct format for proper time series handling
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$DATE_TIME.initial_value <- as.POSIXct(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$timestamp, format = "%Y/%m/%d %H:%M:%S")
# Create an xts object with 'FN2O_DRY' as the data and 'DATE_TIME.initial_value' as the order index
CO2_XTS <- xts(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$FCO2_DRY, order.by = Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$timestamp)
calculate_rolling_stats <- function(ts, timestamp, hours = 12) {
# Define the time window: 12 hours before and after the timestamp
start_time <- timestamp - hours*3600 # Convert hours to seconds
end_time <- timestamp + hours*3600 # Convert hours to seconds
# Ensure start_time and end_time are within the bounds of the ts index
start_time <- max(start_time, start(ts))
end_time <- min(end_time, end(ts))
# Subset the time series to the window
window_ts <- ts[paste(start_time, end_time, sep = "/")]
# Calculate mean and standard deviation, handling the case where window_ts might be empty or NA
window_mean <- ifelse(length(window_ts) > 0, mean(window_ts, na.rm = TRUE), NA)
window_sd <- ifelse(length(window_ts) > 0, sd(window_ts, na.rm = TRUE), NA)
# Return the results as a list
return(list(mean = window_mean, sd = window_sd))
}
# Apply the custom rolling stats function to each timestamp in the N2O_XTS object
timestamps <- index(CO2_XTS)
results <- lapply(timestamps, function(t) calculate_rolling_stats(CO2_XTS, t))
# Optional: To convert results back into a format that can be merged with your original data,
results_df_ch4_co2 <- do.call(rbind, lapply(1:length(results), function(i) {
data.frame(timestamp = timestamps[i], mean = results[[i]]$mean, sd = results[[i]]$sd)
}))
rownames(results_df_ch4_co2) <- NULL
# Ensure both data frames use the same time zone, for example, UTC
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$DATE_TIME.initial_value <- as.POSIXct(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$timestamp, tz = "UTC")
# When you create or convert timestamps in results_df, ensure the time zone is also set to UTC
results_df_ch4_co2$timestamp <- as.POSIXct(results_df_ch4_co2$timestamp, origin = "1970-01-01", tz = "UTC")
# Ensure the timestamp format and time zone in results_df match
results_df_ch4_co2$timestamp <- as.POSIXct(results_df_ch4_co2$timestamp, tz = "UTC")
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2 <- merge(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2, results_df_ch4_co2, by = "timestamp", all.x = TRUE)
# Now, Cleaned_N2O contains your original data along with the rolling mean and sd for each measurement
# Step 1: Determine which measurements are within three SD of the rolling mean
is_within_three_sd <- with(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2, abs(FCO2_DRY - mean) <= (3 * sd))
# Step 2: Create a new column that includes measurements within three SD of the rolling mean, NA otherwise
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned <- ifelse(is_within_three_sd, Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$FCO2_DRY, NA)
write.csv(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2,"Cleaned_N2O_CH4_CO2.csv")
further cleaning of ch4 - removing all before clock issue was resolved and removing those that have the same linear and exponential then adding per unit of mass as before
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned <- ifelse(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned == Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$FCO2_DRY.LIN, NA, Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned)
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$DATE_TIME.initial_value <- as.Date(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$DATE_TIME.initial_value)
# Identify rows with dates before July 28 of the respective year
indices <- which(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$timestamp < as.Date(sprintf("%s-07-28", format(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$timestamp, "%Y"))))
# Set values to NA for these rows in specified columns
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned[indices] <- NA
# Pile Dry Mass <- bulk density in lbs per 5 gallon bucket * pile volume in meters
# 0.453592 lb > kg conversion factor as BD
# 5 gallon bucket 0.0189271 in cubic meters
co2_umoltomg_Conversion <- 44.01/1000
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned_perkg <- (Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Flux_CO2_Cleaned*(co2_umoltomg_Conversion) *(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Area_2D))/Cleaned_N2O_Cleaned_CH4_Cleaned_CO2$Pile_Dry_Mass
# units of cleaned flux per mass are in mg per kg per second.
# Bulk Density in lbs * Kg conversion / bucket cubic meters. * 3d cubic meters * moisture percentage = pile dry mass.
# flux per m^2 * conversion from umol to ug is ug per m^2. * total m^2 / total kg of pile = flux in ug per dry kg of pile mass.
# lets figure out what values make sense and keep those. Percentage of values outside -0.1 - 0.2 which seem to be reasonable values.
write.csv(Cleaned_N2O_Cleaned_CH4_Cleaned_CO2,"Cleaned_N2O_Mass_Corrected_N2O_CH4_Co2_Cleaned_Per_kg.csv")
We have preliminary cleaning done, now lets clean based on expected values. Lets remove fluxes which despite meeting the cleaning above still do not make sense. All negative ch4 fluxes should be removed as the environment would not create a negative methane flux. All negative co2 flux as well. Not touching N2O as I understand its dynamics less. Maybe I should have eliminated the negative values first as they do effect the rolling mean and sd and such.
library(dplyr)
Cleaned_N2O_Cleaned_CH4_Cleaned_CO2_negative_eliminated <- Cleaned_N2O_Cleaned_CH4_Cleaned_CO2 %>%
mutate(
Flux_CO2_Cleaned = if_else(Flux_CO2_Cleaned < 0, NA_real_, Flux_CO2_Cleaned),
Flux_CO2_Cleaned_perkg = if_else(Flux_CO2_Cleaned_perkg < 0, NA_real_, Flux_CO2_Cleaned_perkg),
Flux_CH4_Cleaned = if_else(Flux_CH4_Cleaned < 0, NA_real_, Flux_CH4_Cleaned),
Flux_CH4_Cleaned_perkg = if_else(Flux_CH4_Cleaned_perkg < 0, NA_real_, Flux_CH4_Cleaned_perkg)
)