ec <- here("data/1_prepared/ec_with_na_hainich.rds") %>%
        read_rds()

lda_data <- ec %>% 
  select(time, daytime, month, wind_speed, wind_dir, w_unrot, H, LE, bowen_ratio, co2_flux, co2_mixing_ratio, air_temperature, co2_strg, u_star, TKE, L, z_d_L, es, e, RAD_IN, RAD_OUT, LW_IN, LW_OUT, NETRAD, u_var, v_var, w_var, qc_co2_flux, flag_timelag_sf_co2, flag_spikes_hf_co2, flag_absolute_limits_hf_co2) %>% 
  mutate(across(starts_with("flag"), as_factor), daytime = as.integer(daytime))

Utility functions

# Given a recipe it plots all the predictors distributions as histograms
plot_rec_var_dist <- function(rec){
  recipe_data <- rec %>% 
    prep() %>% 
    bake(all_predictors(), new_data=NULL)
  
  recipe_data %>% 
    mutate(across(starts_with("flag"), . %>% as.logical %>% as.double),
           across(starts_with("qc"), . %>% as.character %>% as.double)) %>% 
    pivot_longer(everything(),
                 names_to = "var", values_to = "val") %>%
    ggplot() +
    geom_histogram(aes(val), bins=20) +
    facet_wrap(~var, ncol=4, scales = "free") +
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), strip.text = element_text(margin = margin(0.1,0,0.1,0), face="bold", size=7))
}

plot_lda_scalings <- function(lda_wkflow) {
  lda_scaling <- lda_wkflow %>%
    extract_fit_engine() %>% 
    `$`(scaling) %>% 
    as_tibble(rownames = "var") 
  
  vectors_pos <- lda_scaling %>% 
      filter(LD1 > 0) %>% 
      add_origins()
    
    vectors_neg <- lda_scaling %>% 
      filter(LD1 < 0) %>% 
      add_origins()
    
  vectors_layer <- list(
    geom_line(aes(group=var), data = vectors_pos, arrow=arrow(length = unit(.1, "inches"))),
    geom_line(aes(group=var), data = vectors_neg, arrow=arrow(ends = "first", length = unit(.1, "inches"))),
    geom_label_repel(aes(label=var), data=lda_scaling))
  
  if(ncol(lda_scaling)==3){ # 1 col is the "var" and 2 cols LD1 and LD2
    # plot 2D vectors with arrows
    
    ggplot(lda_scaling, aes(LD1, LD2)) +
      vectors_layer
  }
  else{
    # plot in 1 dimension, the y axis is the variable
    ggplot(lda_scaling, aes(LD1, var)) +
      vectors_layer
  }
}

# add an origin to the datapoint for plotting
add_origins <- function(dat){
    tibble(
      var = dat$var,
      LD1 = 0,
      LD2 = 0
    ) %>% 
      bind_rows(dat)
}

Basic recipe

lda_rec <- recipe(lda_data) %>% 
  update_role(starts_with("flag"), starts_with("qc"), new_role = "flag") %>% # mark them as flag so is not incluced in computation
  update_role(!has_role("flag"), new_role = "predictor") %>% # set all the rest as a predictor
  update_role(time, new_role = "time") %>% 
  step_naomit(everything())

plot_rec_var_dist(lda_rec)

Removing extreme values from the dataset

Remove the extreme values in the dataset by removing everything below the 2.5% and above 97.5 % percentile

lda_rec_filt <- lda_rec %>% 
  step_filter(across(c(bowen_ratio, H, L, LE, co2_flux, co2_mixing_ratio, z_d_L, co2_strg), ~between(.x, quantile(.x, .03), quantile(.x, .97))), skip=F) #Need to have proper plotting

plot_rec_var_dist(lda_rec_filt)

Transform data YeoJohnson

lda_rec_yeo <- lda_rec_filt %>% 
  step_YeoJohnson(all_numeric_predictors(), -month, -daytime)

plot_rec_var_dist(lda_rec_yeo)

# Normalize

lda_rec_norm <- lda_rec_yeo %>% 
  step_normalize(all_numeric_predictors())

plot_rec_var_dist(lda_rec_norm)

lda_rec_final <- lda_rec_norm

QC

qc_rec <- lda_rec_final %>% 
  update_role(qc_co2_flux, new_role = "outcome")

qc_model <- workflow() %>% 
  add_recipe(qc_rec) %>% 
  add_model(discrim_linear()) %>% 
  fit(lda_data)
qc_model %>% 
  extract_fit_engine() %>% 
  print_all()

## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##         0         1         2 
## 0.5045668 0.3077187 0.1877145 
## 
## Group means:
##       daytime      month  wind_speed    wind_dir    w_unrot           H
## 0  0.03739062  0.1755772  0.06072370 -0.06109830  0.1051101 -0.03008967
## 1 -0.03611798 -0.1480361 -0.07357175  0.03860533 -0.1003909  0.02625310
## 2 -0.04129615 -0.2292681 -0.04261665  0.10094368 -0.1179605  0.03784300
##           LE  bowen_ratio    co2_flux co2_mixing_ratio air_temperature
## 0  0.2399436 -0.023364161 -0.05894768     -0.026030038       0.1680656
## 1 -0.2125237  0.009680416  0.11788489      0.037622032      -0.1628486
## 2 -0.2965678  0.046932635 -0.03479936      0.008293928      -0.1891921
##      co2_strg      u_star          TKE             L         z_d_L         es
## 0 -0.04095184  0.05593280  0.025444054 -0.0004403831  4.521485e-05  0.1675974
## 1  0.03123853 -0.06171042 -0.038711382 -0.0019699663 -8.003213e-03 -0.1610845
## 2  0.05886738 -0.04918313 -0.004933071  0.0044130754  1.299806e-02 -0.1864288
##            e     RAD_IN    RAD_OUT       LW_IN       LW_OUT      NETRAD
## 0  0.1412339  0.1338496  0.1136949 -0.01839958  0.002393756  0.05539825
## 1 -0.1116514 -0.1255884 -0.1089492  0.01153735 -0.011025773 -0.05513253
## 2 -0.1966001 -0.1539048 -0.1270065  0.03054404  0.011640162 -0.05852936
##          u_var       v_var       w_var
## 0  0.017278793  0.03310379  0.03157139
## 1 -0.029608762 -0.04776914 -0.04328383
## 2  0.002092884 -0.01067374 -0.01390745
## 
## Coefficients of linear discriminants:
##                           LD1         LD2
## daytime           0.129695180 -0.19248947
## month            -0.445175355  0.05055367
## wind_speed       -0.549349793  0.30692752
## wind_dir          0.017886148  0.19981295
## w_unrot          -0.112259772  0.16915265
## H                 0.270631417 -0.23292442
## LE               -0.478420171 -0.23144721
## bowen_ratio       0.014037879  0.11509519
## co2_flux         -0.060450877 -1.00188859
## co2_mixing_ratio  0.236186564 -0.12650002
## air_temperature  -0.646194210 -0.43742144
## co2_strg          0.079415432  0.06753304
## u_star           -1.276019099 -0.65723304
## TKE              -1.915141975  5.04131792
## L                 0.010013636  0.01972600
## z_d_L             0.017691686  0.21343591
## es               -0.056482053  0.42038599
## e                -0.050642205 -0.32311627
## RAD_IN           -0.002693222  0.29310522
## RAD_OUT          -0.130616012 -0.03007364
## LW_IN             0.198754119 -0.09314998
## LW_OUT            0.510812837  0.16676400
## NETRAD           -0.349568502 -0.09514977
## u_var             1.884264714 -3.10751524
## v_var             0.904836470 -1.48055899
## w_var             0.795325793  0.07162726
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9693 0.0307

plot_lda(qc_model, lda_data)

plot_lda_density(qc_model, lda_data)

plot_lda_scalings(qc_model)

## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Look at extreme values

# Note convert this to a function

predictor_data <- qc_model %>%
    extract_recipe() %>% 
    bake(all_predictors(), new_data=lda_data) %>% 
    as.matrix()
  
outcome_data <- qc_model %>%
    extract_recipe()  %>% 
  bake(all_outcomes(), new_data=lda_data) 

lda_scaling <- qc_model %>%
  extract_fit_engine() %>% 
  `$`(scaling)

data_not_transf <- lda_rec_filt %>% 
  prep() %>% 
  bake(new_data=lda_data)

obs_coords <- predictor_data %*% lda_scaling %>% 
  as_tibble()
  
  
 qc_data_with_lda <- bind_cols(data_not_transf, obs_coords)

qc_data_with_lda %>% 
  slice_max(LD1, n = 50)

qc_data_with_lda %>% 
 ggcorr(label=T, nudge_x= -.3)

## Warning in ggcorr(., label = T, nudge_x = -0.3): data in column(s)
## 'time', 'qc_co2_flux', 'flag_timelag_sf_co2', 'flag_spikes_hf_co2',
## 'flag_absolute_limits_hf_co2' are not numeric and were ignored

qc_data_outl <- qc_data_with_lda %>% 
  mutate(outlier = LD1 > sort(LD1, decreasing = T)[50])

plot_with_outl <- function(data, mapping, ...){
  ggplot(data, mapping) +
    geom_point(aes(col = outlier)) +
    scale_color_manual(values = list("TRUE" = "blue", "FALSE" = "darkgreen"))
}

qc_data_outl_all <- qc_data_outl %>% 
  filter(outlier)

qc_data_outl_none <- qc_data_outl %>% 
  sample_n(50)

qc_data_outl_plot <- bind_rows(qc_data_outl_all, qc_data_outl_none)

summary(qc_data_outl_all)

##       time                        daytime         month       
##  Min.   :2016-01-02 16:30:00   Min.   :1.00   Min.   : 1.000  
##  1st Qu.:2017-01-10 10:00:00   1st Qu.:1.00   1st Qu.: 1.000  
##  Median :2017-02-01 10:30:00   Median :2.00   Median : 2.000  
##  Mean   :2017-06-06 04:30:00   Mean   :1.51   Mean   : 2.714  
##  3rd Qu.:2018-03-11 13:30:00   3rd Qu.:2.00   3rd Qu.: 3.000  
##  Max.   :2019-04-25 23:30:00   Max.   :2.00   Max.   :12.000  
##    wind_speed         wind_dir        w_unrot               H          
##  Min.   :0.06164   Min.   : 24.2   Min.   :-1.59567   Min.   :-37.271  
##  1st Qu.:1.05346   1st Qu.:148.1   1st Qu.:-0.24253   1st Qu.:  1.435  
##  Median :2.00165   Median :177.7   Median :-0.03205   Median : 46.098  
##  Mean   :2.10817   Mean   :187.5   Mean   :-0.18544   Mean   : 72.507  
##  3rd Qu.:2.79380   3rd Qu.:241.3   3rd Qu.: 0.02254   3rd Qu.:146.144  
##  Max.   :5.37995   Max.   :356.4   Max.   : 0.21551   Max.   :273.558  
##        LE          bowen_ratio          co2_flux       co2_mixing_ratio
##  Min.   :-66.19   Min.   :-45.5252   Min.   :-11.668   Min.   :343.0   
##  1st Qu.:-57.57   1st Qu.: -3.1660   1st Qu.: -5.363   1st Qu.:367.9   
##  Median :-37.31   Median : -0.4111   Median : -2.100   Median :386.8   
##  Mean   :-31.08   Mean   :  0.4960   Mean   : -2.491   Mean   :387.5   
##  3rd Qu.: -6.89   3rd Qu.:  0.1486   3rd Qu.:  0.390   3rd Qu.:404.3   
##  Max.   : 38.07   Max.   : 59.0284   Max.   :  5.601   Max.   :434.4   
##  air_temperature    co2_strg           u_star             TKE         
##  Min.   :260.1   Min.   :-4.6484   Min.   :0.09828   Min.   :0.06094  
##  1st Qu.:278.7   1st Qu.:-1.1623   1st Qu.:0.23218   1st Qu.:0.75383  
##  Median :280.7   Median : 0.3681   Median :0.43401   Median :1.54899  
##  Mean   :281.0   Mean   : 0.6054   Mean   :0.45175   Mean   :1.96344  
##  3rd Qu.:284.5   3rd Qu.: 1.9635   3rd Qu.:0.58519   3rd Qu.:2.80650  
##  Max.   :302.4   Max.   : 6.5407   Max.   :1.16203   Max.   :7.48359  
##        L                z_d_L                es               e          
##  Min.   :-3155.80   Min.   :-0.95478   Min.   : 223.2   Min.   :  58.73  
##  1st Qu.: -347.09   1st Qu.:-0.40450   1st Qu.: 902.5   1st Qu.: 285.38  
##  Median :  -58.54   Median :-0.07217   Median :1039.9   Median : 531.36  
##  Mean   :  -98.91   Mean   :-0.18269   Mean   :1202.7   Mean   : 744.14  
##  3rd Qu.:  -23.63   3rd Qu.:-0.01042   3rd Qu.:1338.1   3rd Qu.:1048.26  
##  Max.   : 5153.00   Max.   : 1.01799   Max.   :4052.2   Max.   :2996.71  
##      RAD_IN         RAD_OUT          LW_IN           LW_OUT     
##  Min.   :195.5   Min.   :280.4   Min.   :144.7   Min.   :196.5  
##  1st Qu.:290.2   1st Qu.:302.1   1st Qu.:249.9   1st Qu.:295.7  
##  Median :325.6   Median :313.9   Median :298.5   Median :306.4  
##  Mean   :344.1   Mean   :326.6   Mean   :286.0   Mean   :310.1  
##  3rd Qu.:392.4   3rd Qu.:338.2   3rd Qu.:307.4   3rd Qu.:323.0  
##  Max.   :614.3   Max.   :432.5   Max.   :394.3   Max.   :427.5  
##      NETRAD              u_var             v_var             w_var         
##  Min.   :-94.46333   Min.   :0.04339   Min.   :0.01684   Min.   :0.009243  
##  1st Qu.:-19.19000   1st Qu.:0.91915   1st Qu.:0.34031   1st Qu.:0.143815  
##  Median : -0.00383   Median :1.83685   Median :0.97067   Median :0.397365  
##  Mean   : 17.45010   Mean   :2.15313   Mean   :1.21434   Mean   :0.559412  
##  3rd Qu.: 61.09667   3rd Qu.:2.94282   3rd Qu.:1.57321   3rd Qu.:0.811808  
##  Max.   :219.10667   Max.   :7.77232   Max.   :6.34987   Max.   :2.387391  
##  qc_co2_flux flag_timelag_sf_co2 flag_spikes_hf_co2 flag_absolute_limits_hf_co2
##  0: 3        FALSE:32            FALSE:49           FALSE:49                   
##  1:29        TRUE :17            TRUE : 0           TRUE : 0                   
##  2:17                                                                          
##                                                                                
##                                                                                
##                                                                                
##       LD1             LD2          outlier       
##  Min.   :3.583   Min.   :-3.0576   Mode:logical  
##  1st Qu.:3.699   1st Qu.:-0.2695   TRUE:49       
##  Median :3.932   Median : 0.5869                 
##  Mean   :4.097   Mean   : 0.6626                 
##  3rd Qu.:4.346   3rd Qu.: 1.6296                 
##  Max.   :6.147   Max.   : 3.7091

qc_data_outl_plot %>% 
  ggpairs(columns = 1:10, lower = list(continuous = plot_with_outl), legend = c(2,1))

qc_data_outl_plot %>% 
  ggpairs(columns = 10:20, lower = list(continuous = plot_with_outl), legend = c(2,1))

qc_data_outl_plot %>% 
  ggpairs(columns = 1:10, mapping = aes(col=outlier), legend = c(2,1)) +
  scale_color_manual(values = list("darkgreen", "blue")) +
  scale_fill_manual(values = list("darkgreen", "blue")) +
  theme(axis.text.x = element_text(angle = 45))

qc_data_outl_plot %>% 
  ggpairs(columns = 10:20, mapping = aes(col=outlier), legend = c(2,1)) +
  scale_color_manual(values = list("blue", "darkgreen")) +
  scale_fill_manual(values = list("blue", "darkgreen")) +
  theme(axis.text.x = element_text(angle = 45))

qc_data_outl_plot %>% 
  ggpairs(columns = 20:ncol(qc_data_outl_plot), mapping = aes(col=outlier), legend = c(2,1)) +
  scale_color_manual(values = list("blue", "darkgreen")) +
  scale_fill_manual(values = list("blue", "darkgreen")) +
  theme(axis.text.x = element_text(angle = 45))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qc_data_outl_plot %>% 
  select(-time) %>% 
  mutate(across(starts_with("flag"), . %>% as.logical %>% as.double),
         across(starts_with("qc"), . %>% as.character %>% as.double)) %>% 
  pivot_longer(-outlier,
               names_to = "var", values_to = "val") %>%
  ggplot() +
  geom_histogram(aes(val, color = outlier), bins = 10, alpha = .5) +
  facet_wrap(~var, nrow = 4, scales = "free") +
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), strip.text = element_text(margin = margin(0.1,0,0.1,0), face="bold", size=7))

summary(data_not_transf %>% select(1:10))

##       time                        daytime          month       
##  Min.   :2016-01-01 13:00:00   Min.   :1.000   Min.   : 1.000  
##  1st Qu.:2018-01-20 13:37:30   1st Qu.:1.000   1st Qu.: 3.000  
##  Median :2018-11-02 23:15:00   Median :1.000   Median : 5.000  
##  Mean   :2018-10-21 04:19:58   Mean   :1.492   Mean   : 5.821  
##  3rd Qu.:2019-10-07 16:52:30   3rd Qu.:2.000   3rd Qu.: 9.000  
##  Max.   :2020-12-31 22:00:00   Max.   :2.000   Max.   :12.000  
##    wind_speed         wind_dir           w_unrot                 H           
##  Min.   :0.05525   Min.   :  0.0189   Min.   :-1.8696530   Min.   :-123.651  
##  1st Qu.:2.07906   1st Qu.: 81.3470   1st Qu.:-0.0478436   1st Qu.: -29.487  
##  Median :2.76481   Median :191.7728   Median : 0.0123796   Median :  -7.535  
##  Mean   :2.93437   Mean   :196.9563   Mean   :-0.0008549   Mean   :  15.166  
##  3rd Qu.:3.64171   3rd Qu.:320.3329   3rd Qu.: 0.0850359   3rd Qu.:  38.437  
##  Max.   :9.89610   Max.   :359.9997   Max.   : 1.0973364   Max.   : 275.505  
##        LE           bowen_ratio          co2_flux       
##  Min.   :-66.194   Min.   :-65.8040   Min.   :-21.1391  
##  1st Qu.: -1.102   1st Qu.: -3.1591   1st Qu.: -1.5544  
##  Median :  4.519   Median :  0.5137   Median :  0.2594  
##  Mean   : 18.882   Mean   :  0.8633   Mean   : -0.9881  
##  3rd Qu.: 23.402   3rd Qu.:  3.7868   3rd Qu.:  1.5125  
##  Max.   :208.985   Max.   : 73.9792   Max.   :  7.1877

Timelag

timelag_rec <- lda_rec_final %>% 
  update_role(flag_timelag_sf_co2, new_role = "outcome")

timelag_model <- workflow() %>% 
  add_recipe(timelag_rec) %>% 
  add_model(discrim_linear()) %>% 
  fit(lda_data)

timelag_model %>% 
  extract_fit_engine() %>% 
  print_all()

## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##     FALSE      TRUE 
## 0.4900216 0.5099784 
## 
## Group means:
##          daytime      month  wind_speed   wind_dir     w_unrot          H
## FALSE  0.3105033  0.1850701 -0.02593776 -0.1307283  0.08182455  0.2374543
## TRUE  -0.2983526 -0.1778278  0.02492275  0.1256126 -0.07862256 -0.2281621
##               LE  bowen_ratio   co2_flux co2_mixing_ratio air_temperature
## FALSE  0.2029511  0.002058003 -0.4725937       0.02481925     0.007580115
## TRUE  -0.1950091 -0.001977468  0.4540999      -0.02384801    -0.008901868
##          co2_strg      u_star        TKE           L      z_d_L            es
## FALSE -0.04645027  0.10104480  0.1143038 -0.06092442 -0.1958807 -0.0006729069
## TRUE   0.04463256 -0.09709067 -0.1098308  0.05854029  0.1882154  0.0006465744
##                 e     RAD_IN    RAD_OUT       LW_IN      LW_OUT     NETRAD
## FALSE  0.02795513  0.3050546  0.1187567 -0.01930986 -0.03829352  0.2979054
## TRUE  -0.02686118 -0.2931171 -0.1141095  0.01855422  0.03679500 -0.2862477
##            u_var      v_var       w_var
## FALSE  0.1093983  0.1260633  0.09577075
## TRUE  -0.1051173 -0.1211301 -0.09202300
## 
## Coefficients of linear discriminants:
##                          LD1
## daytime          -0.23390613
## month            -0.26967457
## wind_speed        0.26361854
## wind_dir          0.14200624
## w_unrot           0.04640620
## H                 0.11730761
## LE                0.04779700
## bowen_ratio       0.03078330
## co2_flux          0.90128751
## co2_mixing_ratio  0.06199037
## air_temperature  -5.88757091
## co2_strg         -0.01839527
## u_star           -0.07230427
## TKE               3.46699063
## L                -0.02128016
## z_d_L             0.04355335
## es                6.10823084
## e                 0.06172669
## RAD_IN           -0.44635380
## RAD_OUT          -0.11204932
## LW_IN            -0.07122702
## LW_OUT            0.31299440
## NETRAD            0.31660006
## u_var            -1.78420725
## v_var            -1.41408212
## w_var            -0.65516000

plot_lda(timelag_model, lda_data)

plot_lda_scalings(timelag_model)

plot_lda_density(timelag_model, lda_data)

Absolute limites

abs_rec <- lda_rec_final %>% 
  update_role(flag_absolute_limits_hf_co2, new_role = "outcome")

abs_model <- workflow() %>% 
  add_recipe(abs_rec) %>% 
  add_model(discrim_linear()) %>% 
  fit(lda_data)

abs_model %>% 
  extract_fit_engine() %>% 
  print_all()

## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##       FALSE        TRUE 
## 0.999894409 0.000105591 
## 
## Group means:
##             daytime         month    wind_speed      wind_dir       w_unrot
## FALSE -0.0001072075 -0.0001486363 -0.0000154116  6.597934e-05  1.373619e-05
## TRUE   1.0152018586  1.4075111902  0.1459401909 -6.247914e-01 -1.300749e-01
##                   H            LE   bowen_ratio      co2_flux co2_mixing_ratio
## FALSE -2.252957e-05  3.420528e-05  3.543177e-06  2.357318e-05    -0.0001310852
## TRUE   2.133438e-01 -3.239069e-01 -3.355212e-02 -2.232262e-01     1.2413115182
##       air_temperature      co2_strg       u_star           TKE             L
## FALSE   -0.0007529629  1.864993e-05 -3.56757e-05 -4.410609e-05  2.595158e-05
## TRUE    -0.6861991472 -1.766055e-01  3.37831e-01  4.176626e-01 -2.457485e-01
##               z_d_L            es             e        RAD_IN       RAD_OUT
## FALSE  3.207577e-05  7.194514e-05  4.162886e-06 -6.751382e-05 -5.357278e-05
## TRUE  -3.037415e-01 -6.812845e-01 -3.942045e-02  6.393221e-01  5.073075e-01
##              LW_IN        LW_OUT        NETRAD         u_var         v_var
## FALSE  0.000111772  0.0001166285 -5.871172e-05 -4.527251e-05 -4.657884e-05
## TRUE  -1.058425170 -1.1044135511  5.559706e-01  4.287080e-01  4.410783e-01
##               w_var
## FALSE -4.627924e-05
## TRUE   4.382413e-01
## 
## Coefficients of linear discriminants:
##                            LD1
## daytime           5.185739e-01
## month             4.846264e-01
## wind_speed       -2.734317e-01
## wind_dir         -1.858760e-01
## w_unrot          -1.664705e-01
## H                -1.539404e-01
## LE               -3.045737e-01
## bowen_ratio      -2.759583e-02
## co2_flux         -3.601355e-02
## co2_mixing_ratio  2.126024e-01
## air_temperature  -4.897063e+00
## co2_strg          4.809104e-04
## u_star           -4.657037e-01
## TKE              -1.195410e+01
## L                -6.077647e-02
## z_d_L            -4.021810e-02
## es                4.764193e+00
## e                 2.895657e-02
## RAD_IN            1.484825e-01
## RAD_OUT           3.669713e-01
## LW_IN            -2.470792e-01
## LW_OUT           -1.902084e-01
## NETRAD           -1.657484e-01
## u_var             6.339465e+00
## v_var             4.533520e+00
## w_var             2.082472e+00

plot_lda(abs_model, lda_data)

plot_lda_scalings(abs_model)

plot_lda_density(abs_model, lda_data)

Spikes

spike_rec <- lda_rec_final %>% 
  update_role(flag_spikes_hf_co2, new_role = "outcome")

spike_model <- workflow() %>% 
  add_recipe(spike_rec) %>% 
  add_model(discrim_linear()) %>% 
  fit(lda_data)

spike_model %>% 
  extract_fit_engine() %>% 
  print_all()

## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##       FALSE        TRUE 
## 0.992107069 0.007892931 
## 
## Group means:
##            daytime        month   wind_speed      wind_dir       w_unrot
## FALSE -0.001370831 -0.002181825  0.003474296  8.857974e-05  0.0003339202
## TRUE   0.172307525  0.274245948 -0.436703856 -1.113409e-02 -0.0419723180
##                  H           LE   bowen_ratio     co2_flux co2_mixing_ratio
## FALSE -0.003425203 -0.004045985  0.0003263712  0.003321219      0.003622764
## TRUE   0.430533187  0.508562752 -0.0410234463 -0.417462753     -0.455365721
##       air_temperature      co2_strg       u_star          TKE             L
## FALSE     -0.01541553  1.635775e-05  0.004680517  0.002979107  0.0006198252
## TRUE       1.83309757 -2.056098e-03 -0.588320583 -0.374460821 -0.0779093341
##              z_d_L          es           e       RAD_IN     RAD_OUT      LW_IN
## FALSE  0.001281102 -0.01397215 -0.00645072 -0.005216179 -0.01213126 -0.0116838
## TRUE  -0.161028933  1.75623857  0.81082751  0.655650978  1.52484666  1.4686025
##            LW_OUT       NETRAD        u_var        v_var        w_var
## FALSE -0.01343897 -0.001143591  0.003053885  0.002333481  0.003987313
## TRUE   1.68922065  0.143744369 -0.383860017 -0.293308408 -0.501187892
## 
## Coefficients of linear discriminants:
##                            LD1
## daytime            0.008510457
## month             -0.219199289
## wind_speed         0.066455109
## wind_dir          -0.053352676
## w_unrot            0.016113864
## H                  0.022496189
## LE                -0.015205926
## bowen_ratio        0.015999905
## co2_flux           0.048865720
## co2_mixing_ratio   0.146862970
## air_temperature   14.749504185
## co2_strg          -0.006406019
## u_star            -0.680400537
## TKE               -0.718667995
## L                  0.019502685
## z_d_L             -0.046817070
## es               -14.179218470
## e                  0.041067943
## RAD_IN            -0.329657126
## RAD_OUT            0.318052118
## LW_IN              0.310653332
## LW_OUT            -0.046485967
## NETRAD             0.194304521
## u_var              0.339001580
## v_var              0.352781764
## w_var              0.603150298

plot_lda(spike_model, lda_data)

plot_lda_scalings(spike_model)

plot_lda_density(spike_model, lda_data)

Further analysis

without YeoJohnson

qc_rec2 <- lda_rec_filt %>%
  step_normalize(all_numeric_predictors()) %>% 
  update_role(qc_co2_flux, new_role = "outcome")

qc_model2 <- workflow() %>% 
  add_recipe(qc_rec2) %>% 
  add_model(discrim_linear()) %>% 
  fit(lda_data)

## Warning in lda.default(x, grouping, ...): variables are collinear

qc_model2 %>% 
  extract_fit_engine() %>% 
  print_all()

## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##         0         1         2 
## 0.5045668 0.3077187 0.1877145 
## 
## Group means:
##       daytime      month  wind_speed    wind_dir    w_unrot           H
## 0  0.03739062  0.1755772  0.03940566 -0.05686120  0.1141825  0.02053649
## 1 -0.03611798 -0.1480361 -0.05025189  0.03453580 -0.1107110 -0.02064360
## 2 -0.04129615 -0.2292681 -0.02354290  0.09622575 -0.1254291 -0.02136014
##           LE bowen_ratio   co2_flux co2_mixing_ratio air_temperature
## 0  0.2336052 -0.02473110 -0.1878905     -0.018942804       0.1697668
## 1 -0.2046535  0.01078829  0.1878594      0.032406706      -0.1622974
## 2 -0.2924319  0.04879077  0.1970837     -0.002206751      -0.1902718
##      co2_strg      u_star          TKE             L         z_d_L         es
## 0 -0.04012638  0.02674573 -0.012517808 -0.0042559949 -0.0050726510  0.1605737
## 1  0.03063113 -0.03503337  0.000597652  0.0009185291 -0.0006564355 -0.1524610
## 2  0.05764429 -0.01446123  0.032667493  0.0099341574  0.0147111122 -0.1816857
##             e     RAD_IN    RAD_OUT       LW_IN       LW_OUT      NETRAD
## 0  0.09501669  0.1388321  0.1110645 -0.01805697  0.005823554  0.08686069
## 1 -0.07000648 -0.1312688 -0.1067699  0.01133946 -0.013295750 -0.08421840
## 2 -0.14063894 -0.1579859 -0.1235088  0.02994753  0.006142195 -0.09541854
##          u_var        v_var        w_var
## 0 -0.019200865 -0.002293082 -0.011246009
## 1  0.007931063 -0.009935985 -0.001006148
## 2  0.038609608  0.022451660  0.031878059
## 
## Coefficients of linear discriminants:
##                            LD1         LD2
## daytime           0.1634314549  0.04903350
## month            -0.2393819911  0.14848093
## wind_speed       -0.5473547888  0.55440541
## wind_dir         -0.0126815939  0.31975949
## w_unrot          -0.1517607737  0.31934049
## H                 0.2638747299 -0.10889328
## LE               -0.4071195085 -0.61985700
## bowen_ratio       0.0005812976  0.19971153
## co2_flux          0.1373344413 -0.68659388
## co2_mixing_ratio  0.1385970999 -0.39502716
## air_temperature  -0.2928259671  0.51812616
## co2_strg          0.0658300784  0.13895094
## u_star           -0.6047135665 -0.07396740
## TKE               0.2675578908 -0.12457343
## L                 0.0379483370  0.06677565
## z_d_L            -0.0294281420  0.20848696
## es                0.3880892827  0.21186803
## e                 0.0441184904 -0.42131289
## RAD_IN           -3.8100401325 -2.87742730
## RAD_OUT           0.0011576895  0.07822717
## LW_IN            -0.1272250929 -0.26070916
## LW_OUT            0.5770262482  0.11550707
## NETRAD            3.3568712661  2.93586359
## u_var             0.5226653262 -0.46534593
## v_var            -0.1481000231  0.31000063
## w_var             0.2834038054  0.05924503
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9893 0.0107

plot_lda(qc_model2, lda_data)

plot_lda_scalings(qc_model2)

## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

plot_lda_density(qc_model2, lda_data)

without circular variables

data3 <- lda_data %>% 
  select(-daytime, -month, -wind_dir)

qc_rec3 <- recipe(data3) %>%
   update_role(starts_with("flag"), starts_with("qc"), new_role = "flag") %>% # mark them as flag so is not incluced in computation
  update_role(!has_role("flag"), new_role = "predictor") %>% # set all the rest as a predictor
  step_naomit(everything()) %>% 
  step_filter(across(c(bowen_ratio, H, L, LE, co2_flux, co2_mixing_ratio, z_d_L, co2_strg), ~between(.x, quantile(.x, .03), quantile(.x, .97)))) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_YeoJohnson(all_numeric_predictors()) %>% 
  update_role(qc_co2_flux, new_role = "outcome")

qc_model3 <- workflow() %>% 
  add_recipe(qc_rec3) %>% 
  add_model(discrim_linear()) %>% 
  fit(data3)

qc_model3 %>% 
  extract_fit_engine() %>% 
  print_all()

## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##         0         1         2 
## 0.5045668 0.3077187 0.1877145 
## 
## Group means:
##         time  wind_speed   w_unrot          H          LE bowen_ratio  co2_flux
## 0 1547830723 -0.08087025 0.2546777 -0.2893384 -0.09187521 -0.04448385 0.2254270
## 1 1533640748 -0.20954879 0.1033120 -0.2581023 -0.44583025 -0.01230453 0.3717765
## 2 1529885365 -0.17971416 0.0904212 -0.2519178 -0.51346434  0.02466329 0.2630386
##   co2_mixing_ratio air_temperature    co2_strg     u_star        TKE
## 0       0.03802484      0.03901146 -0.03897528 -0.1215517 -0.3273951
## 1       0.10128582     -0.27906218  0.03238869 -0.2302725 -0.3659810
## 2       0.07179396     -0.30305173  0.05966682 -0.2176282 -0.3403501
##             L       z_d_L         es          e     RAD_IN     RAD_OUT
## 0 -0.04743615 -0.05979350 -0.1591356 -0.1361718 -0.2688244  0.12455376
## 1 -0.05202928 -0.06914308 -0.4321946 -0.3701407 -0.4696090 -0.09709213
## 2 -0.04735455 -0.04764544 -0.4538453 -0.4441007 -0.4909792 -0.11476474
##         LW_IN      LW_OUT     NETRAD      u_var      v_var      w_var
## 0 -0.01826195 -0.02830419 -0.3179805 -0.3380940 -0.3173195 -0.3217014
## 1  0.01114381 -0.04319884 -0.4144137 -0.3627740 -0.3721109 -0.3739518
## 2  0.02975674 -0.02178427 -0.4185202 -0.3388659 -0.3437583 -0.3508059
## 
## Coefficients of linear discriminants:
##                            LD1           LD2
## time             -1.329814e-08 -8.577055e-09
## wind_speed       -5.885051e-01  3.972474e-01
## w_unrot          -1.388824e-01  1.172577e-01
## H                 2.939712e-01 -3.445847e-01
## LE               -6.797973e-01 -2.800534e-01
## bowen_ratio       2.339964e-02  1.287814e-01
## co2_flux         -1.159867e-01 -1.275961e+00
## co2_mixing_ratio  3.071111e-01  1.282209e-02
## air_temperature   8.147327e-01 -4.707825e-01
## co2_strg          6.838829e-02  6.439536e-02
## u_star           -1.393486e+00 -4.220639e-01
## TKE               1.237909e+00 -3.735112e+00
## L                -8.128978e-03  4.317977e-02
## z_d_L            -1.040734e-02  1.920585e-01
## es               -1.536551e+00  6.162487e-01
## e                -2.268016e-01 -3.396069e-01
## RAD_IN            5.314878e-01  8.555561e-01
## RAD_OUT          -1.122378e-01  6.441693e-02
## LW_IN             1.486691e-01  2.671867e-02
## LW_OUT            6.525541e-02 -2.031890e-01
## NETRAD           -7.505341e-01 -6.900556e-01
## u_var             6.754854e-01  9.338485e-01
## v_var            -2.988276e-01  1.983071e+00
## w_var             6.138866e-01  1.256474e+00
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9722 0.0278

# plot_lda(qc_model3, data3)

# plot_lda_scalings(qc_model3)

LDA exploration

Utility functions

Basic recipe

Removing extreme values from the dataset

Transform data YeoJohnson

QC

Timelag

Absolute limites

Spikes

Further analysis

without YeoJohnson

without circular variables