ec <- here("data/1_prepared/ec_with_na_hainich.rds") %>%
read_rds()
lda_data <- ec %>%
select(time, daytime, month, wind_speed, wind_dir, w_unrot, H, LE, bowen_ratio, co2_flux, co2_mixing_ratio, air_temperature, co2_strg, u_star, TKE, L, z_d_L, es, e, RAD_IN, RAD_OUT, LW_IN, LW_OUT, NETRAD, u_var, v_var, w_var, qc_co2_flux, flag_timelag_sf_co2, flag_spikes_hf_co2, flag_absolute_limits_hf_co2) %>%
mutate(across(starts_with("flag"), as_factor), daytime = as.integer(daytime))
# Given a recipe it plots all the predictors distributions as histograms
plot_rec_var_dist <- function(rec){
recipe_data <- rec %>%
prep() %>%
bake(all_predictors(), new_data=NULL)
recipe_data %>%
mutate(across(starts_with("flag"), . %>% as.logical %>% as.double),
across(starts_with("qc"), . %>% as.character %>% as.double)) %>%
pivot_longer(everything(),
names_to = "var", values_to = "val") %>%
ggplot() +
geom_histogram(aes(val), bins=20) +
facet_wrap(~var, ncol=4, scales = "free") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), strip.text = element_text(margin = margin(0.1,0,0.1,0), face="bold", size=7))
}
plot_lda_scalings <- function(lda_wkflow) {
lda_scaling <- lda_wkflow %>%
extract_fit_engine() %>%
`$`(scaling) %>%
as_tibble(rownames = "var")
vectors_pos <- lda_scaling %>%
filter(LD1 > 0) %>%
add_origins()
vectors_neg <- lda_scaling %>%
filter(LD1 < 0) %>%
add_origins()
vectors_layer <- list(
geom_line(aes(group=var), data = vectors_pos, arrow=arrow(length = unit(.1, "inches"))),
geom_line(aes(group=var), data = vectors_neg, arrow=arrow(ends = "first", length = unit(.1, "inches"))),
geom_label_repel(aes(label=var), data=lda_scaling))
if(ncol(lda_scaling)==3){ # 1 col is the "var" and 2 cols LD1 and LD2
# plot 2D vectors with arrows
ggplot(lda_scaling, aes(LD1, LD2)) +
vectors_layer
}
else{
# plot in 1 dimension, the y axis is the variable
ggplot(lda_scaling, aes(LD1, var)) +
vectors_layer
}
}
# add an origin to the datapoint for plotting
add_origins <- function(dat){
tibble(
var = dat$var,
LD1 = 0,
LD2 = 0
) %>%
bind_rows(dat)
}
lda_rec <- recipe(lda_data) %>%
update_role(starts_with("flag"), starts_with("qc"), new_role = "flag") %>% # mark them as flag so is not incluced in computation
update_role(!has_role("flag"), new_role = "predictor") %>% # set all the rest as a predictor
update_role(time, new_role = "time") %>%
step_naomit(everything())
plot_rec_var_dist(lda_rec)
Remove the extreme values in the dataset by removing everything below the 2.5% and above 97.5 % percentile
lda_rec_filt <- lda_rec %>%
step_filter(across(c(bowen_ratio, H, L, LE, co2_flux, co2_mixing_ratio, z_d_L, co2_strg), ~between(.x, quantile(.x, .03), quantile(.x, .97))), skip=F) #Need to have proper plotting
plot_rec_var_dist(lda_rec_filt)
lda_rec_yeo <- lda_rec_filt %>%
step_YeoJohnson(all_numeric_predictors(), -month, -daytime)
plot_rec_var_dist(lda_rec_yeo)
# Normalize
lda_rec_norm <- lda_rec_yeo %>%
step_normalize(all_numeric_predictors())
plot_rec_var_dist(lda_rec_norm)
lda_rec_final <- lda_rec_norm
qc_rec <- lda_rec_final %>%
update_role(qc_co2_flux, new_role = "outcome")
qc_model <- workflow() %>%
add_recipe(qc_rec) %>%
add_model(discrim_linear()) %>%
fit(lda_data)
qc_model %>%
extract_fit_engine() %>%
print_all()
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## 0 1 2
## 0.5045668 0.3077187 0.1877145
##
## Group means:
## daytime month wind_speed wind_dir w_unrot H
## 0 0.03739062 0.1755772 0.06072370 -0.06109830 0.1051101 -0.03008967
## 1 -0.03611798 -0.1480361 -0.07357175 0.03860533 -0.1003909 0.02625310
## 2 -0.04129615 -0.2292681 -0.04261665 0.10094368 -0.1179605 0.03784300
## LE bowen_ratio co2_flux co2_mixing_ratio air_temperature
## 0 0.2399436 -0.023364161 -0.05894768 -0.026030038 0.1680656
## 1 -0.2125237 0.009680416 0.11788489 0.037622032 -0.1628486
## 2 -0.2965678 0.046932635 -0.03479936 0.008293928 -0.1891921
## co2_strg u_star TKE L z_d_L es
## 0 -0.04095184 0.05593280 0.025444054 -0.0004403831 4.521485e-05 0.1675974
## 1 0.03123853 -0.06171042 -0.038711382 -0.0019699663 -8.003213e-03 -0.1610845
## 2 0.05886738 -0.04918313 -0.004933071 0.0044130754 1.299806e-02 -0.1864288
## e RAD_IN RAD_OUT LW_IN LW_OUT NETRAD
## 0 0.1412339 0.1338496 0.1136949 -0.01839958 0.002393756 0.05539825
## 1 -0.1116514 -0.1255884 -0.1089492 0.01153735 -0.011025773 -0.05513253
## 2 -0.1966001 -0.1539048 -0.1270065 0.03054404 0.011640162 -0.05852936
## u_var v_var w_var
## 0 0.017278793 0.03310379 0.03157139
## 1 -0.029608762 -0.04776914 -0.04328383
## 2 0.002092884 -0.01067374 -0.01390745
##
## Coefficients of linear discriminants:
## LD1 LD2
## daytime 0.129695180 -0.19248947
## month -0.445175355 0.05055367
## wind_speed -0.549349793 0.30692752
## wind_dir 0.017886148 0.19981295
## w_unrot -0.112259772 0.16915265
## H 0.270631417 -0.23292442
## LE -0.478420171 -0.23144721
## bowen_ratio 0.014037879 0.11509519
## co2_flux -0.060450877 -1.00188859
## co2_mixing_ratio 0.236186564 -0.12650002
## air_temperature -0.646194210 -0.43742144
## co2_strg 0.079415432 0.06753304
## u_star -1.276019099 -0.65723304
## TKE -1.915141975 5.04131792
## L 0.010013636 0.01972600
## z_d_L 0.017691686 0.21343591
## es -0.056482053 0.42038599
## e -0.050642205 -0.32311627
## RAD_IN -0.002693222 0.29310522
## RAD_OUT -0.130616012 -0.03007364
## LW_IN 0.198754119 -0.09314998
## LW_OUT 0.510812837 0.16676400
## NETRAD -0.349568502 -0.09514977
## u_var 1.884264714 -3.10751524
## v_var 0.904836470 -1.48055899
## w_var 0.795325793 0.07162726
##
## Proportion of trace:
## LD1 LD2
## 0.9693 0.0307
plot_lda(qc_model, lda_data)
plot_lda_density(qc_model, lda_data)
plot_lda_scalings(qc_model)
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Look at extreme values
# Note convert this to a function
predictor_data <- qc_model %>%
extract_recipe() %>%
bake(all_predictors(), new_data=lda_data) %>%
as.matrix()
outcome_data <- qc_model %>%
extract_recipe() %>%
bake(all_outcomes(), new_data=lda_data)
lda_scaling <- qc_model %>%
extract_fit_engine() %>%
`$`(scaling)
data_not_transf <- lda_rec_filt %>%
prep() %>%
bake(new_data=lda_data)
obs_coords <- predictor_data %*% lda_scaling %>%
as_tibble()
qc_data_with_lda <- bind_cols(data_not_transf, obs_coords)
qc_data_with_lda %>%
slice_max(LD1, n = 50)
qc_data_with_lda %>%
ggcorr(label=T, nudge_x= -.3)
## Warning in ggcorr(., label = T, nudge_x = -0.3): data in column(s)
## 'time', 'qc_co2_flux', 'flag_timelag_sf_co2', 'flag_spikes_hf_co2',
## 'flag_absolute_limits_hf_co2' are not numeric and were ignored
qc_data_outl <- qc_data_with_lda %>%
mutate(outlier = LD1 > sort(LD1, decreasing = T)[50])
plot_with_outl <- function(data, mapping, ...){
ggplot(data, mapping) +
geom_point(aes(col = outlier)) +
scale_color_manual(values = list("TRUE" = "blue", "FALSE" = "darkgreen"))
}
qc_data_outl_all <- qc_data_outl %>%
filter(outlier)
qc_data_outl_none <- qc_data_outl %>%
sample_n(50)
qc_data_outl_plot <- bind_rows(qc_data_outl_all, qc_data_outl_none)
summary(qc_data_outl_all)
## time daytime month
## Min. :2016-01-02 16:30:00 Min. :1.00 Min. : 1.000
## 1st Qu.:2017-01-10 10:00:00 1st Qu.:1.00 1st Qu.: 1.000
## Median :2017-02-01 10:30:00 Median :2.00 Median : 2.000
## Mean :2017-06-06 04:30:00 Mean :1.51 Mean : 2.714
## 3rd Qu.:2018-03-11 13:30:00 3rd Qu.:2.00 3rd Qu.: 3.000
## Max. :2019-04-25 23:30:00 Max. :2.00 Max. :12.000
## wind_speed wind_dir w_unrot H
## Min. :0.06164 Min. : 24.2 Min. :-1.59567 Min. :-37.271
## 1st Qu.:1.05346 1st Qu.:148.1 1st Qu.:-0.24253 1st Qu.: 1.435
## Median :2.00165 Median :177.7 Median :-0.03205 Median : 46.098
## Mean :2.10817 Mean :187.5 Mean :-0.18544 Mean : 72.507
## 3rd Qu.:2.79380 3rd Qu.:241.3 3rd Qu.: 0.02254 3rd Qu.:146.144
## Max. :5.37995 Max. :356.4 Max. : 0.21551 Max. :273.558
## LE bowen_ratio co2_flux co2_mixing_ratio
## Min. :-66.19 Min. :-45.5252 Min. :-11.668 Min. :343.0
## 1st Qu.:-57.57 1st Qu.: -3.1660 1st Qu.: -5.363 1st Qu.:367.9
## Median :-37.31 Median : -0.4111 Median : -2.100 Median :386.8
## Mean :-31.08 Mean : 0.4960 Mean : -2.491 Mean :387.5
## 3rd Qu.: -6.89 3rd Qu.: 0.1486 3rd Qu.: 0.390 3rd Qu.:404.3
## Max. : 38.07 Max. : 59.0284 Max. : 5.601 Max. :434.4
## air_temperature co2_strg u_star TKE
## Min. :260.1 Min. :-4.6484 Min. :0.09828 Min. :0.06094
## 1st Qu.:278.7 1st Qu.:-1.1623 1st Qu.:0.23218 1st Qu.:0.75383
## Median :280.7 Median : 0.3681 Median :0.43401 Median :1.54899
## Mean :281.0 Mean : 0.6054 Mean :0.45175 Mean :1.96344
## 3rd Qu.:284.5 3rd Qu.: 1.9635 3rd Qu.:0.58519 3rd Qu.:2.80650
## Max. :302.4 Max. : 6.5407 Max. :1.16203 Max. :7.48359
## L z_d_L es e
## Min. :-3155.80 Min. :-0.95478 Min. : 223.2 Min. : 58.73
## 1st Qu.: -347.09 1st Qu.:-0.40450 1st Qu.: 902.5 1st Qu.: 285.38
## Median : -58.54 Median :-0.07217 Median :1039.9 Median : 531.36
## Mean : -98.91 Mean :-0.18269 Mean :1202.7 Mean : 744.14
## 3rd Qu.: -23.63 3rd Qu.:-0.01042 3rd Qu.:1338.1 3rd Qu.:1048.26
## Max. : 5153.00 Max. : 1.01799 Max. :4052.2 Max. :2996.71
## RAD_IN RAD_OUT LW_IN LW_OUT
## Min. :195.5 Min. :280.4 Min. :144.7 Min. :196.5
## 1st Qu.:290.2 1st Qu.:302.1 1st Qu.:249.9 1st Qu.:295.7
## Median :325.6 Median :313.9 Median :298.5 Median :306.4
## Mean :344.1 Mean :326.6 Mean :286.0 Mean :310.1
## 3rd Qu.:392.4 3rd Qu.:338.2 3rd Qu.:307.4 3rd Qu.:323.0
## Max. :614.3 Max. :432.5 Max. :394.3 Max. :427.5
## NETRAD u_var v_var w_var
## Min. :-94.46333 Min. :0.04339 Min. :0.01684 Min. :0.009243
## 1st Qu.:-19.19000 1st Qu.:0.91915 1st Qu.:0.34031 1st Qu.:0.143815
## Median : -0.00383 Median :1.83685 Median :0.97067 Median :0.397365
## Mean : 17.45010 Mean :2.15313 Mean :1.21434 Mean :0.559412
## 3rd Qu.: 61.09667 3rd Qu.:2.94282 3rd Qu.:1.57321 3rd Qu.:0.811808
## Max. :219.10667 Max. :7.77232 Max. :6.34987 Max. :2.387391
## qc_co2_flux flag_timelag_sf_co2 flag_spikes_hf_co2 flag_absolute_limits_hf_co2
## 0: 3 FALSE:32 FALSE:49 FALSE:49
## 1:29 TRUE :17 TRUE : 0 TRUE : 0
## 2:17
##
##
##
## LD1 LD2 outlier
## Min. :3.583 Min. :-3.0576 Mode:logical
## 1st Qu.:3.699 1st Qu.:-0.2695 TRUE:49
## Median :3.932 Median : 0.5869
## Mean :4.097 Mean : 0.6626
## 3rd Qu.:4.346 3rd Qu.: 1.6296
## Max. :6.147 Max. : 3.7091
qc_data_outl_plot %>%
ggpairs(columns = 1:10, lower = list(continuous = plot_with_outl), legend = c(2,1))
qc_data_outl_plot %>%
ggpairs(columns = 10:20, lower = list(continuous = plot_with_outl), legend = c(2,1))
qc_data_outl_plot %>%
ggpairs(columns = 1:10, mapping = aes(col=outlier), legend = c(2,1)) +
scale_color_manual(values = list("darkgreen", "blue")) +
scale_fill_manual(values = list("darkgreen", "blue")) +
theme(axis.text.x = element_text(angle = 45))
qc_data_outl_plot %>%
ggpairs(columns = 10:20, mapping = aes(col=outlier), legend = c(2,1)) +
scale_color_manual(values = list("blue", "darkgreen")) +
scale_fill_manual(values = list("blue", "darkgreen")) +
theme(axis.text.x = element_text(angle = 45))
qc_data_outl_plot %>%
ggpairs(columns = 20:ncol(qc_data_outl_plot), mapping = aes(col=outlier), legend = c(2,1)) +
scale_color_manual(values = list("blue", "darkgreen")) +
scale_fill_manual(values = list("blue", "darkgreen")) +
theme(axis.text.x = element_text(angle = 45))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qc_data_outl_plot %>%
select(-time) %>%
mutate(across(starts_with("flag"), . %>% as.logical %>% as.double),
across(starts_with("qc"), . %>% as.character %>% as.double)) %>%
pivot_longer(-outlier,
names_to = "var", values_to = "val") %>%
ggplot() +
geom_histogram(aes(val, color = outlier), bins = 10, alpha = .5) +
facet_wrap(~var, nrow = 4, scales = "free") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), strip.text = element_text(margin = margin(0.1,0,0.1,0), face="bold", size=7))
summary(data_not_transf %>% select(1:10))
## time daytime month
## Min. :2016-01-01 13:00:00 Min. :1.000 Min. : 1.000
## 1st Qu.:2018-01-20 13:37:30 1st Qu.:1.000 1st Qu.: 3.000
## Median :2018-11-02 23:15:00 Median :1.000 Median : 5.000
## Mean :2018-10-21 04:19:58 Mean :1.492 Mean : 5.821
## 3rd Qu.:2019-10-07 16:52:30 3rd Qu.:2.000 3rd Qu.: 9.000
## Max. :2020-12-31 22:00:00 Max. :2.000 Max. :12.000
## wind_speed wind_dir w_unrot H
## Min. :0.05525 Min. : 0.0189 Min. :-1.8696530 Min. :-123.651
## 1st Qu.:2.07906 1st Qu.: 81.3470 1st Qu.:-0.0478436 1st Qu.: -29.487
## Median :2.76481 Median :191.7728 Median : 0.0123796 Median : -7.535
## Mean :2.93437 Mean :196.9563 Mean :-0.0008549 Mean : 15.166
## 3rd Qu.:3.64171 3rd Qu.:320.3329 3rd Qu.: 0.0850359 3rd Qu.: 38.437
## Max. :9.89610 Max. :359.9997 Max. : 1.0973364 Max. : 275.505
## LE bowen_ratio co2_flux
## Min. :-66.194 Min. :-65.8040 Min. :-21.1391
## 1st Qu.: -1.102 1st Qu.: -3.1591 1st Qu.: -1.5544
## Median : 4.519 Median : 0.5137 Median : 0.2594
## Mean : 18.882 Mean : 0.8633 Mean : -0.9881
## 3rd Qu.: 23.402 3rd Qu.: 3.7868 3rd Qu.: 1.5125
## Max. :208.985 Max. : 73.9792 Max. : 7.1877
timelag_rec <- lda_rec_final %>%
update_role(flag_timelag_sf_co2, new_role = "outcome")
timelag_model <- workflow() %>%
add_recipe(timelag_rec) %>%
add_model(discrim_linear()) %>%
fit(lda_data)
timelag_model %>%
extract_fit_engine() %>%
print_all()
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## FALSE TRUE
## 0.4900216 0.5099784
##
## Group means:
## daytime month wind_speed wind_dir w_unrot H
## FALSE 0.3105033 0.1850701 -0.02593776 -0.1307283 0.08182455 0.2374543
## TRUE -0.2983526 -0.1778278 0.02492275 0.1256126 -0.07862256 -0.2281621
## LE bowen_ratio co2_flux co2_mixing_ratio air_temperature
## FALSE 0.2029511 0.002058003 -0.4725937 0.02481925 0.007580115
## TRUE -0.1950091 -0.001977468 0.4540999 -0.02384801 -0.008901868
## co2_strg u_star TKE L z_d_L es
## FALSE -0.04645027 0.10104480 0.1143038 -0.06092442 -0.1958807 -0.0006729069
## TRUE 0.04463256 -0.09709067 -0.1098308 0.05854029 0.1882154 0.0006465744
## e RAD_IN RAD_OUT LW_IN LW_OUT NETRAD
## FALSE 0.02795513 0.3050546 0.1187567 -0.01930986 -0.03829352 0.2979054
## TRUE -0.02686118 -0.2931171 -0.1141095 0.01855422 0.03679500 -0.2862477
## u_var v_var w_var
## FALSE 0.1093983 0.1260633 0.09577075
## TRUE -0.1051173 -0.1211301 -0.09202300
##
## Coefficients of linear discriminants:
## LD1
## daytime -0.23390613
## month -0.26967457
## wind_speed 0.26361854
## wind_dir 0.14200624
## w_unrot 0.04640620
## H 0.11730761
## LE 0.04779700
## bowen_ratio 0.03078330
## co2_flux 0.90128751
## co2_mixing_ratio 0.06199037
## air_temperature -5.88757091
## co2_strg -0.01839527
## u_star -0.07230427
## TKE 3.46699063
## L -0.02128016
## z_d_L 0.04355335
## es 6.10823084
## e 0.06172669
## RAD_IN -0.44635380
## RAD_OUT -0.11204932
## LW_IN -0.07122702
## LW_OUT 0.31299440
## NETRAD 0.31660006
## u_var -1.78420725
## v_var -1.41408212
## w_var -0.65516000
plot_lda(timelag_model, lda_data)
plot_lda_scalings(timelag_model)
plot_lda_density(timelag_model, lda_data)
abs_rec <- lda_rec_final %>%
update_role(flag_absolute_limits_hf_co2, new_role = "outcome")
abs_model <- workflow() %>%
add_recipe(abs_rec) %>%
add_model(discrim_linear()) %>%
fit(lda_data)
abs_model %>%
extract_fit_engine() %>%
print_all()
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## FALSE TRUE
## 0.999894409 0.000105591
##
## Group means:
## daytime month wind_speed wind_dir w_unrot
## FALSE -0.0001072075 -0.0001486363 -0.0000154116 6.597934e-05 1.373619e-05
## TRUE 1.0152018586 1.4075111902 0.1459401909 -6.247914e-01 -1.300749e-01
## H LE bowen_ratio co2_flux co2_mixing_ratio
## FALSE -2.252957e-05 3.420528e-05 3.543177e-06 2.357318e-05 -0.0001310852
## TRUE 2.133438e-01 -3.239069e-01 -3.355212e-02 -2.232262e-01 1.2413115182
## air_temperature co2_strg u_star TKE L
## FALSE -0.0007529629 1.864993e-05 -3.56757e-05 -4.410609e-05 2.595158e-05
## TRUE -0.6861991472 -1.766055e-01 3.37831e-01 4.176626e-01 -2.457485e-01
## z_d_L es e RAD_IN RAD_OUT
## FALSE 3.207577e-05 7.194514e-05 4.162886e-06 -6.751382e-05 -5.357278e-05
## TRUE -3.037415e-01 -6.812845e-01 -3.942045e-02 6.393221e-01 5.073075e-01
## LW_IN LW_OUT NETRAD u_var v_var
## FALSE 0.000111772 0.0001166285 -5.871172e-05 -4.527251e-05 -4.657884e-05
## TRUE -1.058425170 -1.1044135511 5.559706e-01 4.287080e-01 4.410783e-01
## w_var
## FALSE -4.627924e-05
## TRUE 4.382413e-01
##
## Coefficients of linear discriminants:
## LD1
## daytime 5.185739e-01
## month 4.846264e-01
## wind_speed -2.734317e-01
## wind_dir -1.858760e-01
## w_unrot -1.664705e-01
## H -1.539404e-01
## LE -3.045737e-01
## bowen_ratio -2.759583e-02
## co2_flux -3.601355e-02
## co2_mixing_ratio 2.126024e-01
## air_temperature -4.897063e+00
## co2_strg 4.809104e-04
## u_star -4.657037e-01
## TKE -1.195410e+01
## L -6.077647e-02
## z_d_L -4.021810e-02
## es 4.764193e+00
## e 2.895657e-02
## RAD_IN 1.484825e-01
## RAD_OUT 3.669713e-01
## LW_IN -2.470792e-01
## LW_OUT -1.902084e-01
## NETRAD -1.657484e-01
## u_var 6.339465e+00
## v_var 4.533520e+00
## w_var 2.082472e+00
plot_lda(abs_model, lda_data)
plot_lda_scalings(abs_model)
plot_lda_density(abs_model, lda_data)
spike_rec <- lda_rec_final %>%
update_role(flag_spikes_hf_co2, new_role = "outcome")
spike_model <- workflow() %>%
add_recipe(spike_rec) %>%
add_model(discrim_linear()) %>%
fit(lda_data)
spike_model %>%
extract_fit_engine() %>%
print_all()
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## FALSE TRUE
## 0.992107069 0.007892931
##
## Group means:
## daytime month wind_speed wind_dir w_unrot
## FALSE -0.001370831 -0.002181825 0.003474296 8.857974e-05 0.0003339202
## TRUE 0.172307525 0.274245948 -0.436703856 -1.113409e-02 -0.0419723180
## H LE bowen_ratio co2_flux co2_mixing_ratio
## FALSE -0.003425203 -0.004045985 0.0003263712 0.003321219 0.003622764
## TRUE 0.430533187 0.508562752 -0.0410234463 -0.417462753 -0.455365721
## air_temperature co2_strg u_star TKE L
## FALSE -0.01541553 1.635775e-05 0.004680517 0.002979107 0.0006198252
## TRUE 1.83309757 -2.056098e-03 -0.588320583 -0.374460821 -0.0779093341
## z_d_L es e RAD_IN RAD_OUT LW_IN
## FALSE 0.001281102 -0.01397215 -0.00645072 -0.005216179 -0.01213126 -0.0116838
## TRUE -0.161028933 1.75623857 0.81082751 0.655650978 1.52484666 1.4686025
## LW_OUT NETRAD u_var v_var w_var
## FALSE -0.01343897 -0.001143591 0.003053885 0.002333481 0.003987313
## TRUE 1.68922065 0.143744369 -0.383860017 -0.293308408 -0.501187892
##
## Coefficients of linear discriminants:
## LD1
## daytime 0.008510457
## month -0.219199289
## wind_speed 0.066455109
## wind_dir -0.053352676
## w_unrot 0.016113864
## H 0.022496189
## LE -0.015205926
## bowen_ratio 0.015999905
## co2_flux 0.048865720
## co2_mixing_ratio 0.146862970
## air_temperature 14.749504185
## co2_strg -0.006406019
## u_star -0.680400537
## TKE -0.718667995
## L 0.019502685
## z_d_L -0.046817070
## es -14.179218470
## e 0.041067943
## RAD_IN -0.329657126
## RAD_OUT 0.318052118
## LW_IN 0.310653332
## LW_OUT -0.046485967
## NETRAD 0.194304521
## u_var 0.339001580
## v_var 0.352781764
## w_var 0.603150298
plot_lda(spike_model, lda_data)
plot_lda_scalings(spike_model)
plot_lda_density(spike_model, lda_data)
qc_rec2 <- lda_rec_filt %>%
step_normalize(all_numeric_predictors()) %>%
update_role(qc_co2_flux, new_role = "outcome")
qc_model2 <- workflow() %>%
add_recipe(qc_rec2) %>%
add_model(discrim_linear()) %>%
fit(lda_data)
## Warning in lda.default(x, grouping, ...): variables are collinear
qc_model2 %>%
extract_fit_engine() %>%
print_all()
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## 0 1 2
## 0.5045668 0.3077187 0.1877145
##
## Group means:
## daytime month wind_speed wind_dir w_unrot H
## 0 0.03739062 0.1755772 0.03940566 -0.05686120 0.1141825 0.02053649
## 1 -0.03611798 -0.1480361 -0.05025189 0.03453580 -0.1107110 -0.02064360
## 2 -0.04129615 -0.2292681 -0.02354290 0.09622575 -0.1254291 -0.02136014
## LE bowen_ratio co2_flux co2_mixing_ratio air_temperature
## 0 0.2336052 -0.02473110 -0.1878905 -0.018942804 0.1697668
## 1 -0.2046535 0.01078829 0.1878594 0.032406706 -0.1622974
## 2 -0.2924319 0.04879077 0.1970837 -0.002206751 -0.1902718
## co2_strg u_star TKE L z_d_L es
## 0 -0.04012638 0.02674573 -0.012517808 -0.0042559949 -0.0050726510 0.1605737
## 1 0.03063113 -0.03503337 0.000597652 0.0009185291 -0.0006564355 -0.1524610
## 2 0.05764429 -0.01446123 0.032667493 0.0099341574 0.0147111122 -0.1816857
## e RAD_IN RAD_OUT LW_IN LW_OUT NETRAD
## 0 0.09501669 0.1388321 0.1110645 -0.01805697 0.005823554 0.08686069
## 1 -0.07000648 -0.1312688 -0.1067699 0.01133946 -0.013295750 -0.08421840
## 2 -0.14063894 -0.1579859 -0.1235088 0.02994753 0.006142195 -0.09541854
## u_var v_var w_var
## 0 -0.019200865 -0.002293082 -0.011246009
## 1 0.007931063 -0.009935985 -0.001006148
## 2 0.038609608 0.022451660 0.031878059
##
## Coefficients of linear discriminants:
## LD1 LD2
## daytime 0.1634314549 0.04903350
## month -0.2393819911 0.14848093
## wind_speed -0.5473547888 0.55440541
## wind_dir -0.0126815939 0.31975949
## w_unrot -0.1517607737 0.31934049
## H 0.2638747299 -0.10889328
## LE -0.4071195085 -0.61985700
## bowen_ratio 0.0005812976 0.19971153
## co2_flux 0.1373344413 -0.68659388
## co2_mixing_ratio 0.1385970999 -0.39502716
## air_temperature -0.2928259671 0.51812616
## co2_strg 0.0658300784 0.13895094
## u_star -0.6047135665 -0.07396740
## TKE 0.2675578908 -0.12457343
## L 0.0379483370 0.06677565
## z_d_L -0.0294281420 0.20848696
## es 0.3880892827 0.21186803
## e 0.0441184904 -0.42131289
## RAD_IN -3.8100401325 -2.87742730
## RAD_OUT 0.0011576895 0.07822717
## LW_IN -0.1272250929 -0.26070916
## LW_OUT 0.5770262482 0.11550707
## NETRAD 3.3568712661 2.93586359
## u_var 0.5226653262 -0.46534593
## v_var -0.1481000231 0.31000063
## w_var 0.2834038054 0.05924503
##
## Proportion of trace:
## LD1 LD2
## 0.9893 0.0107
plot_lda(qc_model2, lda_data)
plot_lda_scalings(qc_model2)
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_lda_density(qc_model2, lda_data)
data3 <- lda_data %>%
select(-daytime, -month, -wind_dir)
qc_rec3 <- recipe(data3) %>%
update_role(starts_with("flag"), starts_with("qc"), new_role = "flag") %>% # mark them as flag so is not incluced in computation
update_role(!has_role("flag"), new_role = "predictor") %>% # set all the rest as a predictor
step_naomit(everything()) %>%
step_filter(across(c(bowen_ratio, H, L, LE, co2_flux, co2_mixing_ratio, z_d_L, co2_strg), ~between(.x, quantile(.x, .03), quantile(.x, .97)))) %>%
step_normalize(all_numeric_predictors()) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
update_role(qc_co2_flux, new_role = "outcome")
qc_model3 <- workflow() %>%
add_recipe(qc_rec3) %>%
add_model(discrim_linear()) %>%
fit(data3)
qc_model3 %>%
extract_fit_engine() %>%
print_all()
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## 0 1 2
## 0.5045668 0.3077187 0.1877145
##
## Group means:
## time wind_speed w_unrot H LE bowen_ratio co2_flux
## 0 1547830723 -0.08087025 0.2546777 -0.2893384 -0.09187521 -0.04448385 0.2254270
## 1 1533640748 -0.20954879 0.1033120 -0.2581023 -0.44583025 -0.01230453 0.3717765
## 2 1529885365 -0.17971416 0.0904212 -0.2519178 -0.51346434 0.02466329 0.2630386
## co2_mixing_ratio air_temperature co2_strg u_star TKE
## 0 0.03802484 0.03901146 -0.03897528 -0.1215517 -0.3273951
## 1 0.10128582 -0.27906218 0.03238869 -0.2302725 -0.3659810
## 2 0.07179396 -0.30305173 0.05966682 -0.2176282 -0.3403501
## L z_d_L es e RAD_IN RAD_OUT
## 0 -0.04743615 -0.05979350 -0.1591356 -0.1361718 -0.2688244 0.12455376
## 1 -0.05202928 -0.06914308 -0.4321946 -0.3701407 -0.4696090 -0.09709213
## 2 -0.04735455 -0.04764544 -0.4538453 -0.4441007 -0.4909792 -0.11476474
## LW_IN LW_OUT NETRAD u_var v_var w_var
## 0 -0.01826195 -0.02830419 -0.3179805 -0.3380940 -0.3173195 -0.3217014
## 1 0.01114381 -0.04319884 -0.4144137 -0.3627740 -0.3721109 -0.3739518
## 2 0.02975674 -0.02178427 -0.4185202 -0.3388659 -0.3437583 -0.3508059
##
## Coefficients of linear discriminants:
## LD1 LD2
## time -1.329814e-08 -8.577055e-09
## wind_speed -5.885051e-01 3.972474e-01
## w_unrot -1.388824e-01 1.172577e-01
## H 2.939712e-01 -3.445847e-01
## LE -6.797973e-01 -2.800534e-01
## bowen_ratio 2.339964e-02 1.287814e-01
## co2_flux -1.159867e-01 -1.275961e+00
## co2_mixing_ratio 3.071111e-01 1.282209e-02
## air_temperature 8.147327e-01 -4.707825e-01
## co2_strg 6.838829e-02 6.439536e-02
## u_star -1.393486e+00 -4.220639e-01
## TKE 1.237909e+00 -3.735112e+00
## L -8.128978e-03 4.317977e-02
## z_d_L -1.040734e-02 1.920585e-01
## es -1.536551e+00 6.162487e-01
## e -2.268016e-01 -3.396069e-01
## RAD_IN 5.314878e-01 8.555561e-01
## RAD_OUT -1.122378e-01 6.441693e-02
## LW_IN 1.486691e-01 2.671867e-02
## LW_OUT 6.525541e-02 -2.031890e-01
## NETRAD -7.505341e-01 -6.900556e-01
## u_var 6.754854e-01 9.338485e-01
## v_var -2.988276e-01 1.983071e+00
## w_var 6.138866e-01 1.256474e+00
##
## Proportion of trace:
## LD1 LD2
## 0.9722 0.0278
# plot_lda(qc_model3, data3)
# plot_lda_scalings(qc_model3)