Análise - Artigo Sementes

Autor

Daniel A. Silva

Definição dos Dados

Dados Brutos

A tabela a seguir mostra as primeiras 6 linhas dos dados brutos referentes aos bounding boxes das sementes e do endosperma, destacando as coordenadas em píxeis \((x_1, y_1)\) e \((x_2, y_2)\). Além disso, são apresentadas as medidas de área \((mm^2)\), largura \((mm)\) e comprimento \((mm)\) para ambas as estruturas.

Código
da_seeds_colnames <- c("seed_bbox_x1", "seed_bbox_y1", "seed_bbox_x2", 
                       "seed_bbox_y2", "endosperm_bbox_x1", "endosperm_bbox_y1", 
                       "endosperm_bbox_x2", "endosperm_bbox_y2", "area_seed", 
                       "area_endosperm", "width_seed", "width_endosperm", 
                       "height_seed", "height_endosperm")
da_seeds <- readr::read_csv(file = here::here('../data/seeds-measurements-pred-fix-v2.csv'),
                            col_names = da_seeds_colnames, show_col_types = FALSE) |> 
  tidyr::complete(area_endosperm, fill = list(area_endosperm = 0)) |> 
  tidyr::complete(width_endosperm, fill = list(width_endosperm = 0)) |> 
  tidyr::complete(height_endosperm, fill = list(height_endosperm = 0)) 
  
da_seeds |> head()
height_endosperm width_endosperm area_endosperm seed_bbox_x1 seed_bbox_y1 seed_bbox_x2 seed_bbox_y2 endosperm_bbox_x1 endosperm_bbox_y1 endosperm_bbox_x2 endosperm_bbox_y2 area_seed width_seed height_seed
0.5692358 1.127930 0.5854420 375 671 624 1244 437 811 491 918 10.760642 6.034954 2.624815
0.7695225 1.254427 0.7994058 651 1678 903 2195 727 1785 800 1904 10.334104 5.449906 2.656434
0.8643951 1.096306 0.7513459 270 351 526 926 331 508 413 612 10.534066 6.061307 2.698599
0.8643951 2.276943 1.5869217 209 1220 430 1773 269 1342 351 1558 9.980183 5.829396 2.329650
0.8854779 1.212261 0.9408075 590 397 842 989 651 565 735 680 11.644834 6.240511 2.656434
0.9065607 1.233344 0.9054154 270 1435 491 1945 331 1556 417 1673 8.988148 5.376116 2.329650

Análise Exploratória (EDA)

Descritiva

Código
da_seeds |> 
  dplyr::select(dplyr::ends_with(c("seed", "endosperm"))) |> 
  skimr::skim() |> 
  skimr::yank("numeric") |> 
  dplyr::select(-n_missing, -complete_rate)

Variable type: numeric

skim_variable mean sd p0 p25 p50 p75 p100 hist
area_seed 10.79 0.79 8.01 10.30 10.77 11.28 13.12 ▁▂▇▅▁
width_seed 5.54 0.49 2.52 5.28 5.56 5.87 6.42 ▁▁▁▇▆
height_seed 2.73 0.40 2.18 2.66 2.66 2.81 6.36 ▇▁▁▁▁
height_endosperm 2.02 0.57 0.00 2.01 2.18 2.34 3.26 ▁▁▁▇▁
width_endosperm 3.09 0.84 0.00 3.20 3.33 3.47 3.78 ▁▁▁▁▇
area_endosperm 5.30 1.72 0.00 5.36 5.97 6.26 7.08 ▁▁▁▂▇

Correlacao

Código
cor_seeds <- da_seeds |> 
  dplyr::select(dplyr::ends_with(c("seed", "endosperm"))) |> 
  dplyr::select(area_seed, area_endosperm,
                width_seed, width_endosperm,
                height_seed, height_endosperm) |> 
  dplyr::rename("Seed Area" = area_seed, 
                "Endosperm Area" = area_endosperm,
                "Seed Height" = height_seed, 
                "Seed Width" = width_seed,
                "Endosperm Height" = height_endosperm,
                "Endosperm Width" = width_endosperm) |> 
  cor()

cor_seeds |> 
  ggcorrplot::ggcorrplot(lab = TRUE, type = "upper", digits = 2,
                         legend.title = "r",
                         colors = ggsci::pal_material("grey")(9)[seq(2, 9, 3)])

Comparação de Histogramas

Os histogramas de frequência apresentados a seguir oferecem uma representação visual da distribuição dos dados brutos coletados.

Inicialmente, vamos nos concentrar na análise das contagens absolutas representadas nos Histogramas de Frequência Asoluta, examinando a distribuição de cada cluster:

Código
breaks <- seq(8, 13.2, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_seed) +
  ggplot2::geom_histogram(color = "#000000", breaks = breaks) +
  ggplot2::stat_bin(ggplot2::aes(label = ggplot2::after_stat(count)), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Seed Area'~(mm^2)),
                y = 'Frequency') +
  ggplot2::scale_y_continuous(limits = c(0, 350)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material("grey") +
  ggplot2::theme_light()

Código
breaks <- seq(0, 7.1, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_endosperm) +
  ggplot2::geom_histogram(color = "#000000", breaks = breaks) +
  ggplot2::stat_bin(ggplot2::aes(label = ggplot2::after_stat(count)), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Endosperm Area'~(mm^2)),
                y = 'Frequency') +
  ggplot2::scale_y_continuous(limits = c(0, 420)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material("grey") +
  ggplot2::theme_light()

Após a identificação de diferenças notáveis nas contagens, que podem ser atribuídas, em parte, à falta de adequação das predições aos clusters definidos pelo Wesley, avançamos para uma análise mais abrangente por meio dos Histogramas de Frequência Relativa. As frequências relativas expressam a porcentagem de ocorrência de cada valor em relação ao total de observações, proporcionando uma perspectiva mais normatizada das características da distribuição das predições nos clusters:

Código
breaks <- seq(8, 13.2, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_seed) +
  ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), 
                          breaks = breaks, color = "#000000") +
  ggplot2::stat_bin(
    ggplot2::aes(y = ggplot2::after_stat(density),
                 label = scales::percent(ggplot2::after_stat(density))), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Seed Area'~(mm^2)), y = 'Density') + 
  ggplot2::scale_y_continuous(labels = scales::percent_format(),
                              limits = c(0, 0.5)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material(palette = 'grey') +
  ggplot2::theme_light()

Código
breaks <- seq(0, 7.1, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_endosperm) +
  ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), 
                          breaks = breaks, color = "#000000") +
  ggplot2::stat_bin(
    ggplot2::aes(y = ggplot2::after_stat(density),
                 label = scales::percent(ggplot2::after_stat(density))), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Endosperm Area'~(mm^2)), y = 'Density') + 
  ggplot2::scale_y_continuous(labels = scales::percent_format(),
                              limits = c(0, 0.5)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material(palette = 'grey') +
  ggplot2::theme_light()

Area do endosperma / Area da semente

Código
breaks <- seq(0, 1, 0.1)
da_seeds |> 
  dplyr::mutate(ae_at = area_endosperm/area_seed) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = ae_at) +
  ggplot2::geom_histogram(color = "#000000", breaks = breaks) +
  ggplot2::stat_bin(ggplot2::aes(label = ggplot2::after_stat(count)), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = 'Endosperm/Seed Area Ratio',
                y = 'Frequency') +
  ggplot2::scale_y_continuous(limits = c(0, 420)) +
  ggplot2::scale_x_continuous(breaks = breaks, labels = scales::percent_format()) +
  ggsci::scale_fill_material("grey") +
  ggplot2::theme_light()

Dispersao

Código
expand.grid(a_seed = c(seq(0.0000001, 1, length.out = 1000), 
                       seq(0.00001, 100, length.out = 1000)), 
            a_endosperm = c(seq(0.0000001, 1, length.out = 1000), 
                            seq(0.00001, 100, length.out = 1000))) |> 
  tibble::as_tibble() |> 
  dplyr::mutate(ratio = a_endosperm / a_seed) |> 
  dplyr::filter(ratio <= 1) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = a_endosperm, y = a_seed) +
  ggplot2::geom_point(ggplot2::aes(x = area_endosperm, y = area_seed),
                      data = da_seeds, size = 3, alpha = 0.5) +
  ggplot2::geom_contour(ggplot2::aes(z = ratio), size = 0.4, 
                        bins = 10, color = "#000000") +
  metR::geom_text_contour(ggplot2::aes(z = ratio),
                          label.placer = metR::label_placer_n(1), stroke = 0.2) +
  ggplot2::scale_x_continuous(limits = c(-1, 9), breaks = seq(0, 8, 1), 
                              expand = c(0, -0.5)) +
  ggplot2::scale_y_continuous(limits = c(7, 14), breaks = seq(8, 13, 1), 
                              expand = c(0, -0.5)) +
  ggplot2::labs(x = bquote("Endosperm Area"~(mm^2)), 
                y = bquote("Seed Area"~(mm^2))) +
  ggplot2::theme_light()