Análise - Artigo Sementes

Autor

Daniel A. Silva

Definição dos Dados

Dados Brutos

A tabela a seguir mostra as primeiras 6 linhas dos dados brutos referentes aos bounding boxes das sementes e do endosperma, destacando as coordenadas em píxeis \((x_1, y_1)\) e \((x_2, y_2)\). Além disso, são apresentadas as medidas de área \((mm^2)\), largura \((mm)\) e comprimento \((mm)\) para ambas as estruturas.

Código

da_seeds_colnames <- c("seed_bbox_x1", "seed_bbox_y1", "seed_bbox_x2", 
                       "seed_bbox_y2", "endosperm_bbox_x1", "endosperm_bbox_y1", 
                       "endosperm_bbox_x2", "endosperm_bbox_y2", "area_seed", 
                       "area_endosperm", "width_seed", "width_endosperm", 
                       "height_seed", "height_endosperm")
da_seeds <- readr::read_csv(file = here::here('../data/seeds-measurements-pred-fix-v2.csv'),
                            col_names = da_seeds_colnames, show_col_types = FALSE) |> 
  tidyr::complete(area_endosperm, fill = list(area_endosperm = 0)) |> 
  tidyr::complete(width_endosperm, fill = list(width_endosperm = 0)) |> 
  tidyr::complete(height_endosperm, fill = list(height_endosperm = 0)) 
  
da_seeds |> head()

height_endosperm	width_endosperm	area_endosperm	seed_bbox_x1	seed_bbox_y1	seed_bbox_x2	seed_bbox_y2	endosperm_bbox_x1	endosperm_bbox_y1	endosperm_bbox_x2	endosperm_bbox_y2	area_seed	width_seed	height_seed
0.5692358	1.127930	0.5854420	375	671	624	1244	437	811	491	918	10.760642	6.034954	2.624815
0.7695225	1.254427	0.7994058	651	1678	903	2195	727	1785	800	1904	10.334104	5.449906	2.656434
0.8643951	1.096306	0.7513459	270	351	526	926	331	508	413	612	10.534066	6.061307	2.698599
0.8643951	2.276943	1.5869217	209	1220	430	1773	269	1342	351	1558	9.980183	5.829396	2.329650
0.8854779	1.212261	0.9408075	590	397	842	989	651	565	735	680	11.644834	6.240511	2.656434
0.9065607	1.233344	0.9054154	270	1435	491	1945	331	1556	417	1673	8.988148	5.376116	2.329650

Análise Exploratória (EDA)

Descritiva

Código

da_seeds |> 
  dplyr::select(dplyr::ends_with(c("seed", "endosperm"))) |> 
  skimr::skim() |> 
  skimr::yank("numeric") |> 
  dplyr::select(-n_missing, -complete_rate)

Variable type: numeric

skim_variable	mean	sd	p0	p25	p50	p75	p100	hist
area_seed	10.79	0.79	8.01	10.30	10.77	11.28	13.12	▁▂▇▅▁
width_seed	5.54	0.49	2.52	5.28	5.56	5.87	6.42	▁▁▁▇▆
height_seed	2.73	0.40	2.18	2.66	2.66	2.81	6.36	▇▁▁▁▁
height_endosperm	2.02	0.57	0.00	2.01	2.18	2.34	3.26	▁▁▁▇▁
width_endosperm	3.09	0.84	0.00	3.20	3.33	3.47	3.78	▁▁▁▁▇
area_endosperm	5.30	1.72	0.00	5.36	5.97	6.26	7.08	▁▁▁▂▇

Correlacao

Código

cor_seeds <- da_seeds |> 
  dplyr::select(dplyr::ends_with(c("seed", "endosperm"))) |> 
  dplyr::select(area_seed, area_endosperm,
                width_seed, width_endosperm,
                height_seed, height_endosperm) |> 
  dplyr::rename("Seed Area" = area_seed, 
                "Endosperm Area" = area_endosperm,
                "Seed Height" = height_seed, 
                "Seed Width" = width_seed,
                "Endosperm Height" = height_endosperm,
                "Endosperm Width" = width_endosperm) |> 
  cor()

cor_seeds |> 
  ggcorrplot::ggcorrplot(lab = TRUE, type = "upper", digits = 2,
                         legend.title = "r",
                         colors = ggsci::pal_material("grey")(9)[seq(2, 9, 3)])

Comparação de Histogramas

Os histogramas de frequência apresentados a seguir oferecem uma representação visual da distribuição dos dados brutos coletados.

Inicialmente, vamos nos concentrar na análise das contagens absolutas representadas nos Histogramas de Frequência Asoluta, examinando a distribuição de cada cluster:

Código

breaks <- seq(8, 13.2, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_seed) +
  ggplot2::geom_histogram(color = "#000000", breaks = breaks) +
  ggplot2::stat_bin(ggplot2::aes(label = ggplot2::after_stat(count)), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Seed Area'~(mm^2)),
                y = 'Frequency') +
  ggplot2::scale_y_continuous(limits = c(0, 350)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material("grey") +
  ggplot2::theme_light()

Código

breaks <- seq(0, 7.1, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_endosperm) +
  ggplot2::geom_histogram(color = "#000000", breaks = breaks) +
  ggplot2::stat_bin(ggplot2::aes(label = ggplot2::after_stat(count)), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Endosperm Area'~(mm^2)),
                y = 'Frequency') +
  ggplot2::scale_y_continuous(limits = c(0, 420)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material("grey") +
  ggplot2::theme_light()

Após a identificação de diferenças notáveis nas contagens, que podem ser atribuídas, em parte, à falta de adequação das predições aos clusters definidos pelo Wesley, avançamos para uma análise mais abrangente por meio dos Histogramas de Frequência Relativa. As frequências relativas expressam a porcentagem de ocorrência de cada valor em relação ao total de observações, proporcionando uma perspectiva mais normatizada das características da distribuição das predições nos clusters:

Código

breaks <- seq(8, 13.2, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_seed) +
  ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), 
                          breaks = breaks, color = "#000000") +
  ggplot2::stat_bin(
    ggplot2::aes(y = ggplot2::after_stat(density),
                 label = scales::percent(ggplot2::after_stat(density))), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Seed Area'~(mm^2)), y = 'Density') + 
  ggplot2::scale_y_continuous(labels = scales::percent_format(),
                              limits = c(0, 0.5)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material(palette = 'grey') +
  ggplot2::theme_light()

Código

breaks <- seq(0, 7.1, length.out = 6)
da_seeds |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = area_endosperm) +
  ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), 
                          breaks = breaks, color = "#000000") +
  ggplot2::stat_bin(
    ggplot2::aes(y = ggplot2::after_stat(density),
                 label = scales::percent(ggplot2::after_stat(density))), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = bquote('Endosperm Area'~(mm^2)), y = 'Density') + 
  ggplot2::scale_y_continuous(labels = scales::percent_format(),
                              limits = c(0, 0.5)) +
  ggplot2::scale_x_continuous(breaks = breaks) +
  ggsci::scale_fill_material(palette = 'grey') +
  ggplot2::theme_light()

Area do endosperma / Area da semente

Código

breaks <- seq(0, 1, 0.1)
da_seeds |> 
  dplyr::mutate(ae_at = area_endosperm/area_seed) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = ae_at) +
  ggplot2::geom_histogram(color = "#000000", breaks = breaks) +
  ggplot2::stat_bin(ggplot2::aes(label = ggplot2::after_stat(count)), 
                    breaks = breaks, vjust=-0.5, geom = "text") +
  ggplot2::labs(x = 'Endosperm/Seed Area Ratio',
                y = 'Frequency') +
  ggplot2::scale_y_continuous(limits = c(0, 420)) +
  ggplot2::scale_x_continuous(breaks = breaks, labels = scales::percent_format()) +
  ggsci::scale_fill_material("grey") +
  ggplot2::theme_light()

Dispersao

Código

expand.grid(a_seed = c(seq(0.0000001, 1, length.out = 1000), 
                       seq(0.00001, 100, length.out = 1000)), 
            a_endosperm = c(seq(0.0000001, 1, length.out = 1000), 
                            seq(0.00001, 100, length.out = 1000))) |> 
  tibble::as_tibble() |> 
  dplyr::mutate(ratio = a_endosperm / a_seed) |> 
  dplyr::filter(ratio <= 1) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = a_endosperm, y = a_seed) +
  ggplot2::geom_point(ggplot2::aes(x = area_endosperm, y = area_seed),
                      data = da_seeds, size = 3, alpha = 0.5) +
  ggplot2::geom_contour(ggplot2::aes(z = ratio), size = 0.4, 
                        bins = 10, color = "#000000") +
  metR::geom_text_contour(ggplot2::aes(z = ratio),
                          label.placer = metR::label_placer_n(1), stroke = 0.2) +
  ggplot2::scale_x_continuous(limits = c(-1, 9), breaks = seq(0, 8, 1), 
                              expand = c(0, -0.5)) +
  ggplot2::scale_y_continuous(limits = c(7, 14), breaks = seq(8, 13, 1), 
                              expand = c(0, -0.5)) +
  ggplot2::labs(x = bquote("Endosperm Area"~(mm^2)), 
                y = bquote("Seed Area"~(mm^2))) +
  ggplot2::theme_light()