Import data

data <- read_excel("./data/palmtrees.xlsx")
data
## # A tibble: 2,557 × 29
##    spec_name            acc_genus acc_species palm_tribe palm_subfamily climbing
##    <chr>                <chr>     <chr>       <chr>      <chr>          <chr>   
##  1 Acanthophoenix crin… Acanthop… crinita     Areceae    Arecoideae     climbing
##  2 Acanthophoenix rous… Acanthop… rousselii   Areceae    Arecoideae     climbing
##  3 Acanthophoenix rubra Acanthop… rubra       Areceae    Arecoideae     climbing
##  4 Acoelorrhaphe wrigh… Acoelorr… wrightii    Trachycar… Coryphoideae   climbing
##  5 Acrocomia aculeata   Acrocomia aculeata    Cocoseae   Arecoideae     climbing
##  6 Acrocomia crispa     Acrocomia crispa      Cocoseae   Arecoideae     climbing
##  7 Acrocomia emensis    Acrocomia emensis     Cocoseae   Arecoideae     climbing
##  8 Acrocomia glaucesce… Acrocomia glaucescens Cocoseae   Arecoideae     climbing
##  9 Acrocomia hassleri   Acrocomia hassleri    Cocoseae   Arecoideae     climbing
## 10 Acrocomia intumesce… Acrocomia intumescens Cocoseae   Arecoideae     climbing
## # ℹ 2,547 more rows
## # ℹ 23 more variables: acaulescent <chr>, erect <chr>, stem_solitary <chr>,
## #   stem_armed <chr>, leaves_armed <chr>, max_stem_height_m <chr>,
## #   max_stem_dia_cm <chr>, understorey_canopy <chr>, max_leaf_number <chr>,
## #   max__blade__length_m <chr>, max__rachis__length_m <chr>,
## #   max__petiole_length_m <chr>, average_fruit_length_cm <chr>,
## #   min_fruit_length_cm <chr>, max_fruit_length_cm <chr>, …

Introduction

# Clean the data
data$max_leaf_number <- as.numeric(data$max_leaf_number)
## Warning: NAs introduced by coercion
data$average_fruit_length_cm <- as.numeric(data$average_fruit_length_cm)
## Warning: NAs introduced by coercion
data$max_stem_height_m <- as.numeric(data$max_stem_height_m)
## Warning: NAs introduced by coercion
data$max_stem_dia_cm <- as.numeric(data$max_stem_dia_cm)
## Warning: NAs introduced by coercion

Questions

What is the distribution of fruit size? What about that of max leaf number? Leaf number distribution by fruit size?

Variation

Visualizing distributions

ggplot(data) +
  geom_bar(mapping = aes(fruit_size_categorical))

ggplot(data) +
  geom_histogram(mapping = aes(max_leaf_number), binwidth = 1)
## Warning: Removed 1251 rows containing non-finite outside the scale range
## (`stat_bin()`).

Typical values

filter(data, max_leaf_number<40) %>%
  ggplot(mapping = aes(max_leaf_number)) +
  geom_histogram(binwidth = 0.75)

Unusual values

ggplot(data) +
  geom_histogram(mapping = aes(max_leaf_number), binwidth = 1.25) +
  coord_cartesian(ylim = c(0, 20))
## Warning: Removed 1251 rows containing non-finite outside the scale range
## (`stat_bin()`).

Missing Values

data2 <- data %>%
  mutate(max_leaf_number = ifelse(max_leaf_number <= 0, NA, max_leaf_number))

ggplot(data2, mapping = aes(max_leaf_number, average_fruit_length_cm)) +
  geom_point()
## Warning: Removed 1330 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

ggplot(data, mapping = aes(x = max_leaf_number, colour = fruit_size_categorical)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1251 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data, mapping = aes(max_leaf_number, y = after_stat(density))) + 
  geom_freqpoly(mapping = aes(colour = fruit_size_categorical), binwidth = 6)
## Warning: Removed 1251 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data, mapping = aes(fruit_size_categorical, max_leaf_number)) +
  geom_boxplot()
## Warning: Removed 1251 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Two categorical variables

ggplot(data) +
  geom_count(mapping = aes(fruit_size_categorical, fruit_shape))

data %>% 
  count(fruit_shape, fruit_size_categorical) %>%  
  ggplot(mapping = aes(fruit_shape, fruit_size_categorical)) +
    geom_tile(mapping = aes(fill = n))

Two continous variables

ggplot(data) +
  geom_point(mapping = aes(max_leaf_number, average_fruit_length_cm), alpha = 1/10)
## Warning: Removed 1330 rows containing missing values or values outside the scale range
## (`geom_point()`).

library(hexbin)
ggplot(data) +
  geom_bin2d(mapping = aes(max_leaf_number, average_fruit_length_cm))
## `stat_bin2d()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1330 rows containing non-finite outside the scale range
## (`stat_bin2d()`).

filter(data, average_fruit_length_cm < 20) %>%
  ggplot(mapping = aes(max_leaf_number, average_fruit_length_cm)) + 
  geom_boxplot(mapping = aes(group = cut_width(max_leaf_number, 0.1)))
## Warning: Removed 825 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

filter(data, average_fruit_length_cm < 20) %>%
  ggplot(mapping = aes(max_leaf_number, average_fruit_length_cm)) + 
  geom_boxplot(mapping = aes(group = cut_number(max_leaf_number, 12)))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Removed 825 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

Patterns and models

summary(data)
##   spec_name          acc_genus         acc_species         palm_tribe       
##  Length:2557        Length:2557        Length:2557        Length:2557       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  palm_subfamily       climbing         acaulescent           erect          
##  Length:2557        Length:2557        Length:2557        Length:2557       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  stem_solitary       stem_armed        leaves_armed       max_stem_height_m
##  Length:2557        Length:2557        Length:2557        Min.   :  0.00   
##  Class :character   Class :character   Class :character   1st Qu.:  2.50   
##  Mode  :character   Mode  :character   Mode  :character   Median :  6.00   
##                                                           Mean   : 10.86   
##                                                           3rd Qu.: 15.00   
##                                                           Max.   :170.00   
##                                                           NA's   :446      
##  max_stem_dia_cm  understorey_canopy max_leaf_number max__blade__length_m
##  Min.   :  0.00   Length:2557        Min.   : 4.00   Length:2557         
##  1st Qu.:  2.00   Class :character   1st Qu.: 8.00   Class :character    
##  Median :  5.00   Mode  :character   Median :11.00   Mode  :character    
##  Mean   : 12.38                      Mean   :14.37                       
##  3rd Qu.: 17.00                      3rd Qu.:18.00                       
##  Max.   :175.00                      Max.   :75.00                       
##  NA's   :602                         NA's   :1251                        
##  max__rachis__length_m max__petiole_length_m average_fruit_length_cm
##  Length:2557           Length:2557           Min.   : 0.300         
##  Class :character      Class :character      1st Qu.: 1.050         
##  Mode  :character      Mode  :character      Median : 1.500         
##                                              Mean   : 2.196         
##                                              3rd Qu.: 2.500         
##                                              Max.   :45.000         
##                                              NA's   :505            
##  min_fruit_length_cm max_fruit_length_cm average_fruit_width_cm
##  Length:2557         Length:2557         Length:2557           
##  Class :character    Class :character    Class :character      
##  Mode  :character    Mode  :character    Mode  :character      
##                                                                
##                                                                
##                                                                
##                                                                
##  min_fruit_width_cm max_fruit_width_cm fruit_size_categorical
##  Length:2557        Length:2557        Length:2557           
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##                                                              
##  fruit_shape        fruit_color_description main_fruit_colors 
##  Length:2557        Length:2557             Length:2557       
##  Class :character   Class :character        Class :character  
##  Mode  :character   Mode  :character        Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##  conspicuousness   
##  Length:2557       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
ggplot(data, aes(x = log(max_stem_height_m), y = log(max_stem_dia_cm))) +
  geom_point()
## Warning: Removed 647 rows containing missing values or values outside the scale range
## (`geom_point()`).

There seems to be a subtle logarithmic relationship here

data_clean <- data %>%
  filter(max_stem_dia_cm > 0, max_stem_height_m > 0)
library(modelr)

mod <- lm(log(max_stem_dia_cm) ~ log(max_stem_height_m), data = data_clean)

data2 <- data_clean %>% 
  add_residuals(mod) %>% 
  mutate(resid = exp(resid))

ggplot(data2) + 
  geom_point(mapping = aes(x = average_fruit_length_cm, y = resid))
## Warning: Removed 149 rows containing missing values or values outside the scale range
## (`geom_point()`).

I don’t see anything here really.

ggplot(data2, mapping = aes(resid, average_fruit_length_cm)) + 
  geom_line(mapping = aes(colour = palm_tribe, group = palm_tribe))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

Wow, that graph is not helpful.

filter(data2, resid < 15 & average_fruit_length_cm < 20 & (palm_tribe %in% 
    c("Areceae", "Sclerospermeae", "Trachycarpeae", "Phytelepheae", "Caryoteae"))) %>%
  ggplot(mapping = aes(resid, average_fruit_length_cm)) + 
  stat_summary_bin(fun = mean, bins = 5, geom = "line", 
    aes(group = palm_tribe, color = palm_tribe))

Conclusion

Based on the above chart, that plots the residual of stem diameter uncluding stem width, we see that the stem diameter has a positive affect on fruit length within each respective tribe.