The dataset to be used can be found in Infostat by navigating to File → Open sample data → Atriplex. Once located, the dataset can be selected and exported in either Excel or .txt format.

The dataset contains three categorical variables: Tamaño, Episperma and Bloque. R stored these as character (chr) vectors rather than factor type because we used the strinAsFactors = FALSE parameter when loading the data. Additionally, although the PG, PN and PS variables is stored as a numeric (int) vector.

# Set working directory
setwd("E:/Desktop/ProyectoR")
# Load libraries
library(readxl)
# Load data
datos <- read_excel("Atriplex.xlsx")

Ensure the dataset is correctly understood by applying the functions covered in previous sessions in R.

class(datos)
## [1] "tbl_df"     "tbl"        "data.frame"
names(datos)      
## [1] "Tamaño"    "Episperma" "PG"        "PN"        "PS"        "Bloque"
str(datos) 
## tibble [27 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Tamaño   : chr [1:27] "chicas" "chicas" "chicas" "chicas" ...
##  $ Episperma: chr [1:27] "claro" "claro" "claro" "rojizo" ...
##  $ PG       : num [1:27] 60 73 73 93 66 60 20 26 20 87 ...
##  $ PN       : num [1:27] 47 33 60 7 33 20 0 13 7 54 ...
##  $ PS       : num [1:27] 0.003 0.003 0.0031 0.003 0.0026 0.003 0.003 0.0027 0.003 0.0033 ...
##  $ Bloque   : num [1:27] 1 2 3 1 2 3 1 2 3 1 ...
datos$Bloque=factor(datos$Bloque, 
                    levels = c("1","2","3"), 
                    ordered = TRUE)
head(datos,3)
## # A tibble: 3 × 6
##   Tamaño Episperma    PG    PN     PS Bloque
##   <chr>  <chr>     <dbl> <dbl>  <dbl> <ord> 
## 1 chicas claro        60    47 0.003  1     
## 2 chicas claro        73    33 0.003  2     
## 3 chicas claro        73    60 0.0031 3
tail(datos,3)
## # A tibble: 3 × 6
##   Tamaño  Episperma    PG    PN     PS Bloque
##   <chr>   <chr>     <dbl> <dbl>  <dbl> <ord> 
## 1 grandes oscuro       53    47 0.0033 1     
## 2 grandes oscuro       40    27 0.003  2     
## 3 grandes oscuro       53    47 0.0042 3
attach(datos)

The sample size will be computed and assigned to a variable named n.

dim.datos<-dim(datos)
dim.datos
## [1] 27  6
n<-dim.datos[1]
n
## [1] 27

The categorical data is tipically examined using tables. A table that presents a single categorical variable is know as a one way table. The table() function can be used to generate one way tables for the data.

Frequency tables for categorical variables Tamaño and Episperma

Absolute frequency

fa.tamaño<-table(Tamaño)
fa.tamaño
## Tamaño
##   chicas  grandes medianas 
##        9        9        9
fa.episperma<-table(Episperma) 
fa.episperma
## Episperma
##  claro oscuro rojizo 
##      9      9      9

An alternative method to determine the number of observations.

n<-sum(fa.episperma)
n
## [1] 27

Relative frequency

fr.episperma<-(fa.episperma/n)*100 
fr.episperma
## Episperma
##    claro   oscuro   rojizo 
## 33.33333 33.33333 33.33333
fr.tamaño<-(fa.tamaño/n)*100
fr.tamaño
## Tamaño
##   chicas  grandes medianas 
## 33.33333 33.33333 33.33333

Relative frequency by using the function prop.table()

fr.episperma1<-prop.table(fa.tamaño)
fr.episperma1
## Tamaño
##    chicas   grandes  medianas 
## 0.3333333 0.3333333 0.3333333
fr.episperma1<-prop.table(fr.tamaño)
fr.episperma1
## Tamaño
##    chicas   grandes  medianas 
## 0.3333333 0.3333333 0.3333333

Cumulative absolute frequency.

faa.tamaño<-cumsum(fa.tamaño)
faa.tamaño
##   chicas  grandes medianas 
##        9       18       27
faa.episperma<-cumsum(fa.episperma)
faa.episperma
##  claro oscuro rojizo 
##      9     18     27

Cumulative relative frequency.

fra.episperma<-cumsum(fr.episperma)
fra.episperma
##     claro    oscuro    rojizo 
##  33.33333  66.66667 100.00000
fra.tamaño<-cumsum(fr.tamaño)
fra.tamaño
##    chicas   grandes  medianas 
##  33.33333  66.66667 100.00000

Frequency tables for numerical variables PG, PN and PS

Absolute and relative frequency.

fa.pg<-table(cut(PG,breaks = 4)) 
fa.pg
## 
## (12.9,34.8] (34.8,56.5] (56.5,78.2]  (78.2,100] 
##           6           3           5          13
fr.pg<-prop.table(fa.pg)
fr.pg
## 
## (12.9,34.8] (34.8,56.5] (56.5,78.2]  (78.2,100] 
##   0.2222222   0.1111111   0.1851852   0.4814815

Frequency tables will be constructed in R, utilizing custom intervals for data grouping.

fa.pg1<-table(cut(PG, breaks=c(12.9,34.8,56.5,78.2,100)))
fa.pg1
## 
## (12.9,34.8] (34.8,56.5] (56.5,78.2]  (78.2,100] 
##           6           3           5          13

Cumulative absolute and frequency table.

faa.pg<-cumsum(fa.pg) 
faa.pg
## (12.9,34.8] (34.8,56.5] (56.5,78.2]  (78.2,100] 
##           6           9          14          27
fra.pg<-cumsum(fr.pg)
fra.pg
## (12.9,34.8] (34.8,56.5] (56.5,78.2]  (78.2,100] 
##   0.2222222   0.3333333   0.5185185   1.0000000

Contingency table for Tamaño vs Episperma

fa.contingencia.cat<-table(Tamaño, Episperma)
fa.contingencia.cat
##           Episperma
## Tamaño    claro oscuro rojizo
##   chicas       3      3      3
##   grandes      3      3      3
##   medianas     3      3      3
fr.contingencia.cat<-prop.table(fa.contingencia.cat)
fr.contingencia.cat
##           Episperma
## Tamaño        claro    oscuro    rojizo
##   chicas   0.1111111 0.1111111 0.1111111
##   grandes  0.1111111 0.1111111 0.1111111
##   medianas 0.1111111 0.1111111 0.1111111

Contingency table for Tamaño vs PG

A new column named catPG is being added to the datos data frame. This column contains the variable PG divided into four equal-width categories using the cut() function.

datos$catPG<-cut(PG,breaks=4)
head(datos)
## # A tibble: 6 × 7
##   Tamaño Episperma    PG    PN     PS Bloque catPG      
##   <chr>  <chr>     <dbl> <dbl>  <dbl> <ord>  <fct>      
## 1 chicas claro        60    47 0.003  1      (56.5,78.2]
## 2 chicas claro        73    33 0.003  2      (56.5,78.2]
## 3 chicas claro        73    60 0.0031 3      (56.5,78.2]
## 4 chicas rojizo       93     7 0.003  1      (78.2,100] 
## 5 chicas rojizo       66    33 0.0026 2      (56.5,78.2]
## 6 chicas rojizo       60    20 0.003  3      (56.5,78.2]
attach(datos)
fa.contingencia.num<-table(Tamaño,catPG)
fa.contingencia.num
##           catPG
## Tamaño    (12.9,34.8] (34.8,56.5] (56.5,78.2] (78.2,100]
##   chicas             3           0           5          1
##   grandes            0           3           0          6
##   medianas           3           0           0          6
fr.contingencia.num<-prop.table(fa.contingencia.num)
fr.contingencia.num
##           catPG
## Tamaño    (12.9,34.8] (34.8,56.5] (56.5,78.2] (78.2,100]
##   chicas    0.11111111  0.00000000  0.18518519 0.03703704
##   grandes   0.00000000  0.11111111  0.00000000 0.22222222
##   medianas  0.11111111  0.00000000  0.00000000 0.22222222
# install.packages("gmodels")
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.4.3
CrossTable(Tamaño,Episperma)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  27 
## 
##  
##              | Episperma 
##       Tamaño |     claro |    oscuro |    rojizo | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##       chicas |         3 |         3 |         3 |         9 | 
##              |     0.000 |     0.000 |     0.000 |           | 
##              |     0.333 |     0.333 |     0.333 |     0.333 | 
##              |     0.333 |     0.333 |     0.333 |           | 
##              |     0.111 |     0.111 |     0.111 |           | 
## -------------|-----------|-----------|-----------|-----------|
##      grandes |         3 |         3 |         3 |         9 | 
##              |     0.000 |     0.000 |     0.000 |           | 
##              |     0.333 |     0.333 |     0.333 |     0.333 | 
##              |     0.333 |     0.333 |     0.333 |           | 
##              |     0.111 |     0.111 |     0.111 |           | 
## -------------|-----------|-----------|-----------|-----------|
##     medianas |         3 |         3 |         3 |         9 | 
##              |     0.000 |     0.000 |     0.000 |           | 
##              |     0.333 |     0.333 |     0.333 |     0.333 | 
##              |     0.333 |     0.333 |     0.333 |           | 
##              |     0.111 |     0.111 |     0.111 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |         9 |         9 |         9 |        27 | 
##              |     0.333 |     0.333 |     0.333 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
## 

Graphics

R has three graphics systems: base, lattice, and ggplot2. In this session, we’ll only cover the basic aspects of the base and lattice systems. In another session, we’ll focus on advanced graphics using ggplot2 in R.

1. Plots using R’s base graphics system

Bar charts o bar plots.

barplot(fa.tamaño, main = "Tamaño de semillas de Atriplex",
        xlab = "Tamaño semilla",ylab = "Lotes semillas",
        col = "pink")

Pie chart

pie(fa.episperma,
    labels = paste(names(fa.episperma), "\n", fa.episperma),
    main = "Color Episperma de \n semillas de Atriplex",
    col = rainbow(6))

Histogram

hist(PG,freq= T,breaks = 4,
     main = "Poder de Germinación de semillas de Atriplex",
     xlab = "Poder de Germinación (PG) en %",
     ylab = "Lotes de semillas",
     col = "blue",
     border = "yellow")

Stacked bar plot

rownames(fa.contingencia.cat)
## [1] "chicas"   "grandes"  "medianas"
colnames(fa.contingencia.cat)
## [1] "claro"  "oscuro" "rojizo"
barplot(fa.contingencia.cat,
        main="Semillas Atriplex, Color vs Epiperma",
        xlab = "Color Episperma",
        ylab = "Lotes de semillas",
        col = c("blue","red","yellowgreen"),
        legend.text = rownames(fa.contingencia.cat),
        beside = F)

barplot(fa.contingencia.cat,
        main="Semillas Atriplex, Color vs Epiperma",
        xlab = "Color Episperma",
        ylab = "Lotes de semillas",
        col = c("blue","red","yellowgreen"),
        legend.text = rownames(fa.contingencia.cat),
        beside = T)

Box plot o Box-and-whisker plot.

boxplot(PG,notch = FALSE,col = "blue",
        main="Diagrama de caja PG",
        ylab = "Poder de Germinación (PG)")

boxplot(PG,notch = TRUE,col = "blue",
        main="Diagrama de caja PG",
        ylab = "Poder de Germinación (PG)")
points(mean(PG), col = "red", pch = 19)

This R function creates a boxplot comparing the variable PS across the different levels of the variable Tamaño.

boxplot(PS~Tamaño, horizontal = TRUE)

Scatter plot

plot(PG, PS,
     xlab="Poder de Germinación %",
     ylab="Peso seco en gr",
     main="Gráfico de Disperión PG vs PS")

This R command generates pairwise scatterplots for the variables in columns 3 to 5 of the datos data frame.

plot(datos[ ,3:5])

2. Graphics with R’s Lattice System

Bar Chart o Bar Plot in lattice Convert fa.tamaño into a data frame if it’s a vector with names.

library(lattice)
df <- data.frame(Tamaño = names(fa.tamaño), 
                 Frecuencia = as.numeric(fa.tamaño))
df
##     Tamaño Frecuencia
## 1   chicas          9
## 2  grandes          9
## 3 medianas          9
barchart(Frecuencia ~ Tamaño, data = df,
         main = "Tamaño de semillas de Atriplex",
         xlab = "Tamaño semilla",
         ylab = "Lotes semillas",
         col = "pink")

In lattice, we use the formula y ~ x, which is why it’s written as Frecuencia ~ Tamaño. barchart() requires a data frame, so we convert the vector fa.tamaño into a data.frame.

Pie chart in lattice?

The lattice package does not have a built-in function for pie charts. Pie charts are part of base R or packages like ggplot2, but not lattice.

Histogram in lattice

histogram(~ PG,
          breaks = 4,
          type = "count",
          col = "blue",
          main = "Poder de Germinación de semillas de Atriplex",
          xlab = "Poder de Germinación (PG) en %",
          ylab = "Lotes de semillas")

Stacked bar plot in lattice Create a data frame with the contingency table values (if necessary)

df1 <- as.data.frame(fa.contingencia.cat)
df1
##     Tamaño Episperma Freq
## 1   chicas     claro    3
## 2  grandes     claro    3
## 3 medianas     claro    3
## 4   chicas    oscuro    3
## 5  grandes    oscuro    3
## 6 medianas    oscuro    3
## 7   chicas    rojizo    3
## 8  grandes    rojizo    3
## 9 medianas    rojizo    3
colnames(df1) <- c("Tamaño", "Episperma", "Frecuencia")
colnames(df1)
## [1] "Tamaño"     "Episperma"  "Frecuencia"
barchart(Frecuencia ~ Tamaño | Episperma, 
         data = df1,
         main = "Semillas Atriplex, Color vs Epiperma",
         xlab = "Color Episperma",
         ylab = "Lotes de semillas",
         col = c("blue", "red", "yellowgreen"),
         stack = TRUE)

Box plot in lattice.

If PG is a vector, we can create a simple data frame.

df_box <- data.frame(PG = PG)
head(df_box,3)
##   PG
## 1 60
## 2 73
## 3 73
bwplot(~ PG, data = df_box,
       main = "Diagrama de caja PG",
       ylab = "Poder de Germinación (PG)",
       notch = TRUE,
       col = "blue")

Scatter plot in lattice

df_scatter <- data.frame(PG = PG, PS = PS)
head(df_scatter,3)
##   PG     PS
## 1 60 0.0030
## 2 73 0.0030
## 3 73 0.0031
xyplot(PS ~ PG, data = df_scatter,
       xlab = "Poder de Germinación %",
       ylab = "Peso seco en gr",
       main = "Gráfico de Dispersión PG vs PS",
       pch = 19, col = "darkblue")

In the next session, we’re going to work on summary measures, also known as descriptive statistics.

Bibliografy

Di Rienzo J.A., Casanoves F., Balzarini M.G., Gonzalez L., Tablada M., Robledo C.W. InfoStat versión 2020. Centro de Transferencia InfoStat, Facultad de Ciencias Agropecuarias, Universidad Nacional de Córdoba, Argentina. URL http://www.infostat.com.ar