LabEcoPob: Variación Poblacional

Métodos Estadísticos

A continuación se muestran ejemplos del uso de diversas funciones y pruebas estadísticas, utilizando la variable L (longitud de semilla, en mm); para su informe deberá utilizar también la variable masa (masa de cada semilla, g).

Estadísticas descriptivas

Usando summary y table:

# summary de todas las variables
summary(semillas)

##      arbol           grupo          fruto          semilla            L        
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   : 1.00   Min.   : 7.10  
##  1st Qu.:1.000   1st Qu.:2.00   1st Qu.:1.000   1st Qu.: 2.00   1st Qu.: 9.30  
##  Median :2.000   Median :3.00   Median :1.000   Median : 4.00   Median :10.10  
##  Mean   :2.103   Mean   :2.87   Mean   :1.462   Mean   : 4.65   Mean   :10.27  
##  3rd Qu.:3.000   3rd Qu.:4.00   3rd Qu.:2.000   3rd Qu.: 6.50   3rd Qu.:11.00  
##  Max.   :3.000   Max.   :5.00   Max.   :2.000   Max.   :13.00   Max.   :13.50  
##       masa          nsemillas   
##  Min.   :0.0320   Min.   : 3.0  
##  1st Qu.:0.2010   1st Qu.: 7.0  
##  Median :0.2200   Median : 8.0  
##  Mean   :0.2225   Mean   : 8.3  
##  3rd Qu.:0.2500   3rd Qu.:10.0  
##  Max.   :0.3280   Max.   :13.0

# summary escogiendo variables
variables_num <- c("L", "masa")
summary(semillas[ ,variables_num])

##        L              masa       
##  Min.   : 7.10   Min.   :0.0320  
##  1st Qu.: 9.30   1st Qu.:0.2010  
##  Median :10.10   Median :0.2200  
##  Mean   :10.27   Mean   :0.2225  
##  3rd Qu.:11.00   3rd Qu.:0.2500  
##  Max.   :13.50   Max.   :0.3280

# table para frecuencia por árbol
table(semillas$arbol)

## 
##  1  2  3 
## 60 80 83

Usando function y agrupando por árbol con sapply y split.

#función para definir estadísticos a calcular
mystats <- function(x){
  m <- mean(x)
  md <- median(x)
  n <- length(x)
  s <- sd(x)
  se <- sd(x)/sqrt(length(x))
  skew <- sum((x-m)^3/s^3)/n
  kurt <- sum((x-m)^4/s^4)/n - 3
  return(c(n=n, mean=m, median=md, stdev=s, serror=se, skew=skew, kurtosis=kurt))
}
# función sapply con split para aplicar la función a datos de cada árbol
sapply(split(semillas$L, semillas$arbol), mystats)

##                   1           2           3
## n        60.0000000 80.00000000 83.00000000
## mean     11.8165000 10.01662500  9.40481928
## median   11.9000000 10.00500000  9.50000000
## stdev     0.8889316  0.75782658  0.80802433
## serror    0.1147606  0.08472759  0.08869219
## skew     -0.2814080  0.29338334 -0.54568049
## kurtosis -0.9816538 -0.52665455  0.28381366

Histogramas con curva normal, por árbol

#histograma y curva normal para cada arbol; variable longitud
#arbol 1
arbol1 <- semillas %>% 
  filter(arbol == "1")
msa1 <- semillas %>% 
  filter(arbol == "1") %>% 
  summarise(means = mean(L), sd=sd(L))
ggplot(arbol1, aes(x=L)) +
  geom_histogram(aes(y=..density..), position="identity", binwidth=.25, color="red", fill="gray") +
  stat_function(fun = dnorm, color="red", size=1, args=list(mean=msa1$means, sd=msa1$sd)) +
  labs(x="Longitud semilla, mm", y="densidad") + 
  annotate("text", x = 11, y = 1, label = "Arbol 1")

#arbol 2
arbol2 <- semillas %>% 
  filter(arbol == "2")
msa2 <- semillas %>% 
  filter(arbol == "2") %>% 
  summarise(means = mean(L), sd=sd(L))
ggplot(arbol2, aes(x=L)) +
  geom_histogram(aes(y=..density..), position="identity", binwidth=.25, color="blue", fill="gray") +
  stat_function(fun = dnorm, color="blue", size=1, args=list(mean=msa2$means, sd=msa2$sd)) +
  labs(x="Longitud semilla, mm", y="densidad") + 
  annotate("text", x = 11, y = 1, label = "Arbol 2")

#arbol 3
arbol3 <- semillas %>% 
  filter(arbol == "3")
msa3 <- semillas %>% 
  filter(arbol == "3") %>% 
  summarise(means = mean(L), sd=sd(L))
ggplot(arbol3, aes(x=L)) +
  geom_histogram(aes(y=..density..), position="identity", binwidth=.25, color="green", fill="gray") +
  stat_function(fun = dnorm, color="green", size=1, args=list(mean=msa3$means, sd=msa3$sd)) +
  labs(x="Longitud semilla, mm", y="densidad") + 
  annotate("text", x = 11, y = 1, label = "Arbol 3")

Curvas de densidad (normales) juntas

#las tres curvas normales juntas
ggplot(semillas, aes(x=L)) +
  stat_function(fun = dnorm, args=list(mean=msa1$means, sd=msa1$sd), aes(colour = "Arbol 1"), size=1.5) + 
  stat_function(fun = dnorm, args=list(mean=msa2$means, sd=msa2$sd), aes(colour = "Arbol 2"), size=1.5) + 
  stat_function(fun = dnorm, args=list(mean=msa3$means, sd=msa3$sd), aes(colour = "Arbol 3"), size=1.5) +
  scale_colour_manual("T. populnea", values = c("red","blue","green")) +
  labs(x="Longitud semilla, mm", y="densidad")

Box & Whisker plot para cada árbol

long <- ggplot(semillas, aes(x = arbol, y=L, group=arbol)) + geom_boxplot(fill="cornflowerblue") +
  stat_summary(aes(group=arbol), fun.y=mean, colour="darkred", geom="point", shape=18, size=3) +
  labs(x = "Arbol", y = "Longitud semilla, mm")

## Warning: `fun.y` is deprecated. Use `fun` instead.

long

Pruebas de hipótesis estadísticas

Pruebas de normalidad para los datos de longitud

library(EnvStats)
#Q-Q plot
qqPlot(semillas$L, add.line = TRUE, points.col = "blue", line.col = "red")

#prueba Shapiro-Wilk
shapiro.test(semillas$L)

## 
##  Shapiro-Wilk normality test
## 
## data:  semillas$L
## W = 0.97068, p-value = 0.0001398

Prueba de homogeneidad de varianza

bartlett.test(L ~ arbol, data=semillas)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  L by arbol
## Bartlett's K-squared = 1.7397, df = 2, p-value = 0.419

ANOVA para el efecto de poblaciones (árboles)

analisis <- aov(L ~ as.factor(arbol), data=semillas)
summary(analisis)

##                   Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(arbol)   2  210.8  105.38   159.3 <2e-16 ***
## Residuals        220  145.5    0.66                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pruebas post hoc para determinar las diferencias entre árboles

library(agricolae)
posthoc <- TukeyHSD(analisis)
posthoc

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = L ~ as.factor(arbol), data = semillas)
## 
## $`as.factor(arbol)`
##           diff        lwr        upr   p adj
## 2-1 -1.7998750 -2.1276397 -1.4721103 0.0e+00
## 3-1 -2.4116807 -2.7368969 -2.0864646 0.0e+00
## 3-2 -0.6118057 -0.9125023 -0.3111091 8.7e-06

#gráfica
par(cex = 0.6)
plot(posthoc)

Regresión para relaciones alométricas, y análisis de residuales

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:EnvStats':
## 
##     qqPlot

## The following object is masked from 'package:dplyr':
## 
##     recode

# regresión lineal simple
rls_1 <- lm(L ~ masa, data=semillas)
summary(rls_1)

## 
## Call:
## lm(formula = L ~ masa, data = semillas)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2061 -0.7771 -0.1554  0.6326  3.4516 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.1641     0.3301  21.704   <2e-16 ***
## masa         13.9740     1.4485   9.647   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.065 on 221 degrees of freedom
## Multiple R-squared:  0.2963, Adjusted R-squared:  0.2931 
## F-statistic: 93.07 on 1 and 221 DF,  p-value: < 2.2e-16

plot(L ~ masa, data=semillas)
abline(rls_1)

#residuales
spreadLevelPlot(rls_1)

## 
## Suggested power transformation:  1.693059

Regresión longitud vs. cantidad de semillas

rls_3 <- lm(L ~ nsemillas, data=semillas)
summary(rls_3)

## 
## Call:
## lm(formula = L ~ nsemillas, data = semillas)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5886 -0.8110 -0.0334  0.7390  3.0631 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.31793    0.28247  40.068  < 2e-16 ***
## nsemillas   -0.12587    0.03255  -3.866 0.000145 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.229 on 221 degrees of freedom
## Multiple R-squared:  0.06336,    Adjusted R-squared:  0.05912 
## F-statistic: 14.95 on 1 and 221 DF,  p-value: 0.0001453

plot(L ~ nsemillas, data=semillas)
abline(rls_3)

#residuales
spreadLevelPlot(rls_3)

## 
## Suggested power transformation:  -10.89188