Universidad Rafael Landívar
Campus San Luís Gonzaga S.J. Zacapa
Curso: Programación estadística con R
Docente: Dr. Ezequiel López Bautista
# Cargar bibliotecas
library(datasets)
library(ggplot2)
library(tidyverse)
library(viridis)
library(fBasics)
Description: the orange dataset is a classic datgaset that provides 35 observations with 3 variables. It provides the age (in days) and circumferenci (in mm) for five type of Orange trees. We have manipulate, visualize data and built a regression model to predict the age of the orange tree based o nthe given circumference
# Importar el dataset Orange
data(Orange)
# Mostrar la tabla de datos
knitr::kable(Orange, caption = "Dataset Orange - Datos completos")
| Tree | age | circumference |
|---|---|---|
| 1 | 118 | 30 |
| 1 | 484 | 58 |
| 1 | 664 | 87 |
| 1 | 1004 | 115 |
| 1 | 1231 | 120 |
| 1 | 1372 | 142 |
| 1 | 1582 | 145 |
| 2 | 118 | 33 |
| 2 | 484 | 69 |
| 2 | 664 | 111 |
| 2 | 1004 | 156 |
| 2 | 1231 | 172 |
| 2 | 1372 | 203 |
| 2 | 1582 | 203 |
| 3 | 118 | 30 |
| 3 | 484 | 51 |
| 3 | 664 | 75 |
| 3 | 1004 | 108 |
| 3 | 1231 | 115 |
| 3 | 1372 | 139 |
| 3 | 1582 | 140 |
| 4 | 118 | 32 |
| 4 | 484 | 62 |
| 4 | 664 | 112 |
| 4 | 1004 | 167 |
| 4 | 1231 | 179 |
| 4 | 1372 | 209 |
| 4 | 1582 | 214 |
| 5 | 118 | 30 |
| 5 | 484 | 49 |
| 5 | 664 | 81 |
| 5 | 1004 | 125 |
| 5 | 1231 | 142 |
| 5 | 1372 | 174 |
| 5 | 1582 | 177 |
Resumen de los 5 números más la media para las variables edad y circunferencia:
summary(Orange[, c("age", "circumference")])
## age circumference
## Min. : 118.0 Min. : 30.0
## 1st Qu.: 484.0 1st Qu.: 65.5
## Median :1004.0 Median :115.0
## Mean : 922.1 Mean :115.9
## 3rd Qu.:1372.0 3rd Qu.:161.5
## Max. :1582.0 Max. :214.0
Gráfico de dispersión entre circunferencia y edad:
plot(Orange$circumference, Orange$age,
main = "Dispersión: Circunferencia vs Edad",
xlab = "Circunferencia (mm)",
ylab = "Edad (días)",
col = "blue",
pch = 19)
Interpretación: Se observa una asociación directa (positiva) entre la circunferencia y la edad de los árboles. A mayor circunferencia, mayor edad.
cor_result <- cor.test(Orange$circumference, Orange$age)
cor_result
##
## Pearson's product-moment correlation
##
## data: Orange$circumference and Orange$age
## t = 12.9, df = 33, p-value = 1.931e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8342364 0.9557955
## sample estimates:
## cor
## 0.9135189
Interpretación:
Considerando que la columna Tree representa el estrato
donde se encuentran ubicados los árboles:
ggplot(Orange, aes(x=circumference, y=age, color=Tree)) +
geom_point(size=2) +
labs(title = "Dispersión por estrato",
x = "Circunferencia (mm)",
y = "Edad (días)") +
theme_minimal()
Insertando una nueva columna con los nombres de los estratos:
Orange$Estrato <- factor(Orange$Tree,
levels = c("1", "2", "3", "4", "5"),
labels = c("Norte", "Sur", "Este", "Oeste", "Centro"))
# Mostrar las primeras filas
knitr::kable(head(Orange, 35), caption = "Dataset Orange con columna Estrato")
| Tree | age | circumference | Estrato |
|---|---|---|---|
| 1 | 118 | 30 | Norte |
| 1 | 484 | 58 | Norte |
| 1 | 664 | 87 | Norte |
| 1 | 1004 | 115 | Norte |
| 1 | 1231 | 120 | Norte |
| 1 | 1372 | 142 | Norte |
| 1 | 1582 | 145 | Norte |
| 2 | 118 | 33 | Sur |
| 2 | 484 | 69 | Sur |
| 2 | 664 | 111 | Sur |
| 2 | 1004 | 156 | Sur |
| 2 | 1231 | 172 | Sur |
| 2 | 1372 | 203 | Sur |
| 2 | 1582 | 203 | Sur |
| 3 | 118 | 30 | Este |
| 3 | 484 | 51 | Este |
| 3 | 664 | 75 | Este |
| 3 | 1004 | 108 | Este |
| 3 | 1231 | 115 | Este |
| 3 | 1372 | 139 | Este |
| 3 | 1582 | 140 | Este |
| 4 | 118 | 32 | Oeste |
| 4 | 484 | 62 | Oeste |
| 4 | 664 | 112 | Oeste |
| 4 | 1004 | 167 | Oeste |
| 4 | 1231 | 179 | Oeste |
| 4 | 1372 | 209 | Oeste |
| 4 | 1582 | 214 | Oeste |
| 5 | 118 | 30 | Centro |
| 5 | 484 | 49 | Centro |
| 5 | 664 | 81 | Centro |
| 5 | 1004 | 125 | Centro |
| 5 | 1231 | 142 | Centro |
| 5 | 1372 | 174 | Centro |
| 5 | 1582 | 177 | Centro |
ggplot(Orange, aes(x=factor(Tree), y=circumference, fill=Tree, color=Tree)) +
geom_bar(stat='identity', position='dodge') +
labs(title = "Circunferencia total por árbol",
x = "Árbol",
y = "Circunferencia (mm)") +
theme_minimal()
Orange %>%
ggplot(aes(x=Tree, y=circumference, fill=Tree)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_minimal() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Box plot de circunferencia por árbol con jitter") +
xlab("Árbol") +
ylab("Circunferencia (mm)")
basicStats(Orange$circumference)
shapiro_result <- shapiro.test(Orange$circumference)
shapiro_result
##
## Shapiro-Wilk normality test
##
## data: Orange$circumference
## W = 0.94591, p-value = 0.08483
Interpretación:
boxplot(Orange$circumference,
main = "Box plot general de circunferencia",
ylab = "Circunferencia (mm)",
col = "lightblue",
border = "darkblue")
Este análisis del dataset Orange permitió: