Introducción

Este documento presenta la solución del Taller de Análisis en R, donde se abordan ejercicios relacionados con la manipulación de vectores, visualización de datos, análisis de correlación, regresión lineal y pruebas estadísticas.

Gráficos en el Taller

En este taller se generarán gráficos que muestran distribuciones y relaciones entre variables, como histogramas, diagramas de densidad y gráficos de dispersión.

El siguiente ejemplo muestra un histograma y un gráfico de densidad de los datos generados:

brillo <- rnorm(963, mean = 5, sd = 2)
hist(brillo, probability = TRUE, main = "Histograma y Densidad de Brillo", col = "blue", breaks = 30, xlab = "Brillo")
lines(density(brillo), col = "red", lwd = 2)

Ejercicio 1

set.seed(123)
valores <- c(rnorm(100), 50, 60, -45)
q1 <- quantile(valores, 0.25)
q3 <- quantile(valores, 0.75)
iqr <- q3 - q1
limite_inferior <- q1 - 1.5 * iqr
limite_superior <- q3 + 1.5 * iqr
valores_atipicos <- valores[valores < limite_inferior | valores > limite_superior]
segundo_mas_pequeno <- sort(valores_atipicos)[2]
print(segundo_mas_pequeno)

## [1] 50

boxplot(valores, main = "Diagrama de Caja de Valores", ylab = "Valores", col = "lightblue")

Ejercicio 2

brillo <- rnorm(963, mean = 5, sd = 2)
hist(brillo, probability = TRUE, main = "Histograma y Densidad de Brillo", col = "blue", breaks = 30, xlab = "Brillo")
lines(density(brillo), col = "red", lwd = 2)

Ejercicio 3

set.seed(123)
datos <- data.frame(
  sodio = rnorm(100, mean = 50, sd = 10),
  azucar = rnorm(100, mean = 30, sd = 5)
)
correlacion <- cor(datos$sodio, datos$azucar)
print(correlacion)

## [1] -0.04953215

plot(datos$sodio, datos$azucar, main = "Sodio vs Azúcar",
     xlab = "Sodio", ylab = "Azúcar", pch = 19, col = "darkgreen")
abline(lm(datos$azucar ~ datos$sodio), col = "red")

Ejercicio 4

library(MASS)
data(UScereal)
head(UScereal)

##                           mfr calories   protein      fat   sodium     fibre
## 100% Bran                   N 212.1212 12.121212 3.030303 393.9394 30.303030
## All-Bran                    K 212.1212 12.121212 3.030303 787.8788 27.272727
## All-Bran with Extra Fiber   K 100.0000  8.000000 0.000000 280.0000 28.000000
## Apple Cinnamon Cheerios     G 146.6667  2.666667 2.666667 240.0000  2.000000
## Apple Jacks                 K 110.0000  2.000000 0.000000 125.0000  1.000000
## Basic 4                     G 173.3333  4.000000 2.666667 280.0000  2.666667
##                              carbo   sugars shelf potassium vitamins
## 100% Bran                 15.15152 18.18182     3 848.48485 enriched
## All-Bran                  21.21212 15.15151     3 969.69697 enriched
## All-Bran with Extra Fiber 16.00000  0.00000     3 660.00000 enriched
## Apple Cinnamon Cheerios   14.00000 13.33333     1  93.33333 enriched
## Apple Jacks               11.00000 14.00000     2  30.00000 enriched
## Basic 4                   24.00000 10.66667     3 133.33333 enriched

boxplot(fat ~ shelf, data = UScereal, main = "Grasa vs Estantería", col = "lightblue")

plot(UScereal$sodium, UScereal$sugars, main = "Sodio vs Azúcares",
     xlab = "Sodio", ylab = "Azúcares", col = "red", pch = 16)

Ejercicio 5

brillo <- rnorm(963, mean = 50, sd = 15)
q1 <- quantile(brillo, 0.25)
q3 <- quantile(brillo, 0.75)
iqr <- q3 - q1
limite_inferior <- q1 - 1.5 * iqr
limite_superior <- q3 + 1.5 * iqr
brillo_sin_atipicos <- brillo[brillo >= limite_inferior & brillo <= limite_superior]
hist(brillo_sin_atipicos, main = "Histograma sin valores atípicos", col = "lightblue", xlab = "Brillo")

brillo_log <- log(brillo_sin_atipicos)
par(mfrow = c(1, 2))
hist(brillo_sin_atipicos, main = "Sin Transformar", col = "lightblue")
hist(brillo_log, main = "Transformación Logarítmica", col = "lightgreen")

Ejercicio 6

vector1 <- rnorm(50, mean = 5, sd = 2)
vector2 <- rnorm(50, mean = 6, sd = 2)
t_test <- t.test(vector1, vector2)
print(t_test)

## 
##  Welch Two Sample t-test
## 
## data:  vector1 and vector2
## t = -1.9462, df = 97.856, p-value = 0.0545
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.56617685  0.01528155
## sample estimates:
## mean of x mean of y 
##  5.047986  5.823434

binom1 <- rbinom(50, size = 10, prob = 0.6)
binom2 <- rbinom(50, size = 10, prob = 0.7)
t_test_binom <- t.test(binom1, binom2)
print(t_test_binom)

## 
##  Welch Two Sample t-test
## 
## data:  binom1 and binom2
## t = -4.1362, df = 97.026, p-value = 7.515e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.6574239 -0.5825761
## sample estimates:
## mean of x mean of y 
##      5.94      7.06

Ejercicio 7

set.seed(123)
datos_CO2 <- data.frame(
  PIB_per_capita = rnorm(26, mean = 30000, sd = 5000),
  emisiones_CO2 = rnorm(26, mean = 15, sd = 3)
)
modelo <- lm(emisiones_CO2 ~ PIB_per_capita, data = datos_CO2)
summary(modelo)

## 
## Call:
## lm(formula = emisiones_CO2 ~ PIB_per_capita, data = datos_CO2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2165 -1.8721  0.0137  1.3820  4.3402 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.073e+01  2.876e+00   7.207  1.9e-07 ***
## PIB_per_capita -1.767e-04  9.617e-05  -1.837   0.0786 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.363 on 24 degrees of freedom
## Multiple R-squared:  0.1233, Adjusted R-squared:  0.08673 
## F-statistic: 3.374 on 1 and 24 DF,  p-value: 0.07864

plot(datos_CO2$PIB_per_capita, datos_CO2$emisiones_CO2, main = "Regresión: PIB vs CO2",
     xlab = "PIB per cápita", ylab = "Emisiones de CO2", pch = 19, col = "blue")
abline(modelo, col = "red")

Ejercicio 8

datos <- rnorm(50, mean = 10, sd = 2)
shapiro_test <- shapiro.test(datos)
print(shapiro_test)

## 
##  Shapiro-Wilk normality test
## 
## data:  datos
## W = 0.99123, p-value = 0.9705

library(pwr)
pwr_test <- pwr.t.test(d = 0.5, n = 30, sig.level = 0.05, power = NULL, type = "two.sample")
print(pwr_test)

## 
##      Two-sample t test power calculation 
## 
##               n = 30
##               d = 0.5
##       sig.level = 0.05
##           power = 0.4778965
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Ejercicio 9

educacion <- data.frame(
  gasto_educacion = runif(50, 5000, 20000),
  puntaje_prueba = runif(50, 50, 100)
)
modelo_educacion <- lm(puntaje_prueba ~ gasto_educacion, data = educacion)
summary(modelo_educacion)

## 
## Call:
## lm(formula = puntaje_prueba ~ gasto_educacion, data = educacion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.0386 -16.1703  -0.4602  14.5221  24.6253 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     78.7322190  7.5048956  10.491 5.16e-14 ***
## gasto_educacion -0.0003870  0.0005761  -0.672    0.505    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.32 on 48 degrees of freedom
## Multiple R-squared:  0.009313,   Adjusted R-squared:  -0.01133 
## F-statistic: 0.4512 on 1 and 48 DF,  p-value: 0.505

plot(educacion$gasto_educacion, educacion$puntaje_prueba,
     main = "Gasto en Educación vs Puntaje en Prueba",
     xlab = "Gasto en Educación (USD)", ylab = "Puntaje en Prueba", pch = 19)
abline(modelo_educacion, col = "red")

Taller de Análisis en R

Paula

r Sys.Date()