Este documento presenta la soluciĂ³n del Taller de AnĂ¡lisis en R, donde se abordan ejercicios relacionados con la manipulaciĂ³n de vectores, visualizaciĂ³n de datos, anĂ¡lisis de correlaciĂ³n, regresiĂ³n lineal y pruebas estadĂsticas.
En este taller se generarĂ¡n grĂ¡ficos que muestran distribuciones y relaciones entre variables, como histogramas, diagramas de densidad y grĂ¡ficos de dispersiĂ³n.
El siguiente ejemplo muestra un histograma y un grĂ¡fico de densidad de los datos generados:
brillo <- rnorm(963, mean = 5, sd = 2)
hist(brillo, probability = TRUE, main = "Histograma y Densidad de Brillo", col = "blue", breaks = 30, xlab = "Brillo")
lines(density(brillo), col = "red", lwd = 2)
set.seed(123)
valores <- c(rnorm(100), 50, 60, -45)
q1 <- quantile(valores, 0.25)
q3 <- quantile(valores, 0.75)
iqr <- q3 - q1
limite_inferior <- q1 - 1.5 * iqr
limite_superior <- q3 + 1.5 * iqr
valores_atipicos <- valores[valores < limite_inferior | valores > limite_superior]
segundo_mas_pequeno <- sort(valores_atipicos)[2]
print(segundo_mas_pequeno)
## [1] 50
boxplot(valores, main = "Diagrama de Caja de Valores", ylab = "Valores", col = "lightblue")
brillo <- rnorm(963, mean = 5, sd = 2)
hist(brillo, probability = TRUE, main = "Histograma y Densidad de Brillo", col = "blue", breaks = 30, xlab = "Brillo")
lines(density(brillo), col = "red", lwd = 2)
set.seed(123)
datos <- data.frame(
sodio = rnorm(100, mean = 50, sd = 10),
azucar = rnorm(100, mean = 30, sd = 5)
)
correlacion <- cor(datos$sodio, datos$azucar)
print(correlacion)
## [1] -0.04953215
plot(datos$sodio, datos$azucar, main = "Sodio vs AzĂºcar",
xlab = "Sodio", ylab = "AzĂºcar", pch = 19, col = "darkgreen")
abline(lm(datos$azucar ~ datos$sodio), col = "red")
library(MASS)
data(UScereal)
head(UScereal)
## mfr calories protein fat sodium fibre
## 100% Bran N 212.1212 12.121212 3.030303 393.9394 30.303030
## All-Bran K 212.1212 12.121212 3.030303 787.8788 27.272727
## All-Bran with Extra Fiber K 100.0000 8.000000 0.000000 280.0000 28.000000
## Apple Cinnamon Cheerios G 146.6667 2.666667 2.666667 240.0000 2.000000
## Apple Jacks K 110.0000 2.000000 0.000000 125.0000 1.000000
## Basic 4 G 173.3333 4.000000 2.666667 280.0000 2.666667
## carbo sugars shelf potassium vitamins
## 100% Bran 15.15152 18.18182 3 848.48485 enriched
## All-Bran 21.21212 15.15151 3 969.69697 enriched
## All-Bran with Extra Fiber 16.00000 0.00000 3 660.00000 enriched
## Apple Cinnamon Cheerios 14.00000 13.33333 1 93.33333 enriched
## Apple Jacks 11.00000 14.00000 2 30.00000 enriched
## Basic 4 24.00000 10.66667 3 133.33333 enriched
boxplot(fat ~ shelf, data = UScereal, main = "Grasa vs EstanterĂa", col = "lightblue")
plot(UScereal$sodium, UScereal$sugars, main = "Sodio vs AzĂºcares",
xlab = "Sodio", ylab = "AzĂºcares", col = "red", pch = 16)
brillo <- rnorm(963, mean = 50, sd = 15)
q1 <- quantile(brillo, 0.25)
q3 <- quantile(brillo, 0.75)
iqr <- q3 - q1
limite_inferior <- q1 - 1.5 * iqr
limite_superior <- q3 + 1.5 * iqr
brillo_sin_atipicos <- brillo[brillo >= limite_inferior & brillo <= limite_superior]
hist(brillo_sin_atipicos, main = "Histograma sin valores atĂpicos", col = "lightblue", xlab = "Brillo")
brillo_log <- log(brillo_sin_atipicos)
par(mfrow = c(1, 2))
hist(brillo_sin_atipicos, main = "Sin Transformar", col = "lightblue")
hist(brillo_log, main = "TransformaciĂ³n LogarĂtmica", col = "lightgreen")
vector1 <- rnorm(50, mean = 5, sd = 2)
vector2 <- rnorm(50, mean = 6, sd = 2)
t_test <- t.test(vector1, vector2)
print(t_test)
##
## Welch Two Sample t-test
##
## data: vector1 and vector2
## t = -1.9462, df = 97.856, p-value = 0.0545
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.56617685 0.01528155
## sample estimates:
## mean of x mean of y
## 5.047986 5.823434
binom1 <- rbinom(50, size = 10, prob = 0.6)
binom2 <- rbinom(50, size = 10, prob = 0.7)
t_test_binom <- t.test(binom1, binom2)
print(t_test_binom)
##
## Welch Two Sample t-test
##
## data: binom1 and binom2
## t = -4.1362, df = 97.026, p-value = 7.515e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.6574239 -0.5825761
## sample estimates:
## mean of x mean of y
## 5.94 7.06
set.seed(123)
datos_CO2 <- data.frame(
PIB_per_capita = rnorm(26, mean = 30000, sd = 5000),
emisiones_CO2 = rnorm(26, mean = 15, sd = 3)
)
modelo <- lm(emisiones_CO2 ~ PIB_per_capita, data = datos_CO2)
summary(modelo)
##
## Call:
## lm(formula = emisiones_CO2 ~ PIB_per_capita, data = datos_CO2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2165 -1.8721 0.0137 1.3820 4.3402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.073e+01 2.876e+00 7.207 1.9e-07 ***
## PIB_per_capita -1.767e-04 9.617e-05 -1.837 0.0786 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.363 on 24 degrees of freedom
## Multiple R-squared: 0.1233, Adjusted R-squared: 0.08673
## F-statistic: 3.374 on 1 and 24 DF, p-value: 0.07864
plot(datos_CO2$PIB_per_capita, datos_CO2$emisiones_CO2, main = "RegresiĂ³n: PIB vs CO2",
xlab = "PIB per cĂ¡pita", ylab = "Emisiones de CO2", pch = 19, col = "blue")
abline(modelo, col = "red")
datos <- rnorm(50, mean = 10, sd = 2)
shapiro_test <- shapiro.test(datos)
print(shapiro_test)
##
## Shapiro-Wilk normality test
##
## data: datos
## W = 0.99123, p-value = 0.9705
library(pwr)
pwr_test <- pwr.t.test(d = 0.5, n = 30, sig.level = 0.05, power = NULL, type = "two.sample")
print(pwr_test)
##
## Two-sample t test power calculation
##
## n = 30
## d = 0.5
## sig.level = 0.05
## power = 0.4778965
## alternative = two.sided
##
## NOTE: n is number in *each* group
educacion <- data.frame(
gasto_educacion = runif(50, 5000, 20000),
puntaje_prueba = runif(50, 50, 100)
)
modelo_educacion <- lm(puntaje_prueba ~ gasto_educacion, data = educacion)
summary(modelo_educacion)
##
## Call:
## lm(formula = puntaje_prueba ~ gasto_educacion, data = educacion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.0386 -16.1703 -0.4602 14.5221 24.6253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.7322190 7.5048956 10.491 5.16e-14 ***
## gasto_educacion -0.0003870 0.0005761 -0.672 0.505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.32 on 48 degrees of freedom
## Multiple R-squared: 0.009313, Adjusted R-squared: -0.01133
## F-statistic: 0.4512 on 1 and 48 DF, p-value: 0.505
plot(educacion$gasto_educacion, educacion$puntaje_prueba,
main = "Gasto en EducaciĂ³n vs Puntaje en Prueba",
xlab = "Gasto en EducaciĂ³n (USD)", ylab = "Puntaje en Prueba", pch = 19)
abline(modelo_educacion, col = "red")