1. Carga y exploración inicial

encuesta <- read_excel("encuesta_habitos.xls") %>% clean_names()
glimpse(encuesta)

## Rows: 64
## Columns: 11
## $ id                    <dbl> 114, 139, 104, 19, 89, 241, 176, 41, 242, 6, 163…
## $ genero                <chr> "M", "F", "M", "M", "M", "M", "M", "F", "F", "M"…
## $ edad                  <dbl> 51, 42, 40, 50, 35, 20, 34, 38, 34, 37, 32, 33, …
## $ estrato               <dbl> 5, 6, 5, 2, 4, 2, 3, 1, 3, 1, 3, 2, 3, 1, 3, 1, …
## $ tiempo_celular        <dbl> 3.8, 2.9, 4.4, 4.4, 3.7, 2.9, 3.7, 3.6, 3.7, 2.4…
## $ tiempo_tv             <dbl> 2.1, 0.0, 2.9, 2.1, 1.3, 3.0, 1.4, 1.7, 1.2, 2.2…
## $ tiempo_internet       <dbl> 2.2, 3.6, 3.9, 2.6, 2.4, 2.0, 3.2, 1.9, 2.7, 3.4…
## $ tiempo_lectura        <dbl> 0.3, 2.1, 1.0, 0.0, 0.0, 0.7, 0.4, 0.0, 0.7, 0.1…
## $ tiempo_deporte        <dbl> 2.2, 2.7, 0.6, 0.1, 0.6, 0.6, 0.6, 0.6, 0.7, 0.1…
## $ tiempo_aire_libre     <dbl> 1.9, 1.8, 0.8, 0.0, 0.9, 2.6, 0.6, 0.0, 0.8, 1.7…
## $ tiempo_total_pantalla <dbl> 8.1, 6.5, 11.2, 9.1, 7.4, 7.9, 8.3, 7.2, 7.6, 8.…

head(encuesta)

## # A tibble: 6 × 11
##      id genero  edad estrato tiempo_celular tiempo_tv tiempo_internet
##   <dbl> <chr>  <dbl>   <dbl>          <dbl>     <dbl>           <dbl>
## 1   114 M         51       5            3.8       2.1             2.2
## 2   139 F         42       6            2.9       0               3.6
## 3   104 M         40       5            4.4       2.9             3.9
## 4    19 M         50       2            4.4       2.1             2.6
## 5    89 M         35       4            3.7       1.3             2.4
## 6   241 M         20       2            2.9       3               2  
## # ℹ 4 more variables: tiempo_lectura <dbl>, tiempo_deporte <dbl>,
## #   tiempo_aire_libre <dbl>, tiempo_total_pantalla <dbl>

sapply(encuesta, function(x) sum(is.na(x)))

##                    id                genero                  edad 
##                     0                     0                     0 
##               estrato        tiempo_celular             tiempo_tv 
##                     0                     0                     0 
##       tiempo_internet        tiempo_lectura        tiempo_deporte 
##                     0                     0                     0 
##     tiempo_aire_libre tiempo_total_pantalla 
##                     0                     0

2. Análisis Exploratorio Adicional

2.1 Correlaciones

num_vars <- select(encuesta, tiempo_internet, tiempo_lectura, tiempo_aire_libre, tiempo_celular, tiempo_total_pantalla)
corr_mat <- cor(num_vars, use = "pairwise.complete.obs")
print(corr_mat)

##                       tiempo_internet tiempo_lectura tiempo_aire_libre
## tiempo_internet                 1.000         0.1154            0.1087
## tiempo_lectura                  0.115         1.0000           -0.0464
## tiempo_aire_libre               0.109        -0.0464            1.0000
## tiempo_celular                  0.203        -0.0899            0.1680
## tiempo_total_pantalla           0.577        -0.1364            0.1737
##                       tiempo_celular tiempo_total_pantalla
## tiempo_internet               0.2031                 0.577
## tiempo_lectura               -0.0899                -0.136
## tiempo_aire_libre             0.1680                 0.174
## tiempo_celular                1.0000                 0.757
## tiempo_total_pantalla         0.7573                 1.000

2.2 Heatmap de correlaciones

corrplot(corr_mat, method = "color", addCoef.col = "black",
         tl.col = "black", tl.srt = 45,
         number.cex = 0.7,
         title = "Mapa de calor de correlaciones",
         mar = c(0,0,1,0))

2.3 Gráficos de caja por género

ggplot(encuesta, aes(x = genero, y = tiempo_internet, fill = genero)) +
  geom_boxplot(alpha = 0.6) +
  labs(title = "Distribución de tiempo en internet por género") +
  theme(legend.position = "none")

3. Análisis Descriptivo Numérico (Tiempo en internet)

media_ti <- mean(encuesta$tiempo_internet, na.rm = TRUE)
mediana_ti <- median(encuesta$tiempo_internet, na.rm = TRUE)
moda_ti <- as.numeric(names(sort(table(encuesta$tiempo_internet), decreasing = TRUE)[1]))
sd_ti <- sd(encuesta$tiempo_internet, na.rm = TRUE)
var_ti <- var(encuesta$tiempo_internet, na.rm = TRUE)
rango_ti <- range(encuesta$tiempo_internet, na.rm = TRUE)
iqr_ti <- IQR(encuesta$tiempo_internet, na.rm = TRUE)
asim_ti <- skewness(encuesta$tiempo_internet, na.rm = TRUE)
curt_ti <- kurtosis(encuesta$tiempo_internet, na.rm = TRUE)
tibble(
  Estadístico = c("Media", "Mediana", "Moda", "SD", "Varianza", "Min", "Max", "IQR", "Asimetría", "Curtosis"),
  Valor = c(media_ti, mediana_ti, moda_ti, sd_ti, var_ti, rango_ti[1], rango_ti[2], iqr_ti, asim_ti, curt_ti)
)

## # A tibble: 10 × 2
##    Estadístico  Valor
##    <chr>        <dbl>
##  1 Media        2.84 
##  2 Mediana      2.85 
##  3 Moda         2.5  
##  4 SD           0.781
##  5 Varianza     0.61 
##  6 Min          0.9  
##  7 Max          4.4  
##  8 IQR          1.05 
##  9 Asimetría   -0.222
## 10 Curtosis     2.50

4. Prueba de Normalidad y Outliers

shapiro <- shapiro.test(encuesta$tiempo_internet)
outliers <- boxplot.stats(encuesta$tiempo_internet)$out
list(Shapiro = shapiro, Outliers = outliers)

## $Shapiro
## 
##  Shapiro-Wilk normality test
## 
## data:  encuesta$tiempo_internet
## W = 1, p-value = 0.6
## 
## 
## $Outliers
## numeric(0)

5. Intervalo de Confianza 95% (Tiempo de lectura)

n <- sum(!is.na(encuesta$tiempo_lectura))
xbar <- mean(encuesta$tiempo_lectura, na.rm = TRUE)
s <- sd(encuesta$tiempo_lectura, na.rm = TRUE)
error <- qt(0.975, df = n - 1) * s / sqrt(n)
ic <- xbar + c(-1, 1) * error
ic

## [1] 0.775 1.175

6. Comparación de Medias al 99% (Aire Libre por Género)

levene <- leveneTest(tiempo_aire_libre ~ genero, data = encuesta)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

var_equal <- levene$`Pr(>F)`[1] > 0.05
ttest <- t.test(tiempo_aire_libre ~ genero, data = encuesta, var.equal = var_equal, conf.level = 0.99)
ef_size <- cohen.d(tiempo_aire_libre ~ genero, data = encuesta)
list(Levene = levene, TTest = ttest, Cohen_d = ef_size)

## $Levene
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1     0.2   0.66
##       62               
## 
## $TTest
## 
##  Two Sample t-test
## 
## data:  tiempo_aire_libre by genero
## t = 1, df = 62, p-value = 0.3
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 99 percent confidence interval:
##  -0.324  0.751
## sample estimates:
## mean in group F mean in group M 
##            1.46            1.25 
## 
## 
## $Cohen_d
## 
## Cohen's d
## 
## d estimate: 0.266 (small)
## 95 percent confidence interval:
##  lower  upper 
## -0.240  0.772

7. Homogeneidad de Varianzas (tiempo_celular)

var.test(tiempo_celular ~ genero, data = encuesta)

## 
##  F test to compare two variances
## 
## data:  tiempo_celular by genero
## F = 1, num df = 27, denom df = 35, p-value = 0.6
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.592 2.518
## sample estimates:
## ratio of variances 
##                1.2

8. Proporciones >3h en Internet por Género

# Crear variable binaria
encuesta <- encuesta %>%
  mutate(high_tiempo_internet = ifelse(tiempo_internet > 3, 1, 0))

# Verifica valores únicos en genero y en la nueva variable
unique(encuesta$genero)

## [1] "M" "F"

unique(encuesta$high_tiempo_internet)

## [1] 0 1

# Tabla de contingencia
tabla <- table(encuesta$genero, encuesta$high_tiempo_internet)

# Revisar nombres válidos
print(tabla)

##    
##      0  1
##   F 13 15
##   M 23 13

# Extraer proporciones correctamente
# Extraer nombres de filas y columnas
gen_vals <- rownames(tabla)
bin_vals <- colnames(tabla)

# Extraer valores para prop.test
x <- c(tabla[gen_vals[1], bin_vals[2]], tabla[gen_vals[2], bin_vals[2]])
n <- rowSums(tabla)

# Prueba de proporciones
prop.test(x = x, n = n, correct = FALSE, conf.level = 0.99)

## 
##  2-sample test for equality of proportions without continuity correction
## 
## data:  x out of n
## X-squared = 2, df = 1, p-value = 0.2
## alternative hypothesis: two.sided
## 99 percent confidence interval:
##  -0.144  0.493
## sample estimates:
## prop 1 prop 2 
##  0.536  0.361

9. Regresión Lineal

modelo <- lm(tiempo_total_pantalla ~ tiempo_internet, data = encuesta)
summary(modelo)

## 
## Call:
## lm(formula = tiempo_total_pantalla ~ tiempo_internet, data = encuesta)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.009 -0.739  0.001  0.671  4.231 
## 
## Coefficients:
##                 Estimate Std. Error t value      Pr(>|t|)    
## (Intercept)        5.189      0.635    8.17 0.00000000002 ***
## tiempo_internet    1.200      0.216    5.56 0.00000060908 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.34 on 62 degrees of freedom
## Multiple R-squared:  0.333,  Adjusted R-squared:  0.322 
## F-statistic: 30.9 on 1 and 62 DF,  p-value: 0.000000609

ggplot(encuesta, aes(tiempo_internet, tiempo_total_pantalla)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Regresión lineal")

## `geom_smooth()` using formula = 'y ~ x'

augment(modelo) %>%
  ggplot(aes(.fitted, .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Residuos vs Ajustados")

10. Análisis de Poder Estadístico

d <- abs(ef_size$estimate)
pwr.t.test(d = d, n = length(na.omit(encuesta$tiempo_aire_libre)), sig.level = 0.01, type = "two.sample")

## 
##      Two-sample t test power calculation 
## 
##               n = 64
##               d = 0.266
##       sig.level = 0.01
##           power = 0.138
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

11. Reproducibilidad y Sesión

sessionInfo()

## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Bogota
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] effsize_0.8.1  corrplot_0.95  pwr_1.3-0      psych_2.5.3    broom_1.0.8   
##  [6] car_3.1-3      carData_3.0-5  moments_0.14.1 janitor_2.2.1  readxl_1.4.5  
## [11] ggplot2_3.5.2  dplyr_1.1.4   
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.5         sass_0.4.10        generics_0.1.4     tidyr_1.3.1       
##  [5] stringi_1.8.7      lattice_0.22-6     digest_0.6.37      magrittr_2.0.3    
##  [9] evaluate_1.0.3     grid_4.4.3         timechange_0.3.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      Matrix_1.7-2       cellranger_1.1.0   jsonlite_2.0.0    
## [17] backports_1.5.0    Formula_1.2-5      mgcv_1.9-1         purrr_1.0.4       
## [21] scales_1.4.0       jquerylib_0.1.4    abind_1.4-8        mnormt_2.1.1      
## [25] cli_3.6.5          rlang_1.1.6        splines_4.4.3      withr_3.0.2       
## [29] cachem_1.1.0       yaml_2.3.10        tools_4.4.3        parallel_4.4.3    
## [33] vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4    lubridate_1.9.4   
## [37] snakecase_0.11.1   stringr_1.5.1      pkgconfig_2.0.3    pillar_1.10.2     
## [41] bslib_0.9.0        gtable_0.3.6       glue_1.8.0         xfun_0.52         
## [45] tibble_3.2.1       tidyselect_1.2.1   knitr_1.50         farver_2.1.2      
## [49] nlme_3.1-167       htmltools_0.5.8.1  labeling_0.4.3     rmarkdown_2.29    
## [53] compiler_4.4.3

Taller 3 – Hábitos Diarios en el Uso del Tiempo Libre

Andersson Sánchez, Cristian Pedraza, David Correa, Juan Sánchez

10 de May de 2025

1. Carga y exploración inicial

2. Análisis Exploratorio Adicional

2.1 Correlaciones

2.2 Heatmap de correlaciones

2.3 Gráficos de caja por género

3. Análisis Descriptivo Numérico (Tiempo en internet)

4. Prueba de Normalidad y Outliers

5. Intervalo de Confianza 95% (Tiempo de lectura)

6. Comparación de Medias al 99% (Aire Libre por Género)

7. Homogeneidad de Varianzas (tiempo_celular)

8. Proporciones >3h en Internet por Género

9. Regresión Lineal

10. Análisis de Poder Estadístico

11. Reproducibilidad y Sesión