1 1. Base de datos seleccionada
2 2. Indicadores estadísticos
3 3. Relación lineal entre variables cuantitativas
4 4. Gráficos
5 5. Diagrama de cajas por variable cualitativa binaria
6 6. Comparaciones de medias por grupo (opcional)
7 7. Modelo lineal simple (opcional)

1 1. Base de datos seleccionada

data <- read.csv("student_performance_300.csv", stringsAsFactors = FALSE)

str(data)

## 'data.frame':    300 obs. of  10 variables:
##  $ Student_ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                    : chr  "Male" "Female" "Male" "Male" ...
##  $ Study_Hours_per_Week      : num  10.2 13.2 8.4 14.2 16.9 8.9 8.7 14.4 14.7 8.4 ...
##  $ Attendance_Rate           : num  87 95.5 96.1 96.9 91.4 73.6 100 73.5 88 77.5 ...
##  $ Past_Exam_Scores          : int  88 76 76 83 66 60 57 46 95 57 ...
##  $ Parental_Education_Level  : chr  "Low" "High" "Medium" "Medium" ...
##  $ Internet_Access_at_Home   : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Extracurricular_Activities: chr  "No" "No" "Yes" "No" ...
##  $ Final_Exam_Score          : int  76 67 64 74 74 56 44 60 80 54 ...
##  $ Pass_Fail                 : chr  "Pass" "Pass" "Pass" "Pass" ...

head(data)

n_obs  <- nrow(data)
n_vars <- ncol(data)
cat("Observaciones:", n_obs, "\nVariables:", n_vars, "\n")

## Observaciones: 300 
## Variables: 10

Variables cualitativas (categóricas): Gender, Parental_Education_Level, Internet_Access_at_Home, Extracurricular_Activities, Pass_Fail.
Variables cuantitativas: Study_Hours_per_Week, Attendance_Rate, Past_Exam_Scores, Final_Exam_Score.

Este dataset tiene 300 observaciones (≥ 100 ✅).

2 2. Indicadores estadísticos

num_vars <- c("Study_Hours_per_Week", "Attendance_Rate", "Past_Exam_Scores", "Final_Exam_Score")
desc <- psych::describe(data[, num_vars])
desc

3 3. Relación lineal entre variables cuantitativas

cor_mat <- cor(data[, num_vars], use = "pairwise.complete.obs", method = "pearson")
cor_mat

##                      Study_Hours_per_Week Attendance_Rate Past_Exam_Scores
## Study_Hours_per_Week           1.00000000    -0.066741036      0.112167475
## Attendance_Rate               -0.06674104     1.000000000      0.005742008
## Past_Exam_Scores               0.11216747     0.005742008      1.000000000
## Final_Exam_Score               0.55467932    -0.022037577      0.691077196
##                      Final_Exam_Score
## Study_Hours_per_Week       0.55467932
## Attendance_Rate           -0.02203758
## Past_Exam_Scores           0.69107720
## Final_Exam_Score           1.00000000

cor_df <- as.data.frame(as.table(cor_mat)) %>% 
  dplyr::rename(var1 = Var1, var2 = Var2, r = Freq) %>%
  dplyr::filter(var1 < var2) %>%
  dplyr::mutate(abs_r = abs(r)) %>%
  dplyr::arrange(dplyr::desc(abs_r))

pares_fuertes <- dplyr::filter(cor_df, abs_r > 0.55)
pares_fuertes

# Extraemos nombres de columnas como caracter explícito
best_pair <- cor_df %>% dplyr::slice(1)
x_name <- as.character(best_pair$var1[[1]])

## Error in `[[.default`(best_pair$var1, 1): subscript out of bounds

y_name <- as.character(best_pair$var2[[1]])

## Error in `[[.default`(best_pair$var2, 1): subscript out of bounds

r_val  <- round(best_pair$r[[1]], 3)

## Error in best_pair$r[[1]]: subscript out of bounds

cat("Par con mayor |r|:", x_name, "vs", y_name, "-> r =", r_val, "\n")

## Error: object 'x_name' not found

4 4. Gráficos

ggplot(data, aes(x = Final_Exam_Score)) +
  geom_histogram(binwidth = 5, color = "black") +
  labs(title = "Histograma: Final_Exam_Score", x = "Final_Exam_Score", y = "Frecuencia")

ggplot(data, aes(x = Study_Hours_per_Week)) +
  geom_histogram(binwidth = 2, color = "black") +
  labs(title = "Histograma: Study_Hours_per_Week", x = "Study_Hours_per_Week", y = "Frecuencia")

ggplot(data, aes(x = .data[[x_name]], y = .data[[y_name]])) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = paste("Dispersión:", x_name, "vs", y_name, "(r =", r_val, ")"),
       x = x_name, y = y_name)

## Error: object 'x_name' not found

5 5. Diagrama de cajas por variable cualitativa binaria

ggplot(data, aes(x = Pass_Fail, y = Final_Exam_Score, fill = Pass_Fail)) +
  geom_boxplot(alpha = 0.85) +
  labs(title = "Boxplot: Final_Exam_Score por Pass/Fail", x = "Pass_Fail", y = "Final_Exam_Score") +
  guides(fill = "none")

6 6. Comparaciones de medias por grupo (opcional)

data %>% 
  group_by(Pass_Fail) %>%
  summarise(across(all_of(num_vars), list(media = mean, sd = sd), .names = "{.col}_{.fn}"))

7 7. Modelo lineal simple (opcional)

f <- as.formula(paste(y_name, "~", x_name))

## Error: object 'y_name' not found

m <- lm(f, data = data)

## Error in eval(mf, parent.frame()): object 'f' not found

broom::tidy(m)

## Error: object 'm' not found

broom::glance(m)

## Error: object 'm' not found

Primer Avance del Proyecto de Estadística - Dataset Sintético (v2)

Francesco Marmolejo y Nicólas Garzón

09 September 2025