data <- read.csv("student_performance_300.csv", stringsAsFactors = FALSE)
str(data)
## 'data.frame': 300 obs. of 10 variables:
## $ Student_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "Male" "Female" "Male" "Male" ...
## $ Study_Hours_per_Week : num 10.2 13.2 8.4 14.2 16.9 8.9 8.7 14.4 14.7 8.4 ...
## $ Attendance_Rate : num 87 95.5 96.1 96.9 91.4 73.6 100 73.5 88 77.5 ...
## $ Past_Exam_Scores : int 88 76 76 83 66 60 57 46 95 57 ...
## $ Parental_Education_Level : chr "Low" "High" "Medium" "Medium" ...
## $ Internet_Access_at_Home : chr "No" "Yes" "Yes" "Yes" ...
## $ Extracurricular_Activities: chr "No" "No" "Yes" "No" ...
## $ Final_Exam_Score : int 76 67 64 74 74 56 44 60 80 54 ...
## $ Pass_Fail : chr "Pass" "Pass" "Pass" "Pass" ...
head(data)
n_obs <- nrow(data)
n_vars <- ncol(data)
cat("Observaciones:", n_obs, "\nVariables:", n_vars, "\n")
## Observaciones: 300
## Variables: 10
Variables cualitativas (categóricas):
Gender, Parental_Education_Level,
Internet_Access_at_Home,
Extracurricular_Activities, Pass_Fail.
Variables cuantitativas:
Study_Hours_per_Week, Attendance_Rate,
Past_Exam_Scores, Final_Exam_Score.
Este dataset tiene 300 observaciones (≥ 100 ✅).
num_vars <- c("Study_Hours_per_Week", "Attendance_Rate", "Past_Exam_Scores", "Final_Exam_Score")
desc <- psych::describe(data[, num_vars])
desc
cor_mat <- cor(data[, num_vars], use = "pairwise.complete.obs", method = "pearson")
cor_mat
## Study_Hours_per_Week Attendance_Rate Past_Exam_Scores
## Study_Hours_per_Week 1.00000000 -0.066741036 0.112167475
## Attendance_Rate -0.06674104 1.000000000 0.005742008
## Past_Exam_Scores 0.11216747 0.005742008 1.000000000
## Final_Exam_Score 0.55467932 -0.022037577 0.691077196
## Final_Exam_Score
## Study_Hours_per_Week 0.55467932
## Attendance_Rate -0.02203758
## Past_Exam_Scores 0.69107720
## Final_Exam_Score 1.00000000
cor_df <- as.data.frame(as.table(cor_mat)) %>%
dplyr::rename(var1 = Var1, var2 = Var2, r = Freq) %>%
dplyr::filter(var1 < var2) %>%
dplyr::mutate(abs_r = abs(r)) %>%
dplyr::arrange(dplyr::desc(abs_r))
pares_fuertes <- dplyr::filter(cor_df, abs_r > 0.55)
pares_fuertes
# Extraemos nombres de columnas como caracter explícito
best_pair <- cor_df %>% dplyr::slice(1)
x_name <- as.character(best_pair$var1[[1]])
## Error in `[[.default`(best_pair$var1, 1): subscript out of bounds
y_name <- as.character(best_pair$var2[[1]])
## Error in `[[.default`(best_pair$var2, 1): subscript out of bounds
r_val <- round(best_pair$r[[1]], 3)
## Error in best_pair$r[[1]]: subscript out of bounds
cat("Par con mayor |r|:", x_name, "vs", y_name, "-> r =", r_val, "\n")
## Error: object 'x_name' not found
ggplot(data, aes(x = Final_Exam_Score)) +
geom_histogram(binwidth = 5, color = "black") +
labs(title = "Histograma: Final_Exam_Score", x = "Final_Exam_Score", y = "Frecuencia")
ggplot(data, aes(x = Study_Hours_per_Week)) +
geom_histogram(binwidth = 2, color = "black") +
labs(title = "Histograma: Study_Hours_per_Week", x = "Study_Hours_per_Week", y = "Frecuencia")
ggplot(data, aes(x = .data[[x_name]], y = .data[[y_name]])) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = paste("Dispersión:", x_name, "vs", y_name, "(r =", r_val, ")"),
x = x_name, y = y_name)
## Error: object 'x_name' not found
ggplot(data, aes(x = Pass_Fail, y = Final_Exam_Score, fill = Pass_Fail)) +
geom_boxplot(alpha = 0.85) +
labs(title = "Boxplot: Final_Exam_Score por Pass/Fail", x = "Pass_Fail", y = "Final_Exam_Score") +
guides(fill = "none")
data %>%
group_by(Pass_Fail) %>%
summarise(across(all_of(num_vars), list(media = mean, sd = sd), .names = "{.col}_{.fn}"))
f <- as.formula(paste(y_name, "~", x_name))
## Error: object 'y_name' not found
m <- lm(f, data = data)
## Error in eval(mf, parent.frame()): object 'f' not found
broom::tidy(m)
## Error: object 'm' not found
broom::glance(m)
## Error: object 'm' not found