# Load necessary libraries and read the data
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv('./Downloads/students_dropout_and_academic_success.csv')
# Display summary of the dataset
summary(df)
## Marital_status Application_mode Application_order Course
## Min. :1.000 Min. : 1.00 Min. :0.000 Min. : 33
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:9085
## Median :1.000 Median :17.00 Median :1.000 Median :9238
## Mean :1.179 Mean :18.67 Mean :1.728 Mean :8857
## 3rd Qu.:1.000 3rd Qu.:39.00 3rd Qu.:2.000 3rd Qu.:9556
## Max. :6.000 Max. :57.00 Max. :9.000 Max. :9991
## Daytime_evening_attendance. Previous_qualification
## Min. :0.0000 Min. : 1.000
## 1st Qu.:1.0000 1st Qu.: 1.000
## Median :1.0000 Median : 1.000
## Mean :0.8908 Mean : 4.578
## 3rd Qu.:1.0000 3rd Qu.: 1.000
## Max. :1.0000 Max. :43.000
## Previous_qualification_grade Nacionality Mothers_qualification
## Min. : 95.0 Min. : 1.000 Min. : 1.00
## 1st Qu.:125.0 1st Qu.: 1.000 1st Qu.: 2.00
## Median :133.1 Median : 1.000 Median :19.00
## Mean :132.6 Mean : 1.873 Mean :19.56
## 3rd Qu.:140.0 3rd Qu.: 1.000 3rd Qu.:37.00
## Max. :190.0 Max. :109.000 Max. :44.00
## Fathers_qualification Mothers_occupation Fathers_occupation Admission_grade
## Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 95.0
## 1st Qu.: 3.00 1st Qu.: 4.00 1st Qu.: 4.00 1st Qu.:117.9
## Median :19.00 Median : 5.00 Median : 7.00 Median :126.1
## Mean :22.28 Mean : 10.96 Mean : 11.03 Mean :127.0
## 3rd Qu.:37.00 3rd Qu.: 9.00 3rd Qu.: 9.00 3rd Qu.:134.8
## Max. :44.00 Max. :194.00 Max. :195.00 Max. :190.0
## Displaced Educational_special_needs Debtor
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.5484 Mean :0.01153 Mean :0.1137
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## Tuition_fees_uptodate Gender Scholarship_holder Age_at_enrollment
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :17.00
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:19.00
## Median :1.0000 Median :0.0000 Median :0.0000 Median :20.00
## Mean :0.8807 Mean :0.3517 Mean :0.2484 Mean :23.27
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:25.00
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :70.00
## International Curricular_units_1st_sem_credited
## Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 0.00
## Mean :0.02486 Mean : 0.71
## 3rd Qu.:0.00000 3rd Qu.: 0.00
## Max. :1.00000 Max. :20.00
## Curricular_units_1st_sem_enrolled Curricular_units_1st_sem_evaluations
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 5.000 1st Qu.: 6.000
## Median : 6.000 Median : 8.000
## Mean : 6.271 Mean : 8.299
## 3rd Qu.: 7.000 3rd Qu.:10.000
## Max. :26.000 Max. :45.000
## Curricular_units_1st_sem_approved Curricular_units1st_sem_grade
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 3.000 1st Qu.:11.00
## Median : 5.000 Median :12.29
## Mean : 4.707 Mean :10.64
## 3rd Qu.: 6.000 3rd Qu.:13.40
## Max. :26.000 Max. :18.88
## Curricular_units_1st_sem_without_evaluations Curricular_units_2nd_sem_credited
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.1377 Mean : 0.5418
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :12.0000 Max. :19.0000
## Curricular_units_2nd_sem_enrolled Curricular_units_2nd_sem_evaluations
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 5.000 1st Qu.: 6.000
## Median : 6.000 Median : 8.000
## Mean : 6.232 Mean : 8.063
## 3rd Qu.: 7.000 3rd Qu.:10.000
## Max. :23.000 Max. :33.000
## Curricular_units_2nd_sem_approved Curricular_units_2nd_sem_grade
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.:10.75
## Median : 5.000 Median :12.20
## Mean : 4.436 Mean :10.23
## 3rd Qu.: 6.000 3rd Qu.:13.33
## Max. :20.000 Max. :18.57
## Curricular_units_2nd_sem_without_evaluations Unemployment_rate
## Min. : 0.0000 Min. : 7.60
## 1st Qu.: 0.0000 1st Qu.: 9.40
## Median : 0.0000 Median :11.10
## Mean : 0.1503 Mean :11.57
## 3rd Qu.: 0.0000 3rd Qu.:13.90
## Max. :12.0000 Max. :16.20
## Inflation_rate GDP Target
## Min. :-0.800 Min. :-4.060000 Length:4424
## 1st Qu.: 0.300 1st Qu.:-1.700000 Class :character
## Median : 1.400 Median : 0.320000 Mode :character
## Mean : 1.228 Mean : 0.001969
## 3rd Qu.: 2.600 3rd Qu.: 1.790000
## Max. : 3.700 Max. : 3.510000
set1 <- df%>%
mutate(Age_Group = cut(Age_at_enrollment, breaks = c(0, 18, 25, 40, Inf), labels = c("Young", "Young Adult", "Adult", "Senior"))) %>%
group_by(Age_Group) %>%
summarise(Avg_Admission_Grade = mean(Admission_grade, na.rm = TRUE))
# Visualization for Set 1
ggplot(set1, aes(x = Age_Group, y = Avg_Admission_Grade)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Average Admission Grade by Age Group",
x = "Age Group", y = "Average Admission Grade")
boxplot(Admission_grade ~ Fathers_qualification, data = df, main = "Admission Grade by Father's Qualification", ylab = "Admission Grade", xlab = "Father's Qualification")
# Calculate the average admission grade by school type
avg_admission_grade <- tapply(df$Admission_grade, df$Daytime_evening_attendance, mean)
# Create a bar graph with a custom y-axis scale
barplot(avg_admission_grade, main = "Average Admission Grade by School Type",
ylab = "Average Admission Grade", xlab = "School Type", col = c("violet", "blue"),
ylim = c(0, 140))
# Set 1: Relationship between Age and Admission Grade
cor_set1 <- cor(df$Age_at_enrollment, df$Admission_grade, use = "complete.obs")
# Set 2: Relationship between Gender, Displacement, and Admission Grade
cor_set2 <- cor(cbind( df$Admission_grade, df$Fathers_qualification), use = "complete.obs")
# Set 3: Relationship between Previous Qualification and Admission Grade
cor_set3 <- cor(df$Previous_qualification, df$Daytime_evening_attendance, use = "complete.obs")
# Display correlation coefficients
cat("Set 1 Correlation Coefficient:", cor_set1, "\n")
## Set 1 Correlation Coefficient: -0.02991536
cat("Set 2 Correlation Coefficient:", cor_set2, "\n")
## Set 2 Correlation Coefficient: 1 -0.0519716 -0.0519716 1
cat("Set 3 Correlation Coefficient:", cor_set3, "\n")
## Set 3 Correlation Coefficient: -0.0718706
Null Hypothesis (H0): There is no significant difference in the admission grades between students attending daytime and evening classes.
Alternative Hypothesis (H1): There is a significant difference in the admission grades between students attending daytime and evening classes.
# Subset the data
daytime_grades <- df$Admission_grade[df$Daytime_evening_attendance. == 1]
evening_grades <- df$Admission_grade[df$Daytime_evening_attendance. == 0]
# Perform Fisher's test
fisher_test_result <- var.test(daytime_grades, evening_grades)
print(fisher_test_result)
##
## F test to compare two variances
##
## data: daytime_grades and evening_grades
## F = 0.71705, num df = 3940, denom df = 482, p-value = 3.283e-07
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6249765 0.8168656
## sample estimates:
## ratio of variances
## 0.7170487
sd_daytime <- sd(daytime_grades)
sd_evening<- sd(evening_grades)
# Calculate coefficient of variation
cv_daytime <- sd_daytime / mean(daytime_grades)
cv_evening <- sd_evening / mean(evening_grades)
# Combine the data into a data frame
cv_data <- data.frame(
Group = c("Daytime", "Evening"),
Coefficient_of_Variation = c(cv_daytime, cv_evening)
)
# Create a bar plot for coefficient of variation
ggplot(cv_data, aes(x = Group, y = Coefficient_of_Variation, fill = Group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Coefficient of Variation (Variance) by Attendance Type",
x = "Attendance Type",
y = "Coefficient of Variation (CV)") +
theme_minimal()
The p-value is very small (less than the typical significance level of 0.05), suggesting that you reject the null hypothesis. The alternative hypothesis, in this case, is that the true ratio of variances is not equal to 1. The 95 percent confidence interval for the ratio of variances does not include 1, further supporting the evidence against equal variances.
Null Hypothesis (H0): There is no correlation between the unemployment rate and students’ enrollment status.
Alternative Hypothesis (H1): There is a significant correlation between the economic factors and students’ enrollment status.
# Fit ANOVA model
model <- aov(Unemployment_rate ~ Target, data = df)
# Conduct the F-test
anova_result <- anova(model)
# Print the results
print(anova_result)
## Analysis of Variance Table
##
## Response: Unemployment_rate
## Df Sum Sq Mean Sq F value Pr(>F)
## Target 2 83.9 41.933 5.9225 0.0027 **
## Residuals 4421 31302.2 7.080
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
# Calculate means and confidence intervals
means_ci <- tapply(df$Unemployment_rate, df$Target, function(x) mean(x, na.rm = TRUE))
# Combine the data into a data frame
plot_data <- data.frame(
Target = names(means_ci),
Mean = unname(means_ci)
)
# Create a bar plot with error bars
ggplot(plot_data, aes(x = Target, y = Mean, fill = Target)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = Mean - sd(df$Unemployment_rate)/sqrt(length(df$Unemployment_rate)),
ymax = Mean + sd(df$Unemployment_rate)/sqrt(length(df$Unemployment_rate))),
position = position_dodge(width = 0.8),
width = 0.25) +
labs(title = "Mean Unemployment Rate by Target",
x = "Target",
y = "Mean Unemployment Rate") +
theme_minimal()
The results of the Analysis of Variance (ANOVA) table indicate that there is a significant difference in the mean Unemployment_rate across different levels of the Target variable (enrollment status). The p-value associated with the F-statistic is 0.0027, which is less than the commonly used significance level of 0.05.
Therefore, based on the p-value, you would reject the null hypothesis and conclude that there is a statistically significant difference in the mean Unemployment_rate among different enrollment statuses.
Null Hypothesis (H0): There is no significant relationship between the age at enrollment and the admission grade of students.
Alternative Hypothesis (H1): There is a significant relationship between the age at enrollment and the admission grade of students. Specifically, students enrolling at a younger age tend to have higher admission grades compared to those enrolling at an older age.
model <- aov(Age_at_enrollment ~ Admission_grade, data = df)
# Conduct the F-test
anova_result <- anova(model)
# Print the results
print(anova_result)
## Analysis of Variance Table
##
## Response: Age_at_enrollment
## Df Sum Sq Mean Sq F value Pr(>F)
## Admission_grade 1 228 227.897 3.9609 0.04663 *
## Residuals 4422 254426 57.536
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
# Create a grouped boxplot
ggplot(df, aes(x = Admission_grade, y = Age_at_enrollment)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Grouped Boxplot of Age at Enrollment by Admission Grade",
x = "Admission Grade",
y = "Age at Enrollment") +
theme_minimal()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
The p-value (0.04663) is less than 0.05, indicating that there is a significant relationship between Age_at_enrollment and Admission_grade. In other words, the admission grade is associated with a significant difference in the age at enrollment.
# Fit linear regression model
lm_model <- lm(Admission_grade ~ Previous_qualification_grade + Age_at_enrollment + Gender, data = df)
summary(lm_model)
##
## Call:
## lm(formula = Admission_grade ~ Previous_qualification_grade +
## Age_at_enrollment + Gender, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.480 -5.722 -0.376 5.928 64.823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.04020 1.94174 20.621 <2e-16 ***
## Previous_qualification_grade 0.64279 0.01352 47.550 <2e-16 ***
## Age_at_enrollment 0.05831 0.02373 2.457 0.0141 *
## Gender 0.96408 0.37517 2.570 0.0102 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.78 on 4420 degrees of freedom
## Multiple R-squared: 0.3391, Adjusted R-squared: 0.3387
## F-statistic: 756 on 3 and 4420 DF, p-value: < 2.2e-16
# Diagnostic plots for the extended model
par(mfrow = c(2, 2))
plot(lm_model)
The overall model is really good at predicting Admission_grade, and the chances of this happening by random chance are extremely low (less than 0.001). Individual Predictor Significance:
Each of the three things we looked at (Previous_qualification_grade, Age_at_enrollment, and Gender) is important on its own in predicting Admission_grade.
When we look at Previous_qualification_grade, for every increase in your previous qualification grade, your Admission_grade tends to go up by about 0.64 points. Similarly, being older at the time of enrollment or being male is associated with a higher Admission_grade.
The model we built explains about 33.87% of the differences we see in Admission_grade. So, while we can’t explain everything, we’re capturing a good chunk of what’s going on.
Overall, this linear regression model suggests that Previous_qualification_grade, Age_at_enrollment, and Gender are important predictors of Admission_grade in your dataset.
library(nnet)
# Fit multinomial logistic regression model with 'Dropout' as the reference category
multinom_model1 <- multinom(Target ~ Debtor + Inflation_rate + Displaced, data = df, reference = "Dropout")
## # weights: 15 (8 variable)
## initial value 4860.260765
## iter 10 value 4386.677429
## final value 4361.181102
## converged
# Display the summary of the model
summary(multinom_model1)
## Call:
## multinom(formula = Target ~ Debtor + Inflation_rate + Displaced,
## data = df, reference = "Dropout")
##
## Coefficients:
## (Intercept) Debtor Inflation_rate Displaced
## Enrolled -0.5317555 -0.7681265 -0.04445352 0.2544974
## Graduate 0.4661843 -1.7305607 -0.05506359 0.4468940
##
## Std. Errors:
## (Intercept) Debtor Inflation_rate Displaced
## Enrolled 0.07952178 0.1295423 0.03239461 0.08969533
## Graduate 0.06236364 0.1210153 0.02542534 0.07072001
##
## Residual Deviance: 8722.362
## AIC: 8738.362
library(coefplot)
# Create a coefficient plot with explicit reference label
coefplot(
multinom_model1,
type = "stacked",
col = c("blue", "red"),
main = "Multinomial Logistic Regression Coefficients",
labels = c("Enrolled", "Graduate", "Reference: Dropout")
)
Debtor:
Enrolled: Being a debtor decreases the likelihood of dropout. This suggests that students who have some form of financial obligation (debt) are less likely to drop out during their enrollment. Graduate: Conversely, for graduates, being a debtor increases the likelihood of dropout. This could imply that financial constraints or debt become more challenging to manage post-graduation. Inflation Rate:
The inflation rate has a minimal impact on the likelihood of dropout for both enrolled and graduate students. This suggests that changes in the inflation rate don’t significantly influence dropout rates in the given context. Displaced:
Being displaced increases the likelihood of dropout for both enrolled and graduate students. This indicates that students who have experienced displacement (perhaps due to economic or social factors) are at a higher risk of dropping out, regardless of their enrollment status.
# Fit multinomial logistic regression model with 'Dropout' as the reference category
multinom_model <- multinom(Target ~ Unemployment_rate + Age_at_enrollment, data = df, reference = "Dropout")
## # weights: 12 (6 variable)
## initial value 4860.260765
## iter 10 value 4367.499665
## final value 4367.474262
## converged
# Display the summary of the model
summary(multinom_model)
## Call:
## multinom(formula = Target ~ Unemployment_rate + Age_at_enrollment,
## data = df, reference = "Dropout")
##
## Coefficients:
## (Intercept) Unemployment_rate Age_at_enrollment
## Enrolled 1.303313 -0.044851691 -0.05725080
## Graduate 2.073845 0.008668169 -0.07358384
##
## Std. Errors:
## (Intercept) Unemployment_rate Age_at_enrollment
## Enrolled 0.2453154 0.01704149 0.006290958
## Graduate 0.1910319 0.01315273 0.004861413
##
## Residual Deviance: 8734.949
## AIC: 8746.949
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
# Create an effects object
effect_plot <- allEffects(multinom_model)
# Plot the effects for a specific variable
plot(effect_plot, "Unemployment_rate")
plot(effect_plot,'Age_at_enrollment')
Likelihood of Enrolling in College (Enrolled): The intercept suggests that, compared to not going to college (Dropout), the starting likelihood of considering going to college is approximately 1.303 times higher for individuals in the “Enrolled” group. An increase in age at enrollment is associated with a decrease in the likelihood of considering going to college for individuals in the “Enrolled” group. Unemployment rate does not seem to have a statistically significant impact on the likelihood of considering going to college for individuals in the “Enrolled” group. Likelihood of Graduating from College (Graduate):
The intercept suggests that, compared to not going to college (Dropout), the starting likelihood of considering graduating from college is approximately 2.074 times higher for individuals in the “Graduate” group. An increase in age at enrollment is associated with a decrease in the likelihood of considering graduating from college for individuals in the “Graduate” group. Unemployment rate has a small but statistically significant positive impact on the likelihood of considering graduating from college for individuals in the “Graduate” group.
1.Academic Performance and Attendance:
Attendance type significantly influences admission grades, indicating potential differences in learning environments between daytime and evening classes.
Economic Factors and Enrollment Decisions: Economic factors, particularly the unemployment rate, play a crucial role in students’ enrollment decisions, emphasizing the need for targeted support during economic fluctuations.
Age at Enrollment and Academic Success: Younger students at the time of enrollment tend to achieve higher admission grades, emphasizing the importance of early engagement in academic pursuits.
Multinomial Logistic Regression Insights: 2.Debtor Status: Being a debtor influences dropout likelihood, with contrasting effects for enrolled and graduate students.
3.Inflation Rate: Inflation rate has minimal impact on dropout likelihood for both groups.
4.Displacement: Displacement significantly increases dropout risk, regardless of enrollment status.
1.Tailor support services based on class schedules to address
potential academic challenges.
2.Provide economic support and guidance during periods of higher
unemployment.
3.Implement targeted interventions for older students at
enrollment.
4.Offer financial literacy programs, considering debtor status
implications.
5.Establish support programs for students who have experienced
displacement.
6.Provide decision-making support for those considering college or
graduation.