library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Calculating the summaries to get valuble insights from the data
data<-read.csv('./Downloads/students_dropout_and_academic_success.csv')
first_15_columns<-data[,c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)]
print(summary(first_15_columns))
## Marital_status Application_mode Application_order Course
## Min. :1.000 Min. : 1.00 Min. :0.000 Min. : 33
## 1st Qu.:1.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:9085
## Median :1.000 Median :17.00 Median :1.000 Median :9238
## Mean :1.179 Mean :18.67 Mean :1.728 Mean :8857
## 3rd Qu.:1.000 3rd Qu.:39.00 3rd Qu.:2.000 3rd Qu.:9556
## Max. :6.000 Max. :57.00 Max. :9.000 Max. :9991
## Daytime.evening.attendance. Previous_qualification
## Min. :0.0000 Min. : 1.000
## 1st Qu.:1.0000 1st Qu.: 1.000
## Median :1.0000 Median : 1.000
## Mean :0.8908 Mean : 4.578
## 3rd Qu.:1.0000 3rd Qu.: 1.000
## Max. :1.0000 Max. :43.000
## Previous_qualification..grade. Nacionality Mother.s_qualification
## Min. : 95.0 Min. : 1.000 Min. : 1.00
## 1st Qu.:125.0 1st Qu.: 1.000 1st Qu.: 2.00
## Median :133.1 Median : 1.000 Median :19.00
## Mean :132.6 Mean : 1.873 Mean :19.56
## 3rd Qu.:140.0 3rd Qu.: 1.000 3rd Qu.:37.00
## Max. :190.0 Max. :109.000 Max. :44.00
## Father.s_qualification Mother.s_occupation Father.s_occupation Admission_grade
## Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 95.0
## 1st Qu.: 3.00 1st Qu.: 4.00 1st Qu.: 4.00 1st Qu.:117.9
## Median :19.00 Median : 5.00 Median : 7.00 Median :126.1
## Mean :22.28 Mean : 10.96 Mean : 11.03 Mean :127.0
## 3rd Qu.:37.00 3rd Qu.: 9.00 3rd Qu.: 9.00 3rd Qu.:134.8
## Max. :44.00 Max. :194.00 Max. :195.00 Max. :190.0
## Displaced Educational_special_needs
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :1.0000 Median :0.00000
## Mean :0.5484 Mean :0.01153
## 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
1.Are there any trends in the choice of courses (Course) based on different application modes (Application.mode)? Does certain application modes attract more applicants to specific courses?
2.Is there a relationship between the level of previous qualification (Previous.qualification) and the admission grades (Admission.grade)? Do individuals with higher previous qualifications tend to have higher admission grades?
3.Do the qualifications and occupations of an applicant’s parents (e.g., “Mother.s.qualification,” “Father.s.qualification,” “Mother.s.occupation,” “Father.s.occupation”) have any influence on their admission or educational outcomes? Are there trends in parental backgrounds and student success?
4.Are there specific data transformation or preprocessing steps recommended in the documentation to make the data suitable for analysis? How do these transformations affect the variables you plan to use?
5.How does the presence of educational special needs (e.g., “Educational.special.needs”) affect admission outcomes? Are applicants with special needs admitted at different rates or with different admission grades compared to those without special needs?
‘Early Intervention’: The primary goal could be to identify students who are at risk of dropping out early in their academic journey. By doing so, educational institutions can implement methods to improve Graduation rate or enhance student outcomes.
This Data set consists of 37 Attributes which are numerical values except the Target attribute which has categorical data.The data set includes information known at the time of student enrollment – academic path, demographics, and social-economic factors.
The problem is formulated as a three category classification task (dropout, enrolled, and graduate) at the end of the normal duration of the course.
1.Calculate the average admission grade for each course category
library(dplyr)
course_avg_grade <- data %>%
group_by(Course) %>%
summarize(Avg_Admission_Grade = mean(Admission_grade, na.rm = TRUE))
print(course_avg_grade)
## # A tibble: 17 × 2
## Course Avg_Admission_Grade
## <int> <dbl>
## 1 33 119.
## 2 171 136.
## 3 8014 122.
## 4 9003 133.
## 5 9070 129.
## 6 9085 131.
## 7 9119 126.
## 8 9130 132.
## 9 9147 123.
## 10 9238 126.
## 11 9254 122.
## 12 9500 127.
## 13 9556 123.
## 14 9670 121.
## 15 9773 128.
## 16 9853 123.
## 17 9991 130.
2.Create a contingency table to examine the relationship between “Marital.status” and”Educational.special.needs.”
cross_table <- table(data$Marital_status, data$Educational_special_needs)
print(cross_table)
##
## 0 1
## 1 3869 50
## 2 378 1
## 3 4 0
## 4 91 0
## 5 25 0
## 6 6 0
3.Calculate the 90th percentile of admission grades to identify the top-performing applicants.
percentile_90 <- quantile(data$Admission.grade, probs = 0.9, na.rm = TRUE)
print(percentile_90)
## 90%
## NA
Load necessary libraries Create a smoothed scatter plot for Age vs. Admission Grade
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
smoothed_age_grade <- ggplot(data, aes(x = Age_at_enrollment, y = Admission_grade)) +
geom_smooth(method = "lm", color = "blue") +
geom_point(color = "gray", alpha = 0.5) +
labs(title = "Smoothed Scatter Plot: Age vs. Admission Grade",
x = "Age_at_enrollment",
y = "Admission Grade")
Create histogram for age frequency
hist_age <- ggplot(data, aes(x = Age_at_enrollment)) +
geom_histogram( fill = "green", alpha = 0.7) +
labs(title = "Distribution of Age",
x = "Age",
y = "Frequency")
Create a scatter plot for Admission Grades vs. Previous Qualification
scatterplot <- ggplot(data, aes(x = Admission_grade, y = Previous_qualification)) +
geom_point(alpha = 0.7) +
labs(title = "Scatter Plot: Admission Grades vs. Previous Qualification",
x = "Admission Grade",
y = "Previous Qualification")
Create a box plot to compare ages by Marital Status
boxplot_age <- ggplot(data, aes(x = Marital_status, y = Age_at_enrollment, fill = Marital_status)) +
geom_boxplot() +
labs(title = "Age Distribution by Marital Status",
x = "Marital Status",
y = "Age") +
scale_fill_manual(values = c("Single" = "blue", "Married" = "red"))
Arrange the plots in a grid
grid.arrange(smoothed_age_grade, hist_age, scatterplot, boxplot_age, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Create a bar plot to compare the counts of individuals with and without
educational special needs (Educational.special.needs) by their gender
(Gender).
barplot_special_needs <- ggplot(data, aes(x = Gender, fill = factor(Educational_special_needs))) +
geom_bar() +
labs(title = "Educational Special Needs by Gender",
x = "Gender",
y = "Count")
print(barplot_special_needs)