library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data<-read.csv('./Downloads/students_dropout_and_academic_success.csv')
# Group by Marital Status and calculate the mean Admission Grade for each group
grouped_marital_status <- data %>%
group_by(Marital_status) %>%
summarize(Mean_Admission_Grade = mean(Admission_grade, na.rm = TRUE))
# Group by Application Mode and calculate the mean Previous Qualification Grade for each group
grouped_application_mode <- data %>%
group_by(Application_mode) %>%
summarize(Mean_Previous_Qualification_Grade = mean(`Previous_qualification_grade`, na.rm = TRUE))
# Group by Nacionality Needs and calculate the mean GDP in each group
grouped_Nacionality <- data %>%
group_by(Nacionality) %>%
summarize(Mean_GDP = mean(GDP))
print(grouped_marital_status)
## # A tibble: 6 × 2
## Marital_status Mean_Admission_Grade
## <int> <dbl>
## 1 1 127.
## 2 2 128.
## 3 3 151.
## 4 4 125.
## 5 5 128.
## 6 6 115.
print(grouped_application_mode)
## # A tibble: 18 × 2
## Application_mode Mean_Previous_Qualification_Grade
## <int> <dbl>
## 1 1 134.
## 2 2 130
## 3 5 133.
## 4 7 133.
## 5 10 148.
## 6 15 139.
## 7 16 136.
## 8 17 131.
## 9 18 131.
## 10 26 133.
## 11 27 130
## 12 39 131.
## 13 42 130.
## 14 43 131.
## 15 44 140.
## 16 51 131.
## 17 53 138.
## 18 57 133.
print(grouped_Nacionality)
## # A tibble: 21 × 2
## Nacionality Mean_GDP
## <int> <dbl>
## 1 1 -0.0141
## 2 2 -0.69
## 3 6 1.21
## 4 11 0.97
## 5 13 1.74
## 6 14 -4.06
## 7 17 -0.92
## 8 21 1.17
## 9 22 -0.492
## 10 24 -1.85
## # ℹ 11 more rows
These group-by operations and calculations offer valuable insights into the dataset, helping to identify patterns or relationships between variables. These insights can inform educational policies, support mechanisms, and resource allocation strategies.
# Calculating the overall mean Admission Grade
overall_mean_grade <- mean(data$Admission_grade, na.rm = TRUE)
# Calculating the standard deviation of Admission Grade
grade_sd <- sd(data$Admission_grade, na.rm = TRUE)
# Calculating z-scores for each group
grouped_marital_status <- grouped_marital_status %>%
mutate(Z_Score = (Mean_Admission_Grade - overall_mean_grade) / grade_sd)
grouped_marital_status <- grouped_marital_status %>%
mutate(Probability = pnorm(Z_Score))
grouped_marital_status <- grouped_marital_status %>%
mutate(Marital_Status_Anomaly_Tag = ifelse(Probability == min(Probability), "Anomaly", "Normal"))
# Joining results back to the original data
data <- data %>%
inner_join(grouped_marital_status %>% select(Marital_status, Marital_Status_Anomaly_Tag), by = "Marital_status")
print(grouped_marital_status)
## # A tibble: 6 × 5
## Marital_status Mean_Admission_Grade Z_Score Probability
## <int> <dbl> <dbl> <dbl>
## 1 1 127. -0.00449 0.498
## 2 2 128. 0.0668 0.527
## 3 3 151. 1.64 0.949
## 4 4 125. -0.121 0.452
## 5 5 128. 0.0730 0.529
## 6 6 115. -0.852 0.197
## # ℹ 1 more variable: Marital_Status_Anomaly_Tag <chr>
overall_mean_previous_qualification_grade=mean(data$Previous_qualification_grade,na.rm=TRUE)
sd_previous_qualification_grade=sd(data$Previous_qualification_grade,na.rm=TRUE)
grouped_application_mode <- grouped_application_mode %>%
mutate(Z_score1=(Mean_Previous_Qualification_Grade-overall_mean_previous_qualification_grade)/sd_previous_qualification_grade)
grouped_application_mode <- grouped_application_mode %>%
mutate(Probability1=pnorm(Z_score1))
grouped_application_mode <- grouped_application_mode %>%
mutate(Application_Mode_Anomaly_Tag=ifelse(Probability1==min(Probability1),'Anomaly','Normal'))
data <- data %>%
inner_join(grouped_application_mode %>% select(Application_mode,Application_Mode_Anomaly_Tag),by="Application_mode")
print(grouped_application_mode)
## # A tibble: 18 × 5
## Application_mode Mean_Previous_Qualification_Grade Z_score1 Probability1
## <int> <dbl> <dbl> <dbl>
## 1 1 134. 0.0872 0.535
## 2 2 130 -0.198 0.421
## 3 5 133. 0.0388 0.515
## 4 7 133. 0.0170 0.507
## 5 10 148. 1.19 0.883
## 6 15 139. 0.467 0.680
## 7 16 136. 0.253 0.600
## 8 17 131. -0.145 0.442
## 9 18 131. -0.131 0.448
## 10 26 133. 0.0369 0.515
## 11 27 130 -0.198 0.421
## 12 39 131. -0.138 0.445
## 13 42 130. -0.232 0.408
## 14 43 131. -0.159 0.437
## 15 44 140. 0.575 0.717
## 16 51 131. -0.106 0.458
## 17 53 138. 0.430 0.666
## 18 57 133. 0.0369 0.515
## # ℹ 1 more variable: Application_Mode_Anomaly_Tag <chr>
overall_mean_GDP <- mean(data$GDP, na.rm = TRUE)
sd_GDP <- sd(data$GDP, na.rm = TRUE)
grouped_Nacionality <- grouped_Nacionality %>%
mutate(Z_score2 = (Mean_GDP - overall_mean_GDP) / sd_GDP)
grouped_Nacionality <- grouped_Nacionality %>%
mutate(Probability2 = pnorm(Z_score2))
grouped_Nacionality <- grouped_Nacionality %>%
mutate(Nacionality_Anomaly_Tag = ifelse(Probability2 == min(Probability2), 'Anomaly', 'Normal'))
# Joining results back to the original data
data <- data %>%
left_join(grouped_Nacionality %>% select(Nacionality, Nacionality_Anomaly_Tag), by = 'Nacionality')
print(grouped_Nacionality)
## # A tibble: 21 × 5
## Nacionality Mean_GDP Z_score2 Probability2 Nacionality_Anomaly_Tag
## <int> <dbl> <dbl> <dbl> <chr>
## 1 1 -0.0141 -0.00709 0.497 Normal
## 2 2 -0.69 -0.305 0.380 Normal
## 3 6 1.21 0.534 0.703 Normal
## 4 11 0.97 0.426 0.665 Normal
## 5 13 1.74 0.766 0.778 Normal
## 6 14 -4.06 -1.79 0.0368 Anomaly
## 7 17 -0.92 -0.406 0.342 Normal
## 8 21 1.17 0.515 0.697 Normal
## 9 22 -0.492 -0.218 0.414 Normal
## 10 24 -1.85 -0.818 0.207 Normal
## # ℹ 11 more rows
By obtaining these probabilities we can understand which groups deviate significantly from the overall means, can assist in risk assessment and resource allocation. For example, groups with lower GDP means may face higher economic risks and require additional support.
It can also inform policy decisions and resource allocation strategies aimed at addressing economic disparities.
Anomalies can be detected which might have very less impact on the data.
The rarity of certain groups in student admissions and academic success data can be explained by variations in the admission grade, previous qualification grade, and GDP.
The groups with higher mean values for admission grade, previous qualification grade, and GDP are more common, while groups with lower mean values are rarer.
Statistical Comparison: Perform statistical tests to compare the mean values of admission grade, previous qualification grade, and GDP between the common and rare groups (Marital Status, Application Mode, and Nationality). Specifically, compare the mean values of these variables for groups labeled as "Anomaly" and "Normal."
Testable Predictions: If variations in admission grade influence rarity, then groups labeled as "Anomaly" should have significantly lower mean admission grades compared to "Normal" groups. If variations in previous qualification grade influence rarity, then groups labeled as "Anomaly" should have significantly lower mean previous qualification grades compared to "Normal" groups. If variations in GDP influence rarity, then groups labeled as "Anomaly" should have significantly lower mean GDP values compared to "Normal" groups.
library(ggplot2)
# Creating a box plot for Admission Grade by Marital Status
ggplot(data, aes(x = as.factor(Marital_status), y = Admission_grade, fill = Marital_Status_Anomaly_Tag)) +
geom_boxplot() +
labs(x = "Marital Status", y = "Admission Grade") +
scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red")) +
theme_minimal()
The boxes in the plot show the interquartile range (IQR), which measures the spread of data within each marital status group. A wider IQR indicates greater variability in admission grades within that group.
If significant differences or anomalies are observed, it may prompt further research and analysis to understand the underlying factors contributing to these variations. This can lead to valuable insights for educational research and policy development.
# Creating a bar chart for Application Mode
ggplot(data, aes(x = as.factor(Application_mode), fill = Application_Mode_Anomaly_Tag)) +
geom_bar() +
labs(x = "Application Mode", y = "Count") +
scale_fill_manual(values = c("Normal" = "green", "Anomaly" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
It provides a clear visualization of the counts within each category and helps identify any patterns or outliers related to the application mode.
# Creating contingency table for two categorical variables (e.g., Marital_status and Gender)
contingency_table <- table(data$Marital_status, data$Gender)
# Converting contingency table to a data frame
combinations_df <- as.data.frame.table(contingency_table)
colnames(combinations_df) <- c("Marital_status", "Gender", "Frequency")
# Sort the combinations by frequency in descending order
sorted_combinations <- combinations_df %>%
arrange(desc(Frequency))
# Most common combinations
most_common <- head(sorted_combinations, 5)
print("Most Common Combinations:")
## [1] "Most Common Combinations:"
print(most_common)
## Marital_status Gender Frequency
## 1 1 0 2552
## 2 1 1 1367
## 3 2 0 221
## 4 2 1 158
## 5 4 0 69
# Least common combinations
least_common <- tail(sorted_combinations, 5)
print("\nLeast Common Combinations:")
## [1] "\nLeast Common Combinations:"
print(least_common)
## Marital_status Gender Frequency
## 8 5 1 7
## 9 6 0 5
## 10 3 0 3
## 11 3 1 1
## 12 6 1 1
From the table obtained we can say that younger individuals/students prefer to be stay single, thus there combinations have more frequency when compared to other combinations.
Form the least_common table we can see that “legally separated” status students have least combinations, this might be because younger individuals are more likely to stay single and very rare that they have a status of legal separation already.
*This insight can be valuable for understanding the demographics of your data.
*Knowing the most common combinations allows you to focus your analysis on those groups, potentially uncovering patterns or trends specific to them. Conversely, studying the least common combinations might reveal insights about
*Identifying the least common combinations can raise questions about data representation and potential bias. It’s important to consider whether the dataset adequately represents all combinations or if certain groups are underrepresented.
# List of categorical column names for analysis
categorical_columns <- c("Marital_status", "Fathers_qualification")
# data frame with all possible combinations of categorical variables
all_combinations <- expand.grid(lapply(data[, categorical_columns], unique))
# Find missing combinations
missing_combinations <- anti_join(all_combinations, data[, categorical_columns])
## Joining with `by = join_by(Marital_status, Fathers_qualification)`
print("Missing Combinations:")
## [1] "Missing Combinations:"
print(missing_combinations)
## Marital_status Fathers_qualification
## 1 4 12
## 2 3 12
## 3 5 12
## 4 6 12
## 5 3 3
## 6 6 3
## 7 3 38
## 8 6 38
## 9 3 19
## 10 6 19
## 11 4 5
## 12 3 5
## 13 5 5
## 14 6 5
## 15 4 4
## 16 3 4
## 17 5 4
## 18 6 4
## 19 3 34
## 20 5 34
## 21 6 34
## 22 4 2
## 23 3 2
## 24 6 2
## 25 4 39
## 26 3 39
## 27 5 39
## 28 6 39
## 29 2 11
## 30 4 11
## 31 3 11
## 32 5 11
## 33 6 11
## 34 2 9
## 35 4 9
## 36 3 9
## 37 5 9
## 38 6 9
## 39 4 36
## 40 3 36
## 41 5 36
## 42 6 36
## 43 2 26
## 44 4 26
## 45 3 26
## 46 5 26
## 47 6 26
## 48 2 40
## 49 4 40
## 50 3 40
## 51 5 40
## 52 6 40
## 53 4 14
## 54 3 14
## 55 5 14
## 56 6 14
## 57 2 20
## 58 4 20
## 59 3 20
## 60 5 20
## 61 6 20
## 62 4 35
## 63 3 35
## 64 5 35
## 65 6 35
## 66 4 41
## 67 3 41
## 68 5 41
## 69 6 41
## 70 2 22
## 71 3 22
## 72 5 22
## 73 6 22
## 74 2 13
## 75 4 13
## 76 3 13
## 77 5 13
## 78 6 13
## 79 2 29
## 80 4 29
## 81 3 29
## 82 5 29
## 83 6 29
## 84 2 43
## 85 4 43
## 86 3 43
## 87 5 43
## 88 6 43
## 89 2 18
## 90 4 18
## 91 3 18
## 92 5 18
## 93 6 18
## 94 2 42
## 95 4 42
## 96 3 42
## 97 5 42
## 98 6 42
## 99 2 10
## 100 4 10
## 101 3 10
## 102 5 10
## 103 6 10
## 104 2 6
## 105 4 6
## 106 3 6
## 107 5 6
## 108 6 6
## 109 2 30
## 110 4 30
## 111 3 30
## 112 5 30
## 113 6 30
## 114 1 25
## 115 4 25
## 116 3 25
## 117 5 25
## 118 6 25
## 119 2 44
## 120 4 44
## 121 3 44
## 122 5 44
## 123 6 44
## 124 2 33
## 125 4 33
## 126 3 33
## 127 5 33
## 128 6 33
## 129 2 27
## 130 4 27
## 131 3 27
## 132 5 27
## 133 6 27
## 134 1 31
## 135 4 31
## 136 3 31
## 137 5 31
## 138 6 31
The output displays a list of combinations of “Marital_status” and “Fathers_qualification” that do not exist in my dataset. These combinations represent unique pairs of marital status and fathers’ qualifications that have not been observed among the data.
Identifying missing combinations is crucial for assessing the completeness of your dataset. It helps you recognize which specific combinations of categorical variables are absent, which could be important for understanding the data’s representativeness.