Question 1

Performing group_by and calculating the overall means

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data<-read.csv('./Downloads/students_dropout_and_academic_success.csv')
# Group by Marital Status and calculate the mean Admission Grade for each group
grouped_marital_status <- data %>%
  group_by(Marital_status) %>%
  summarize(Mean_Admission_Grade = mean(Admission_grade, na.rm = TRUE))

# Group by Application Mode and calculate the mean Previous Qualification Grade for each group
grouped_application_mode <- data %>%
  group_by(Application_mode) %>%
  summarize(Mean_Previous_Qualification_Grade = mean(`Previous_qualification_grade`, na.rm = TRUE))

# Group by Nacionality Needs and calculate the mean GDP in each group
grouped_Nacionality <- data %>%
  group_by(Nacionality) %>%
  summarize(Mean_GDP = mean(GDP))

print(grouped_marital_status)

## # A tibble: 6 × 2
##   Marital_status Mean_Admission_Grade
##            <int>                <dbl>
## 1              1                 127.
## 2              2                 128.
## 3              3                 151.
## 4              4                 125.
## 5              5                 128.
## 6              6                 115.

print(grouped_application_mode)

## # A tibble: 18 × 2
##    Application_mode Mean_Previous_Qualification_Grade
##               <int>                             <dbl>
##  1                1                              134.
##  2                2                              130 
##  3                5                              133.
##  4                7                              133.
##  5               10                              148.
##  6               15                              139.
##  7               16                              136.
##  8               17                              131.
##  9               18                              131.
## 10               26                              133.
## 11               27                              130 
## 12               39                              131.
## 13               42                              130.
## 14               43                              131.
## 15               44                              140.
## 16               51                              131.
## 17               53                              138.
## 18               57                              133.

print(grouped_Nacionality)

## # A tibble: 21 × 2
##    Nacionality Mean_GDP
##          <int>    <dbl>
##  1           1  -0.0141
##  2           2  -0.69  
##  3           6   1.21  
##  4          11   0.97  
##  5          13   1.74  
##  6          14  -4.06  
##  7          17  -0.92  
##  8          21   1.17  
##  9          22  -0.492 
## 10          24  -1.85  
## # ℹ 11 more rows

These group-by operations and calculations offer valuable insights into the dataset, helping to identify patterns or relationships between variables. These insights can inform educational policies, support mechanisms, and resource allocation strategies.

Calculating probability and finding the anomalies

# Calculating the overall mean Admission Grade
overall_mean_grade <- mean(data$Admission_grade, na.rm = TRUE)

# Calculating the standard deviation of Admission Grade
grade_sd <- sd(data$Admission_grade, na.rm = TRUE)

# Calculating z-scores for each group
grouped_marital_status <- grouped_marital_status %>%
  mutate(Z_Score = (Mean_Admission_Grade - overall_mean_grade) / grade_sd)

grouped_marital_status <- grouped_marital_status %>%
  mutate(Probability = pnorm(Z_Score))

grouped_marital_status <- grouped_marital_status %>%
  mutate(Marital_Status_Anomaly_Tag = ifelse(Probability == min(Probability), "Anomaly", "Normal"))
# Joining results back to the original data
data <- data %>%
  inner_join(grouped_marital_status %>% select(Marital_status, Marital_Status_Anomaly_Tag), by = "Marital_status")
print(grouped_marital_status)

## # A tibble: 6 × 5
##   Marital_status Mean_Admission_Grade  Z_Score Probability
##            <int>                <dbl>    <dbl>       <dbl>
## 1              1                 127. -0.00449       0.498
## 2              2                 128.  0.0668        0.527
## 3              3                 151.  1.64          0.949
## 4              4                 125. -0.121         0.452
## 5              5                 128.  0.0730        0.529
## 6              6                 115. -0.852         0.197
## # ℹ 1 more variable: Marital_Status_Anomaly_Tag <chr>

overall_mean_previous_qualification_grade=mean(data$Previous_qualification_grade,na.rm=TRUE)

sd_previous_qualification_grade=sd(data$Previous_qualification_grade,na.rm=TRUE)

grouped_application_mode <- grouped_application_mode %>%
  mutate(Z_score1=(Mean_Previous_Qualification_Grade-overall_mean_previous_qualification_grade)/sd_previous_qualification_grade)

grouped_application_mode <- grouped_application_mode %>%
  mutate(Probability1=pnorm(Z_score1))

grouped_application_mode <- grouped_application_mode %>%
  mutate(Application_Mode_Anomaly_Tag=ifelse(Probability1==min(Probability1),'Anomaly','Normal'))
data <- data %>%
  inner_join(grouped_application_mode %>% select(Application_mode,Application_Mode_Anomaly_Tag),by="Application_mode")
print(grouped_application_mode)

## # A tibble: 18 × 5
##    Application_mode Mean_Previous_Qualification_Grade Z_score1 Probability1
##               <int>                             <dbl>    <dbl>        <dbl>
##  1                1                              134.   0.0872        0.535
##  2                2                              130   -0.198         0.421
##  3                5                              133.   0.0388        0.515
##  4                7                              133.   0.0170        0.507
##  5               10                              148.   1.19          0.883
##  6               15                              139.   0.467         0.680
##  7               16                              136.   0.253         0.600
##  8               17                              131.  -0.145         0.442
##  9               18                              131.  -0.131         0.448
## 10               26                              133.   0.0369        0.515
## 11               27                              130   -0.198         0.421
## 12               39                              131.  -0.138         0.445
## 13               42                              130.  -0.232         0.408
## 14               43                              131.  -0.159         0.437
## 15               44                              140.   0.575         0.717
## 16               51                              131.  -0.106         0.458
## 17               53                              138.   0.430         0.666
## 18               57                              133.   0.0369        0.515
## # ℹ 1 more variable: Application_Mode_Anomaly_Tag <chr>

overall_mean_GDP <- mean(data$GDP, na.rm = TRUE)


sd_GDP <- sd(data$GDP, na.rm = TRUE)


grouped_Nacionality <- grouped_Nacionality %>%
  mutate(Z_score2 = (Mean_GDP - overall_mean_GDP) / sd_GDP)

grouped_Nacionality <- grouped_Nacionality %>%
  mutate(Probability2 = pnorm(Z_score2))

grouped_Nacionality <- grouped_Nacionality %>%
  mutate(Nacionality_Anomaly_Tag = ifelse(Probability2 == min(Probability2), 'Anomaly', 'Normal'))

# Joining results back to the original data
data <- data %>%
  left_join(grouped_Nacionality %>% select(Nacionality, Nacionality_Anomaly_Tag), by = 'Nacionality')
print(grouped_Nacionality)

## # A tibble: 21 × 5
##    Nacionality Mean_GDP Z_score2 Probability2 Nacionality_Anomaly_Tag
##          <int>    <dbl>    <dbl>        <dbl> <chr>                  
##  1           1  -0.0141 -0.00709       0.497  Normal                 
##  2           2  -0.69   -0.305         0.380  Normal                 
##  3           6   1.21    0.534         0.703  Normal                 
##  4          11   0.97    0.426         0.665  Normal                 
##  5          13   1.74    0.766         0.778  Normal                 
##  6          14  -4.06   -1.79          0.0368 Anomaly                
##  7          17  -0.92   -0.406         0.342  Normal                 
##  8          21   1.17    0.515         0.697  Normal                 
##  9          22  -0.492  -0.218         0.414  Normal                 
## 10          24  -1.85   -0.818         0.207  Normal                 
## # ℹ 11 more rows

By obtaining these probabilities we can understand which groups deviate significantly from the overall means, can assist in risk assessment and resource allocation. For example, groups with lower GDP means may face higher economic risks and require additional support.

It can also inform policy decisions and resource allocation strategies aimed at addressing economic disparities.

Anomalies can be detected which might have very less impact on the data.

Conclusions

Grouped by Marital Status:

Marital status groups 3 and 5 have the highest mean admission grade, suggesting that students in these groups tend to have higher admission grades on average.
Marital status group 6 has the lowest mean admission grade, indicating that students in this group typically have lower admission grades.
Marital status group 6 stands out as an anomaly because it has a significantly lower probability compared to other groups. This suggests that it’s an unusual or rare group in the context of admission grades.
Group 6 might represent a unique subset of students with distinct characteristics that warrant further investigation. Possible reasons for this anomaly could be explored to better understand why these students have lower admission grades.

Grouped by Application Mode:

Application mode 15 has the highest mean previous qualification grade, indicating that students who applied through this mode tend to have higher previous qualification grades.
Application mode 42 has the lowest mean previous qualification grade, suggesting that students who applied through this mode generally have lower previous qualification grades.
Application mode 42 is identified as an anomaly due to its significantly lower probability. This indicates that this mode is quite different from the others in terms of previous qualification grades.
Investigating further, mode 42 might be an unconventional or unique application mode that attracts students with distinct qualifications.

Grouped by Nationality:

Nationality 13 has the highest mean GDP, indicating that students from this nationality tend to come from regions with higher economic development.
Nationality 14 stands out as an anomaly because it has a very low probability compared to other nationalities, suggesting that it’s significantly different in terms of GDP.
Nationality 14 represent a particular region or group of students from a less economically developed area. This anomaly could be explored further to understand the context behind the low GDP.

Hypothesis

The rarity of certain groups in student admissions and academic success data can be explained by variations in the admission grade, previous qualification grade, and GDP.

Testable Hypothesis

The groups with higher mean values for admission grade, previous qualification grade, and GDP are more common, while groups with lower mean values are rarer.

Testing Methods

  Statistical Comparison: Perform statistical tests to compare the mean values of admission grade, previous qualification grade, and GDP between the common and rare groups (Marital Status, Application Mode, and Nationality). Specifically, compare the mean values of these variables for groups labeled as "Anomaly" and "Normal."

Testable Predictions:

  If variations in admission grade influence rarity, then groups labeled as "Anomaly" should have significantly lower mean admission grades compared to "Normal" groups.

  If variations in previous qualification grade influence rarity, then groups labeled as "Anomaly" should have significantly lower mean previous qualification grades compared to "Normal" groups.

  If variations in GDP influence rarity, then groups labeled as "Anomaly" should have significantly lower mean GDP values compared to "Normal" groups.

Visualizing

library(ggplot2)
# Creating a box plot for Admission Grade by Marital Status
ggplot(data, aes(x = as.factor(Marital_status), y = Admission_grade, fill = Marital_Status_Anomaly_Tag)) +
  geom_boxplot() +
  labs(x = "Marital Status", y = "Admission Grade") +
  scale_fill_manual(values = c("Normal" = "blue", "Anomaly" = "red")) +
  theme_minimal()

The boxes in the plot show the interquartile range (IQR), which measures the spread of data within each marital status group. A wider IQR indicates greater variability in admission grades within that group.

If significant differences or anomalies are observed, it may prompt further research and analysis to understand the underlying factors contributing to these variations. This can lead to valuable insights for educational research and policy development.

# Creating a bar chart for Application Mode
ggplot(data, aes(x = as.factor(Application_mode), fill = Application_Mode_Anomaly_Tag)) +
  geom_bar() +
  labs(x = "Application Mode", y = "Count") +
  scale_fill_manual(values = c("Normal" = "green", "Anomaly" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

It provides a clear visualization of the counts within each category and helps identify any patterns or outliers related to the application mode.

Question 2 - Pick 2-3 categorical variables for which you know all possible combinations.

Which combinations are the most/least common, and why might that be?

# Creating contingency table for two categorical variables (e.g., Marital_status and Gender)
contingency_table <- table(data$Marital_status, data$Gender)

# Converting contingency table to a data frame
combinations_df <- as.data.frame.table(contingency_table)

colnames(combinations_df) <- c("Marital_status", "Gender", "Frequency")

# Sort the combinations by frequency in descending order
sorted_combinations <- combinations_df %>%
  arrange(desc(Frequency))

# Most common combinations
most_common <- head(sorted_combinations, 5)
print("Most Common Combinations:")

## [1] "Most Common Combinations:"

print(most_common)

##   Marital_status Gender Frequency
## 1              1      0      2552
## 2              1      1      1367
## 3              2      0       221
## 4              2      1       158
## 5              4      0        69

# Least common combinations
least_common <- tail(sorted_combinations, 5)
print("\nLeast Common Combinations:")

## [1] "\nLeast Common Combinations:"

print(least_common)

##    Marital_status Gender Frequency
## 8               5      1         7
## 9               6      0         5
## 10              3      0         3
## 11              3      1         1
## 12              6      1         1

From the table obtained we can say that younger individuals/students prefer to be stay single, thus there combinations have more frequency when compared to other combinations.

Form the least_common table we can see that “legally separated” status students have least combinations, this might be because younger individuals are more likely to stay single and very rare that they have a status of legal separation already.

*This insight can be valuable for understanding the demographics of your data.

*Knowing the most common combinations allows you to focus your analysis on those groups, potentially uncovering patterns or trends specific to them. Conversely, studying the least common combinations might reveal insights about

*Identifying the least common combinations can raise questions about data representation and potential bias. It’s important to consider whether the dataset adequately represents all combinations or if certain groups are underrepresented.

# List of categorical column names for analysis
categorical_columns <- c("Marital_status", "Fathers_qualification")

# data frame with all possible combinations of categorical variables
all_combinations <- expand.grid(lapply(data[, categorical_columns], unique))

# Find missing combinations
missing_combinations <- anti_join(all_combinations, data[, categorical_columns])

## Joining with `by = join_by(Marital_status, Fathers_qualification)`

print("Missing Combinations:")

## [1] "Missing Combinations:"

print(missing_combinations)

##     Marital_status Fathers_qualification
## 1                4                    12
## 2                3                    12
## 3                5                    12
## 4                6                    12
## 5                3                     3
## 6                6                     3
## 7                3                    38
## 8                6                    38
## 9                3                    19
## 10               6                    19
## 11               4                     5
## 12               3                     5
## 13               5                     5
## 14               6                     5
## 15               4                     4
## 16               3                     4
## 17               5                     4
## 18               6                     4
## 19               3                    34
## 20               5                    34
## 21               6                    34
## 22               4                     2
## 23               3                     2
## 24               6                     2
## 25               4                    39
## 26               3                    39
## 27               5                    39
## 28               6                    39
## 29               2                    11
## 30               4                    11
## 31               3                    11
## 32               5                    11
## 33               6                    11
## 34               2                     9
## 35               4                     9
## 36               3                     9
## 37               5                     9
## 38               6                     9
## 39               4                    36
## 40               3                    36
## 41               5                    36
## 42               6                    36
## 43               2                    26
## 44               4                    26
## 45               3                    26
## 46               5                    26
## 47               6                    26
## 48               2                    40
## 49               4                    40
## 50               3                    40
## 51               5                    40
## 52               6                    40
## 53               4                    14
## 54               3                    14
## 55               5                    14
## 56               6                    14
## 57               2                    20
## 58               4                    20
## 59               3                    20
## 60               5                    20
## 61               6                    20
## 62               4                    35
## 63               3                    35
## 64               5                    35
## 65               6                    35
## 66               4                    41
## 67               3                    41
## 68               5                    41
## 69               6                    41
## 70               2                    22
## 71               3                    22
## 72               5                    22
## 73               6                    22
## 74               2                    13
## 75               4                    13
## 76               3                    13
## 77               5                    13
## 78               6                    13
## 79               2                    29
## 80               4                    29
## 81               3                    29
## 82               5                    29
## 83               6                    29
## 84               2                    43
## 85               4                    43
## 86               3                    43
## 87               5                    43
## 88               6                    43
## 89               2                    18
## 90               4                    18
## 91               3                    18
## 92               5                    18
## 93               6                    18
## 94               2                    42
## 95               4                    42
## 96               3                    42
## 97               5                    42
## 98               6                    42
## 99               2                    10
## 100              4                    10
## 101              3                    10
## 102              5                    10
## 103              6                    10
## 104              2                     6
## 105              4                     6
## 106              3                     6
## 107              5                     6
## 108              6                     6
## 109              2                    30
## 110              4                    30
## 111              3                    30
## 112              5                    30
## 113              6                    30
## 114              1                    25
## 115              4                    25
## 116              3                    25
## 117              5                    25
## 118              6                    25
## 119              2                    44
## 120              4                    44
## 121              3                    44
## 122              5                    44
## 123              6                    44
## 124              2                    33
## 125              4                    33
## 126              3                    33
## 127              5                    33
## 128              6                    33
## 129              2                    27
## 130              4                    27
## 131              3                    27
## 132              5                    27
## 133              6                    27
## 134              1                    31
## 135              4                    31
## 136              3                    31
## 137              5                    31
## 138              6                    31

The output displays a list of combinations of “Marital_status” and “Fathers_qualification” that do not exist in my dataset. These combinations represent unique pairs of marital status and fathers’ qualifications that have not been observed among the data.

Identifying missing combinations is crucial for assessing the completeness of your dataset. It helps you recognize which specific combinations of categorical variables are absent, which could be important for understanding the data’s representativeness.

NikhilBharadwaj_DD3

2023-09-11