# Required packages
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(RColorBrewer)
library(lubridate)

# Load Datasets
course <- read.csv('~/Downloads/archive (1)/courses.csv')
assessment <- read.csv('~/Downloads/archive (1)/assessments.csv')
vle <- read.csv('~/Downloads/archive (1)/vle.csv')
info_stu <- read.csv('~/Downloads/archive (1)/studentInfo.csv')
reg_stu <- read.csv('~/Downloads/archive (1)/studentRegistration.csv')
as_stu <- read.csv('~/Downloads/archive (1)/studentAssessment.csv')
vle_stu <- read.csv('~/Downloads/archive (1)/studentVle.csv')
# Specify the file path
file_path <- "~/Downloads/archive (1)/studentInfo.csv"

# Read the CSV file into a data frame
studentInfo <- read.csv(file_path, sep = ',', fileEncoding = 'ISO-8859-1')

# Print the first few rows of the data frame
head(studentInfo)
##   code_module code_presentation id_student gender               region
## 1         AAA             2013J      11391      M  East Anglian Region
## 2         AAA             2013J      28400      F             Scotland
## 3         AAA             2013J      30268      F North Western Region
## 4         AAA             2013J      31604      F    South East Region
## 5         AAA             2013J      32885      F West Midlands Region
## 6         AAA             2013J      38053      M                Wales
##       highest_education imd_band age_band num_of_prev_attempts studied_credits
## 1      HE Qualification  90-100%     55<=                    0             240
## 2      HE Qualification   20-30%    35-55                    0              60
## 3 A Level or Equivalent   30-40%    35-55                    0              60
## 4 A Level or Equivalent   50-60%    35-55                    0              60
## 5    Lower Than A Level   50-60%     0-35                    0              60
## 6 A Level or Equivalent   80-90%    35-55                    0              60
##   disability final_result
## 1          N         Pass
## 2          N         Pass
## 3          Y    Withdrawn
## 4          N         Pass
## 5          N         Pass
## 6          N         Pass
# Assuming 'id_student' is the column name in the data frame
unique_students <- unique(studentInfo$id_student)

# Get the length of the unique students
num_unique_students <- length(unique_students)

# Print the result
print(num_unique_students)
## [1] 28785
# Assuming 'final_result' is the column name in the data frame
studentInfo <- mutate(studentInfo, final_result_2 = ifelse(final_result %in% c('Withdrawn', 'Fail'), 'Failed', 'Passed'))

# Print the modified data frame
#print(studentInfo)
# Assuming 'code_module' is the column name in the data frame
unique_code_modules <- unique(studentInfo$code_module)

# Print the unique values
print(unique_code_modules)
## [1] "AAA" "BBB" "CCC" "DDD" "EEE" "FFF" "GGG"
# Grouping the data by 'code_module' and summarizing the number of occurrences
module_summary <- studentInfo %>%
  group_by(code_module) %>%
  summarise(Count = n()) %>%
  # Calculating the percentage of each module
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))

# Plotting using ggplot2
# Creating a bar plot with module code on the x-axis, count on the y-axis, and fill colors representing module code
# Adding text labels with the percentage values
# Customizing the theme and axis labels
ggplot(module_summary, aes(x = code_module, y = Count, fill = code_module, label = Percentage)) +
  geom_bar(stat = "identity") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = "Distribution - Enrolled Module",
       x = "Enrolled Module",
       y = "Quantity [Students]",
       fill = "Enrolled Module")

# Grouping the data by 'code_module' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
  group_by(code_module, final_result) %>%
  summarise(Count = n()) %>%
  # Calculating the percentage of each combination of module and result
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_module'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_module', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result, y = Percentage, fill = code_module, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = "Results by Enrolled Module",
       x = "Result",
       y = "Percentage [%]",
       fill = "Enrolled Module")

# Grouping the data by 'code_module' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
  group_by(code_module, final_result_2) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each combination of module and result
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_module'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result_2' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_module', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result_2, y = Percentage, fill = code_module, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Results by Enrolled Module',
       x = 'Result',
       y = 'Percentage [%]',
       fill = 'Enrolled Module')

# Assuming 'code_presentation' is the column name in the data frame
unique_presentations <- unique(studentInfo$code_presentation)

# Print the unique values
print(unique_presentations)
## [1] "2013J" "2014J" "2013B" "2014B"
# Grouping the data by 'code_presentation' and summarizing the count
period_summary <- studentInfo %>%
  group_by(code_presentation) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each presentation period
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))

# Plotting using ggplot2
# Creating a bar plot with presentation period on the x-axis, count on the y-axis,
# fill colors representing presentation period, and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(period_summary, aes(x = code_presentation, y = Count, fill = code_presentation, label = Percentage)) +
  geom_bar(stat = "identity") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Distribution - Enrolled Period',
       x = 'Enrolled Period',
       y = 'Quantity [Students]',
       fill = 'Enrolled Period')

# Grouping the data by 'code_presentation' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
  group_by(code_presentation, final_result) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each combination of presentation period and result
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_presentation'. You can override using
## the `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_presentation', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result, y = Percentage, fill = code_presentation, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Results by Enrolled Period',
       x = 'Result',
       y = 'Percentage [%]',
       fill = 'Enrolled Period')

# Grouping the data by 'code_presentation' and 'final_result_2' and summarizing the count
result_summary <- studentInfo %>%
  group_by(code_presentation, final_result_2) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each combination of presentation period and result
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_presentation'. You can override using
## the `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result_2' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_presentation', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result_2, y = Percentage, fill = code_presentation, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Results by Enrolled Period',
       x = 'Result',
       y = 'Percentage [%]',
       fill = 'Enrolled Period')

# Get the unique values of the 'gender' column
unique_genders <- unique(studentInfo$gender)

# Print the unique values
print(unique_genders)
## [1] "M" "F"
# Grouping the data by 'gender' and summarizing the count
gender_summary <- studentInfo %>%
  distinct(id_student, gender) %>%
  group_by(gender) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each gender
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))

# Plotting using ggplot2
# Creating a bar plot with gender on the x-axis, count on the y-axis,
# fill colors representing gender, and text labels with the count values
# Customizing the theme and axis labels
ggplot(gender_summary, aes(x = gender, y = Count, fill = gender, label = Count)) +
  geom_bar(stat = "identity") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Distribution - Gender',
       x = 'Gender',
       y = 'Quantity [Students]',
       fill = 'Gender')

# Grouping the data by 'gender' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
  group_by(gender, final_result) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each combination of gender and result
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'gender', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result, y = Percentage, fill = gender, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Results by Gender',
       x = 'Result',
       y = 'Percentage [%]',
       fill = 'Gender')

# Grouping the data by 'gender' and 'final_result_2' and summarizing the count
result_summary <- studentInfo %>%
  group_by(gender, final_result_2) %>%
  summarise(Count = n()) %>%
  # Ungrouping the data to reset grouping
  ungroup() %>%
  # Calculating the percentage of each combination of gender and result
  mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result_2' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'gender', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result_2, y = Percentage, fill = gender, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5)) +
  theme_minimal() +
  labs(title = 'Results by Gender',
       x = 'Result',
       y = 'Percentage [%]',
       fill = 'Gender')

# Get the unique values of the 'region' column
unique_regions <- unique(studentInfo$region)

# Calculate the number of unique regions
num_unique_regions <- length(unique_regions)

# Print the number of unique regions
print(num_unique_regions)
## [1] 13
# Create a new data frame 'regiao'
regiao <- studentInfo %>%
  select(Region = region, Count = id_student) %>%
  distinct() %>%
  group_by(Region) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count))

# Calculate the percentage of each region
regiao$Percentage <- round(100 * (regiao$Count / sum(regiao$Count)), 2)

# Create a bar plot using ggplot2
plot <- ggplot(regiao, aes(x = Region, y = Count, text = paste0(Percentage, "%"), fill = Region)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Distribution - Locality",
    x = "Region",
    y = "Quantity [Students]"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Group by region and final_result, calculate counts and percentages
df <- studentInfo %>%
  group_by(region, final_result) %>%
  summarise(Count = n()) %>%
  group_by(final_result) %>%
  mutate(Percentage = round(100 * (Count / sum(Count)), 2)) %>%
  ungroup()
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
# Plot the data using ggplot2
plot <- ggplot(df, aes(x = final_result, y = Percentage, fill = region, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Locality",
    x = "Result",
    y = "Percentage [%]",
    fill = "Region"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  guides(fill = guide_legend(title = "Region"))

# Show the plot
print(plot)

# Group by region and final_result_2, calculate counts and percentages
df <- studentInfo %>%
  group_by(region, final_result_2) %>%
  summarise(Count = n()) %>%
  group_by(final_result_2) %>%
  mutate(Percentage = round(100 * (Count / sum(Count)), 2)) %>%
  ungroup()
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
# Plot the data using ggplot2
plot <- ggplot(df, aes(x = final_result_2, y = Percentage, fill = region, label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Locality",
    x = "Result",
    y = "Percentage [%]",
    fill = "Region"
  ) +
  theme_minimal()

# Show the plot
print(plot)

# Extract unique values from the 'highest_education' column
unique_values <- unique(studentInfo$highest_education)

# Print the unique values
print(unique_values)
## [1] "HE Qualification"            "A Level or Equivalent"      
## [3] "Lower Than A Level"          "Post Graduate Qualification"
## [5] "No Formal quals"
# Group by highest_education, calculate counts and percentages
education_distribution <- studentInfo %>%
  group_by(Education_Level = highest_education) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count))

education_distribution$Percentage <- round(100 * (education_distribution$Count / sum(education_distribution$Count)), 2)

# Create a bar plot using ggplot2
plot <- ggplot(education_distribution, aes(x = Education_Level, y = Count, text = paste0(Percentage, "%"), fill = Education_Level)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Distribution - Education Level",
    x = "Education Level",
    y = "Quantity [Students]",
    fill = "Education Level"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Group by highest_education and final_result, calculate counts and percentages
result_by_education <- studentInfo %>%
  group_by(Education_Level = highest_education, Result = final_result) %>%
  summarise(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Education_Level'. You can override using
## the `.groups` argument.
result_by_education$Percentage <- round(100 * (result_by_education$Count / sum(result_by_education$Count)), 2)

# Create a stacked bar plot using ggplot2
plot <- ggplot(result_by_education, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Education_Level)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Education Level",
    x = "Result",
    y = "Percentage [%]",
    fill = "Education Level"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Group by highest_education and final_result_2, calculate counts and percentages
result_by_education <- studentInfo %>%
  group_by(Education_Level = highest_education, Result = final_result_2) %>%
  summarise(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Education_Level'. You can override using
## the `.groups` argument.
result_by_education$Percentage <- round(100 * (result_by_education$Count / sum(result_by_education$Count)), 2)

# Create a stacked bar plot using ggplot2
plot <- ggplot(result_by_education, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Education_Level)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Education Level",
    x = "Result",
    y = "Percentage [%]",
    fill = "Education Level"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Extract unique values from the 'imd_band' column
unique_values <- unique(studentInfo$imd_band)

# Print the unique values to the console
print(unique_values)
##  [1] "90-100%" "20-30%"  "30-40%"  "50-60%"  "80-90%"  "70-80%"  ""       
##  [8] "60-70%"  "40-50%"  "10-20"   "0-10%"
# Define a Function to Find the Mode
mode_func <- function(x) {
  unique_values <- unique(x)
  unique_values[which.max(tabulate(match(x, unique_values)))]
}

# Apply the Operation to Fill Missing Values
studentInfo <- studentInfo %>%
  group_by(region) %>%
  mutate(imd_band_2 = ifelse(is.na(imd_band), mode_func(imd_band), imd_band)) %>%
  ungroup()
# Group by IMD_Band, Calculate Counts, and Percentages
imd_distribution <- studentInfo %>%
  group_by(IMD_Band = imd_band) %>%
  summarise(Count = n()) %>%
  arrange(IMD_Band)

# Calculate Percentages
imd_distribution$Percentage <- round(100 * (imd_distribution$Count / sum(imd_distribution$Count)), 2)

# Create a Bar Plot using ggplot2
plot <- ggplot(imd_distribution, aes(x = IMD_Band, y = Count, text = paste0(Percentage, "%"), fill = IMD_Band)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Distribution - IMD Band",
    x = "IMD Band",
    y = "Quantity [Students]",
    fill = "IMD Band"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the Plot
print(plot)

# Group by imd_band and final_result, calculate counts and percentages
df_results <- studentInfo %>%
  group_by(Index = imd_band, Result = final_result) %>%
  summarise(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Index'. You can override using the
## `.groups` argument.
df_results$Percentage <- round(100 * (df_results$Count / sum(df_results$Count)), 2)

# Create a stacked bar plot using ggplot2
plot <- ggplot(df_results, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Index)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by IMD Index",
    x = "Result",
    y = "Percentage [%]",
    fill = "IMD Index"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Group by imd_band_2 and final_result_2, calculate counts and percentages
df_results <- studentInfo %>%
  group_by(Index = imd_band_2, Result = final_result_2) %>%
  summarise(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Index'. You can override using the
## `.groups` argument.
df_results$Percentage <- round(100 * (df_results$Count / sum(df_results$Count)), 2)

# Create a stacked bar plot using ggplot2
plot <- ggplot(df_results, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Index)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Index",
    x = "Result",
    y = "Quantity [Students]",
    fill = "IMD Index"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Find Unique Values in the 'age_band' Column
unique_age_band <- unique(studentInfo$age_band)

# Print the Unique Values
print(unique_age_band)
## [1] "55<="  "35-55" "0-35"
# Group by age_band, calculate counts and percentages
age_distribution <- studentInfo %>%
  select(age_band, id_student) %>%
  distinct() %>%
  group_by(Age_Group = age_band) %>%
  summarise(Count = n()) %>%
  arrange(Age_Group)

age_distribution$Percentage <- round(100 * (age_distribution$Count / sum(age_distribution$Count)), 2)

# Create a bar plot using ggplot2
plot <- ggplot(age_distribution, aes(x = Age_Group, y = Count, text = as.character(Count), fill = Age_Group)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = as.character(Count)), vjust = -0.5, size = 3) +
  labs(
    title = "Distribution - Age",
    x = "Age Group",
    y = "Quantity [Students]",
    fill = "Age Group"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Group by age_band and final_result, calculate counts and percentages
result_by_age <- studentInfo %>%
  group_by(Age_Group = age_band, Result = final_result) %>%
  summarise(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
result_by_age$Percentage <- round(100 * (result_by_age$Count / sum(result_by_age$Count)), 2)

# Create a stacked bar plot using ggplot2
plot <- ggplot(result_by_age, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Age Group",
    x = "Result",
    y = "Percentage [%]",
    fill = "Age Group"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(plot)

# Group by 'age_band' and 'final_result_2', and Calculate Counts and Percentages
df <- studentInfo %>%
  group_by(age_band, final_result_2) %>%
  summarise(Qtde = n()) %>%
  mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
## `summarise()` has grouped output by 'age_band'. You can override using the
## `.groups` argument.
# Create a Stacked Bar Plot using ggplot2
plot <- ggplot(df, aes(x = final_result_2, y = Percentual, text = paste0(Percentual, "%"), fill = age_band)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = "Results by Age",
    x = "Result",
    y = "Percentage [%]",
    fill = "Age"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the Plot
print(plot)

# Find Unique Values in the 'num_of_prev_attempts' Column
unique_prev_attempts <- unique(studentInfo$num_of_prev_attempts)

# Print the Unique Values
print(unique_prev_attempts)
## [1] 0 1 2 4 3 5 6
# Group by num_of_prev_attempts, Calculate Counts and Percentages
tentat <- studentInfo %>%
  group_by(Tentativas = num_of_prev_attempts) %>%
  summarise(Qtde = n()) %>%
  arrange(Tentativas)

tentat$Percentual <- round(100 * (tentat$Qtde / sum(tentat$Qtde)), 2)

# Create a Bar Plot using ggplot2
plot <- ggplot(tentat, aes(x = Tentativas, y = Qtde, text = paste0(Percentual, "%"), fill = factor(Tentativas))) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(Percentual, "%")), vjust = -0.5, size = 3) +
  labs(
    title = "Distribution - Previous Attempts",
    x = "Attempts",
    y = "Quantity [attempts]",
    fill = "Number of previous attempts"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the Plot
print(plot)

# Create the data frame (replace 'num_of_prev_attempts' and 'final_result' with your actual column names)
df <- studentInfo %>%
  group_by(num_of_prev_attempts, final_result) %>%
  summarise(Qtde = n()) %>%
  mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
## `summarise()` has grouped output by 'num_of_prev_attempts'. You can override
## using the `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = final_result, y = Percentual, text = paste0(Percentual, "%"), fill = factor(num_of_prev_attempts))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = 'Results by Previous Attempts',
    x = 'Result',
    y = 'Quantity [attempts]',
    fill = 'Number of previous attempts'
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Print the plot
print(fig)

# Group by num_of_prev_attempts and final_result_2, calculate counts and percentages
df <- studentInfo %>%
  group_by(Attempts = num_of_prev_attempts, Result = final_result_2) %>%
  summarise(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Attempts'. You can override using the
## `.groups` argument.
df$Percentage <- round(100 * (df$Count / sum(df$Count)), 2)

# Create a bar plot using ggplot2
ggplot(df, aes(x = Result, y = Percentage, fill = as.factor(Attempts), label = paste0(Percentage, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = 'Results by Previous Attempts',
    x = 'Result',
    y = 'Percentage [%]',
    fill = 'Number of previous attempts'
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Find the Maximum Value in the 'studied_credits' Column
max_studied_credits <- max(unique(studentInfo$studied_credits))

# Print the Maximum Value
print(max_studied_credits)
## [1] 655
# Find the Minimum Value in the 'studied_credits' Column
min_studied_credits <- min(unique(studentInfo$studied_credits))

# Print the Minimum Value
print(min_studied_credits)
## [1] 30
# Get Unique Values from 'studied_credits' Column and Sort
unique_sorted_studied_credits <- sort(unique(studentInfo$studied_credits))

# Print the Sorted Unique Values
print(unique_sorted_studied_credits)
##  [1]  30  40  45  50  55  60  65  70  75  80  85  90  95 100 105 110 115 120 130
## [20] 135 140 145 150 155 160 165 170 175 180 190 195 200 205 210 215 220 225 235
## [39] 240 250 255 270 280 300 310 315 325 330 345 355 360 370 390 400 420 430 480
## [58] 540 585 630 655
# Create a New Column 'cred_bin' and Bin 'studied_credits' into Categories
studentInfo$cred_bin <- cut(studentInfo$studied_credits, 
                            breaks = c(-Inf, 100, 200, 300, 400, Inf),
                            labels = c('1.Up to 100 credits', '2.100 to 200 credits', '3.200 to 300 credits', '4.300 to 400 credits', '5.Above 400 credits'))

# Print the Head of the Data Frame to Check the Results
head(studentInfo)
## # A tibble: 6 × 15
##   code_module code_presentation id_student gender region       highest_education
##   <chr>       <chr>                  <int> <chr>  <chr>        <chr>            
## 1 AAA         2013J                  11391 M      East Anglia… HE Qualification 
## 2 AAA         2013J                  28400 F      Scotland     HE Qualification 
## 3 AAA         2013J                  30268 F      North Weste… A Level or Equiv…
## 4 AAA         2013J                  31604 F      South East … A Level or Equiv…
## 5 AAA         2013J                  32885 F      West Midlan… Lower Than A Lev…
## 6 AAA         2013J                  38053 M      Wales        A Level or Equiv…
## # ℹ 9 more variables: imd_band <chr>, age_band <chr>,
## #   num_of_prev_attempts <int>, studied_credits <int>, disability <chr>,
## #   final_result <chr>, final_result_2 <chr>, imd_band_2 <chr>, cred_bin <fct>
# Sample data (replace this with your actual data)
cred <- data.frame(
  cred_bin = c("bin1", "bin2", "bin3", "bin4"),
  Qtde = c(10, 20, 15, 25)
)

# Load necessary libraries
library(ggplot2)

# Create the ggplot object
ggplot(cred, aes(x = cred_bin, y = Qtde, fill = cred_bin)) +
  geom_bar(stat = "identity") +
  
  # Customize the appearance
  theme_minimal() +
  labs(title = 'Distribution - Studied Credits',
       x = 'Quantity [Students]',
       y = 'Credits',
       fill = 'Credit Intervals') +
  
  # Rotate x-axis labels for better readability
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

# Group by credit bins and binary result, calculate counts, and percentage
df <- studentInfo %>%
  group_by(cred_bin, final_result) %>%
  summarise(Qtde = n()) %>%
  mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2)) %>%
  arrange(cred_bin, final_result)
## `summarise()` has grouped output by 'cred_bin'. You can override using the
## `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = final_result, y = Percentual, text = paste0(Percentual, "%"), fill = cred_bin)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = 'Results by Studied Credits',
    x = 'Result',
    y = 'Percentage [%]',
    fill = 'Credit Intervals'
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Update plot layout
fig <- fig +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_blank()  # Remove legend title
  )

# Print the plot
print(fig)

# Group by credit bins and binary result, calculate counts, and percentage
df <- studentInfo %>%
  group_by(cred_bin, final_result_2) %>%
  summarise(Qtde = n()) %>%
  mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2)) %>%
  arrange(cred_bin, final_result_2)
## `summarise()` has grouped output by 'cred_bin'. You can override using the
## `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = final_result_2, y = Percentual, text = paste0(Percentual, "%"), fill = cred_bin)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = 'Results by Studied Credits',
    x = 'Result',
    y = 'Percentage [%]',
    fill = 'Credit Intervals'
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Update plot layout
fig <- fig +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_blank()  # Remove legend title
  )

# Print the plot
print(fig)

# Assuming 'studentInfo' is your data frame
unique_disability <- unique(studentInfo$disability)
print(unique_disability)
## [1] "N" "Y"
# Group by disability, calculate counts, and percentage
defci <- studentInfo %>%
  distinct(disability, id_student) %>%
  group_by(disability) %>%
  summarise(Qtde = n()) %>%
  arrange(disability) %>%
  mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))

# Create a bar plot using ggplot2
fig <- ggplot(defci, aes(x = disability, y = Qtde, text = paste0(Qtde), fill = disability)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Qtde), vjust = -0.5, size = 3) +
  labs(
    title = 'Distribution - Disability',
    x = 'Disability',
    y = 'Quantities [Students]',
    fill = 'Disability'
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Update plot layout
fig <- fig +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_blank()  # Remove legend title
  )

# Print the plot
print(fig)

# Group by disability and final result, calculate counts, and percentage
df <- studentInfo %>%
  group_by(Deficiencia = disability, Resultado = final_result) %>%
  summarise(Qtde = n()) %>%
  mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
## `summarise()` has grouped output by 'Deficiencia'. You can override using the
## `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = Resultado, y = Percentual, color = Deficiencia, text = paste0(Percentual, "%"))) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
  labs(
    title = 'Results by Disability',
    x = 'Result',
    y = 'Percentage [%]',
    color = 'Disability'
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Update plot layout
fig <- fig +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_blank()  # Remove legend title
  )

# Print the plot
print(fig)