# Required packages
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(RColorBrewer)
library(lubridate)
# Load Datasets
course <- read.csv('~/Downloads/archive (1)/courses.csv')
assessment <- read.csv('~/Downloads/archive (1)/assessments.csv')
vle <- read.csv('~/Downloads/archive (1)/vle.csv')
info_stu <- read.csv('~/Downloads/archive (1)/studentInfo.csv')
reg_stu <- read.csv('~/Downloads/archive (1)/studentRegistration.csv')
as_stu <- read.csv('~/Downloads/archive (1)/studentAssessment.csv')
vle_stu <- read.csv('~/Downloads/archive (1)/studentVle.csv')
# Specify the file path
file_path <- "~/Downloads/archive (1)/studentInfo.csv"
# Read the CSV file into a data frame
studentInfo <- read.csv(file_path, sep = ',', fileEncoding = 'ISO-8859-1')
# Print the first few rows of the data frame
head(studentInfo)
## code_module code_presentation id_student gender region
## 1 AAA 2013J 11391 M East Anglian Region
## 2 AAA 2013J 28400 F Scotland
## 3 AAA 2013J 30268 F North Western Region
## 4 AAA 2013J 31604 F South East Region
## 5 AAA 2013J 32885 F West Midlands Region
## 6 AAA 2013J 38053 M Wales
## highest_education imd_band age_band num_of_prev_attempts studied_credits
## 1 HE Qualification 90-100% 55<= 0 240
## 2 HE Qualification 20-30% 35-55 0 60
## 3 A Level or Equivalent 30-40% 35-55 0 60
## 4 A Level or Equivalent 50-60% 35-55 0 60
## 5 Lower Than A Level 50-60% 0-35 0 60
## 6 A Level or Equivalent 80-90% 35-55 0 60
## disability final_result
## 1 N Pass
## 2 N Pass
## 3 Y Withdrawn
## 4 N Pass
## 5 N Pass
## 6 N Pass
# Assuming 'id_student' is the column name in the data frame
unique_students <- unique(studentInfo$id_student)
# Get the length of the unique students
num_unique_students <- length(unique_students)
# Print the result
print(num_unique_students)
## [1] 28785
# Assuming 'final_result' is the column name in the data frame
studentInfo <- mutate(studentInfo, final_result_2 = ifelse(final_result %in% c('Withdrawn', 'Fail'), 'Failed', 'Passed'))
# Print the modified data frame
#print(studentInfo)
# Assuming 'code_module' is the column name in the data frame
unique_code_modules <- unique(studentInfo$code_module)
# Print the unique values
print(unique_code_modules)
## [1] "AAA" "BBB" "CCC" "DDD" "EEE" "FFF" "GGG"
# Grouping the data by 'code_module' and summarizing the number of occurrences
module_summary <- studentInfo %>%
group_by(code_module) %>%
summarise(Count = n()) %>%
# Calculating the percentage of each module
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
# Plotting using ggplot2
# Creating a bar plot with module code on the x-axis, count on the y-axis, and fill colors representing module code
# Adding text labels with the percentage values
# Customizing the theme and axis labels
ggplot(module_summary, aes(x = code_module, y = Count, fill = code_module, label = Percentage)) +
geom_bar(stat = "identity") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = "Distribution - Enrolled Module",
x = "Enrolled Module",
y = "Quantity [Students]",
fill = "Enrolled Module")

# Grouping the data by 'code_module' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
group_by(code_module, final_result) %>%
summarise(Count = n()) %>%
# Calculating the percentage of each combination of module and result
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_module'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_module', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result, y = Percentage, fill = code_module, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = "Results by Enrolled Module",
x = "Result",
y = "Percentage [%]",
fill = "Enrolled Module")

# Grouping the data by 'code_module' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
group_by(code_module, final_result_2) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each combination of module and result
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_module'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result_2' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_module', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result_2, y = Percentage, fill = code_module, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Results by Enrolled Module',
x = 'Result',
y = 'Percentage [%]',
fill = 'Enrolled Module')

# Assuming 'code_presentation' is the column name in the data frame
unique_presentations <- unique(studentInfo$code_presentation)
# Print the unique values
print(unique_presentations)
## [1] "2013J" "2014J" "2013B" "2014B"
# Grouping the data by 'code_presentation' and summarizing the count
period_summary <- studentInfo %>%
group_by(code_presentation) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each presentation period
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
# Plotting using ggplot2
# Creating a bar plot with presentation period on the x-axis, count on the y-axis,
# fill colors representing presentation period, and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(period_summary, aes(x = code_presentation, y = Count, fill = code_presentation, label = Percentage)) +
geom_bar(stat = "identity") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Distribution - Enrolled Period',
x = 'Enrolled Period',
y = 'Quantity [Students]',
fill = 'Enrolled Period')

# Grouping the data by 'code_presentation' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
group_by(code_presentation, final_result) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each combination of presentation period and result
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_presentation'. You can override using
## the `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_presentation', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result, y = Percentage, fill = code_presentation, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Results by Enrolled Period',
x = 'Result',
y = 'Percentage [%]',
fill = 'Enrolled Period')

# Grouping the data by 'code_presentation' and 'final_result_2' and summarizing the count
result_summary <- studentInfo %>%
group_by(code_presentation, final_result_2) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each combination of presentation period and result
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'code_presentation'. You can override using
## the `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result_2' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'code_presentation', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result_2, y = Percentage, fill = code_presentation, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Results by Enrolled Period',
x = 'Result',
y = 'Percentage [%]',
fill = 'Enrolled Period')

# Get the unique values of the 'gender' column
unique_genders <- unique(studentInfo$gender)
# Print the unique values
print(unique_genders)
## [1] "M" "F"
# Grouping the data by 'gender' and summarizing the count
gender_summary <- studentInfo %>%
distinct(id_student, gender) %>%
group_by(gender) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each gender
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
# Plotting using ggplot2
# Creating a bar plot with gender on the x-axis, count on the y-axis,
# fill colors representing gender, and text labels with the count values
# Customizing the theme and axis labels
ggplot(gender_summary, aes(x = gender, y = Count, fill = gender, label = Count)) +
geom_bar(stat = "identity") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Distribution - Gender',
x = 'Gender',
y = 'Quantity [Students]',
fill = 'Gender')

# Grouping the data by 'gender' and 'final_result' and summarizing the count
result_summary <- studentInfo %>%
group_by(gender, final_result) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each combination of gender and result
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'gender', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result, y = Percentage, fill = gender, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Results by Gender',
x = 'Result',
y = 'Percentage [%]',
fill = 'Gender')

# Grouping the data by 'gender' and 'final_result_2' and summarizing the count
result_summary <- studentInfo %>%
group_by(gender, final_result_2) %>%
summarise(Count = n()) %>%
# Ungrouping the data to reset grouping
ungroup() %>%
# Calculating the percentage of each combination of gender and result
mutate(Percentage = round(100 * (Count / sum(Count)), 2))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
# Plotting using ggplot2
# Creating a stacked bar plot with 'final_result_2' on the x-axis, 'Percentage' on the y-axis,
# fill colors representing 'gender', and text labels with the percentage values
# Customizing the theme and axis labels
ggplot(result_summary, aes(x = final_result_2, y = Percentage, fill = gender, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5)) +
theme_minimal() +
labs(title = 'Results by Gender',
x = 'Result',
y = 'Percentage [%]',
fill = 'Gender')

# Get the unique values of the 'region' column
unique_regions <- unique(studentInfo$region)
# Calculate the number of unique regions
num_unique_regions <- length(unique_regions)
# Print the number of unique regions
print(num_unique_regions)
## [1] 13
# Create a new data frame 'regiao'
regiao <- studentInfo %>%
select(Region = region, Count = id_student) %>%
distinct() %>%
group_by(Region) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
# Calculate the percentage of each region
regiao$Percentage <- round(100 * (regiao$Count / sum(regiao$Count)), 2)
# Create a bar plot using ggplot2
plot <- ggplot(regiao, aes(x = Region, y = Count, text = paste0(Percentage, "%"), fill = Region)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Distribution - Locality",
x = "Region",
y = "Quantity [Students]"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Group by region and final_result, calculate counts and percentages
df <- studentInfo %>%
group_by(region, final_result) %>%
summarise(Count = n()) %>%
group_by(final_result) %>%
mutate(Percentage = round(100 * (Count / sum(Count)), 2)) %>%
ungroup()
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
# Plot the data using ggplot2
plot <- ggplot(df, aes(x = final_result, y = Percentage, fill = region, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Locality",
x = "Result",
y = "Percentage [%]",
fill = "Region"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
) +
guides(fill = guide_legend(title = "Region"))
# Show the plot
print(plot)

# Group by region and final_result_2, calculate counts and percentages
df <- studentInfo %>%
group_by(region, final_result_2) %>%
summarise(Count = n()) %>%
group_by(final_result_2) %>%
mutate(Percentage = round(100 * (Count / sum(Count)), 2)) %>%
ungroup()
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
# Plot the data using ggplot2
plot <- ggplot(df, aes(x = final_result_2, y = Percentage, fill = region, label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Locality",
x = "Result",
y = "Percentage [%]",
fill = "Region"
) +
theme_minimal()
# Show the plot
print(plot)

# Extract unique values from the 'highest_education' column
unique_values <- unique(studentInfo$highest_education)
# Print the unique values
print(unique_values)
## [1] "HE Qualification" "A Level or Equivalent"
## [3] "Lower Than A Level" "Post Graduate Qualification"
## [5] "No Formal quals"
# Group by highest_education, calculate counts and percentages
education_distribution <- studentInfo %>%
group_by(Education_Level = highest_education) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
education_distribution$Percentage <- round(100 * (education_distribution$Count / sum(education_distribution$Count)), 2)
# Create a bar plot using ggplot2
plot <- ggplot(education_distribution, aes(x = Education_Level, y = Count, text = paste0(Percentage, "%"), fill = Education_Level)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Distribution - Education Level",
x = "Education Level",
y = "Quantity [Students]",
fill = "Education Level"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Group by highest_education and final_result, calculate counts and percentages
result_by_education <- studentInfo %>%
group_by(Education_Level = highest_education, Result = final_result) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Education_Level'. You can override using
## the `.groups` argument.
result_by_education$Percentage <- round(100 * (result_by_education$Count / sum(result_by_education$Count)), 2)
# Create a stacked bar plot using ggplot2
plot <- ggplot(result_by_education, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Education_Level)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Education Level",
x = "Result",
y = "Percentage [%]",
fill = "Education Level"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Group by highest_education and final_result_2, calculate counts and percentages
result_by_education <- studentInfo %>%
group_by(Education_Level = highest_education, Result = final_result_2) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Education_Level'. You can override using
## the `.groups` argument.
result_by_education$Percentage <- round(100 * (result_by_education$Count / sum(result_by_education$Count)), 2)
# Create a stacked bar plot using ggplot2
plot <- ggplot(result_by_education, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Education_Level)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Education Level",
x = "Result",
y = "Percentage [%]",
fill = "Education Level"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Extract unique values from the 'imd_band' column
unique_values <- unique(studentInfo$imd_band)
# Print the unique values to the console
print(unique_values)
## [1] "90-100%" "20-30%" "30-40%" "50-60%" "80-90%" "70-80%" ""
## [8] "60-70%" "40-50%" "10-20" "0-10%"
# Define a Function to Find the Mode
mode_func <- function(x) {
unique_values <- unique(x)
unique_values[which.max(tabulate(match(x, unique_values)))]
}
# Apply the Operation to Fill Missing Values
studentInfo <- studentInfo %>%
group_by(region) %>%
mutate(imd_band_2 = ifelse(is.na(imd_band), mode_func(imd_band), imd_band)) %>%
ungroup()
# Group by IMD_Band, Calculate Counts, and Percentages
imd_distribution <- studentInfo %>%
group_by(IMD_Band = imd_band) %>%
summarise(Count = n()) %>%
arrange(IMD_Band)
# Calculate Percentages
imd_distribution$Percentage <- round(100 * (imd_distribution$Count / sum(imd_distribution$Count)), 2)
# Create a Bar Plot using ggplot2
plot <- ggplot(imd_distribution, aes(x = IMD_Band, y = Count, text = paste0(Percentage, "%"), fill = IMD_Band)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Distribution - IMD Band",
x = "IMD Band",
y = "Quantity [Students]",
fill = "IMD Band"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the Plot
print(plot)

# Group by imd_band and final_result, calculate counts and percentages
df_results <- studentInfo %>%
group_by(Index = imd_band, Result = final_result) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Index'. You can override using the
## `.groups` argument.
df_results$Percentage <- round(100 * (df_results$Count / sum(df_results$Count)), 2)
# Create a stacked bar plot using ggplot2
plot <- ggplot(df_results, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Index)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by IMD Index",
x = "Result",
y = "Percentage [%]",
fill = "IMD Index"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Group by imd_band_2 and final_result_2, calculate counts and percentages
df_results <- studentInfo %>%
group_by(Index = imd_band_2, Result = final_result_2) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Index'. You can override using the
## `.groups` argument.
df_results$Percentage <- round(100 * (df_results$Count / sum(df_results$Count)), 2)
# Create a stacked bar plot using ggplot2
plot <- ggplot(df_results, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Index)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Index",
x = "Result",
y = "Quantity [Students]",
fill = "IMD Index"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Find Unique Values in the 'age_band' Column
unique_age_band <- unique(studentInfo$age_band)
# Print the Unique Values
print(unique_age_band)
## [1] "55<=" "35-55" "0-35"
# Group by age_band, calculate counts and percentages
age_distribution <- studentInfo %>%
select(age_band, id_student) %>%
distinct() %>%
group_by(Age_Group = age_band) %>%
summarise(Count = n()) %>%
arrange(Age_Group)
age_distribution$Percentage <- round(100 * (age_distribution$Count / sum(age_distribution$Count)), 2)
# Create a bar plot using ggplot2
plot <- ggplot(age_distribution, aes(x = Age_Group, y = Count, text = as.character(Count), fill = Age_Group)) +
geom_bar(stat = "identity") +
geom_text(aes(label = as.character(Count)), vjust = -0.5, size = 3) +
labs(
title = "Distribution - Age",
x = "Age Group",
y = "Quantity [Students]",
fill = "Age Group"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Group by age_band and final_result, calculate counts and percentages
result_by_age <- studentInfo %>%
group_by(Age_Group = age_band, Result = final_result) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
result_by_age$Percentage <- round(100 * (result_by_age$Count / sum(result_by_age$Count)), 2)
# Create a stacked bar plot using ggplot2
plot <- ggplot(result_by_age, aes(x = Result, y = Percentage, text = paste0(Percentage, "%"), fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Age Group",
x = "Result",
y = "Percentage [%]",
fill = "Age Group"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)

# Group by 'age_band' and 'final_result_2', and Calculate Counts and Percentages
df <- studentInfo %>%
group_by(age_band, final_result_2) %>%
summarise(Qtde = n()) %>%
mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
## `summarise()` has grouped output by 'age_band'. You can override using the
## `.groups` argument.
# Create a Stacked Bar Plot using ggplot2
plot <- ggplot(df, aes(x = final_result_2, y = Percentual, text = paste0(Percentual, "%"), fill = age_band)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = "Results by Age",
x = "Result",
y = "Percentage [%]",
fill = "Age"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the Plot
print(plot)

# Find Unique Values in the 'num_of_prev_attempts' Column
unique_prev_attempts <- unique(studentInfo$num_of_prev_attempts)
# Print the Unique Values
print(unique_prev_attempts)
## [1] 0 1 2 4 3 5 6
# Group by num_of_prev_attempts, Calculate Counts and Percentages
tentat <- studentInfo %>%
group_by(Tentativas = num_of_prev_attempts) %>%
summarise(Qtde = n()) %>%
arrange(Tentativas)
tentat$Percentual <- round(100 * (tentat$Qtde / sum(tentat$Qtde)), 2)
# Create a Bar Plot using ggplot2
plot <- ggplot(tentat, aes(x = Tentativas, y = Qtde, text = paste0(Percentual, "%"), fill = factor(Tentativas))) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(Percentual, "%")), vjust = -0.5, size = 3) +
labs(
title = "Distribution - Previous Attempts",
x = "Attempts",
y = "Quantity [attempts]",
fill = "Number of previous attempts"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the Plot
print(plot)

# Create the data frame (replace 'num_of_prev_attempts' and 'final_result' with your actual column names)
df <- studentInfo %>%
group_by(num_of_prev_attempts, final_result) %>%
summarise(Qtde = n()) %>%
mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
## `summarise()` has grouped output by 'num_of_prev_attempts'. You can override
## using the `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = final_result, y = Percentual, text = paste0(Percentual, "%"), fill = factor(num_of_prev_attempts))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = 'Results by Previous Attempts',
x = 'Result',
y = 'Quantity [attempts]',
fill = 'Number of previous attempts'
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(fig)

# Group by num_of_prev_attempts and final_result_2, calculate counts and percentages
df <- studentInfo %>%
group_by(Attempts = num_of_prev_attempts, Result = final_result_2) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Attempts'. You can override using the
## `.groups` argument.
df$Percentage <- round(100 * (df$Count / sum(df$Count)), 2)
# Create a bar plot using ggplot2
ggplot(df, aes(x = Result, y = Percentage, fill = as.factor(Attempts), label = paste0(Percentage, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(position = position_stack(vjust = 0.5), size = 3) +
labs(
title = 'Results by Previous Attempts',
x = 'Result',
y = 'Percentage [%]',
fill = 'Number of previous attempts'
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

# Find the Maximum Value in the 'studied_credits' Column
max_studied_credits <- max(unique(studentInfo$studied_credits))
# Print the Maximum Value
print(max_studied_credits)
## [1] 655
# Find the Minimum Value in the 'studied_credits' Column
min_studied_credits <- min(unique(studentInfo$studied_credits))
# Print the Minimum Value
print(min_studied_credits)
## [1] 30
# Get Unique Values from 'studied_credits' Column and Sort
unique_sorted_studied_credits <- sort(unique(studentInfo$studied_credits))
# Print the Sorted Unique Values
print(unique_sorted_studied_credits)
## [1] 30 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 130
## [20] 135 140 145 150 155 160 165 170 175 180 190 195 200 205 210 215 220 225 235
## [39] 240 250 255 270 280 300 310 315 325 330 345 355 360 370 390 400 420 430 480
## [58] 540 585 630 655
# Create a New Column 'cred_bin' and Bin 'studied_credits' into Categories
studentInfo$cred_bin <- cut(studentInfo$studied_credits,
breaks = c(-Inf, 100, 200, 300, 400, Inf),
labels = c('1.Up to 100 credits', '2.100 to 200 credits', '3.200 to 300 credits', '4.300 to 400 credits', '5.Above 400 credits'))
# Print the Head of the Data Frame to Check the Results
head(studentInfo)
## # A tibble: 6 × 15
## code_module code_presentation id_student gender region highest_education
## <chr> <chr> <int> <chr> <chr> <chr>
## 1 AAA 2013J 11391 M East Anglia… HE Qualification
## 2 AAA 2013J 28400 F Scotland HE Qualification
## 3 AAA 2013J 30268 F North Weste… A Level or Equiv…
## 4 AAA 2013J 31604 F South East … A Level or Equiv…
## 5 AAA 2013J 32885 F West Midlan… Lower Than A Lev…
## 6 AAA 2013J 38053 M Wales A Level or Equiv…
## # ℹ 9 more variables: imd_band <chr>, age_band <chr>,
## # num_of_prev_attempts <int>, studied_credits <int>, disability <chr>,
## # final_result <chr>, final_result_2 <chr>, imd_band_2 <chr>, cred_bin <fct>
# Sample data (replace this with your actual data)
cred <- data.frame(
cred_bin = c("bin1", "bin2", "bin3", "bin4"),
Qtde = c(10, 20, 15, 25)
)
# Load necessary libraries
library(ggplot2)
# Create the ggplot object
ggplot(cred, aes(x = cred_bin, y = Qtde, fill = cred_bin)) +
geom_bar(stat = "identity") +
# Customize the appearance
theme_minimal() +
labs(title = 'Distribution - Studied Credits',
x = 'Quantity [Students]',
y = 'Credits',
fill = 'Credit Intervals') +
# Rotate x-axis labels for better readability
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Group by credit bins and binary result, calculate counts, and percentage
df <- studentInfo %>%
group_by(cred_bin, final_result) %>%
summarise(Qtde = n()) %>%
mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2)) %>%
arrange(cred_bin, final_result)
## `summarise()` has grouped output by 'cred_bin'. You can override using the
## `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = final_result, y = Percentual, text = paste0(Percentual, "%"), fill = cred_bin)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = 'Results by Studied Credits',
x = 'Result',
y = 'Percentage [%]',
fill = 'Credit Intervals'
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Update plot layout
fig <- fig +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_blank() # Remove legend title
)
# Print the plot
print(fig)

# Group by credit bins and binary result, calculate counts, and percentage
df <- studentInfo %>%
group_by(cred_bin, final_result_2) %>%
summarise(Qtde = n()) %>%
mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2)) %>%
arrange(cred_bin, final_result_2)
## `summarise()` has grouped output by 'cred_bin'. You can override using the
## `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = final_result_2, y = Percentual, text = paste0(Percentual, "%"), fill = cred_bin)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = 'Results by Studied Credits',
x = 'Result',
y = 'Percentage [%]',
fill = 'Credit Intervals'
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Update plot layout
fig <- fig +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_blank() # Remove legend title
)
# Print the plot
print(fig)

# Assuming 'studentInfo' is your data frame
unique_disability <- unique(studentInfo$disability)
print(unique_disability)
## [1] "N" "Y"
# Group by disability, calculate counts, and percentage
defci <- studentInfo %>%
distinct(disability, id_student) %>%
group_by(disability) %>%
summarise(Qtde = n()) %>%
arrange(disability) %>%
mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
# Create a bar plot using ggplot2
fig <- ggplot(defci, aes(x = disability, y = Qtde, text = paste0(Qtde), fill = disability)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Qtde), vjust = -0.5, size = 3) +
labs(
title = 'Distribution - Disability',
x = 'Disability',
y = 'Quantities [Students]',
fill = 'Disability'
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Update plot layout
fig <- fig +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_blank() # Remove legend title
)
# Print the plot
print(fig)

# Group by disability and final result, calculate counts, and percentage
df <- studentInfo %>%
group_by(Deficiencia = disability, Resultado = final_result) %>%
summarise(Qtde = n()) %>%
mutate(Percentual = round(100 * (Qtde / sum(Qtde)), 2))
## `summarise()` has grouped output by 'Deficiencia'. You can override using the
## `.groups` argument.
# Create a bar plot using ggplot2
fig <- ggplot(df, aes(x = Resultado, y = Percentual, color = Deficiencia, text = paste0(Percentual, "%"))) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Percentual, "%")), position = position_stack(vjust = 0.5), size = 3) +
labs(
title = 'Results by Disability',
x = 'Result',
y = 'Percentage [%]',
color = 'Disability'
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Update plot layout
fig <- fig +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_blank() # Remove legend title
)
# Print the plot
print(fig)
