Load libraries and CSV file
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(ggplot2)
library(readr)
library(janitor)
library(scales)
library(DT)
# Load CSV data
education_data <- read_csv("education_career_success.csv", show_col_types = FALSE)
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))
# View first 6 rows to check
head(education_data)
# View column names and types
glimpse(education_data)
## Rows: 5,000
## Columns: 20
## $ Student_ID <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ Age <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ Gender <chr> "Male", "Other", "Female", "Male", "Male", "Male…
## $ High_School_GPA <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ SAT_Score <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ University_Ranking <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ University_GPA <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ Field_of_Study <chr> "Arts", "Law", "Medicine", "Computer Science", "…
## $ Internships_Completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ Projects_Completed <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ Certifications <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ Soft_Skills_Score <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ Networking_Score <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ Job_Offers <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ Starting_Salary <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ Career_Satisfaction <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ Years_to_Promotion <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ Current_Job_Level <chr> "Entry", "Mid", "Entry", "Mid", "Entry", "Entry"…
## $ Work_Life_Balance <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ Entrepreneurship <chr> "No", "No", "No", "No", "No", "Yes", "No", "No",…
Initial data inspection
# Summary statistics for numeric columns
summary(education_data)
## Student_ID Age Gender High_School_GPA
## Length:5000 Min. :18.00 Length:5000 Min. :2.000
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:2.500
## Mode :character Median :23.00 Mode :character Median :2.990
## Mean :23.44 Mean :2.997
## 3rd Qu.:26.00 3rd Qu.:3.500
## Max. :29.00 Max. :4.000
## SAT_Score University_Ranking University_GPA Field_of_Study
## Min. : 900 Min. : 1.0 Min. :2.00 Length:5000
## 1st Qu.:1076 1st Qu.: 256.0 1st Qu.:2.52 Class :character
## Median :1257 Median : 501.5 Median :3.03 Mode :character
## Mean :1254 Mean : 504.3 Mean :3.02
## 3rd Qu.:1432 3rd Qu.: 759.0 3rd Qu.:3.51
## Max. :1600 Max. :1000.0 Max. :4.00
## Internships_Completed Projects_Completed Certifications Soft_Skills_Score
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :2.000 Median :5.000 Median :3.000 Median : 6.000
## Mean :1.982 Mean :4.563 Mean :2.512 Mean : 5.546
## 3rd Qu.:3.000 3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :4.000 Max. :9.000 Max. :5.000 Max. :10.000
## Networking_Score Job_Offers Starting_Salary Career_Satisfaction
## Min. : 1.000 Min. :0.000 Min. : 25000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.: 40200 1st Qu.: 3.000
## Median : 6.000 Median :2.000 Median : 50300 Median : 6.000
## Mean : 5.538 Mean :2.489 Mean : 50564 Mean : 5.578
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.: 60500 3rd Qu.: 8.000
## Max. :10.000 Max. :5.000 Max. :101000 Max. :10.000
## Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
## Min. :1.000 Length:5000 Min. : 1.000 Length:5000
## 1st Qu.:2.000 Class :character 1st Qu.: 3.000 Class :character
## Median :3.000 Mode :character Median : 6.000 Mode :character
## Mean :3.016 Mean : 5.482
## 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :5.000 Max. :10.000
# Check for missing values per column
colSums(is.na(education_data))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_Ranking
## 0 0 0
## University_GPA Field_of_Study Internships_Completed
## 0 0 0
## Projects_Completed Certifications Soft_Skills_Score
## 0 0 0
## Networking_Score Job_Offers Starting_Salary
## 0 0 0
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 0 0 0
## Work_Life_Balance Entrepreneurship
## 0 0
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))
Clean and preprocess data
# Clean column names (e.g., High_School_GPA → high_school_gpa)
education_data <- clean_names(education_data)
# View column names to verify changes
colnames(education_data)
## [1] "student_id" "age" "gender"
## [4] "high_school_gpa" "sat_score" "university_ranking"
## [7] "university_gpa" "field_of_study" "internships_completed"
## [10] "projects_completed" "certifications" "soft_skills_score"
## [13] "networking_score" "job_offers" "starting_salary"
## [16] "career_satisfaction" "years_to_promotion" "current_job_level"
## [19] "work_life_balance" "entrepreneurship"
# Convert character columns to factors
education_data <- education_data %>%
mutate(
gender = factor(gender),
field_of_study = factor(field_of_study),
entrepreneurship = factor(entrepreneurship, levels = c("No", "Yes")),
current_job_level = factor(current_job_level, ordered = TRUE, levels = c("Entry", "Mid", "Senior", "Executive"))
)
# Optional: Remove rows with missing values
education_data <- drop_na(education_data)
# Preview the cleaned data
glimpse(education_data)
## Rows: 5,000
## Columns: 20
## $ student_id <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ age <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ gender <fct> Male, Other, Female, Male, Male, Male, Male, Mal…
## $ high_school_gpa <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ sat_score <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ university_ranking <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ university_gpa <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ field_of_study <fct> Arts, Law, Medicine, Computer Science, Engineeri…
## $ internships_completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ projects_completed <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ certifications <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ soft_skills_score <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ networking_score <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ job_offers <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ starting_salary <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ career_satisfaction <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ years_to_promotion <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ current_job_level <ord> Entry, Mid, Entry, Mid, Entry, Entry, Mid, Entry…
## $ work_life_balance <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ entrepreneurship <fct> No, No, No, No, No, Yes, No, No, No, No, No, Yes…
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))
Distribution of Starting Salaries
- The distribution is slightly right-skewed, with most salaries
clustered around $40,000–$60,000.
ggplot(education_data, aes(x = starting_salary)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
scale_x_continuous(labels = comma) + # Format x-axis nicely
labs(
title = "Distribution of Starting Salaries",
x = "Starting Salary",
y = "Frequency"
) +
theme_minimal()

Starting Salary by Gender
- Male students tend to have a slightly higher starting salary on
average. This finding prompts further analysis of other influencing
variables.
ggplot(education_data, aes(x = gender, y = starting_salary, fill = gender)) +
geom_boxplot() +
scale_y_continuous(labels = comma) + # Format y-axis with commas
labs(
title = "Starting Salary by Gender",
y = "Starting Salary",
x = "Gender"
) +
theme_minimal()

GPA vs Salary
- This plot suggests a weak-to-moderate positive correlation between
university GPA and starting salary.
ggplot(education_data, aes(x = university_gpa, y = starting_salary)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", col = "red") +
scale_y_continuous(labels = comma) + # This formats y-axis labels with commas
labs(title = "University GPA vs Starting Salary",
x = "University GPA",
y = "Starting Salary") +
theme_minimal()

Field of Study vs Job Level
- This visualisation highlights how certain fields, like Engineering
or Business, may lead more often to higher job levels early in
careers.
ggplot(education_data, aes(x = field_of_study, fill = current_job_level)) +
geom_bar(position = "fill") +
labs(title = "Job Level Distribution by Field of Study",
y = "Proportion",
x = "Field of Study") +
theme_minimal()

Internship and Career Success
- Internships appear to positively influence starting salaries, with
those completing more internships earning higher pay.
ggplot(education_data, aes(x = as.factor(internships_completed), y = starting_salary)) +
geom_boxplot(fill = "lightgreen") +
scale_y_continuous(labels = comma) + # This will show numbers like 40,000 instead of 4e+04
labs(title = "Impact of Internships Completed on Starting Salary",
x = "Number of Internships Completed",
y = "Starting Salary")
