Education and Career Success

Load libraries and CSV file

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(ggplot2)
library(readr)
library(janitor)
library(scales)
library(DT)

# Load CSV data
education_data <- read_csv("education_career_success.csv", show_col_types = FALSE)  

# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))

# View first 6 rows to check
head(education_data)

# View column names and types
glimpse(education_data)

## Rows: 5,000
## Columns: 20
## $ Student_ID            <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ Age                   <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ Gender                <chr> "Male", "Other", "Female", "Male", "Male", "Male…
## $ High_School_GPA       <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ SAT_Score             <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ University_Ranking    <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ University_GPA        <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ Field_of_Study        <chr> "Arts", "Law", "Medicine", "Computer Science", "…
## $ Internships_Completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ Projects_Completed    <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ Certifications        <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ Soft_Skills_Score     <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ Networking_Score      <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ Job_Offers            <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ Starting_Salary       <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ Career_Satisfaction   <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ Years_to_Promotion    <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ Current_Job_Level     <chr> "Entry", "Mid", "Entry", "Mid", "Entry", "Entry"…
## $ Work_Life_Balance     <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ Entrepreneurship      <chr> "No", "No", "No", "No", "No", "Yes", "No", "No",…

Initial data inspection

# Summary statistics for numeric columns
summary(education_data)

##   Student_ID             Age           Gender          High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Internships_Completed Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000

# Check for missing values per column
colSums(is.na(education_data))

##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship 
##                     0                     0

# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))

Clean and preprocess data

# Clean column names (e.g., High_School_GPA → high_school_gpa)
education_data <- clean_names(education_data)

# View column names to verify changes
colnames(education_data)

##  [1] "student_id"            "age"                   "gender"               
##  [4] "high_school_gpa"       "sat_score"             "university_ranking"   
##  [7] "university_gpa"        "field_of_study"        "internships_completed"
## [10] "projects_completed"    "certifications"        "soft_skills_score"    
## [13] "networking_score"      "job_offers"            "starting_salary"      
## [16] "career_satisfaction"   "years_to_promotion"    "current_job_level"    
## [19] "work_life_balance"     "entrepreneurship"

# Convert character columns to factors
education_data <- education_data %>%
  mutate(
    gender = factor(gender),
    field_of_study = factor(field_of_study),
    entrepreneurship = factor(entrepreneurship, levels = c("No", "Yes")),
    current_job_level = factor(current_job_level, ordered = TRUE, levels = c("Entry", "Mid", "Senior", "Executive"))
  )

# Optional: Remove rows with missing values
education_data <- drop_na(education_data)

# Preview the cleaned data
glimpse(education_data)

## Rows: 5,000
## Columns: 20
## $ student_id            <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ age                   <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ gender                <fct> Male, Other, Female, Male, Male, Male, Male, Mal…
## $ high_school_gpa       <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ sat_score             <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ university_ranking    <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ university_gpa        <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ field_of_study        <fct> Arts, Law, Medicine, Computer Science, Engineeri…
## $ internships_completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ projects_completed    <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ certifications        <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ soft_skills_score     <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ networking_score      <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ job_offers            <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ starting_salary       <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ career_satisfaction   <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ years_to_promotion    <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ current_job_level     <ord> Entry, Mid, Entry, Mid, Entry, Entry, Mid, Entry…
## $ work_life_balance     <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ entrepreneurship      <fct> No, No, No, No, No, Yes, No, No, No, No, No, Yes…

# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))

Distribution of Starting Salaries

The distribution is slightly right-skewed, with most salaries clustered around $40,000–$60,000.

ggplot(education_data, aes(x = starting_salary)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = comma) +  # Format x-axis nicely
  labs(
    title = "Distribution of Starting Salaries", 
    x = "Starting Salary", 
    y = "Frequency"
  ) +
  theme_minimal()

Starting Salary by Gender

Male students tend to have a slightly higher starting salary on average. This finding prompts further analysis of other influencing variables.

ggplot(education_data, aes(x = gender, y = starting_salary, fill = gender)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +  # Format y-axis with commas
  labs(
    title = "Starting Salary by Gender",
    y = "Starting Salary",
    x = "Gender"
  ) +
  theme_minimal()

GPA vs Salary

This plot suggests a weak-to-moderate positive correlation between university GPA and starting salary.

ggplot(education_data, aes(x = university_gpa, y = starting_salary)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", col = "red") +
  scale_y_continuous(labels = comma) +   # This formats y-axis labels with commas
  labs(title = "University GPA vs Starting Salary", 
       x = "University GPA", 
       y = "Starting Salary") +
  theme_minimal()

Field of Study vs Job Level

This visualisation highlights how certain fields, like Engineering or Business, may lead more often to higher job levels early in careers.

ggplot(education_data, aes(x = field_of_study, fill = current_job_level)) +
  geom_bar(position = "fill") +
  labs(title = "Job Level Distribution by Field of Study", 
       y = "Proportion", 
       x = "Field of Study") +
  theme_minimal()

Internship and Career Success

Internships appear to positively influence starting salaries, with those completing more internships earning higher pay.

ggplot(education_data, aes(x = as.factor(internships_completed), y = starting_salary)) +
  geom_boxplot(fill = "lightgreen") +
  scale_y_continuous(labels = comma) +   # This will show numbers like 40,000 instead of 4e+04
  labs(title = "Impact of Internships Completed on Starting Salary",
       x = "Number of Internships Completed",
       y = "Starting Salary")

References

Dataset: [Kaggle - Education and Career Success]
(https://www.kaggle.com/datasets/adilshamim8/education-and-career-success)

Education and Career Success

Deepthi Bommashettihalli Lakshmipathi

Load libraries and CSV file

Initial data inspection

Clean and preprocess data

Distribution of Starting Salaries

Starting Salary by Gender

GPA vs Salary

Field of Study vs Job Level

Internship and Career Success

References