Load libraries and CSV file

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(ggplot2)
library(readr)
library(janitor)
library(scales)
library(DT)

# Load CSV data
education_data <- read_csv("education_career_success.csv", show_col_types = FALSE)  

# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))
# View first 6 rows to check
head(education_data)
# View column names and types
glimpse(education_data)
## Rows: 5,000
## Columns: 20
## $ Student_ID            <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ Age                   <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ Gender                <chr> "Male", "Other", "Female", "Male", "Male", "Male…
## $ High_School_GPA       <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ SAT_Score             <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ University_Ranking    <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ University_GPA        <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ Field_of_Study        <chr> "Arts", "Law", "Medicine", "Computer Science", "…
## $ Internships_Completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ Projects_Completed    <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ Certifications        <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ Soft_Skills_Score     <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ Networking_Score      <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ Job_Offers            <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ Starting_Salary       <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ Career_Satisfaction   <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ Years_to_Promotion    <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ Current_Job_Level     <chr> "Entry", "Mid", "Entry", "Mid", "Entry", "Entry"…
## $ Work_Life_Balance     <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ Entrepreneurship      <chr> "No", "No", "No", "No", "No", "Yes", "No", "No",…

Initial data inspection

# Summary statistics for numeric columns
summary(education_data)
##   Student_ID             Age           Gender          High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Internships_Completed Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000
# Check for missing values per column
colSums(is.na(education_data))
##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship 
##                     0                     0
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))

Clean and preprocess data

# Clean column names (e.g., High_School_GPA → high_school_gpa)
education_data <- clean_names(education_data)

# View column names to verify changes
colnames(education_data)
##  [1] "student_id"            "age"                   "gender"               
##  [4] "high_school_gpa"       "sat_score"             "university_ranking"   
##  [7] "university_gpa"        "field_of_study"        "internships_completed"
## [10] "projects_completed"    "certifications"        "soft_skills_score"    
## [13] "networking_score"      "job_offers"            "starting_salary"      
## [16] "career_satisfaction"   "years_to_promotion"    "current_job_level"    
## [19] "work_life_balance"     "entrepreneurship"
# Convert character columns to factors
education_data <- education_data %>%
  mutate(
    gender = factor(gender),
    field_of_study = factor(field_of_study),
    entrepreneurship = factor(entrepreneurship, levels = c("No", "Yes")),
    current_job_level = factor(current_job_level, ordered = TRUE, levels = c("Entry", "Mid", "Senior", "Executive"))
  )

# Optional: Remove rows with missing values
education_data <- drop_na(education_data)

# Preview the cleaned data
glimpse(education_data)
## Rows: 5,000
## Columns: 20
## $ student_id            <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ age                   <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ gender                <fct> Male, Other, Female, Male, Male, Male, Male, Mal…
## $ high_school_gpa       <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ sat_score             <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ university_ranking    <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ university_gpa        <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ field_of_study        <fct> Arts, Law, Medicine, Computer Science, Engineeri…
## $ internships_completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ projects_completed    <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ certifications        <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ soft_skills_score     <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ networking_score      <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ job_offers            <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ starting_salary       <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ career_satisfaction   <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ years_to_promotion    <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ current_job_level     <ord> Entry, Mid, Entry, Mid, Entry, Entry, Mid, Entry…
## $ work_life_balance     <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ entrepreneurship      <fct> No, No, No, No, No, Yes, No, No, No, No, No, Yes…
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))

Distribution of Starting Salaries

ggplot(education_data, aes(x = starting_salary)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = comma) +  # Format x-axis nicely
  labs(
    title = "Distribution of Starting Salaries", 
    x = "Starting Salary", 
    y = "Frequency"
  ) +
  theme_minimal()

Starting Salary by Gender

ggplot(education_data, aes(x = gender, y = starting_salary, fill = gender)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +  # Format y-axis with commas
  labs(
    title = "Starting Salary by Gender",
    y = "Starting Salary",
    x = "Gender"
  ) +
  theme_minimal()

GPA vs Salary

ggplot(education_data, aes(x = university_gpa, y = starting_salary)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", col = "red") +
  scale_y_continuous(labels = comma) +   # This formats y-axis labels with commas
  labs(title = "University GPA vs Starting Salary", 
       x = "University GPA", 
       y = "Starting Salary") +
  theme_minimal()

Field of Study vs Job Level

ggplot(education_data, aes(x = field_of_study, fill = current_job_level)) +
  geom_bar(position = "fill") +
  labs(title = "Job Level Distribution by Field of Study", 
       y = "Proportion", 
       x = "Field of Study") +
  theme_minimal()

Internship and Career Success

ggplot(education_data, aes(x = as.factor(internships_completed), y = starting_salary)) +
  geom_boxplot(fill = "lightgreen") +
  scale_y_continuous(labels = comma) +   # This will show numbers like 40,000 instead of 4e+04
  labs(title = "Impact of Internships Completed on Starting Salary",
       x = "Number of Internships Completed",
       y = "Starting Salary")

References