## # A tibble: 6 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
## Rows: 5,000
## Columns: 20
## $ Student_ID <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ Age <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ Gender <chr> "Male", "Other", "Female", "Male", "Male", "Male…
## $ High_School_GPA <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ SAT_Score <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ University_Ranking <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ University_GPA <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ Field_of_Study <chr> "Arts", "Law", "Medicine", "Computer Science", "…
## $ Internships_Completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ Projects_Completed <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ Certifications <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ Soft_Skills_Score <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ Networking_Score <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ Job_Offers <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ Starting_Salary <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ Career_Satisfaction <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ Years_to_Promotion <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ Current_Job_Level <chr> "Entry", "Mid", "Entry", "Mid", "Entry", "Entry"…
## $ Work_Life_Balance <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ Entrepreneurship <chr> "No", "No", "No", "No", "No", "Yes", "No", "No",…
# Summary statistics for numeric columns
summary(education_data)
## Student_ID Age Gender High_School_GPA
## Length:5000 Min. :18.00 Length:5000 Min. :2.000
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:2.500
## Mode :character Median :23.00 Mode :character Median :2.990
## Mean :23.44 Mean :2.997
## 3rd Qu.:26.00 3rd Qu.:3.500
## Max. :29.00 Max. :4.000
## SAT_Score University_Ranking University_GPA Field_of_Study
## Min. : 900 Min. : 1.0 Min. :2.00 Length:5000
## 1st Qu.:1076 1st Qu.: 256.0 1st Qu.:2.52 Class :character
## Median :1257 Median : 501.5 Median :3.03 Mode :character
## Mean :1254 Mean : 504.3 Mean :3.02
## 3rd Qu.:1432 3rd Qu.: 759.0 3rd Qu.:3.51
## Max. :1600 Max. :1000.0 Max. :4.00
## Internships_Completed Projects_Completed Certifications Soft_Skills_Score
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :2.000 Median :5.000 Median :3.000 Median : 6.000
## Mean :1.982 Mean :4.563 Mean :2.512 Mean : 5.546
## 3rd Qu.:3.000 3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :4.000 Max. :9.000 Max. :5.000 Max. :10.000
## Networking_Score Job_Offers Starting_Salary Career_Satisfaction
## Min. : 1.000 Min. :0.000 Min. : 25000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.: 40200 1st Qu.: 3.000
## Median : 6.000 Median :2.000 Median : 50300 Median : 6.000
## Mean : 5.538 Mean :2.489 Mean : 50564 Mean : 5.578
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.: 60500 3rd Qu.: 8.000
## Max. :10.000 Max. :5.000 Max. :101000 Max. :10.000
## Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
## Min. :1.000 Length:5000 Min. : 1.000 Length:5000
## 1st Qu.:2.000 Class :character 1st Qu.: 3.000 Class :character
## Median :3.000 Mode :character Median : 6.000 Mode :character
## Mean :3.016 Mean : 5.482
## 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :5.000 Max. :10.000
# Check for missing values per column
colSums(is.na(education_data))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_Ranking
## 0 0 0
## University_GPA Field_of_Study Internships_Completed
## 0 0 0
## Projects_Completed Certifications Soft_Skills_Score
## 0 0 0
## Networking_Score Job_Offers Starting_Salary
## 0 0 0
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 0 0 0
## Work_Life_Balance Entrepreneurship
## 0 0
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))
# Clean column names (e.g., High_School_GPA → high_school_gpa)
education_data <- clean_names(education_data)
# View column names to verify changes
colnames(education_data)
## [1] "student_id" "age" "gender"
## [4] "high_school_gpa" "sat_score" "university_ranking"
## [7] "university_gpa" "field_of_study" "internships_completed"
## [10] "projects_completed" "certifications" "soft_skills_score"
## [13] "networking_score" "job_offers" "starting_salary"
## [16] "career_satisfaction" "years_to_promotion" "current_job_level"
## [19] "work_life_balance" "entrepreneurship"
# Convert character columns to factors
education_data <- education_data %>%
mutate(
gender = factor(gender),
field_of_study = factor(field_of_study),
entrepreneurship = factor(entrepreneurship, levels = c("No", "Yes")),
current_job_level = factor(current_job_level, ordered = TRUE, levels = c("Entry", "Mid", "Senior", "Executive"))
)
# Optional: Remove rows with missing values
education_data <- drop_na(education_data)
# Preview the cleaned data
glimpse(education_data)
## Rows: 5,000
## Columns: 20
## $ student_id <chr> "S00001", "S00002", "S00003", "S00004", "S00005"…
## $ age <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ gender <fct> Male, Other, Female, Male, Male, Male, Male, Mal…
## $ high_school_gpa <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ sat_score <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ university_ranking <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ university_gpa <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ field_of_study <fct> Arts, Law, Medicine, Computer Science, Engineeri…
## $ internships_completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ projects_completed <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ certifications <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ soft_skills_score <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ networking_score <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ job_offers <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ starting_salary <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ career_satisfaction <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ years_to_promotion <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ current_job_level <ord> Entry, Mid, Entry, Mid, Entry, Entry, Mid, Entry…
## $ work_life_balance <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ entrepreneurship <fct> No, No, No, No, No, Yes, No, No, No, No, No, Yes…
# Preview in interactive table
datatable(education_data, options = list(scrollX = TRUE, pageLength = 5))
ggplot(education_data, aes(x = gender, y = starting_salary, fill = gender)) +
geom_boxplot() +
scale_y_continuous(labels = comma) + # Format y-axis with commas
labs(
title = "Starting Salary by Gender",
y = "Starting Salary",
x = "Gender"
) +
theme_minimal()
ggplot(education_data, aes(x = university_gpa, y = starting_salary)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", col = "red") +
scale_y_continuous(labels = comma) + # This formats y-axis labels with commas
labs(title = "University GPA vs Starting Salary",
x = "University GPA",
y = "Starting Salary") +
theme_minimal()
ggplot(education_data, aes(x = field_of_study, fill = current_job_level)) +
geom_bar(position = "fill") +
labs(title = "Job Level Distribution by Field of Study",
y = "Proportion",
x = "Field of Study") +
theme_minimal()
ggplot(education_data, aes(x = as.factor(internships_completed), y = starting_salary)) +
geom_boxplot(fill = "lightgreen") +
scale_y_continuous(labels = comma) + # This will show numbers like 40,000 instead of 4e+04
labs(title = "Impact of Internships Completed on Starting Salary",
x = "Number of Internships Completed",
y = "Starting Salary")
This analysis confirms that multiple educational and experiential factors shape early career outcomes. Higher university GPA and internship experience consistently correlate with higher starting salaries and faster promotions. Additionally, certain fields of study offer clearer paths to senior roles, while soft skills and networking abilities significantly enhance job satisfaction. However, observed gender-based salary gaps suggest systemic issues that require deeper institutional analysis. These insights are valuable for students, educators, and employers striving to align educational pathways with meaningful career success.
This analysis of education and career data reveals that academic performance, practical experience, and field of study play important roles in shaping early career success. Higher GPAs and internship experience are consistently linked with better starting salaries and quicker progression to higher job levels. Certain fields such as Engineering and Business provide clearer paths to advancement, while soft skills and networking also significantly contribute to job satisfaction. Notably, observed gender differences in salary highlight the need for further investigation into systemic disparities. These findings are valuable for students, educators, and employers in making informed decisions about education and career planning.