Structure of dataset
str(education_career_success)
## 'data.frame': 5000 obs. of 20 variables:
## $ Student_ID : chr "S00001" "S00002" "S00003" "S00004" ...
## $ Age : int 24 21 28 25 22 24 27 20 24 28 ...
## $ Gender : chr "Male" "Other" "Female" "Male" ...
## $ High_School_GPA : num 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
## $ SAT_Score : int 1052 1211 1193 1497 1012 1600 1011 1074 1201 1415 ...
## $ University_Ranking : int 291 112 715 170 599 631 610 240 337 138 ...
## $ University_GPA : num 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
## $ Field_of_Study : chr "Arts" "Law" "Medicine" "Computer Science" ...
## $ Internships_Completed: int 3 4 4 3 4 2 0 1 2 1 ...
## $ Projects_Completed : int 7 7 8 9 6 3 1 5 3 5 ...
## $ Certifications : int 2 3 1 1 4 2 3 5 0 3 ...
## $ Soft_Skills_Score : int 9 8 1 10 10 2 3 5 5 10 ...
## $ Networking_Score : int 8 1 9 6 9 2 3 1 5 2 ...
## $ Job_Offers : int 5 4 0 1 4 1 2 2 2 0 ...
## $ Starting_Salary : num 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
## $ Career_Satisfaction : int 4 1 9 7 9 9 7 2 2 4 ...
## $ Years_to_Promotion : int 5 1 3 5 5 2 4 3 2 2 ...
## $ Current_Job_Level : chr "Entry" "Mid" "Entry" "Mid" ...
## $ Work_Life_Balance : int 7 7 7 5 2 8 3 3 2 2 ...
## $ Entrepreneurship : chr "No" "No" "No" "No" ...
Variable List of dataset
#The knitr package is used to print R objects (e.g., data frames, tables) in a more readable format.It produces nicely formatted tables for reports, making them more presentable in this data set.
library(knitr)
#The kable() function in knitr is a table generator,as there are 20 columns without kable the data is not displaying correctly.Also used the index in data frame so that variables name can be assigned to numbers in ascending order
kable(data.frame(Index = 1:length(names(education_career_success)), Variable = names(education_career_success)))
| 1 |
Student_ID |
| 2 |
Age |
| 3 |
Gender |
| 4 |
High_School_GPA |
| 5 |
SAT_Score |
| 6 |
University_Ranking |
| 7 |
University_GPA |
| 8 |
Field_of_Study |
| 9 |
Internships_Completed |
| 10 |
Projects_Completed |
| 11 |
Certifications |
| 12 |
Soft_Skills_Score |
| 13 |
Networking_Score |
| 14 |
Job_Offers |
| 15 |
Starting_Salary |
| 16 |
Career_Satisfaction |
| 17 |
Years_to_Promotion |
| 18 |
Current_Job_Level |
| 19 |
Work_Life_Balance |
| 20 |
Entrepreneurship |
Printing first 15 rows
library(knitr)
kable(head(education_career_success,15))
| S00001 |
24 |
Male |
3.58 |
1052 |
291 |
3.96 |
Arts |
3 |
7 |
2 |
9 |
8 |
5 |
27200 |
4 |
5 |
Entry |
7 |
No |
| S00002 |
21 |
Other |
2.52 |
1211 |
112 |
3.63 |
Law |
4 |
7 |
3 |
8 |
1 |
4 |
25000 |
1 |
1 |
Mid |
7 |
No |
| S00003 |
28 |
Female |
3.42 |
1193 |
715 |
2.63 |
Medicine |
4 |
8 |
1 |
1 |
9 |
0 |
42400 |
9 |
3 |
Entry |
7 |
No |
| S00004 |
25 |
Male |
2.43 |
1497 |
170 |
2.81 |
Computer Science |
3 |
9 |
1 |
10 |
6 |
1 |
57400 |
7 |
5 |
Mid |
5 |
No |
| S00005 |
22 |
Male |
2.08 |
1012 |
599 |
2.48 |
Engineering |
4 |
6 |
4 |
10 |
9 |
4 |
47600 |
9 |
5 |
Entry |
2 |
No |
| S00006 |
24 |
Male |
2.40 |
1600 |
631 |
3.78 |
Law |
2 |
3 |
2 |
2 |
2 |
1 |
68400 |
9 |
2 |
Entry |
8 |
Yes |
| S00007 |
27 |
Male |
2.36 |
1011 |
610 |
3.83 |
Computer Science |
0 |
1 |
3 |
3 |
3 |
2 |
55500 |
7 |
4 |
Mid |
3 |
No |
| S00008 |
20 |
Male |
2.68 |
1074 |
240 |
2.84 |
Computer Science |
1 |
5 |
5 |
5 |
1 |
2 |
38000 |
2 |
3 |
Entry |
3 |
No |
| S00009 |
24 |
Male |
2.84 |
1201 |
337 |
3.31 |
Business |
2 |
3 |
0 |
5 |
5 |
2 |
68900 |
2 |
2 |
Entry |
2 |
No |
| S00010 |
28 |
Male |
3.02 |
1415 |
138 |
2.33 |
Computer Science |
1 |
5 |
3 |
10 |
2 |
0 |
58900 |
4 |
2 |
Senior |
2 |
No |
| S00011 |
28 |
Female |
2.95 |
1120 |
594 |
2.87 |
Mathematics |
2 |
7 |
5 |
8 |
1 |
5 |
26300 |
9 |
1 |
Entry |
2 |
No |
| S00012 |
25 |
Female |
2.54 |
1070 |
236 |
3.26 |
Law |
2 |
2 |
3 |
2 |
9 |
5 |
35100 |
7 |
4 |
Mid |
6 |
Yes |
| S00013 |
22 |
Female |
2.06 |
1217 |
648 |
2.77 |
Engineering |
2 |
0 |
5 |
2 |
9 |
2 |
42600 |
9 |
4 |
Senior |
8 |
No |
| S00014 |
21 |
Male |
3.21 |
1112 |
794 |
2.72 |
Arts |
0 |
4 |
3 |
8 |
6 |
2 |
76500 |
4 |
3 |
Entry |
3 |
No |
| S00015 |
25 |
Male |
2.79 |
1152 |
3 |
2.00 |
Business |
1 |
2 |
3 |
1 |
8 |
1 |
61100 |
9 |
3 |
Entry |
6 |
Yes |
User defined function using any of the variables from the data
set
# Defined check_salary to check if salary is above 50,000
check_salary <- function(Starting_Salary) {
if (Starting_Salary > 50000) {
return("Salary is Above 50K")
} else {
return("Salary is Below 50K")
}
}
# Test the function
check_salary(49999)
## [1] "Salary is Below 50K"
# Output: "Salary is Below 50K"
Use data manipulation techniques and filter rows based on any
logical criteria that exist in your dataset.
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("conflicted") # Loaded the conflicted package to manage function conflicts
library("knitr")
conflicts_prefer(dplyr::filter) # Resolve conflicts to ensure dplyr::filter is used instead of other filter functions
## [conflicted] Will prefer dplyr::filter over any other package.
high_salary_gpa_data <- education_career_success %>% filter(Starting_Salary >= 70000, University_GPA >= 3.5) #The filter() function is used to subset rows from the education_career_success data set.Keep rows where the Starting_Salary is greater than or equal to 70,000 and University_GPA is greater than or equal to 3.5.
kable(head(high_salary_gpa_data))
| S00017 |
20 |
Female |
3.73 |
1539 |
116 |
3.78 |
Law |
3 |
2 |
3 |
2 |
5 |
3 |
97500 |
8 |
5 |
Mid |
9 |
No |
| S00058 |
27 |
Male |
3.06 |
1096 |
695 |
3.77 |
Computer Science |
3 |
4 |
0 |
9 |
3 |
0 |
82700 |
10 |
5 |
Entry |
5 |
No |
| S00064 |
24 |
Male |
2.91 |
1071 |
306 |
3.86 |
Law |
2 |
3 |
4 |
7 |
2 |
2 |
75700 |
6 |
5 |
Entry |
4 |
Yes |
| S00116 |
28 |
Female |
2.10 |
916 |
78 |
3.79 |
Medicine |
2 |
3 |
0 |
4 |
4 |
4 |
89100 |
9 |
5 |
Senior |
3 |
Yes |
| S00129 |
24 |
Female |
2.62 |
1039 |
274 |
3.60 |
Arts |
0 |
9 |
0 |
1 |
6 |
4 |
96700 |
1 |
1 |
Entry |
5 |
Yes |
| S00198 |
18 |
Female |
3.11 |
1547 |
785 |
3.67 |
Medicine |
4 |
9 |
2 |
4 |
5 |
1 |
75000 |
10 |
5 |
Mid |
2 |
Yes |
Identify the dependent & independent variables and use reshaping
techniques and create a new data frame by joining those variables from
your dataset.
library("tidyverse")
library("knitr")
# Creating separate data frames
dependant_salary_data <- education_career_success %>% select(Starting_Salary) #Select the dependent variable (Starting_Salary)
independent_salary_data <- education_career_success %>% select(University_GPA,Field_of_Study,Age) #Select the independent variables (University_GPA, Field_of_Study, Age)
# Combine rows with rbind and converted to data frame combined_data_by_rows
combined_data_by_rows <- as.data.frame(rbind(dependant_salary_data, University_GPA = NA, Field_of_Study = NA, Age = NA))
# Combine columns with cbind and converted to data frame combined_data_by_columns
combined_data_by_columns <- as.data.frame(cbind(dependant_salary_data,independent_salary_data))
# View the results
kable(head(combined_data_by_rows))
| 27200 |
| 25000 |
| 42400 |
| 57400 |
| 47600 |
| 68400 |
kable(head(combined_data_by_columns))
| 27200 |
3.96 |
Arts |
24 |
| 25000 |
3.63 |
Law |
21 |
| 42400 |
2.63 |
Medicine |
28 |
| 57400 |
2.81 |
Computer Science |
25 |
| 47600 |
2.48 |
Engineering |
22 |
| 68400 |
3.78 |
Law |
24 |
Remove missing values in your dataset
library("knitr")
# Check for missing values
colSums(is.na(education_career_success))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_Ranking
## 0 0 0
## University_GPA Field_of_Study Internships_Completed
## 0 0 0
## Projects_Completed Certifications Soft_Skills_Score
## 0 0 0
## Networking_Score Job_Offers Starting_Salary
## 0 0 0
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 0 0 0
## Work_Life_Balance Entrepreneurship
## 0 0
# Remove rows with missing values
missing_values_cleanup <- na.omit(education_career_success)
# Identify omitted rows
omitted_data <- anti_join(education_career_success, missing_values_cleanup)
## Joining with `by = join_by(Student_ID, Age, Gender, High_School_GPA, SAT_Score,
## University_Ranking, University_GPA, Field_of_Study, Internships_Completed,
## Projects_Completed, Certifications, Soft_Skills_Score, Networking_Score,
## Job_Offers, Starting_Salary, Career_Satisfaction, Years_to_Promotion,
## Current_Job_Level, Work_Life_Balance, Entrepreneurship)`
# View the omitted data
kable(head(omitted_data))
Identify and remove duplicated data from your dataset.
library("knitr")
# Identify duplicate rows in the entire data set and viewing it
duplicated_rows <- education_career_success[duplicated(education_career_success), ]
kable(head(duplicated_rows))
# Using distinct() to remove duplicates and viewing it
library(dplyr)
education_career_success_unique <- education_career_success %>% distinct()
kable(head(education_career_success_unique))
| S00001 |
24 |
Male |
3.58 |
1052 |
291 |
3.96 |
Arts |
3 |
7 |
2 |
9 |
8 |
5 |
27200 |
4 |
5 |
Entry |
7 |
No |
| S00002 |
21 |
Other |
2.52 |
1211 |
112 |
3.63 |
Law |
4 |
7 |
3 |
8 |
1 |
4 |
25000 |
1 |
1 |
Mid |
7 |
No |
| S00003 |
28 |
Female |
3.42 |
1193 |
715 |
2.63 |
Medicine |
4 |
8 |
1 |
1 |
9 |
0 |
42400 |
9 |
3 |
Entry |
7 |
No |
| S00004 |
25 |
Male |
2.43 |
1497 |
170 |
2.81 |
Computer Science |
3 |
9 |
1 |
10 |
6 |
1 |
57400 |
7 |
5 |
Mid |
5 |
No |
| S00005 |
22 |
Male |
2.08 |
1012 |
599 |
2.48 |
Engineering |
4 |
6 |
4 |
10 |
9 |
4 |
47600 |
9 |
5 |
Entry |
2 |
No |
| S00006 |
24 |
Male |
2.40 |
1600 |
631 |
3.78 |
Law |
2 |
3 |
2 |
2 |
2 |
1 |
68400 |
9 |
2 |
Entry |
8 |
Yes |
Reorder multiple rows in descending order
library("knitr")
library("dplyr")
# Reorder rows by Starting_Salary (descending),Age (descending),High_School_GPA(descending)
education_career_success_desc_order <- education_career_success %>% arrange(desc(Starting_Salary),desc(Age),desc(High_School_GPA))
# Print the sorted data set
kable(head(education_career_success_desc_order))
| S01810 |
28 |
Other |
2.34 |
1089 |
24 |
3.01 |
Arts |
0 |
6 |
2 |
3 |
8 |
1 |
101000 |
6 |
4 |
Entry |
8 |
No |
| S04539 |
21 |
Male |
2.38 |
1450 |
943 |
3.93 |
Law |
1 |
1 |
3 |
3 |
4 |
0 |
100600 |
6 |
5 |
Entry |
8 |
No |
| S03831 |
20 |
Male |
3.23 |
1238 |
234 |
2.82 |
Arts |
2 |
7 |
3 |
5 |
3 |
5 |
98900 |
8 |
1 |
Entry |
2 |
Yes |
| S03504 |
25 |
Male |
3.99 |
982 |
459 |
2.70 |
Engineering |
4 |
6 |
2 |
10 |
2 |
0 |
98200 |
4 |
4 |
Senior |
5 |
No |
| S00017 |
20 |
Female |
3.73 |
1539 |
116 |
3.78 |
Law |
3 |
2 |
3 |
2 |
5 |
3 |
97500 |
8 |
5 |
Mid |
9 |
No |
| S01341 |
25 |
Female |
2.97 |
900 |
835 |
3.07 |
Law |
3 |
7 |
4 |
2 |
9 |
5 |
96900 |
7 |
3 |
Entry |
2 |
No |
Rename some of the column names in your dataset.
library("dplyr")
library("knitr")
# Rename columns Student Id to ID,STarting Salary to Salary and High_school_GPA to GPA
education_career_success_column_renamed <- education_career_success %>% rename(ID = Student_ID,Salary = Starting_Salary,GPA = High_School_GPA)
# View the data set with renamed columns
kable(head(education_career_success_column_renamed))
| S00001 |
24 |
Male |
3.58 |
1052 |
291 |
3.96 |
Arts |
3 |
7 |
2 |
9 |
8 |
5 |
27200 |
4 |
5 |
Entry |
7 |
No |
| S00002 |
21 |
Other |
2.52 |
1211 |
112 |
3.63 |
Law |
4 |
7 |
3 |
8 |
1 |
4 |
25000 |
1 |
1 |
Mid |
7 |
No |
| S00003 |
28 |
Female |
3.42 |
1193 |
715 |
2.63 |
Medicine |
4 |
8 |
1 |
1 |
9 |
0 |
42400 |
9 |
3 |
Entry |
7 |
No |
| S00004 |
25 |
Male |
2.43 |
1497 |
170 |
2.81 |
Computer Science |
3 |
9 |
1 |
10 |
6 |
1 |
57400 |
7 |
5 |
Mid |
5 |
No |
| S00005 |
22 |
Male |
2.08 |
1012 |
599 |
2.48 |
Engineering |
4 |
6 |
4 |
10 |
9 |
4 |
47600 |
9 |
5 |
Entry |
2 |
No |
| S00006 |
24 |
Male |
2.40 |
1600 |
631 |
3.78 |
Law |
2 |
3 |
2 |
2 |
2 |
1 |
68400 |
9 |
2 |
Entry |
8 |
Yes |
Add new variables in your data frame by using a mathematical
function
library("dplyr")
library("knitr")
# Add a new variable Double_Salary at end of table
education_career_success <- education_career_success %>% mutate(Double_Salary = Starting_Salary * 2)
# View the updated data set
kable(head(education_career_success))
| S00001 |
24 |
Male |
3.58 |
1052 |
291 |
3.96 |
Arts |
3 |
7 |
2 |
9 |
8 |
5 |
27200 |
4 |
5 |
Entry |
7 |
No |
54400 |
| S00002 |
21 |
Other |
2.52 |
1211 |
112 |
3.63 |
Law |
4 |
7 |
3 |
8 |
1 |
4 |
25000 |
1 |
1 |
Mid |
7 |
No |
50000 |
| S00003 |
28 |
Female |
3.42 |
1193 |
715 |
2.63 |
Medicine |
4 |
8 |
1 |
1 |
9 |
0 |
42400 |
9 |
3 |
Entry |
7 |
No |
84800 |
| S00004 |
25 |
Male |
2.43 |
1497 |
170 |
2.81 |
Computer Science |
3 |
9 |
1 |
10 |
6 |
1 |
57400 |
7 |
5 |
Mid |
5 |
No |
114800 |
| S00005 |
22 |
Male |
2.08 |
1012 |
599 |
2.48 |
Engineering |
4 |
6 |
4 |
10 |
9 |
4 |
47600 |
9 |
5 |
Entry |
2 |
No |
95200 |
| S00006 |
24 |
Male |
2.40 |
1600 |
631 |
3.78 |
Law |
2 |
3 |
2 |
2 |
2 |
1 |
68400 |
9 |
2 |
Entry |
8 |
Yes |
136800 |
Create a training set using a random number generator engine.
library(dplyr)
# Set a random seed
set.seed(12345)
# Create the training set on 50% of the data in data set
training_set <- education_career_success %>% sample_frac(size = 0.5)
# View the training set
kable(head(training_set))
| S00051 |
19 |
Male |
3.66 |
1376 |
624 |
3.14 |
Computer Science |
2 |
0 |
2 |
7 |
5 |
1 |
42000 |
6 |
5 |
Senior |
2 |
No |
84000 |
| S00720 |
22 |
Male |
3.32 |
1265 |
671 |
2.44 |
Business |
4 |
0 |
1 |
8 |
7 |
4 |
25000 |
4 |
2 |
Entry |
8 |
No |
50000 |
| S00730 |
23 |
Male |
2.85 |
980 |
690 |
2.48 |
Law |
4 |
6 |
4 |
7 |
8 |
4 |
32300 |
3 |
5 |
Entry |
5 |
No |
64600 |
| S02712 |
28 |
Female |
3.35 |
1127 |
504 |
2.25 |
Medicine |
2 |
6 |
3 |
10 |
6 |
0 |
25500 |
5 |
2 |
Entry |
8 |
No |
51000 |
| S04922 |
27 |
Male |
3.96 |
1386 |
777 |
3.61 |
Mathematics |
1 |
7 |
5 |
9 |
1 |
2 |
55800 |
4 |
1 |
Entry |
1 |
No |
111600 |
| S00605 |
28 |
Female |
3.21 |
1065 |
784 |
3.15 |
Engineering |
4 |
3 |
3 |
6 |
3 |
5 |
81600 |
8 |
2 |
Mid |
8 |
No |
163200 |
Print the summary statistics of your dataset.
# Generate summary statistics
summary(education_career_success)
## Student_ID Age Gender High_School_GPA
## Length:5000 Min. :18.00 Length:5000 Min. :2.000
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:2.500
## Mode :character Median :23.00 Mode :character Median :2.990
## Mean :23.44 Mean :2.997
## 3rd Qu.:26.00 3rd Qu.:3.500
## Max. :29.00 Max. :4.000
## SAT_Score University_Ranking University_GPA Field_of_Study
## Min. : 900 Min. : 1.0 Min. :2.00 Length:5000
## 1st Qu.:1076 1st Qu.: 256.0 1st Qu.:2.52 Class :character
## Median :1257 Median : 501.5 Median :3.03 Mode :character
## Mean :1254 Mean : 504.3 Mean :3.02
## 3rd Qu.:1432 3rd Qu.: 759.0 3rd Qu.:3.51
## Max. :1600 Max. :1000.0 Max. :4.00
## Internships_Completed Projects_Completed Certifications Soft_Skills_Score
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :2.000 Median :5.000 Median :3.000 Median : 6.000
## Mean :1.982 Mean :4.563 Mean :2.512 Mean : 5.546
## 3rd Qu.:3.000 3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :4.000 Max. :9.000 Max. :5.000 Max. :10.000
## Networking_Score Job_Offers Starting_Salary Career_Satisfaction
## Min. : 1.000 Min. :0.000 Min. : 25000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.: 40200 1st Qu.: 3.000
## Median : 6.000 Median :2.000 Median : 50300 Median : 6.000
## Mean : 5.538 Mean :2.489 Mean : 50564 Mean : 5.578
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.: 60500 3rd Qu.: 8.000
## Max. :10.000 Max. :5.000 Max. :101000 Max. :10.000
## Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
## Min. :1.000 Length:5000 Min. : 1.000 Length:5000
## 1st Qu.:2.000 Class :character 1st Qu.: 3.000 Class :character
## Median :3.000 Mode :character Median : 6.000 Mode :character
## Mean :3.016 Mean : 5.482
## 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :5.000 Max. :10.000
## Double_Salary
## Min. : 50000
## 1st Qu.: 80400
## Median :100600
## Mean :101127
## 3rd Qu.:121000
## Max. :202000
Plot a scatter plot for any 2 variables
#Calling ggplot
library(ggplot2)
# Plot the Scatter Plot where on x axis it is university ranking and on Y axis it is starting salary
ggplot(education_career_success,aes(x=University_Ranking,y=Starting_Salary))+geom_point(size = 1,color = "blue",shape = 10,alpha = 0.3) +geom_point(size = 1,color = "red",shape = 10,alpha = 0.3)

Plot a bar plot for any 2 variables
#Calling libraries
library(ggplot2)
library(dplyr)
# Aggregate the data to calculate the mean age for each field of study
education_career_success_agg <- education_career_success %>%group_by(Field_of_Study) %>%summarise(mean_Age = mean(Age))
duplicates <- education_career_success[duplicated(education_career_success$Field_of_Study), ]
# Create the bar plot with the correct data
ggplot(education_career_success_agg, aes(x = Field_of_Study, y = mean_Age)) +geom_bar(stat = "identity", fill = "red") +
labs(
title = "Age by Field of Study",x = "Field of Study",y = "Age") +
theme_minimal() # Title of the plot,Label for the x-axis,Label for the y-axis,minimal theme to the plot for a clean and modern look

correlation between any 2 variables by applying least square linear
regression model
library("knitr")
# Compute Pearson Correlation Coefficient
Education_Career_Success_Coefficient <- cor(education_career_success$Starting_Salary, education_career_success$University_GPA, method = "pearson")
# Print Correlation Coefficient
kable(head(Education_Career_Success_Coefficient))