Structure of dataset

str(education_career_success)
## 'data.frame':    5000 obs. of  20 variables:
##  $ Student_ID           : chr  "S00001" "S00002" "S00003" "S00004" ...
##  $ Age                  : int  24 21 28 25 22 24 27 20 24 28 ...
##  $ Gender               : chr  "Male" "Other" "Female" "Male" ...
##  $ High_School_GPA      : num  3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
##  $ SAT_Score            : int  1052 1211 1193 1497 1012 1600 1011 1074 1201 1415 ...
##  $ University_Ranking   : int  291 112 715 170 599 631 610 240 337 138 ...
##  $ University_GPA       : num  3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
##  $ Field_of_Study       : chr  "Arts" "Law" "Medicine" "Computer Science" ...
##  $ Internships_Completed: int  3 4 4 3 4 2 0 1 2 1 ...
##  $ Projects_Completed   : int  7 7 8 9 6 3 1 5 3 5 ...
##  $ Certifications       : int  2 3 1 1 4 2 3 5 0 3 ...
##  $ Soft_Skills_Score    : int  9 8 1 10 10 2 3 5 5 10 ...
##  $ Networking_Score     : int  8 1 9 6 9 2 3 1 5 2 ...
##  $ Job_Offers           : int  5 4 0 1 4 1 2 2 2 0 ...
##  $ Starting_Salary      : num  27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
##  $ Career_Satisfaction  : int  4 1 9 7 9 9 7 2 2 4 ...
##  $ Years_to_Promotion   : int  5 1 3 5 5 2 4 3 2 2 ...
##  $ Current_Job_Level    : chr  "Entry" "Mid" "Entry" "Mid" ...
##  $ Work_Life_Balance    : int  7 7 7 5 2 8 3 3 2 2 ...
##  $ Entrepreneurship     : chr  "No" "No" "No" "No" ...

Variable List of dataset

#The knitr package is used to print R objects (e.g., data frames, tables) in a more readable format.It produces nicely formatted tables for reports, making them more presentable in this data set.
library(knitr) 

#The kable() function in knitr is a table generator,as there are 20 columns without kable the data is not displaying correctly.Also used the index in data frame so that variables name can be assigned to numbers in ascending order
kable(data.frame(Index = 1:length(names(education_career_success)), Variable = names(education_career_success)))
Index Variable
1 Student_ID
2 Age
3 Gender
4 High_School_GPA
5 SAT_Score
6 University_Ranking
7 University_GPA
8 Field_of_Study
9 Internships_Completed
10 Projects_Completed
11 Certifications
12 Soft_Skills_Score
13 Networking_Score
14 Job_Offers
15 Starting_Salary
16 Career_Satisfaction
17 Years_to_Promotion
18 Current_Job_Level
19 Work_Life_Balance
20 Entrepreneurship

Printing first 15 rows

library(knitr)
kable(head(education_career_success,15))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes
S00007 27 Male 2.36 1011 610 3.83 Computer Science 0 1 3 3 3 2 55500 7 4 Mid 3 No
S00008 20 Male 2.68 1074 240 2.84 Computer Science 1 5 5 5 1 2 38000 2 3 Entry 3 No
S00009 24 Male 2.84 1201 337 3.31 Business 2 3 0 5 5 2 68900 2 2 Entry 2 No
S00010 28 Male 3.02 1415 138 2.33 Computer Science 1 5 3 10 2 0 58900 4 2 Senior 2 No
S00011 28 Female 2.95 1120 594 2.87 Mathematics 2 7 5 8 1 5 26300 9 1 Entry 2 No
S00012 25 Female 2.54 1070 236 3.26 Law 2 2 3 2 9 5 35100 7 4 Mid 6 Yes
S00013 22 Female 2.06 1217 648 2.77 Engineering 2 0 5 2 9 2 42600 9 4 Senior 8 No
S00014 21 Male 3.21 1112 794 2.72 Arts 0 4 3 8 6 2 76500 4 3 Entry 3 No
S00015 25 Male 2.79 1152 3 2.00 Business 1 2 3 1 8 1 61100 9 3 Entry 6 Yes

User defined function using any of the variables from the data set

# Defined  check_salary to check if salary is above 50,000
check_salary <- function(Starting_Salary) {
  if (Starting_Salary > 50000) {
    return("Salary is Above 50K")
  } else {
    return("Salary is Below 50K")
  }
}

# Test the function
check_salary(49999)  
## [1] "Salary is Below 50K"
# Output: "Salary is Below 50K"

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("conflicted")  # Loaded the conflicted package to manage function conflicts
library("knitr")
conflicts_prefer(dplyr::filter) # Resolve conflicts to ensure dplyr::filter is used instead of other filter functions
## [conflicted] Will prefer dplyr::filter over any other package.
high_salary_gpa_data <- education_career_success %>% filter(Starting_Salary >= 70000, University_GPA >= 3.5) #The filter() function is used to subset rows from the education_career_success data set.Keep rows where the Starting_Salary is greater than or equal to 70,000 and University_GPA is greater than or equal to 3.5.

kable(head(high_salary_gpa_data))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
S00017 20 Female 3.73 1539 116 3.78 Law 3 2 3 2 5 3 97500 8 5 Mid 9 No
S00058 27 Male 3.06 1096 695 3.77 Computer Science 3 4 0 9 3 0 82700 10 5 Entry 5 No
S00064 24 Male 2.91 1071 306 3.86 Law 2 3 4 7 2 2 75700 6 5 Entry 4 Yes
S00116 28 Female 2.10 916 78 3.79 Medicine 2 3 0 4 4 4 89100 9 5 Senior 3 Yes
S00129 24 Female 2.62 1039 274 3.60 Arts 0 9 0 1 6 4 96700 1 1 Entry 5 Yes
S00198 18 Female 3.11 1547 785 3.67 Medicine 4 9 2 4 5 1 75000 10 5 Mid 2 Yes

Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

library("tidyverse")
library("knitr")
# Creating separate data frames
dependant_salary_data <- education_career_success %>% select(Starting_Salary) #Select the dependent variable (Starting_Salary)

independent_salary_data <- education_career_success %>% select(University_GPA,Field_of_Study,Age) #Select the independent variables (University_GPA, Field_of_Study, Age)

# Combine rows with rbind and converted to data frame combined_data_by_rows
combined_data_by_rows <- as.data.frame(rbind(dependant_salary_data, University_GPA = NA, Field_of_Study = NA, Age = NA))

# Combine columns with cbind and converted to data frame combined_data_by_columns
combined_data_by_columns <- as.data.frame(cbind(dependant_salary_data,independent_salary_data))

# View the results
kable(head(combined_data_by_rows))
Starting_Salary
27200
25000
42400
57400
47600
68400
kable(head(combined_data_by_columns))
Starting_Salary University_GPA Field_of_Study Age
27200 3.96 Arts 24
25000 3.63 Law 21
42400 2.63 Medicine 28
57400 2.81 Computer Science 25
47600 2.48 Engineering 22
68400 3.78 Law 24

Remove missing values in your dataset

library("knitr")
# Check for missing values
colSums(is.na(education_career_success))
##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship 
##                     0                     0
# Remove rows with missing values
missing_values_cleanup <- na.omit(education_career_success)

# Identify omitted rows
omitted_data <- anti_join(education_career_success, missing_values_cleanup)
## Joining with `by = join_by(Student_ID, Age, Gender, High_School_GPA, SAT_Score,
## University_Ranking, University_GPA, Field_of_Study, Internships_Completed,
## Projects_Completed, Certifications, Soft_Skills_Score, Networking_Score,
## Job_Offers, Starting_Salary, Career_Satisfaction, Years_to_Promotion,
## Current_Job_Level, Work_Life_Balance, Entrepreneurship)`
# View the omitted data
kable(head(omitted_data))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship

Identify and remove duplicated data from your dataset.

library("knitr")
# Identify duplicate rows in the entire data set and viewing it
duplicated_rows <- education_career_success[duplicated(education_career_success), ]
kable(head(duplicated_rows))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
# Using distinct() to remove duplicates and viewing it
library(dplyr)
education_career_success_unique <- education_career_success %>% distinct()
kable(head(education_career_success_unique))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes

Reorder multiple rows in descending order

library("knitr")
library("dplyr")
# Reorder rows by Starting_Salary (descending),Age (descending),High_School_GPA(descending)
education_career_success_desc_order <- education_career_success %>% arrange(desc(Starting_Salary),desc(Age),desc(High_School_GPA))

# Print the sorted data set
kable(head(education_career_success_desc_order))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
S01810 28 Other 2.34 1089 24 3.01 Arts 0 6 2 3 8 1 101000 6 4 Entry 8 No
S04539 21 Male 2.38 1450 943 3.93 Law 1 1 3 3 4 0 100600 6 5 Entry 8 No
S03831 20 Male 3.23 1238 234 2.82 Arts 2 7 3 5 3 5 98900 8 1 Entry 2 Yes
S03504 25 Male 3.99 982 459 2.70 Engineering 4 6 2 10 2 0 98200 4 4 Senior 5 No
S00017 20 Female 3.73 1539 116 3.78 Law 3 2 3 2 5 3 97500 8 5 Mid 9 No
S01341 25 Female 2.97 900 835 3.07 Law 3 7 4 2 9 5 96900 7 3 Entry 2 No

Rename some of the column names in your dataset.

library("dplyr")
library("knitr")
# Rename columns Student Id to ID,STarting Salary to Salary and High_school_GPA to GPA
education_career_success_column_renamed <- education_career_success %>% rename(ID = Student_ID,Salary = Starting_Salary,GPA = High_School_GPA)

# View the data set with renamed columns
kable(head(education_career_success_column_renamed))
ID Age Gender GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes

Add new variables in your data frame by using a mathematical function

library("dplyr")
library("knitr")

# Add a new variable Double_Salary at end of table
education_career_success <- education_career_success %>% mutate(Double_Salary = Starting_Salary * 2)

# View the updated data set
kable(head(education_career_success))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Double_Salary
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No 54400
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No 50000
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No 84800
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No 114800
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No 95200
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes 136800

Create a training set using a random number generator engine.

library(dplyr)

# Set a random seed
set.seed(12345)

# Create the training set on 50% of the data in data set
training_set <- education_career_success %>% sample_frac(size = 0.5)

# View the training set
kable(head(training_set))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Double_Salary
S00051 19 Male 3.66 1376 624 3.14 Computer Science 2 0 2 7 5 1 42000 6 5 Senior 2 No 84000
S00720 22 Male 3.32 1265 671 2.44 Business 4 0 1 8 7 4 25000 4 2 Entry 8 No 50000
S00730 23 Male 2.85 980 690 2.48 Law 4 6 4 7 8 4 32300 3 5 Entry 5 No 64600
S02712 28 Female 3.35 1127 504 2.25 Medicine 2 6 3 10 6 0 25500 5 2 Entry 8 No 51000
S04922 27 Male 3.96 1386 777 3.61 Mathematics 1 7 5 9 1 2 55800 4 1 Entry 1 No 111600
S00605 28 Female 3.21 1065 784 3.15 Engineering 4 3 3 6 3 5 81600 8 2 Mid 8 No 163200

Use any of the numerical variables from the dataset and perform the following statistical functions such as Mean,Median,Mode,Range

# Calculate the mean of Starting_Salary
mean_salary <- mean(education_career_success$Starting_Salary)
print(paste("Mean of Starting Salary:",mean_salary))
## [1] "Mean of Starting Salary: 50563.54"
# Calculate the median of Starting_Salary
median_salary <- median(education_career_success$Starting_Salary)
print(paste("Median of Starting Salary:",median_salary))
## [1] "Median of Starting Salary: 50300"
# Calculate the range in Starting_Salary
range_Salary <- range(education_career_success$Starting_Salary)  #Get the range (min and max)
range_diff <- range_Salary[2] - range_Salary[1]                  #Calculate the difference (max - min)
print(paste("Range of Starting Salary:",range_diff))
## [1] "Range of Starting Salary: 76000"
# Calculate mode for Starting_Salary
Frequency_mode_salary_table <- table(education_career_success$Starting_Salary)
mode_salary <- as.numeric(names(Frequency_mode_salary_table)[which.max(Frequency_mode_salary_table)])
mode_frequency <- max(Frequency_mode_salary_table)  # Frequency of the mode

print(paste("Mode of Starting Salary:",mode_salary))
## [1] "Mode of Starting Salary: 25000"
print(paste("Frequncy of Starting Salary's Mode Value:",mode_frequency))
## [1] "Frequncy of Starting Salary's Mode Value: 240"

Plot a scatter plot for any 2 variables

#Calling ggplot

library(ggplot2)

# Plot the Scatter Plot where on x axis it is university ranking and on Y axis it is starting salary
ggplot(education_career_success,aes(x=University_Ranking,y=Starting_Salary))+geom_point(size = 1,color = "blue",shape = 10,alpha = 0.3) +geom_point(size = 1,color = "red",shape = 10,alpha = 0.3)

Plot a bar plot for any 2 variables

#Calling libraries 
library(ggplot2)
library(dplyr)


# Aggregate the data to calculate the mean age for each field of study
education_career_success_agg <- education_career_success %>%group_by(Field_of_Study) %>%summarise(mean_Age = mean(Age)) 
duplicates <- education_career_success[duplicated(education_career_success$Field_of_Study), ]

# Create the bar plot with the correct data
ggplot(education_career_success_agg, aes(x = Field_of_Study, y = mean_Age)) +geom_bar(stat = "identity", fill = "red") +
  labs(
    title = "Age by Field of Study",x = "Field of Study",y = "Age") + 
  
  theme_minimal()    # Title of the plot,Label for the x-axis,Label for the y-axis,minimal theme to the plot for a clean and modern look

correlation between any 2 variables by applying least square linear regression model

library("knitr")

# Compute Pearson Correlation Coefficient
Education_Career_Success_Coefficient <- cor(education_career_success$Starting_Salary, education_career_success$University_GPA, method = "pearson")

# Print Correlation Coefficient
kable(head(Education_Career_Success_Coefficient))
x
0.0010225