library(readr)
education <- read_csv("C:/Users/KimCS/Downloads/education.csv")
## Rows: 5000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Student_ID, Gender, Field_of_Study, Current_Job_Level, Entrepreneu...
## dbl (15): Age, High_School_GPA, SAT_Score, University_Ranking, University_GP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(education)
# Task: Print the structure of your dataset
str(education)
## spc_tbl_ [5,000 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Student_ID : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
## $ Age : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
## $ Gender : chr [1:5000] "Male" "Other" "Female" "Male" ...
## $ High_School_GPA : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
## $ SAT_Score : num [1:5000] 1052 1211 1193 1497 1012 ...
## $ University_Ranking : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
## $ University_GPA : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
## $ Field_of_Study : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
## $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
## $ Projects_Completed : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
## $ Certifications : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
## $ Soft_Skills_Score : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
## $ Networking_Score : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
## $ Job_Offers : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
## $ Starting_Salary : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
## $ Career_Satisfaction : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
## $ Years_to_Promotion : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
## $ Current_Job_Level : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
## $ Work_Life_Balance : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
## $ Entrepreneurship : chr [1:5000] "No" "No" "No" "No" ...
## - attr(*, "spec")=
## .. cols(
## .. Student_ID = col_character(),
## .. Age = col_double(),
## .. Gender = col_character(),
## .. High_School_GPA = col_double(),
## .. SAT_Score = col_double(),
## .. University_Ranking = col_double(),
## .. University_GPA = col_double(),
## .. Field_of_Study = col_character(),
## .. Internships_Completed = col_double(),
## .. Projects_Completed = col_double(),
## .. Certifications = col_double(),
## .. Soft_Skills_Score = col_double(),
## .. Networking_Score = col_double(),
## .. Job_Offers = col_double(),
## .. Starting_Salary = col_double(),
## .. Career_Satisfaction = col_double(),
## .. Years_to_Promotion = col_double(),
## .. Current_Job_Level = col_character(),
## .. Work_Life_Balance = col_double(),
## .. Entrepreneurship = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Task: List the variables in your dataset
names(education)
## [1] "Student_ID" "Age" "Gender"
## [4] "High_School_GPA" "SAT_Score" "University_Ranking"
## [7] "University_GPA" "Field_of_Study" "Internships_Completed"
## [10] "Projects_Completed" "Certifications" "Soft_Skills_Score"
## [13] "Networking_Score" "Job_Offers" "Starting_Salary"
## [16] "Career_Satisfaction" "Years_to_Promotion" "Current_Job_Level"
## [19] "Work_Life_Balance" "Entrepreneurship"
# Task: Print the top 15 rows of your dataset
head(education, n=15)
## # A tibble: 15 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## 7 S00007 27 Male 2.36 1011 610
## 8 S00008 20 Male 2.68 1074 240
## 9 S00009 24 Male 2.84 1201 337
## 10 S00010 28 Male 3.02 1415 138
## 11 S00011 28 Female 2.95 1120 594
## 12 S00012 25 Female 2.54 1070 236
## 13 S00013 22 Female 2.06 1217 648
## 14 S00014 21 Male 3.21 1112 794
## 15 S00015 25 Male 2.79 1152 3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
# Task: Write a user defined function using any of the variables from the data set
# defining the terms first:
pass_term <-as.character("Pass")
fail_term <-as.character("Fail")
# function using the above predefined terms
pass_or_fail <- function(gpa) {
if (gpa >= 2.5) {
return(pass_term)
} else {
return(fail_term)
}
}
# Task: use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pass_or_fail(education$University_GPA[1])
## [1] "Pass"
# Task: Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
library(dplyr)
library(tidyr)
# Select relevant columns
df_selected <-education %>% select(Student_ID, University_GPA, Internships_Completed, Job_Offers)
# Reshape from wide to long format (this format is useful for visualization and statistical modeling)
df_long <-df_selected %>%
pivot_longer(cols = c(University_GPA, Internships_Completed),
names_to = "Variable",
values_to = "Value")
# whereas this format is used for machine learning and regression models
df_wide <- df_long %>%
pivot_wider(names_from = Variable, values_from = Value)
# Task: Remove missing values in your dataset
#
colSums(is.na(education))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_Ranking
## 0 0 0
## University_GPA Field_of_Study Internships_Completed
## 0 0 0
## Projects_Completed Certifications Soft_Skills_Score
## 0 0 0
## Networking_Score Job_Offers Starting_Salary
## 0 0 0
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 0 0 0
## Work_Life_Balance Entrepreneurship
## 0 0
education_clean <- education %>% drop_na(University_GPA, Job_Offers)
# to verify that missing values are removed
colSums(is.na(education_clean))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_Ranking
## 0 0 0
## University_GPA Field_of_Study Internships_Completed
## 0 0 0
## Projects_Completed Certifications Soft_Skills_Score
## 0 0 0
## Networking_Score Job_Offers Starting_Salary
## 0 0 0
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 0 0 0
## Work_Life_Balance Entrepreneurship
## 0 0
# Task Identify and remove duplicated data from your dataset.
# to find and count duplicate rows
# a. to count duplicate rows
sum(duplicated(education))
## [1] 0
# to display duplicated rows
education[duplicated(education),]
## # A tibble: 0 × 20
## # ℹ 20 variables: Student_ID <chr>, Age <dbl>, Gender <chr>,
## # High_School_GPA <dbl>, SAT_Score <dbl>, University_Ranking <dbl>,
## # University_GPA <dbl>, Field_of_Study <chr>, Internships_Completed <dbl>,
## # Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## # Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## # Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## # Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>
# to check for duplicated values
# a. count duplicate GPA values
sum(duplicated(education$University_GPA))
## [1] 4799
# Task reorder multiple rows in descending order
library(dplyr)
# reorder by University_GPA and Job_Offers in descending order
education_sorted <- education %>% arrange(desc(University_GPA), desc(Job_Offers))
# view the sorted dataset
head(education_sorted)
## # A tibble: 6 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00114 20 Male 2.9 963 819
## 2 S03117 20 Male 3.85 1243 281
## 3 S01553 26 Female 2.11 1322 322
## 4 S02276 23 Male 3.8 1238 644
## 5 S04363 24 Female 3.34 1117 644
## 6 S04670 18 Female 2.48 1051 244
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
# Task: rename some of the columns in your dataset
library(dplyr)
education_renamed <- education %>%
rename(
GPA = University_GPA,
Internships = Internships_Completed,
Offers = Job_Offers
)
# to view the updated column names
colnames(education_renamed)
## [1] "Student_ID" "Age" "Gender"
## [4] "High_School_GPA" "SAT_Score" "University_Ranking"
## [7] "GPA" "Field_of_Study" "Internships"
## [10] "Projects_Completed" "Certifications" "Soft_Skills_Score"
## [13] "Networking_Score" "Offers" "Starting_Salary"
## [16] "Career_Satisfaction" "Years_to_Promotion" "Current_Job_Level"
## [19] "Work_Life_Balance" "Entrepreneurship"
# Task add new variables in your data frame by using a mathematical function
# creates a new column new_variable with the values of column_name multiplied by 2
library (dplyr)
education_career_success <- education_clean %>% mutate(Double_Salary = Starting_Salary * 2)
# Task Create a training set using a random number generator engine.
# using set.seed()
library (dplyr)
# setting the seed to a specific number
set.seed (123)
# randomly sample 5 numbers, ensures that random numbers are generated by sample () are reproducible
random_numbers <- sample(1:10, 5)
print(random_numbers)
## [1] 3 10 2 8 6
# Task Print the summary statistics of your dataset.
summary(education)
## Student_ID Age Gender High_School_GPA
## Length:5000 Min. :18.00 Length:5000 Min. :2.000
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:2.500
## Mode :character Median :23.00 Mode :character Median :2.990
## Mean :23.44 Mean :2.997
## 3rd Qu.:26.00 3rd Qu.:3.500
## Max. :29.00 Max. :4.000
## SAT_Score University_Ranking University_GPA Field_of_Study
## Min. : 900 Min. : 1.0 Min. :2.00 Length:5000
## 1st Qu.:1076 1st Qu.: 256.0 1st Qu.:2.52 Class :character
## Median :1257 Median : 501.5 Median :3.03 Mode :character
## Mean :1254 Mean : 504.3 Mean :3.02
## 3rd Qu.:1432 3rd Qu.: 759.0 3rd Qu.:3.51
## Max. :1600 Max. :1000.0 Max. :4.00
## Internships_Completed Projects_Completed Certifications Soft_Skills_Score
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :2.000 Median :5.000 Median :3.000 Median : 6.000
## Mean :1.982 Mean :4.563 Mean :2.512 Mean : 5.546
## 3rd Qu.:3.000 3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :4.000 Max. :9.000 Max. :5.000 Max. :10.000
## Networking_Score Job_Offers Starting_Salary Career_Satisfaction
## Min. : 1.000 Min. :0.000 Min. : 25000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.: 40200 1st Qu.: 3.000
## Median : 6.000 Median :2.000 Median : 50300 Median : 6.000
## Mean : 5.538 Mean :2.489 Mean : 50564 Mean : 5.578
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.: 60500 3rd Qu.: 8.000
## Max. :10.000 Max. :5.000 Max. :101000 Max. :10.000
## Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
## Min. :1.000 Length:5000 Min. : 1.000 Length:5000
## 1st Qu.:2.000 Class :character 1st Qu.: 3.000 Class :character
## Median :3.000 Mode :character Median : 6.000 Mode :character
## Mean :3.016 Mean : 5.482
## 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :5.000 Max. :10.000
# Use any of the numerical variables from the dataset and perform the following statistical functions
# Median
gpa <- c(3.5, 3.8, 4.0, 2.9, 3.6, 3.2, 3.9, 2.8)
median_gpa <- median(gpa)
print(median_gpa)
## [1] 3.55
# Use any of the numerical variables from the dataset and perform the following statistical functions
# Mode
data <- c(1, 2, 2, 3, 3, 4)
mode_data <- as.numeric(names(sort(table(data), decreasing = TRUE)))
mode_data <- mode_data[table(data) == max(table(data))]
print(mode_data)
## [1] 3 1
# Use any of the numerical variables from the dataset and perform the following statistical functions
# Range
gpa <- c(3.5, 3.8, 4.0, 2.9, 3.6)
range_gpa <- range(gpa)
print(range_gpa)
## [1] 2.9 4.0
# Task: Plot a scatter plot for any 2 variables in your dataset.
library (ggplot2)
ggplot(education_clean,aes(x=University_Ranking,y=Starting_Salary))+geom_point(size = 1,color = "purple",shape = 10,alpha = 0.3) +geom_point(size = 1,color = "orange",shape = 10,alpha = 0.3)

# Task Plot a bar plot for any 2 variables in your dataset
library (ggplot2)
ggplot(data = education_clean, aes(x = Projects_Completed)) +
geom_bar() +
labs(title = "Projects Completed Frequency",
x = "Projects Completed",
y = "Frequency")

# Task Find the correlation between any 2 variables by applying least square linear regression model.
library("knitr")
# Compute using Pearson Correlation Coefficient
Education_Career_Success_Coefficient <- cor(education_clean$Starting_Salary, education_clean$University_GPA, method = "pearson")
# Print Correlation Coefficient
kable(head(Education_Career_Success_Coefficient))