Import data set for education_career_success.csv
library(readr)
education_career_success <- read_csv("C:\\Users\\Lenovo ThinkPad X1\\Downloads\\archive (2)\\education_career_success.csv")
## Rows: 5000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Student_ID, Gender, Field_of_Study, Current_Job_Level, Entrepreneu...
## dbl (15): Age, High_School_GPA, SAT_Score, University_Ranking, University_GP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View structure of the dataset.
edusuccess = education_career_success
str(edusuccess)
## spc_tbl_ [5,000 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Student_ID : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
## $ Age : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
## $ Gender : chr [1:5000] "Male" "Other" "Female" "Male" ...
## $ High_School_GPA : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
## $ SAT_Score : num [1:5000] 1052 1211 1193 1497 1012 ...
## $ University_Ranking : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
## $ University_GPA : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
## $ Field_of_Study : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
## $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
## $ Projects_Completed : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
## $ Certifications : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
## $ Soft_Skills_Score : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
## $ Networking_Score : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
## $ Job_Offers : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
## $ Starting_Salary : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
## $ Career_Satisfaction : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
## $ Years_to_Promotion : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
## $ Current_Job_Level : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
## $ Work_Life_Balance : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
## $ Entrepreneurship : chr [1:5000] "No" "No" "No" "No" ...
## - attr(*, "spec")=
## .. cols(
## .. Student_ID = col_character(),
## .. Age = col_double(),
## .. Gender = col_character(),
## .. High_School_GPA = col_double(),
## .. SAT_Score = col_double(),
## .. University_Ranking = col_double(),
## .. University_GPA = col_double(),
## .. Field_of_Study = col_character(),
## .. Internships_Completed = col_double(),
## .. Projects_Completed = col_double(),
## .. Certifications = col_double(),
## .. Soft_Skills_Score = col_double(),
## .. Networking_Score = col_double(),
## .. Job_Offers = col_double(),
## .. Starting_Salary = col_double(),
## .. Career_Satisfaction = col_double(),
## .. Years_to_Promotion = col_double(),
## .. Current_Job_Level = col_character(),
## .. Work_Life_Balance = col_double(),
## .. Entrepreneurship = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
List the variables in your dataset.
list(edusuccess)
## [[1]]
## # A tibble: 5,000 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## 7 S00007 27 Male 2.36 1011 610
## 8 S00008 20 Male 2.68 1074 240
## 9 S00009 24 Male 2.84 1201 337
## 10 S00010 28 Male 3.02 1415 138
## # ℹ 4,990 more rows
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
Print the top 15 rows of your dataset.
head(edusuccess, 15)
## # A tibble: 15 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## 7 S00007 27 Male 2.36 1011 610
## 8 S00008 20 Male 2.68 1074 240
## 9 S00009 24 Male 2.84 1201 337
## 10 S00010 28 Male 3.02 1415 138
## 11 S00011 28 Female 2.95 1120 594
## 12 S00012 25 Female 2.54 1070 236
## 13 S00013 22 Female 2.06 1217 648
## 14 S00014 21 Male 3.21 1112 794
## 15 S00015 25 Male 2.79 1152 3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
Write a user defined function using any of the variables from the dataset.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Create a new data frame for Student GPA.
studentgpa <- edusuccess %>% select(Student_ID, High_School_GPA, University_GPA)
#Create user defined function.
coopeligibility <- function(University_GPA) {
ifelse(University_GPA > 3.3, "Co-op Eligible", "Not Co-op Eligible")
}
#Test the user defined function.
studentgpa$Coop_Students = coopeligibility(studentgpa$University_GPA)
#View.
library(knitr)
kable(head(studentgpa))
| Student_ID | High_School_GPA | University_GPA | Coop_Students |
|---|---|---|---|
| S00001 | 3.58 | 3.96 | Co-op Eligible |
| S00002 | 2.52 | 3.63 | Co-op Eligible |
| S00003 | 3.42 | 2.63 | Not Co-op Eligible |
| S00004 | 2.43 | 2.81 | Not Co-op Eligible |
| S00005 | 2.08 | 2.48 | Not Co-op Eligible |
| S00006 | 2.40 | 3.78 | Co-op Eligible |
Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
#Add Coop_Students to the main data frame.
edusuccess$Coop_Students = studentgpa$Coop_Students
#Filter by Field_of_Study and Coop_Students.
medcoopstudents <- edusuccess %>% filter(Field_of_Study == "Medicine", Coop_Students == "Co-op Eligible")
#View.
library(knitr)
kable(head(medcoopstudents))
| Student_ID | Age | Gender | High_School_GPA | SAT_Score | University_Ranking | University_GPA | Field_of_Study | Internships_Completed | Projects_Completed | Certifications | Soft_Skills_Score | Networking_Score | Job_Offers | Starting_Salary | Career_Satisfaction | Years_to_Promotion | Current_Job_Level | Work_Life_Balance | Entrepreneurship | Coop_Students |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| S00024 | 19 | Female | 2.88 | 1245 | 201 | 3.36 | Medicine | 3 | 0 | 3 | 9 | 10 | 2 | 48100 | 3 | 4 | Mid | 5 | No | Co-op Eligible |
| S00026 | 22 | Female | 3.65 | 1142 | 342 | 3.69 | Medicine | 2 | 2 | 1 | 2 | 8 | 4 | 48100 | 3 | 2 | Entry | 5 | No | Co-op Eligible |
| S00029 | 27 | Male | 3.73 | 1340 | 216 | 3.33 | Medicine | 1 | 0 | 3 | 1 | 9 | 3 | 50300 | 8 | 2 | Mid | 6 | No | Co-op Eligible |
| S00043 | 26 | Female | 2.36 | 1355 | 98 | 3.75 | Medicine | 2 | 4 | 3 | 3 | 8 | 0 | 38400 | 10 | 4 | Senior | 7 | No | Co-op Eligible |
| S00081 | 29 | Male | 3.03 | 1546 | 767 | 3.96 | Medicine | 3 | 4 | 3 | 2 | 3 | 2 | 51200 | 3 | 3 | Entry | 5 | No | Co-op Eligible |
| S00116 | 28 | Female | 2.10 | 916 | 78 | 3.79 | Medicine | 2 | 3 | 0 | 4 | 4 | 4 | 89100 | 9 | 5 | Senior | 3 | Yes | Co-op Eligible |
Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
#Identify the variables; Dependent = Starting_Salary, Independent = Field_of_Study, Certifications, University_GPA, Soft_Skills_Score
#Create data frames.
#Select the dependent variable (Starting_Salary)
dependent <- edusuccess %>% select(Starting_Salary)
#Select the independent variables (Field_of_Study, Certifications, University_GPA, Soft_Skills_Score)
independent <- education_career_success %>% select(Field_of_Study, Certifications, University_GPA, Soft_Skills_Score)
#Combine rows using rbind.
combined_data_by_rows <- as.data.frame(rbind(dependent, Field_of_Study = NA, Certifications = NA, University_GPA = NA, Soft_Skills_Score = NA))
#Combine columns using cbind.
salarynew <- as.data.frame(cbind(dependent,independent))
#View.
kable(head(salarynew))
| Starting_Salary | Field_of_Study | Certifications | University_GPA | Soft_Skills_Score |
|---|---|---|---|---|
| 27200 | Arts | 2 | 3.96 | 9 |
| 25000 | Law | 3 | 3.63 | 8 |
| 42400 | Medicine | 1 | 2.63 | 1 |
| 57400 | Computer Science | 1 | 2.81 | 10 |
| 47600 | Engineering | 4 | 2.48 | 10 |
| 68400 | Law | 2 | 3.78 | 2 |
Remove missing values in your dataset.
library("knitr")
#Check for missing values.
colSums(is.na(edusuccess))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_Ranking
## 0 0 0
## University_GPA Field_of_Study Internships_Completed
## 0 0 0
## Projects_Completed Certifications Soft_Skills_Score
## 0 0 0
## Networking_Score Job_Offers Starting_Salary
## 0 0 0
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 0 0 0
## Work_Life_Balance Entrepreneurship Coop_Students
## 0 0 0
#No missing values were found.
#Remove missing values, if any.
colSums(!is.na(edusuccess))
## Student_ID Age Gender
## 5000 5000 5000
## High_School_GPA SAT_Score University_Ranking
## 5000 5000 5000
## University_GPA Field_of_Study Internships_Completed
## 5000 5000 5000
## Projects_Completed Certifications Soft_Skills_Score
## 5000 5000 5000
## Networking_Score Job_Offers Starting_Salary
## 5000 5000 5000
## Career_Satisfaction Years_to_Promotion Current_Job_Level
## 5000 5000 5000
## Work_Life_Balance Entrepreneurship Coop_Students
## 5000 5000 5000
#View.
kable(head(edusuccess))
| Student_ID | Age | Gender | High_School_GPA | SAT_Score | University_Ranking | University_GPA | Field_of_Study | Internships_Completed | Projects_Completed | Certifications | Soft_Skills_Score | Networking_Score | Job_Offers | Starting_Salary | Career_Satisfaction | Years_to_Promotion | Current_Job_Level | Work_Life_Balance | Entrepreneurship | Coop_Students |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| S00001 | 24 | Male | 3.58 | 1052 | 291 | 3.96 | Arts | 3 | 7 | 2 | 9 | 8 | 5 | 27200 | 4 | 5 | Entry | 7 | No | Co-op Eligible |
| S00002 | 21 | Other | 2.52 | 1211 | 112 | 3.63 | Law | 4 | 7 | 3 | 8 | 1 | 4 | 25000 | 1 | 1 | Mid | 7 | No | Co-op Eligible |
| S00003 | 28 | Female | 3.42 | 1193 | 715 | 2.63 | Medicine | 4 | 8 | 1 | 1 | 9 | 0 | 42400 | 9 | 3 | Entry | 7 | No | Not Co-op Eligible |
| S00004 | 25 | Male | 2.43 | 1497 | 170 | 2.81 | Computer Science | 3 | 9 | 1 | 10 | 6 | 1 | 57400 | 7 | 5 | Mid | 5 | No | Not Co-op Eligible |
| S00005 | 22 | Male | 2.08 | 1012 | 599 | 2.48 | Engineering | 4 | 6 | 4 | 10 | 9 | 4 | 47600 | 9 | 5 | Entry | 2 | No | Not Co-op Eligible |
| S00006 | 24 | Male | 2.40 | 1600 | 631 | 3.78 | Law | 2 | 3 | 2 | 2 | 2 | 1 | 68400 | 9 | 2 | Entry | 8 | Yes | Co-op Eligible |
Identify and remove duplicated data from your dataset.
#Identify duplicates.
duplicates <- edusuccess[duplicated(edusuccess),]
#View duplicates.
library(knitr)
kable(head(duplicates))
| Student_ID | Age | Gender | High_School_GPA | SAT_Score | University_Ranking | University_GPA | Field_of_Study | Internships_Completed | Projects_Completed | Certifications | Soft_Skills_Score | Networking_Score | Job_Offers | Starting_Salary | Career_Satisfaction | Years_to_Promotion | Current_Job_Level | Work_Life_Balance | Entrepreneurship | Coop_Students |
|---|
#No duplicates are found.
Reorder multiple rows in descending order.
#Arrange data by Starting_Salary, descending.
edusuccessbysalary <- edusuccess %>% arrange(desc(Starting_Salary))
library(knitr)
kable(head(edusuccessbysalary))
| Student_ID | Age | Gender | High_School_GPA | SAT_Score | University_Ranking | University_GPA | Field_of_Study | Internships_Completed | Projects_Completed | Certifications | Soft_Skills_Score | Networking_Score | Job_Offers | Starting_Salary | Career_Satisfaction | Years_to_Promotion | Current_Job_Level | Work_Life_Balance | Entrepreneurship | Coop_Students |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| S01810 | 28 | Other | 2.34 | 1089 | 24 | 3.01 | Arts | 0 | 6 | 2 | 3 | 8 | 1 | 101000 | 6 | 4 | Entry | 8 | No | Not Co-op Eligible |
| S04539 | 21 | Male | 2.38 | 1450 | 943 | 3.93 | Law | 1 | 1 | 3 | 3 | 4 | 0 | 100600 | 6 | 5 | Entry | 8 | No | Co-op Eligible |
| S03831 | 20 | Male | 3.23 | 1238 | 234 | 2.82 | Arts | 2 | 7 | 3 | 5 | 3 | 5 | 98900 | 8 | 1 | Entry | 2 | Yes | Not Co-op Eligible |
| S03504 | 25 | Male | 3.99 | 982 | 459 | 2.70 | Engineering | 4 | 6 | 2 | 10 | 2 | 0 | 98200 | 4 | 4 | Senior | 5 | No | Not Co-op Eligible |
| S00017 | 20 | Female | 3.73 | 1539 | 116 | 3.78 | Law | 3 | 2 | 3 | 2 | 5 | 3 | 97500 | 8 | 5 | Mid | 9 | No | Co-op Eligible |
| S01341 | 25 | Female | 2.97 | 900 | 835 | 3.07 | Law | 3 | 7 | 4 | 2 | 9 | 5 | 96900 | 7 | 3 | Entry | 2 | No | Not Co-op Eligible |
Rename some of the column names in your dataset.
names(edusuccess)[names(edusuccess) == "Gender"] <- "Student_Gender"
names(edusuccess)[names(edusuccess) == "Age"] <- "Student_Age"
names(edusuccess)[names(edusuccess) == "Internships_Completed"] <- "Completed_Internships"
library(knitr)
kable(head(edusuccess))
| Student_ID | Student_Age | Student_Gender | High_School_GPA | SAT_Score | University_Ranking | University_GPA | Field_of_Study | Completed_Internships | Projects_Completed | Certifications | Soft_Skills_Score | Networking_Score | Job_Offers | Starting_Salary | Career_Satisfaction | Years_to_Promotion | Current_Job_Level | Work_Life_Balance | Entrepreneurship | Coop_Students |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| S00001 | 24 | Male | 3.58 | 1052 | 291 | 3.96 | Arts | 3 | 7 | 2 | 9 | 8 | 5 | 27200 | 4 | 5 | Entry | 7 | No | Co-op Eligible |
| S00002 | 21 | Other | 2.52 | 1211 | 112 | 3.63 | Law | 4 | 7 | 3 | 8 | 1 | 4 | 25000 | 1 | 1 | Mid | 7 | No | Co-op Eligible |
| S00003 | 28 | Female | 3.42 | 1193 | 715 | 2.63 | Medicine | 4 | 8 | 1 | 1 | 9 | 0 | 42400 | 9 | 3 | Entry | 7 | No | Not Co-op Eligible |
| S00004 | 25 | Male | 2.43 | 1497 | 170 | 2.81 | Computer Science | 3 | 9 | 1 | 10 | 6 | 1 | 57400 | 7 | 5 | Mid | 5 | No | Not Co-op Eligible |
| S00005 | 22 | Male | 2.08 | 1012 | 599 | 2.48 | Engineering | 4 | 6 | 4 | 10 | 9 | 4 | 47600 | 9 | 5 | Entry | 2 | No | Not Co-op Eligible |
| S00006 | 24 | Male | 2.40 | 1600 | 631 | 3.78 | Law | 2 | 3 | 2 | 2 | 2 | 1 | 68400 | 9 | 2 | Entry | 8 | Yes | Co-op Eligible |
Add new variables in your data frame by using a mathematical function.
#Create new variable in data frame.
edusuccess <- edusuccess %>% mutate(Monthly_Salary = Starting_Salary/12)
#View.
library(knitr)
kable(head(edusuccess))
| Student_ID | Student_Age | Student_Gender | High_School_GPA | SAT_Score | University_Ranking | University_GPA | Field_of_Study | Completed_Internships | Projects_Completed | Certifications | Soft_Skills_Score | Networking_Score | Job_Offers | Starting_Salary | Career_Satisfaction | Years_to_Promotion | Current_Job_Level | Work_Life_Balance | Entrepreneurship | Coop_Students | Monthly_Salary |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| S00001 | 24 | Male | 3.58 | 1052 | 291 | 3.96 | Arts | 3 | 7 | 2 | 9 | 8 | 5 | 27200 | 4 | 5 | Entry | 7 | No | Co-op Eligible | 2266.667 |
| S00002 | 21 | Other | 2.52 | 1211 | 112 | 3.63 | Law | 4 | 7 | 3 | 8 | 1 | 4 | 25000 | 1 | 1 | Mid | 7 | No | Co-op Eligible | 2083.333 |
| S00003 | 28 | Female | 3.42 | 1193 | 715 | 2.63 | Medicine | 4 | 8 | 1 | 1 | 9 | 0 | 42400 | 9 | 3 | Entry | 7 | No | Not Co-op Eligible | 3533.333 |
| S00004 | 25 | Male | 2.43 | 1497 | 170 | 2.81 | Computer Science | 3 | 9 | 1 | 10 | 6 | 1 | 57400 | 7 | 5 | Mid | 5 | No | Not Co-op Eligible | 4783.333 |
| S00005 | 22 | Male | 2.08 | 1012 | 599 | 2.48 | Engineering | 4 | 6 | 4 | 10 | 9 | 4 | 47600 | 9 | 5 | Entry | 2 | No | Not Co-op Eligible | 3966.667 |
| S00006 | 24 | Male | 2.40 | 1600 | 631 | 3.78 | Law | 2 | 3 | 2 | 2 | 2 | 1 | 68400 | 9 | 2 | Entry | 8 | Yes | Co-op Eligible | 5700.000 |
Create a training set using a random number generator engine.
#Initiate random number generator engine.
set.seed(1234)
#Extract 5 random rows without replacement.
edusuccess %>% sample_n(5, replace=FALSE)
## # A tibble: 5 × 22
## Student_ID Student_Age Student_Gender High_School_GPA SAT_Score
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 S01004 24 Female 3.01 1000
## 2 S00623 26 Female 3.23 1405
## 3 S02693 25 Male 3.7 943
## 4 S00934 29 Female 2.91 1442
## 5 S04496 18 Male 3.9 1534
## # ℹ 17 more variables: University_Ranking <dbl>, University_GPA <dbl>,
## # Field_of_Study <chr>, Completed_Internships <dbl>,
## # Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## # Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## # Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## # Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>,
## # Coop_Students <chr>, Monthly_Salary <dbl>
#Extract 5% of rows, randomly without replacement.
edusuccess %>% sample_frac(0.05, replace=FALSE)
## # A tibble: 250 × 22
## Student_ID Student_Age Student_Gender High_School_GPA SAT_Score
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 S02948 18 Female 2.7 1067
## 2 S02146 27 Male 3.43 1431
## 3 S03175 26 Female 3.41 956
## 4 S02774 24 Male 2.09 1033
## 5 S02374 18 Male 2.36 1124
## 6 S01103 21 Male 3.12 1179
## 7 S04046 21 Male 3.62 1300
## 8 S04366 19 Male 2.69 1248
## 9 S03454 27 Male 2.43 1022
## 10 S02232 25 Male 3.52 1045
## # ℹ 240 more rows
## # ℹ 17 more variables: University_Ranking <dbl>, University_GPA <dbl>,
## # Field_of_Study <chr>, Completed_Internships <dbl>,
## # Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## # Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## # Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## # Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>, …
Print the summary statistics of your dataset.
#Generate summary statistics.
summary(edusuccess)
## Student_ID Student_Age Student_Gender High_School_GPA
## Length:5000 Min. :18.00 Length:5000 Min. :2.000
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:2.500
## Mode :character Median :23.00 Mode :character Median :2.990
## Mean :23.44 Mean :2.997
## 3rd Qu.:26.00 3rd Qu.:3.500
## Max. :29.00 Max. :4.000
## SAT_Score University_Ranking University_GPA Field_of_Study
## Min. : 900 Min. : 1.0 Min. :2.00 Length:5000
## 1st Qu.:1076 1st Qu.: 256.0 1st Qu.:2.52 Class :character
## Median :1257 Median : 501.5 Median :3.03 Mode :character
## Mean :1254 Mean : 504.3 Mean :3.02
## 3rd Qu.:1432 3rd Qu.: 759.0 3rd Qu.:3.51
## Max. :1600 Max. :1000.0 Max. :4.00
## Completed_Internships Projects_Completed Certifications Soft_Skills_Score
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :2.000 Median :5.000 Median :3.000 Median : 6.000
## Mean :1.982 Mean :4.563 Mean :2.512 Mean : 5.546
## 3rd Qu.:3.000 3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :4.000 Max. :9.000 Max. :5.000 Max. :10.000
## Networking_Score Job_Offers Starting_Salary Career_Satisfaction
## Min. : 1.000 Min. :0.000 Min. : 25000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.: 40200 1st Qu.: 3.000
## Median : 6.000 Median :2.000 Median : 50300 Median : 6.000
## Mean : 5.538 Mean :2.489 Mean : 50564 Mean : 5.578
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.: 60500 3rd Qu.: 8.000
## Max. :10.000 Max. :5.000 Max. :101000 Max. :10.000
## Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
## Min. :1.000 Length:5000 Min. : 1.000 Length:5000
## 1st Qu.:2.000 Class :character 1st Qu.: 3.000 Class :character
## Median :3.000 Mode :character Median : 6.000 Mode :character
## Mean :3.016 Mean : 5.482
## 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :5.000 Max. :10.000
## Coop_Students Monthly_Salary
## Length:5000 Min. :2083
## Class :character 1st Qu.:3350
## Mode :character Median :4192
## Mean :4214
## 3rd Qu.:5042
## Max. :8417
Use any of the numerical variables from the dataset and perform the following statistical functions: Mean, Median, Mode, Range.
#Calculate mean for Starting_Salary.
mean_salary <- mean(education_career_success$Starting_Salary)
print(paste("Average Starting Salary:",mean_salary))
## [1] "Average Starting Salary: 50563.54"
#Calculate median for Starting_Salary.
median_salary <- median(education_career_success$Starting_Salary)
print(paste("Median of Starting Salary:",median_salary))
## [1] "Median of Starting Salary: 50300"
# Calculate mode for Starting_Salary.
Frequency_mode_salary_table <- table(education_career_success$Starting_Salary)
mode_salary <- as.numeric(names(Frequency_mode_salary_table)[which.max(Frequency_mode_salary_table)])
mode_frequency <- max(Frequency_mode_salary_table)
#Frequency of the mode
print(paste("Mode of Starting Salary:",mode_salary))
## [1] "Mode of Starting Salary: 25000"
# Calculate the range for Starting_Salary.
range_Salary <- range(education_career_success$Starting_Salary)
#Get the range (min and max).
range_diff <- range_Salary[2] - range_Salary[1]
#Calculate the difference (max - min).
print(paste("Range of Starting Salary:",range_diff))
## [1] "Range of Starting Salary: 76000"
Plot a scatter plot for any 2 variables in your dataset.
library(ggplot2)
# Plot the Scatter Plot; x = University_GPA, y = Starting_Salary
ggplot(edusuccess,aes(x=University_GPA,y=Starting_Salary))+geom_point(size = 1,color = "blue",shape = 20)
Plot a bar plot for any 2 variables in your dataset
#Plot the bar plot for for City and Humidity.
ggplot(edusuccess, aes(x = Field_of_Study, y = Starting_Salary)) +geom_bar(stat = "identity", fill = "limegreen") +
labs(
title = "Salary by Field of Study",x = "Field of Study",y = "Salary")
Find the correlation between any 2 variables by applying least square linear regression model.
library("knitr")
#Calculate the correlation coefficient for Starting_Salary and University_GPA.
edusuccesscorr <- cor(edusuccess$Starting_Salary, edusuccess$University_GPA, method = "pearson")
#Print correlation coefficient.
kable(head(edusuccesscorr))
| x |
|---|
| 0.0010225 |