R programming analysis for COMP4033

Import data set for education_career_success.csv

library(readr)
education_career_success <- read_csv("C:\\Users\\Lenovo ThinkPad X1\\Downloads\\archive (2)\\education_career_success.csv")
## Rows: 5000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Student_ID, Gender, Field_of_Study, Current_Job_Level, Entrepreneu...
## dbl (15): Age, High_School_GPA, SAT_Score, University_Ranking, University_GP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View structure of the dataset.

edusuccess = education_career_success
str(edusuccess)
## spc_tbl_ [5,000 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Student_ID           : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
##  $ Age                  : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
##  $ Gender               : chr [1:5000] "Male" "Other" "Female" "Male" ...
##  $ High_School_GPA      : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
##  $ SAT_Score            : num [1:5000] 1052 1211 1193 1497 1012 ...
##  $ University_Ranking   : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
##  $ University_GPA       : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
##  $ Field_of_Study       : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
##  $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
##  $ Projects_Completed   : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
##  $ Certifications       : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
##  $ Soft_Skills_Score    : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
##  $ Networking_Score     : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
##  $ Job_Offers           : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
##  $ Starting_Salary      : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
##  $ Career_Satisfaction  : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
##  $ Years_to_Promotion   : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
##  $ Current_Job_Level    : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
##  $ Work_Life_Balance    : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
##  $ Entrepreneurship     : chr [1:5000] "No" "No" "No" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Student_ID = col_character(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   High_School_GPA = col_double(),
##   ..   SAT_Score = col_double(),
##   ..   University_Ranking = col_double(),
##   ..   University_GPA = col_double(),
##   ..   Field_of_Study = col_character(),
##   ..   Internships_Completed = col_double(),
##   ..   Projects_Completed = col_double(),
##   ..   Certifications = col_double(),
##   ..   Soft_Skills_Score = col_double(),
##   ..   Networking_Score = col_double(),
##   ..   Job_Offers = col_double(),
##   ..   Starting_Salary = col_double(),
##   ..   Career_Satisfaction = col_double(),
##   ..   Years_to_Promotion = col_double(),
##   ..   Current_Job_Level = col_character(),
##   ..   Work_Life_Balance = col_double(),
##   ..   Entrepreneurship = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

List the variables in your dataset.

list(edusuccess)
## [[1]]
## # A tibble: 5,000 × 20
##    Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##    <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
##  1 S00001        24 Male              3.58      1052                291
##  2 S00002        21 Other             2.52      1211                112
##  3 S00003        28 Female            3.42      1193                715
##  4 S00004        25 Male              2.43      1497                170
##  5 S00005        22 Male              2.08      1012                599
##  6 S00006        24 Male              2.4       1600                631
##  7 S00007        27 Male              2.36      1011                610
##  8 S00008        20 Male              2.68      1074                240
##  9 S00009        24 Male              2.84      1201                337
## 10 S00010        28 Male              3.02      1415                138
## # ℹ 4,990 more rows
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

Print the top 15 rows of your dataset.

head(edusuccess, 15)
## # A tibble: 15 × 20
##    Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##    <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
##  1 S00001        24 Male              3.58      1052                291
##  2 S00002        21 Other             2.52      1211                112
##  3 S00003        28 Female            3.42      1193                715
##  4 S00004        25 Male              2.43      1497                170
##  5 S00005        22 Male              2.08      1012                599
##  6 S00006        24 Male              2.4       1600                631
##  7 S00007        27 Male              2.36      1011                610
##  8 S00008        20 Male              2.68      1074                240
##  9 S00009        24 Male              2.84      1201                337
## 10 S00010        28 Male              3.02      1415                138
## 11 S00011        28 Female            2.95      1120                594
## 12 S00012        25 Female            2.54      1070                236
## 13 S00013        22 Female            2.06      1217                648
## 14 S00014        21 Male              3.21      1112                794
## 15 S00015        25 Male              2.79      1152                  3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

Write a user defined function using any of the variables from the dataset.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Create a new data frame for Student GPA.
studentgpa <- edusuccess %>% select(Student_ID, High_School_GPA, University_GPA)

#Create user defined function.
coopeligibility <- function(University_GPA) {
  ifelse(University_GPA > 3.3, "Co-op Eligible", "Not Co-op Eligible")
}
  
#Test the user defined function.
studentgpa$Coop_Students = coopeligibility(studentgpa$University_GPA)

#View.
library(knitr)
kable(head(studentgpa))
Student_ID High_School_GPA University_GPA Coop_Students
S00001 3.58 3.96 Co-op Eligible
S00002 2.52 3.63 Co-op Eligible
S00003 3.42 2.63 Not Co-op Eligible
S00004 2.43 2.81 Not Co-op Eligible
S00005 2.08 2.48 Not Co-op Eligible
S00006 2.40 3.78 Co-op Eligible

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

#Add Coop_Students to the main data frame.
edusuccess$Coop_Students = studentgpa$Coop_Students

#Filter by Field_of_Study and Coop_Students.
medcoopstudents <- edusuccess %>% filter(Field_of_Study == "Medicine", Coop_Students == "Co-op Eligible")

#View.
library(knitr)
kable(head(medcoopstudents))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Coop_Students
S00024 19 Female 2.88 1245 201 3.36 Medicine 3 0 3 9 10 2 48100 3 4 Mid 5 No Co-op Eligible
S00026 22 Female 3.65 1142 342 3.69 Medicine 2 2 1 2 8 4 48100 3 2 Entry 5 No Co-op Eligible
S00029 27 Male 3.73 1340 216 3.33 Medicine 1 0 3 1 9 3 50300 8 2 Mid 6 No Co-op Eligible
S00043 26 Female 2.36 1355 98 3.75 Medicine 2 4 3 3 8 0 38400 10 4 Senior 7 No Co-op Eligible
S00081 29 Male 3.03 1546 767 3.96 Medicine 3 4 3 2 3 2 51200 3 3 Entry 5 No Co-op Eligible
S00116 28 Female 2.10 916 78 3.79 Medicine 2 3 0 4 4 4 89100 9 5 Senior 3 Yes Co-op Eligible

Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

#Identify the variables; Dependent = Starting_Salary, Independent = Field_of_Study, Certifications, University_GPA, Soft_Skills_Score

#Create data frames.
#Select the dependent variable (Starting_Salary)
dependent <- edusuccess %>% select(Starting_Salary) 

#Select the independent variables (Field_of_Study, Certifications, University_GPA, Soft_Skills_Score)
independent <- education_career_success %>% select(Field_of_Study, Certifications, University_GPA, Soft_Skills_Score) 

#Combine rows using rbind.
combined_data_by_rows <- as.data.frame(rbind(dependent, Field_of_Study = NA, Certifications = NA, University_GPA = NA, Soft_Skills_Score = NA))

#Combine columns using cbind.
salarynew <- as.data.frame(cbind(dependent,independent))

#View.
kable(head(salarynew))
Starting_Salary Field_of_Study Certifications University_GPA Soft_Skills_Score
27200 Arts 2 3.96 9
25000 Law 3 3.63 8
42400 Medicine 1 2.63 1
57400 Computer Science 1 2.81 10
47600 Engineering 4 2.48 10
68400 Law 2 3.78 2

Remove missing values in your dataset.

library("knitr")

#Check for missing values.
colSums(is.na(edusuccess))
##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship         Coop_Students 
##                     0                     0                     0
#No missing values were found.

#Remove missing values, if any.
colSums(!is.na(edusuccess))
##            Student_ID                   Age                Gender 
##                  5000                  5000                  5000 
##       High_School_GPA             SAT_Score    University_Ranking 
##                  5000                  5000                  5000 
##        University_GPA        Field_of_Study Internships_Completed 
##                  5000                  5000                  5000 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                  5000                  5000                  5000 
##      Networking_Score            Job_Offers       Starting_Salary 
##                  5000                  5000                  5000 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                  5000                  5000                  5000 
##     Work_Life_Balance      Entrepreneurship         Coop_Students 
##                  5000                  5000                  5000
#View.
kable(head(edusuccess))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Coop_Students
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No Co-op Eligible
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No Co-op Eligible
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No Not Co-op Eligible
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No Not Co-op Eligible
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No Not Co-op Eligible
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes Co-op Eligible

Identify and remove duplicated data from your dataset.

#Identify duplicates.
duplicates <- edusuccess[duplicated(edusuccess),]

#View duplicates.
library(knitr)
kable(head(duplicates))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Coop_Students
#No duplicates are found.

Reorder multiple rows in descending order.

#Arrange data by Starting_Salary, descending.
edusuccessbysalary <- edusuccess %>% arrange(desc(Starting_Salary))

library(knitr)
kable(head(edusuccessbysalary))
Student_ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Internships_Completed Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Coop_Students
S01810 28 Other 2.34 1089 24 3.01 Arts 0 6 2 3 8 1 101000 6 4 Entry 8 No Not Co-op Eligible
S04539 21 Male 2.38 1450 943 3.93 Law 1 1 3 3 4 0 100600 6 5 Entry 8 No Co-op Eligible
S03831 20 Male 3.23 1238 234 2.82 Arts 2 7 3 5 3 5 98900 8 1 Entry 2 Yes Not Co-op Eligible
S03504 25 Male 3.99 982 459 2.70 Engineering 4 6 2 10 2 0 98200 4 4 Senior 5 No Not Co-op Eligible
S00017 20 Female 3.73 1539 116 3.78 Law 3 2 3 2 5 3 97500 8 5 Mid 9 No Co-op Eligible
S01341 25 Female 2.97 900 835 3.07 Law 3 7 4 2 9 5 96900 7 3 Entry 2 No Not Co-op Eligible

Rename some of the column names in your dataset.

names(edusuccess)[names(edusuccess) == "Gender"] <- "Student_Gender"
names(edusuccess)[names(edusuccess) == "Age"] <- "Student_Age"
names(edusuccess)[names(edusuccess) == "Internships_Completed"] <- "Completed_Internships"

library(knitr)
kable(head(edusuccess))
Student_ID Student_Age Student_Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Completed_Internships Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Coop_Students
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No Co-op Eligible
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No Co-op Eligible
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No Not Co-op Eligible
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No Not Co-op Eligible
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No Not Co-op Eligible
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes Co-op Eligible

Add new variables in your data frame by using a mathematical function.

#Create new variable in data frame.
edusuccess <- edusuccess %>% mutate(Monthly_Salary = Starting_Salary/12)

#View.
library(knitr)
kable(head(edusuccess))
Student_ID Student_Age Student_Gender High_School_GPA SAT_Score University_Ranking University_GPA Field_of_Study Completed_Internships Projects_Completed Certifications Soft_Skills_Score Networking_Score Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship Coop_Students Monthly_Salary
S00001 24 Male 3.58 1052 291 3.96 Arts 3 7 2 9 8 5 27200 4 5 Entry 7 No Co-op Eligible 2266.667
S00002 21 Other 2.52 1211 112 3.63 Law 4 7 3 8 1 4 25000 1 1 Mid 7 No Co-op Eligible 2083.333
S00003 28 Female 3.42 1193 715 2.63 Medicine 4 8 1 1 9 0 42400 9 3 Entry 7 No Not Co-op Eligible 3533.333
S00004 25 Male 2.43 1497 170 2.81 Computer Science 3 9 1 10 6 1 57400 7 5 Mid 5 No Not Co-op Eligible 4783.333
S00005 22 Male 2.08 1012 599 2.48 Engineering 4 6 4 10 9 4 47600 9 5 Entry 2 No Not Co-op Eligible 3966.667
S00006 24 Male 2.40 1600 631 3.78 Law 2 3 2 2 2 1 68400 9 2 Entry 8 Yes Co-op Eligible 5700.000

Create a training set using a random number generator engine.

#Initiate random number generator engine.
set.seed(1234)

#Extract 5 random rows without replacement.
edusuccess %>% sample_n(5, replace=FALSE)
## # A tibble: 5 × 22
##   Student_ID Student_Age Student_Gender High_School_GPA SAT_Score
##   <chr>            <dbl> <chr>                    <dbl>     <dbl>
## 1 S01004              24 Female                    3.01      1000
## 2 S00623              26 Female                    3.23      1405
## 3 S02693              25 Male                      3.7        943
## 4 S00934              29 Female                    2.91      1442
## 5 S04496              18 Male                      3.9       1534
## # ℹ 17 more variables: University_Ranking <dbl>, University_GPA <dbl>,
## #   Field_of_Study <chr>, Completed_Internships <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>,
## #   Coop_Students <chr>, Monthly_Salary <dbl>
#Extract 5% of rows, randomly without replacement.
edusuccess %>% sample_frac(0.05, replace=FALSE)
## # A tibble: 250 × 22
##    Student_ID Student_Age Student_Gender High_School_GPA SAT_Score
##    <chr>            <dbl> <chr>                    <dbl>     <dbl>
##  1 S02948              18 Female                    2.7       1067
##  2 S02146              27 Male                      3.43      1431
##  3 S03175              26 Female                    3.41       956
##  4 S02774              24 Male                      2.09      1033
##  5 S02374              18 Male                      2.36      1124
##  6 S01103              21 Male                      3.12      1179
##  7 S04046              21 Male                      3.62      1300
##  8 S04366              19 Male                      2.69      1248
##  9 S03454              27 Male                      2.43      1022
## 10 S02232              25 Male                      3.52      1045
## # ℹ 240 more rows
## # ℹ 17 more variables: University_Ranking <dbl>, University_GPA <dbl>,
## #   Field_of_Study <chr>, Completed_Internships <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>, …

Print the summary statistics of your dataset.

#Generate summary statistics.
summary(edusuccess)
##   Student_ID         Student_Age    Student_Gender     High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Completed_Internships Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000                      
##  Coop_Students      Monthly_Salary
##  Length:5000        Min.   :2083  
##  Class :character   1st Qu.:3350  
##  Mode  :character   Median :4192  
##                     Mean   :4214  
##                     3rd Qu.:5042  
##                     Max.   :8417

Use any of the numerical variables from the dataset and perform the following statistical functions: Mean, Median, Mode, Range.

#Calculate mean for Starting_Salary.
mean_salary <- mean(education_career_success$Starting_Salary)
print(paste("Average Starting Salary:",mean_salary))
## [1] "Average Starting Salary: 50563.54"
#Calculate median for Starting_Salary.
median_salary <- median(education_career_success$Starting_Salary)
print(paste("Median of Starting Salary:",median_salary))
## [1] "Median of Starting Salary: 50300"
# Calculate mode for Starting_Salary.
Frequency_mode_salary_table <- table(education_career_success$Starting_Salary)
mode_salary <- as.numeric(names(Frequency_mode_salary_table)[which.max(Frequency_mode_salary_table)])
mode_frequency <- max(Frequency_mode_salary_table)  
#Frequency of the mode
print(paste("Mode of Starting Salary:",mode_salary))
## [1] "Mode of Starting Salary: 25000"
# Calculate the range for Starting_Salary.
range_Salary <- range(education_career_success$Starting_Salary)  
#Get the range (min and max).
range_diff <- range_Salary[2] - range_Salary[1]                  
#Calculate the difference (max - min).
print(paste("Range of Starting Salary:",range_diff))
## [1] "Range of Starting Salary: 76000"

Plot a scatter plot for any 2 variables in your dataset.

library(ggplot2)

# Plot the Scatter Plot; x = University_GPA, y = Starting_Salary
ggplot(edusuccess,aes(x=University_GPA,y=Starting_Salary))+geom_point(size = 1,color = "blue",shape = 20) 

Plot a bar plot for any 2 variables in your dataset

#Plot the bar plot for for City and Humidity.
ggplot(edusuccess, aes(x = Field_of_Study, y = Starting_Salary)) +geom_bar(stat = "identity", fill = "limegreen") +
  labs(
    title = "Salary by Field of Study",x = "Field of Study",y = "Salary")

Find the correlation between any 2 variables by applying least square linear regression model.

library("knitr")

#Calculate the correlation coefficient for Starting_Salary and University_GPA.
edusuccesscorr <- cor(edusuccess$Starting_Salary, edusuccess$University_GPA, method = "pearson")

#Print correlation coefficient.
kable(head(edusuccesscorr))
x
0.0010225