R Practice V2

R programming analysis for COMP4033

Import data set for education_career_success.csv

library(readr)
education_career_success <- read_csv("C:\\Users\\Lenovo ThinkPad X1\\Downloads\\archive (2)\\education_career_success.csv")

## Rows: 5000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Student_ID, Gender, Field_of_Study, Current_Job_Level, Entrepreneu...
## dbl (15): Age, High_School_GPA, SAT_Score, University_Ranking, University_GP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View structure of the dataset.

edusuccess = education_career_success
str(edusuccess)

## spc_tbl_ [5,000 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Student_ID           : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
##  $ Age                  : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
##  $ Gender               : chr [1:5000] "Male" "Other" "Female" "Male" ...
##  $ High_School_GPA      : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
##  $ SAT_Score            : num [1:5000] 1052 1211 1193 1497 1012 ...
##  $ University_Ranking   : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
##  $ University_GPA       : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
##  $ Field_of_Study       : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
##  $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
##  $ Projects_Completed   : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
##  $ Certifications       : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
##  $ Soft_Skills_Score    : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
##  $ Networking_Score     : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
##  $ Job_Offers           : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
##  $ Starting_Salary      : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
##  $ Career_Satisfaction  : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
##  $ Years_to_Promotion   : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
##  $ Current_Job_Level    : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
##  $ Work_Life_Balance    : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
##  $ Entrepreneurship     : chr [1:5000] "No" "No" "No" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Student_ID = col_character(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   High_School_GPA = col_double(),
##   ..   SAT_Score = col_double(),
##   ..   University_Ranking = col_double(),
##   ..   University_GPA = col_double(),
##   ..   Field_of_Study = col_character(),
##   ..   Internships_Completed = col_double(),
##   ..   Projects_Completed = col_double(),
##   ..   Certifications = col_double(),
##   ..   Soft_Skills_Score = col_double(),
##   ..   Networking_Score = col_double(),
##   ..   Job_Offers = col_double(),
##   ..   Starting_Salary = col_double(),
##   ..   Career_Satisfaction = col_double(),
##   ..   Years_to_Promotion = col_double(),
##   ..   Current_Job_Level = col_character(),
##   ..   Work_Life_Balance = col_double(),
##   ..   Entrepreneurship = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

List the variables in your dataset.

list(edusuccess)

## [[1]]
## # A tibble: 5,000 × 20
##    Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##    <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
##  1 S00001        24 Male              3.58      1052                291
##  2 S00002        21 Other             2.52      1211                112
##  3 S00003        28 Female            3.42      1193                715
##  4 S00004        25 Male              2.43      1497                170
##  5 S00005        22 Male              2.08      1012                599
##  6 S00006        24 Male              2.4       1600                631
##  7 S00007        27 Male              2.36      1011                610
##  8 S00008        20 Male              2.68      1074                240
##  9 S00009        24 Male              2.84      1201                337
## 10 S00010        28 Male              3.02      1415                138
## # ℹ 4,990 more rows
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

Print the top 15 rows of your dataset.

head(edusuccess, 15)

## # A tibble: 15 × 20
##    Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##    <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
##  1 S00001        24 Male              3.58      1052                291
##  2 S00002        21 Other             2.52      1211                112
##  3 S00003        28 Female            3.42      1193                715
##  4 S00004        25 Male              2.43      1497                170
##  5 S00005        22 Male              2.08      1012                599
##  6 S00006        24 Male              2.4       1600                631
##  7 S00007        27 Male              2.36      1011                610
##  8 S00008        20 Male              2.68      1074                240
##  9 S00009        24 Male              2.84      1201                337
## 10 S00010        28 Male              3.02      1415                138
## 11 S00011        28 Female            2.95      1120                594
## 12 S00012        25 Female            2.54      1070                236
## 13 S00013        22 Female            2.06      1217                648
## 14 S00014        21 Male              3.21      1112                794
## 15 S00015        25 Male              2.79      1152                  3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

Write a user defined function using any of the variables from the dataset.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Create a new data frame for Student GPA.
studentgpa <- edusuccess %>% select(Student_ID, High_School_GPA, University_GPA)

#Create user defined function.
coopeligibility <- function(University_GPA) {
  ifelse(University_GPA > 3.3, "Co-op Eligible", "Not Co-op Eligible")
}
  
#Test the user defined function.
studentgpa$Coop_Students = coopeligibility(studentgpa$University_GPA)

#View.
library(knitr)
kable(head(studentgpa))

Student_ID	High_School_GPA	University_GPA	Coop_Students
S00001	3.58	3.96	Co-op Eligible
S00002	2.52	3.63	Co-op Eligible
S00003	3.42	2.63	Not Co-op Eligible
S00004	2.43	2.81	Not Co-op Eligible
S00005	2.08	2.48	Not Co-op Eligible
S00006	2.40	3.78	Co-op Eligible

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

#Add Coop_Students to the main data frame.
edusuccess$Coop_Students = studentgpa$Coop_Students

#Filter by Field_of_Study and Coop_Students.
medcoopstudents <- edusuccess %>% filter(Field_of_Study == "Medicine", Coop_Students == "Co-op Eligible")

#View.
library(knitr)
kable(head(medcoopstudents))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Coop_Students
S00024	19	Female	2.88	1245	201	3.36	Medicine	3	0	3	9	10	2	48100	3	4	Mid	5	No	Co-op Eligible
S00026	22	Female	3.65	1142	342	3.69	Medicine	2	2	1	2	8	4	48100	3	2	Entry	5	No	Co-op Eligible
S00029	27	Male	3.73	1340	216	3.33	Medicine	1	0	3	1	9	3	50300	8	2	Mid	6	No	Co-op Eligible
S00043	26	Female	2.36	1355	98	3.75	Medicine	2	4	3	3	8	0	38400	10	4	Senior	7	No	Co-op Eligible
S00081	29	Male	3.03	1546	767	3.96	Medicine	3	4	3	2	3	2	51200	3	3	Entry	5	No	Co-op Eligible
S00116	28	Female	2.10	916	78	3.79	Medicine	2	3	0	4	4	4	89100	9	5	Senior	3	Yes	Co-op Eligible

Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

#Identify the variables; Dependent = Starting_Salary, Independent = Field_of_Study, Certifications, University_GPA, Soft_Skills_Score

#Create data frames.
#Select the dependent variable (Starting_Salary)
dependent <- edusuccess %>% select(Starting_Salary) 

#Select the independent variables (Field_of_Study, Certifications, University_GPA, Soft_Skills_Score)
independent <- education_career_success %>% select(Field_of_Study, Certifications, University_GPA, Soft_Skills_Score) 

#Combine rows using rbind.
combined_data_by_rows <- as.data.frame(rbind(dependent, Field_of_Study = NA, Certifications = NA, University_GPA = NA, Soft_Skills_Score = NA))

#Combine columns using cbind.
salarynew <- as.data.frame(cbind(dependent,independent))

#View.
kable(head(salarynew))

Starting_Salary	Field_of_Study	Certifications	University_GPA	Soft_Skills_Score
27200	Arts	2	3.96	9
25000	Law	3	3.63	8
42400	Medicine	1	2.63	1
57400	Computer Science	1	2.81	10
47600	Engineering	4	2.48	10
68400	Law	2	3.78	2

Remove missing values in your dataset.

library("knitr")

#Check for missing values.
colSums(is.na(edusuccess))

##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship         Coop_Students 
##                     0                     0                     0

#No missing values were found.

#Remove missing values, if any.
colSums(!is.na(edusuccess))

##            Student_ID                   Age                Gender 
##                  5000                  5000                  5000 
##       High_School_GPA             SAT_Score    University_Ranking 
##                  5000                  5000                  5000 
##        University_GPA        Field_of_Study Internships_Completed 
##                  5000                  5000                  5000 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                  5000                  5000                  5000 
##      Networking_Score            Job_Offers       Starting_Salary 
##                  5000                  5000                  5000 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                  5000                  5000                  5000 
##     Work_Life_Balance      Entrepreneurship         Coop_Students 
##                  5000                  5000                  5000

#View.
kable(head(edusuccess))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Coop_Students
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No	Co-op Eligible
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No	Co-op Eligible
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No	Not Co-op Eligible
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No	Not Co-op Eligible
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No	Not Co-op Eligible
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes	Co-op Eligible

Identify and remove duplicated data from your dataset.

#Identify duplicates.
duplicates <- edusuccess[duplicated(edusuccess),]

#View duplicates.
library(knitr)
kable(head(duplicates))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Coop_Students

#No duplicates are found.

Reorder multiple rows in descending order.

#Arrange data by Starting_Salary, descending.
edusuccessbysalary <- edusuccess %>% arrange(desc(Starting_Salary))

library(knitr)
kable(head(edusuccessbysalary))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Coop_Students
S01810	28	Other	2.34	1089	24	3.01	Arts	0	6	2	3	8	1	101000	6	4	Entry	8	No	Not Co-op Eligible
S04539	21	Male	2.38	1450	943	3.93	Law	1	1	3	3	4	0	100600	6	5	Entry	8	No	Co-op Eligible
S03831	20	Male	3.23	1238	234	2.82	Arts	2	7	3	5	3	5	98900	8	1	Entry	2	Yes	Not Co-op Eligible
S03504	25	Male	3.99	982	459	2.70	Engineering	4	6	2	10	2	0	98200	4	4	Senior	5	No	Not Co-op Eligible
S00017	20	Female	3.73	1539	116	3.78	Law	3	2	3	2	5	3	97500	8	5	Mid	9	No	Co-op Eligible
S01341	25	Female	2.97	900	835	3.07	Law	3	7	4	2	9	5	96900	7	3	Entry	2	No	Not Co-op Eligible

Rename some of the column names in your dataset.

names(edusuccess)[names(edusuccess) == "Gender"] <- "Student_Gender"
names(edusuccess)[names(edusuccess) == "Age"] <- "Student_Age"
names(edusuccess)[names(edusuccess) == "Internships_Completed"] <- "Completed_Internships"

library(knitr)
kable(head(edusuccess))

Student_ID	Student_Age	Student_Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Completed_Internships	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Coop_Students
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No	Co-op Eligible
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No	Co-op Eligible
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No	Not Co-op Eligible
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No	Not Co-op Eligible
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No	Not Co-op Eligible
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes	Co-op Eligible

Add new variables in your data frame by using a mathematical function.

#Create new variable in data frame.
edusuccess <- edusuccess %>% mutate(Monthly_Salary = Starting_Salary/12)

#View.
library(knitr)
kable(head(edusuccess))

Student_ID	Student_Age	Student_Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Completed_Internships	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Coop_Students	Monthly_Salary
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No	Co-op Eligible	2266.667
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No	Co-op Eligible	2083.333
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No	Not Co-op Eligible	3533.333
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No	Not Co-op Eligible	4783.333
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No	Not Co-op Eligible	3966.667
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes	Co-op Eligible	5700.000

Create a training set using a random number generator engine.

#Initiate random number generator engine.
set.seed(1234)

#Extract 5 random rows without replacement.
edusuccess %>% sample_n(5, replace=FALSE)

## # A tibble: 5 × 22
##   Student_ID Student_Age Student_Gender High_School_GPA SAT_Score
##   <chr>            <dbl> <chr>                    <dbl>     <dbl>
## 1 S01004              24 Female                    3.01      1000
## 2 S00623              26 Female                    3.23      1405
## 3 S02693              25 Male                      3.7        943
## 4 S00934              29 Female                    2.91      1442
## 5 S04496              18 Male                      3.9       1534
## # ℹ 17 more variables: University_Ranking <dbl>, University_GPA <dbl>,
## #   Field_of_Study <chr>, Completed_Internships <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>,
## #   Coop_Students <chr>, Monthly_Salary <dbl>

#Extract 5% of rows, randomly without replacement.
edusuccess %>% sample_frac(0.05, replace=FALSE)

## # A tibble: 250 × 22
##    Student_ID Student_Age Student_Gender High_School_GPA SAT_Score
##    <chr>            <dbl> <chr>                    <dbl>     <dbl>
##  1 S02948              18 Female                    2.7       1067
##  2 S02146              27 Male                      3.43      1431
##  3 S03175              26 Female                    3.41       956
##  4 S02774              24 Male                      2.09      1033
##  5 S02374              18 Male                      2.36      1124
##  6 S01103              21 Male                      3.12      1179
##  7 S04046              21 Male                      3.62      1300
##  8 S04366              19 Male                      2.69      1248
##  9 S03454              27 Male                      2.43      1022
## 10 S02232              25 Male                      3.52      1045
## # ℹ 240 more rows
## # ℹ 17 more variables: University_Ranking <dbl>, University_GPA <dbl>,
## #   Field_of_Study <chr>, Completed_Internships <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>, …

Print the summary statistics of your dataset.

#Generate summary statistics.
summary(edusuccess)

##   Student_ID         Student_Age    Student_Gender     High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Completed_Internships Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000                      
##  Coop_Students      Monthly_Salary
##  Length:5000        Min.   :2083  
##  Class :character   1st Qu.:3350  
##  Mode  :character   Median :4192  
##                     Mean   :4214  
##                     3rd Qu.:5042  
##                     Max.   :8417

Use any of the numerical variables from the dataset and perform the following statistical functions: Mean, Median, Mode, Range.

#Calculate mean for Starting_Salary.
mean_salary <- mean(education_career_success$Starting_Salary)
print(paste("Average Starting Salary:",mean_salary))

## [1] "Average Starting Salary: 50563.54"

#Calculate median for Starting_Salary.
median_salary <- median(education_career_success$Starting_Salary)
print(paste("Median of Starting Salary:",median_salary))

## [1] "Median of Starting Salary: 50300"

# Calculate mode for Starting_Salary.
Frequency_mode_salary_table <- table(education_career_success$Starting_Salary)
mode_salary <- as.numeric(names(Frequency_mode_salary_table)[which.max(Frequency_mode_salary_table)])
mode_frequency <- max(Frequency_mode_salary_table)  
#Frequency of the mode
print(paste("Mode of Starting Salary:",mode_salary))

## [1] "Mode of Starting Salary: 25000"

# Calculate the range for Starting_Salary.
range_Salary <- range(education_career_success$Starting_Salary)  
#Get the range (min and max).
range_diff <- range_Salary[2] - range_Salary[1]                  
#Calculate the difference (max - min).
print(paste("Range of Starting Salary:",range_diff))

## [1] "Range of Starting Salary: 76000"

Plot a scatter plot for any 2 variables in your dataset.

library(ggplot2)

# Plot the Scatter Plot; x = University_GPA, y = Starting_Salary
ggplot(edusuccess,aes(x=University_GPA,y=Starting_Salary))+geom_point(size = 1,color = "blue",shape = 20)

Plot a bar plot for any 2 variables in your dataset

#Plot the bar plot for for City and Humidity.
ggplot(edusuccess, aes(x = Field_of_Study, y = Starting_Salary)) +geom_bar(stat = "identity", fill = "limegreen") +
  labs(
    title = "Salary by Field of Study",x = "Field of Study",y = "Salary")

Find the correlation between any 2 variables by applying least square linear regression model.

library("knitr")

#Calculate the correlation coefficient for Starting_Salary and University_GPA.
edusuccesscorr <- cor(edusuccess$Starting_Salary, edusuccess$University_GPA, method = "pearson")

#Print correlation coefficient.
kable(head(edusuccesscorr))

x
0.0010225

R Practice V2

2025-02-24

R programming analysis for COMP4033