Assignment_1_Education Career Success

Structure of dataset

str(education_career_success)

## 'data.frame':    5000 obs. of  20 variables:
##  $ Student_ID           : chr  "S00001" "S00002" "S00003" "S00004" ...
##  $ Age                  : int  24 21 28 25 22 24 27 20 24 28 ...
##  $ Gender               : chr  "Male" "Other" "Female" "Male" ...
##  $ High_School_GPA      : num  3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
##  $ SAT_Score            : int  1052 1211 1193 1497 1012 1600 1011 1074 1201 1415 ...
##  $ University_Ranking   : int  291 112 715 170 599 631 610 240 337 138 ...
##  $ University_GPA       : num  3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
##  $ Field_of_Study       : chr  "Arts" "Law" "Medicine" "Computer Science" ...
##  $ Internships_Completed: int  3 4 4 3 4 2 0 1 2 1 ...
##  $ Projects_Completed   : int  7 7 8 9 6 3 1 5 3 5 ...
##  $ Certifications       : int  2 3 1 1 4 2 3 5 0 3 ...
##  $ Soft_Skills_Score    : int  9 8 1 10 10 2 3 5 5 10 ...
##  $ Networking_Score     : int  8 1 9 6 9 2 3 1 5 2 ...
##  $ Job_Offers           : int  5 4 0 1 4 1 2 2 2 0 ...
##  $ Starting_Salary      : num  27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
##  $ Career_Satisfaction  : int  4 1 9 7 9 9 7 2 2 4 ...
##  $ Years_to_Promotion   : int  5 1 3 5 5 2 4 3 2 2 ...
##  $ Current_Job_Level    : chr  "Entry" "Mid" "Entry" "Mid" ...
##  $ Work_Life_Balance    : int  7 7 7 5 2 8 3 3 2 2 ...
##  $ Entrepreneurship     : chr  "No" "No" "No" "No" ...

Variable List of dataset

#The knitr package is used to print R objects (e.g., data frames, tables) in a more readable format.It produces nicely formatted tables for reports, making them more presentable in this data set.
library(knitr) 

#The kable() function in knitr is a table generator,as there are 20 columns without kable the data is not displaying correctly.Also used the index in data frame so that variables name can be assigned to numbers in ascending order
kable(data.frame(Index = 1:length(names(education_career_success)), Variable = names(education_career_success)))

Index	Variable
1	Student_ID
2	Age
3	Gender
4	High_School_GPA
5	SAT_Score
6	University_Ranking
7	University_GPA
8	Field_of_Study
9	Internships_Completed
10	Projects_Completed
11	Certifications
12	Soft_Skills_Score
13	Networking_Score
14	Job_Offers
15	Starting_Salary
16	Career_Satisfaction
17	Years_to_Promotion
18	Current_Job_Level
19	Work_Life_Balance
20	Entrepreneurship

Printing first 15 rows

library(knitr)
kable(head(education_career_success,15))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes
S00007	27	Male	2.36	1011	610	3.83	Computer Science	0	1	3	3	3	2	55500	7	4	Mid	3	No
S00008	20	Male	2.68	1074	240	2.84	Computer Science	1	5	5	5	1	2	38000	2	3	Entry	3	No
S00009	24	Male	2.84	1201	337	3.31	Business	2	3	0	5	5	2	68900	2	2	Entry	2	No
S00010	28	Male	3.02	1415	138	2.33	Computer Science	1	5	3	10	2	0	58900	4	2	Senior	2	No
S00011	28	Female	2.95	1120	594	2.87	Mathematics	2	7	5	8	1	5	26300	9	1	Entry	2	No
S00012	25	Female	2.54	1070	236	3.26	Law	2	2	3	2	9	5	35100	7	4	Mid	6	Yes
S00013	22	Female	2.06	1217	648	2.77	Engineering	2	0	5	2	9	2	42600	9	4	Senior	8	No
S00014	21	Male	3.21	1112	794	2.72	Arts	0	4	3	8	6	2	76500	4	3	Entry	3	No
S00015	25	Male	2.79	1152	3	2.00	Business	1	2	3	1	8	1	61100	9	3	Entry	6	Yes

User defined function using any of the variables from the data set

# Defined  check_salary to check if salary is above 50,000
check_salary <- function(Starting_Salary) {
  if (Starting_Salary > 50000) {
    return("Salary is Above 50K")
  } else {
    return("Salary is Below 50K")
  }
}

# Test the function
check_salary(49999)

## [1] "Salary is Below 50K"

# Output: "Salary is Below 50K"

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

library("tidyverse")

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("conflicted")  # Loaded the conflicted package to manage function conflicts
library("knitr")
conflicts_prefer(dplyr::filter) # Resolve conflicts to ensure dplyr::filter is used instead of other filter functions

## [conflicted] Will prefer dplyr::filter over any other package.

high_salary_gpa_data <- education_career_success %>% filter(Starting_Salary >= 70000, University_GPA >= 3.5) #The filter() function is used to subset rows from the education_career_success data set.Keep rows where the Starting_Salary is greater than or equal to 70,000 and University_GPA is greater than or equal to 3.5.

kable(head(high_salary_gpa_data))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship
S00017	20	Female	3.73	1539	116	3.78	Law	3	2	3	2	5	3	97500	8	5	Mid	9	No
S00058	27	Male	3.06	1096	695	3.77	Computer Science	3	4	0	9	3	0	82700	10	5	Entry	5	No
S00064	24	Male	2.91	1071	306	3.86	Law	2	3	4	7	2	2	75700	6	5	Entry	4	Yes
S00116	28	Female	2.10	916	78	3.79	Medicine	2	3	0	4	4	4	89100	9	5	Senior	3	Yes
S00129	24	Female	2.62	1039	274	3.60	Arts	0	9	0	1	6	4	96700	1	1	Entry	5	Yes
S00198	18	Female	3.11	1547	785	3.67	Medicine	4	9	2	4	5	1	75000	10	5	Mid	2	Yes

Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

library("tidyverse")
library("knitr")
# Creating separate data frames
dependant_salary_data <- education_career_success %>% select(Starting_Salary) #Select the dependent variable (Starting_Salary)

independent_salary_data <- education_career_success %>% select(University_GPA,Field_of_Study,Age) #Select the independent variables (University_GPA, Field_of_Study, Age)

# Combine rows with rbind and converted to data frame combined_data_by_rows
combined_data_by_rows <- as.data.frame(rbind(dependant_salary_data, University_GPA = NA, Field_of_Study = NA, Age = NA))

# Combine columns with cbind and converted to data frame combined_data_by_columns
combined_data_by_columns <- as.data.frame(cbind(dependant_salary_data,independent_salary_data))

# View the results
kable(head(combined_data_by_rows))

Starting_Salary
27200
25000
42400
57400
47600
68400

kable(head(combined_data_by_columns))

Starting_Salary	University_GPA	Field_of_Study	Age
27200	3.96	Arts	24
25000	3.63	Law	21
42400	2.63	Medicine	28
57400	2.81	Computer Science	25
47600	2.48	Engineering	22
68400	3.78	Law	24

Remove missing values in your dataset

library("knitr")
# Check for missing values
colSums(is.na(education_career_success))

##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship 
##                     0                     0

# Remove rows with missing values
missing_values_cleanup <- na.omit(education_career_success)

# Identify omitted rows
omitted_data <- anti_join(education_career_success, missing_values_cleanup)

## Joining with `by = join_by(Student_ID, Age, Gender, High_School_GPA, SAT_Score,
## University_Ranking, University_GPA, Field_of_Study, Internships_Completed,
## Projects_Completed, Certifications, Soft_Skills_Score, Networking_Score,
## Job_Offers, Starting_Salary, Career_Satisfaction, Years_to_Promotion,
## Current_Job_Level, Work_Life_Balance, Entrepreneurship)`

# View the omitted data
kable(head(omitted_data))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship

Identify and remove duplicated data from your dataset.

library("knitr")
# Identify duplicate rows in the entire data set and viewing it
duplicated_rows <- education_career_success[duplicated(education_career_success), ]
kable(head(duplicated_rows))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship

# Using distinct() to remove duplicates and viewing it
library(dplyr)
education_career_success_unique <- education_career_success %>% distinct()
kable(head(education_career_success_unique))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes

Reorder multiple rows in descending order

library("knitr")
library("dplyr")
# Reorder rows by Starting_Salary (descending),Age (descending),High_School_GPA(descending)
education_career_success_desc_order <- education_career_success %>% arrange(desc(Starting_Salary),desc(Age),desc(High_School_GPA))

# Print the sorted data set
kable(head(education_career_success_desc_order))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship
S01810	28	Other	2.34	1089	24	3.01	Arts	0	6	2	3	8	1	101000	6	4	Entry	8	No
S04539	21	Male	2.38	1450	943	3.93	Law	1	1	3	3	4	0	100600	6	5	Entry	8	No
S03831	20	Male	3.23	1238	234	2.82	Arts	2	7	3	5	3	5	98900	8	1	Entry	2	Yes
S03504	25	Male	3.99	982	459	2.70	Engineering	4	6	2	10	2	0	98200	4	4	Senior	5	No
S00017	20	Female	3.73	1539	116	3.78	Law	3	2	3	2	5	3	97500	8	5	Mid	9	No
S01341	25	Female	2.97	900	835	3.07	Law	3	7	4	2	9	5	96900	7	3	Entry	2	No

Rename some of the column names in your dataset.

library("dplyr")
library("knitr")
# Rename columns Student Id to ID,STarting Salary to Salary and High_school_GPA to GPA
education_career_success_column_renamed <- education_career_success %>% rename(ID = Student_ID,Salary = Starting_Salary,GPA = High_School_GPA)

# View the data set with renamed columns
kable(head(education_career_success_column_renamed))

ID	Age	Gender	GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes

Add new variables in your data frame by using a mathematical function

library("dplyr")
library("knitr")

# Add a new variable Double_Salary at end of table
education_career_success <- education_career_success %>% mutate(Double_Salary = Starting_Salary * 2)

# View the updated data set
kable(head(education_career_success))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Double_Salary
S00001	24	Male	3.58	1052	291	3.96	Arts	3	7	2	9	8	5	27200	4	5	Entry	7	No	54400
S00002	21	Other	2.52	1211	112	3.63	Law	4	7	3	8	1	4	25000	1	1	Mid	7	No	50000
S00003	28	Female	3.42	1193	715	2.63	Medicine	4	8	1	1	9	0	42400	9	3	Entry	7	No	84800
S00004	25	Male	2.43	1497	170	2.81	Computer Science	3	9	1	10	6	1	57400	7	5	Mid	5	No	114800
S00005	22	Male	2.08	1012	599	2.48	Engineering	4	6	4	10	9	4	47600	9	5	Entry	2	No	95200
S00006	24	Male	2.40	1600	631	3.78	Law	2	3	2	2	2	1	68400	9	2	Entry	8	Yes	136800

Create a training set using a random number generator engine.

library(dplyr)

# Set a random seed
set.seed(12345)

# Create the training set on 50% of the data in data set
training_set <- education_career_success %>% sample_frac(size = 0.5)

# View the training set
kable(head(training_set))

Student_ID	Age	Gender	High_School_GPA	SAT_Score	University_Ranking	University_GPA	Field_of_Study	Internships_Completed	Projects_Completed	Certifications	Soft_Skills_Score	Networking_Score	Job_Offers	Starting_Salary	Career_Satisfaction	Years_to_Promotion	Current_Job_Level	Work_Life_Balance	Entrepreneurship	Double_Salary
S00051	19	Male	3.66	1376	624	3.14	Computer Science	2	0	2	7	5	1	42000	6	5	Senior	2	No	84000
S00720	22	Male	3.32	1265	671	2.44	Business	4	0	1	8	7	4	25000	4	2	Entry	8	No	50000
S00730	23	Male	2.85	980	690	2.48	Law	4	6	4	7	8	4	32300	3	5	Entry	5	No	64600
S02712	28	Female	3.35	1127	504	2.25	Medicine	2	6	3	10	6	0	25500	5	2	Entry	8	No	51000
S04922	27	Male	3.96	1386	777	3.61	Mathematics	1	7	5	9	1	2	55800	4	1	Entry	1	No	111600
S00605	28	Female	3.21	1065	784	3.15	Engineering	4	3	3	6	3	5	81600	8	2	Mid	8	No	163200

Print the summary statistics of your dataset.

# Generate summary statistics
summary(education_career_success)

##   Student_ID             Age           Gender          High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Internships_Completed Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000                      
##  Double_Salary   
##  Min.   : 50000  
##  1st Qu.: 80400  
##  Median :100600  
##  Mean   :101127  
##  3rd Qu.:121000  
##  Max.   :202000

Use any of the numerical variables from the dataset and perform the following statistical functions such as Mean,Median,Mode,Range

# Calculate the mean of Starting_Salary
mean_salary <- mean(education_career_success$Starting_Salary)
print(paste("Mean of Starting Salary:",mean_salary))

## [1] "Mean of Starting Salary: 50563.54"

# Calculate the median of Starting_Salary
median_salary <- median(education_career_success$Starting_Salary)
print(paste("Median of Starting Salary:",median_salary))

## [1] "Median of Starting Salary: 50300"

# Calculate the range in Starting_Salary
range_Salary <- range(education_career_success$Starting_Salary)  #Get the range (min and max)
range_diff <- range_Salary[2] - range_Salary[1]                  #Calculate the difference (max - min)
print(paste("Range of Starting Salary:",range_diff))

## [1] "Range of Starting Salary: 76000"

# Calculate mode for Starting_Salary
Frequency_mode_salary_table <- table(education_career_success$Starting_Salary)
mode_salary <- as.numeric(names(Frequency_mode_salary_table)[which.max(Frequency_mode_salary_table)])
mode_frequency <- max(Frequency_mode_salary_table)  # Frequency of the mode

print(paste("Mode of Starting Salary:",mode_salary))

## [1] "Mode of Starting Salary: 25000"

print(paste("Frequncy of Starting Salary's Mode Value:",mode_frequency))

## [1] "Frequncy of Starting Salary's Mode Value: 240"

Plot a scatter plot for any 2 variables

#Calling ggplot

library(ggplot2)

# Plot the Scatter Plot where on x axis it is university ranking and on Y axis it is starting salary
ggplot(education_career_success,aes(x=University_Ranking,y=Starting_Salary))+geom_point(size = 1,color = "blue",shape = 10,alpha = 0.3) +geom_point(size = 1,color = "red",shape = 10,alpha = 0.3)

Plot a bar plot for any 2 variables

#Calling libraries 
library(ggplot2)
library(dplyr)


# Aggregate the data to calculate the mean age for each field of study
education_career_success_agg <- education_career_success %>%group_by(Field_of_Study) %>%summarise(mean_Age = mean(Age)) 
duplicates <- education_career_success[duplicated(education_career_success$Field_of_Study), ]

# Create the bar plot with the correct data
ggplot(education_career_success_agg, aes(x = Field_of_Study, y = mean_Age)) +geom_bar(stat = "identity", fill = "red") +
  labs(
    title = "Age by Field of Study",x = "Field of Study",y = "Age") + 
  
  theme_minimal()    # Title of the plot,Label for the x-axis,Label for the y-axis,minimal theme to the plot for a clean and modern look

correlation between any 2 variables by applying least square linear regression model

library("knitr")

# Compute Pearson Correlation Coefficient
Education_Career_Success_Coefficient <- cor(education_career_success$Starting_Salary, education_career_success$University_GPA, method = "pearson")

# Print Correlation Coefficient
kable(head(Education_Career_Success_Coefficient))

x
0.0010225

Assignment_1_Education Career Success

COMP4033 Group-3

2025-02-21

Structure of dataset

Variable List of dataset

Printing first 15 rows

User defined function using any of the variables from the data set

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

Remove missing values in your dataset

Identify and remove duplicated data from your dataset.

Reorder multiple rows in descending order

Rename some of the column names in your dataset.

Add new variables in your data frame by using a mathematical function

Create a training set using a random number generator engine.

Print the summary statistics of your dataset.

Use any of the numerical variables from the dataset and perform the following statistical functions such as Mean,Median,Mode,Range

Plot a scatter plot for any 2 variables

Plot a bar plot for any 2 variables

correlation between any 2 variables by applying least square linear regression model