library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
Education1 <- (read_xlsx("C:/Users/rouna/Downloads/education.xlsx"))
str(Education1)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
## $ Student_ID : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
## $ Age : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
## $ Gender : chr [1:5000] "Male" "Other" "Female" "Male" ...
## $ High_School_GPA : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
## $ SAT_Score : num [1:5000] 1052 1211 1193 1497 1012 ...
## $ University_Ranking : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
## $ University_GPA : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
## $ Field_of_Study : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
## $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
## $ Projects_Completed : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
## $ Certifications : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
## $ Soft_Skills_Score : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
## $ Networking_Score : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
## $ Job_Offers : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
## $ Starting_Salary : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
## $ Career_Satisfaction : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
## $ Years_to_Promotion : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
## $ Current_Job_Level : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
## $ Work_Life_Balance : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
## $ Entrepreneurship : chr [1:5000] "No" "No" "No" "No" ...
#List variables in the Dataset
names(Education1)
## [1] "Student_ID" "Age" "Gender"
## [4] "High_School_GPA" "SAT_Score" "University_Ranking"
## [7] "University_GPA" "Field_of_Study" "Internships_Completed"
## [10] "Projects_Completed" "Certifications" "Soft_Skills_Score"
## [13] "Networking_Score" "Job_Offers" "Starting_Salary"
## [16] "Career_Satisfaction" "Years_to_Promotion" "Current_Job_Level"
## [19] "Work_Life_Balance" "Entrepreneurship"
#Print the first 15 rows
head(Education1,15)
## # A tibble: 15 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## 7 S00007 27 Male 2.36 1011 610
## 8 S00008 20 Male 2.68 1074 240
## 9 S00009 24 Male 2.84 1201 337
## 10 S00010 28 Male 3.02 1415 138
## 11 S00011 28 Female 2.95 1120 594
## 12 S00012 25 Female 2.54 1070 236
## 13 S00013 22 Female 2.06 1217 648
## 14 S00014 21 Male 3.21 1112 794
## 15 S00015 25 Male 2.79 1152 3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
#Write a user defined function using any of the variables from the data set
categorize_score <- function(score) {
if (score >= 90) {
return("Excellent")
} else if (score >= 75) {
return("Good")
} else if (score >= 50) {
return("Average")
} else {
return("Needs Improvement")
}
}
if("Score" %in% colnames(Education1)) {Education1$Category <- sapply(Education1$Score, categorize_score)}
categorize_score(92)
## [1] "Excellent"
#Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
filtered_Education1 <- Education1 %>% filter("SAT_Score">60)
head(filtered_Education1)
## # A tibble: 6 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
#Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
new_education <- Education1 %>% select(SAT_Score,Age,University_GPA)
long_education <- pivot_longer(Education1, cols = c("SAT_Score","Age"),names_to = "Variable", values_to = "Value")
head(long_education)
## # A tibble: 6 × 20
## Student_ID Gender High_School_GPA University_Ranking University_GPA
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 S00001 Male 3.58 291 3.96
## 2 S00001 Male 3.58 291 3.96
## 3 S00002 Other 2.52 112 3.63
## 4 S00002 Other 2.52 112 3.63
## 5 S00003 Female 3.42 715 2.63
## 6 S00003 Female 3.42 715 2.63
## # ℹ 15 more variables: Field_of_Study <chr>, Internships_Completed <dbl>,
## # Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## # Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## # Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## # Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>,
## # Variable <chr>, Value <dbl>
#Remove Missing Values
Education1_clean <- na.omit(Education1)
head(Education1_clean)
## # A tibble: 6 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
#Remove Duplicate Rows
Education1_no_duplicates <- Education1 %>% distinct()
head(Education1_no_duplicates)
## # A tibble: 6 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S00001 24 Male 3.58 1052 291
## 2 S00002 21 Other 2.52 1211 112
## 3 S00003 28 Female 3.42 1193 715
## 4 S00004 25 Male 2.43 1497 170
## 5 S00005 22 Male 2.08 1012 599
## 6 S00006 24 Male 2.4 1600 631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
#Reorder Multiple Rows in Descending Order (e.g., by Score)
Education1_Sorted <- Education1 %>% arrange(desc(High_School_GPA))
head(Education1_Sorted)
## # A tibble: 6 × 20
## Student_ID Age Gender High_School_GPA SAT_Score University_Ranking
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 S01046 22 Male 4 1260 828
## 2 S01092 28 Female 4 1242 562
## 3 S01628 26 Male 4 1087 484
## 4 S02004 20 Female 4 945 969
## 5 S02075 19 Female 4 1239 16
## 6 S02650 18 Female 4 941 430
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## # Internships_Completed <dbl>, Projects_Completed <dbl>,
## # Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## # Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## # Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## # Entrepreneurship <chr>
#Rename Columns
Education1_Renamed <- Education1 %>% rename(ID=Student_ID)
head(Education1_Renamed)
## # A tibble: 6 × 20
## ID Age Gender High_School_GPA SAT_Score University_Ranking University_GPA
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 S000… 24 Male 3.58 1052 291 3.96
## 2 S000… 21 Other 2.52 1211 112 3.63
## 3 S000… 28 Female 3.42 1193 715 2.63
## 4 S000… 25 Male 2.43 1497 170 2.81
## 5 S000… 22 Male 2.08 1012 599 2.48
## 6 S000… 24 Male 2.4 1600 631 3.78
## # ℹ 13 more variables: Field_of_Study <chr>, Internships_Completed <dbl>,
## # Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## # Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## # Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## # Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>
#Add New Variable
Education1_Renamed <- Education1 %>% mutate(Doubled_Score = SAT_Score*2)
View(Education1_Renamed)
#Create Training Set (80% of data)
set.seed(234)
train_indices <- sample(1:nrow(Education1_Renamed),0.8*nrow(Education1_Renamed))
train_set <- Education1_Renamed[train_indices,]
View(train_set)
#Print the summary statistics of your dataset.
summary(Education1)
## Student_ID Age Gender High_School_GPA
## Length:5000 Min. :18.00 Length:5000 Min. :2.000
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:2.500
## Mode :character Median :23.00 Mode :character Median :2.990
## Mean :23.44 Mean :2.997
## 3rd Qu.:26.00 3rd Qu.:3.500
## Max. :29.00 Max. :4.000
## SAT_Score University_Ranking University_GPA Field_of_Study
## Min. : 900 Min. : 1.0 Min. :2.00 Length:5000
## 1st Qu.:1076 1st Qu.: 256.0 1st Qu.:2.52 Class :character
## Median :1257 Median : 501.5 Median :3.03 Mode :character
## Mean :1254 Mean : 504.3 Mean :3.02
## 3rd Qu.:1432 3rd Qu.: 759.0 3rd Qu.:3.51
## Max. :1600 Max. :1000.0 Max. :4.00
## Internships_Completed Projects_Completed Certifications Soft_Skills_Score
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :2.000 Median :5.000 Median :3.000 Median : 6.000
## Mean :1.982 Mean :4.563 Mean :2.512 Mean : 5.546
## 3rd Qu.:3.000 3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :4.000 Max. :9.000 Max. :5.000 Max. :10.000
## Networking_Score Job_Offers Starting_Salary Career_Satisfaction
## Min. : 1.000 Min. :0.000 Min. : 25000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.: 40200 1st Qu.: 3.000
## Median : 6.000 Median :2.000 Median : 50300 Median : 6.000
## Mean : 5.538 Mean :2.489 Mean : 50564 Mean : 5.578
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.: 60500 3rd Qu.: 8.000
## Max. :10.000 Max. :5.000 Max. :101000 Max. :10.000
## Years_to_Promotion Current_Job_Level Work_Life_Balance Entrepreneurship
## Min. :1.000 Length:5000 Min. : 1.000 Length:5000
## 1st Qu.:2.000 Class :character 1st Qu.: 3.000 Class :character
## Median :3.000 Mode :character Median : 6.000 Mode :character
## Mean :3.016 Mean : 5.482
## 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :5.000 Max. :10.000
#Mean, Median, Mode, Range.
mean_value <- mean(Education1$SAT_Score, na.rm = TRUE)
median_value <- median(Education1$SAT_Score, na.rm = TRUE)
mode_value <- as.numeric(names(sort(table(Education1$SAT_Score), decreasing = TRUE)[1]))
range_value <- range(Education1$SAT_Score, na.rm = TRUE)
print(paste("Mean:", mean_value))
## [1] "Mean: 1253.832"
print(paste("Median:", median_value))
## [1] "Median: 1257"
print(paste("Mode:", mode_value))
## [1] "Mode: 1319"
print(paste("Range:", paste(range_value, collapse = " - ")))
## [1] "Range: 900 - 1600"
#Scatter plot
ggplot(Education1,aes(x=University_Ranking,y=SAT_Score))+geom_point(size=0.5,color = "blue",shape = 10,alpha= 0.3)+labs(title = "Scatter Plot: University Ranking vs SAT SCORE")
#Plot a bar plot for any 2 variables in your dataset
Education1_MeanAge <- Education1 %>%group_by(Field_of_Study) %>%summarise(mean_Age = mean(Age))
ggplot(Education1_MeanAge, aes(x = Field_of_Study, y = mean_Age)) +geom_bar(stat = "identity", fill = "red") +
labs(
title = "Age by Field of Study",x = "Field of Study",y = "Age")
#Find the correlation between any 2 variables by applying least square
linear regression model
Education1_Coefficient <- cor(Education1$Starting_Salary, Education1$University_Ranking, method = "pearson")
Education1_Coefficient1 <- cor(Education1$Starting_Salary, Education1$University_GPA, method = "pearson")
print(head(Education1_Coefficient))
## [1] 0.02136836
print(head(Education1_Coefficient1))
## [1] 0.001022485