library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
Education1 <- (read_xlsx("C:/Users/rouna/Downloads/education.xlsx"))
str(Education1)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
##  $ Student_ID           : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
##  $ Age                  : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
##  $ Gender               : chr [1:5000] "Male" "Other" "Female" "Male" ...
##  $ High_School_GPA      : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
##  $ SAT_Score            : num [1:5000] 1052 1211 1193 1497 1012 ...
##  $ University_Ranking   : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
##  $ University_GPA       : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
##  $ Field_of_Study       : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
##  $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
##  $ Projects_Completed   : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
##  $ Certifications       : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
##  $ Soft_Skills_Score    : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
##  $ Networking_Score     : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
##  $ Job_Offers           : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
##  $ Starting_Salary      : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
##  $ Career_Satisfaction  : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
##  $ Years_to_Promotion   : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
##  $ Current_Job_Level    : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
##  $ Work_Life_Balance    : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
##  $ Entrepreneurship     : chr [1:5000] "No" "No" "No" "No" ...

#List variables in the Dataset

names(Education1)
##  [1] "Student_ID"            "Age"                   "Gender"               
##  [4] "High_School_GPA"       "SAT_Score"             "University_Ranking"   
##  [7] "University_GPA"        "Field_of_Study"        "Internships_Completed"
## [10] "Projects_Completed"    "Certifications"        "Soft_Skills_Score"    
## [13] "Networking_Score"      "Job_Offers"            "Starting_Salary"      
## [16] "Career_Satisfaction"   "Years_to_Promotion"    "Current_Job_Level"    
## [19] "Work_Life_Balance"     "Entrepreneurship"

#Print the first 15 rows

head(Education1,15)
## # A tibble: 15 × 20
##    Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##    <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
##  1 S00001        24 Male              3.58      1052                291
##  2 S00002        21 Other             2.52      1211                112
##  3 S00003        28 Female            3.42      1193                715
##  4 S00004        25 Male              2.43      1497                170
##  5 S00005        22 Male              2.08      1012                599
##  6 S00006        24 Male              2.4       1600                631
##  7 S00007        27 Male              2.36      1011                610
##  8 S00008        20 Male              2.68      1074                240
##  9 S00009        24 Male              2.84      1201                337
## 10 S00010        28 Male              3.02      1415                138
## 11 S00011        28 Female            2.95      1120                594
## 12 S00012        25 Female            2.54      1070                236
## 13 S00013        22 Female            2.06      1217                648
## 14 S00014        21 Male              3.21      1112                794
## 15 S00015        25 Male              2.79      1152                  3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

#Write a user defined function using any of the variables from the data set

categorize_score <- function(score) {
  if (score >= 90) {
    return("Excellent")
  } else if (score >= 75) {
    return("Good")
  } else if (score >= 50) {
    return("Average")
  } else {
    return("Needs Improvement")
  }
}

if("Score" %in% colnames(Education1)) {Education1$Category <- sapply(Education1$Score, categorize_score)}

categorize_score(92)
## [1] "Excellent"

#Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

filtered_Education1 <- Education1 %>% filter("SAT_Score">60)
head(filtered_Education1)
## # A tibble: 6 × 20
##   Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##   <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
## 1 S00001        24 Male              3.58      1052                291
## 2 S00002        21 Other             2.52      1211                112
## 3 S00003        28 Female            3.42      1193                715
## 4 S00004        25 Male              2.43      1497                170
## 5 S00005        22 Male              2.08      1012                599
## 6 S00006        24 Male              2.4       1600                631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

#Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

new_education <- Education1 %>% select(SAT_Score,Age,University_GPA)
long_education <- pivot_longer(Education1, cols = c("SAT_Score","Age"),names_to = "Variable", values_to = "Value")
head(long_education)
## # A tibble: 6 × 20
##   Student_ID Gender High_School_GPA University_Ranking University_GPA
##   <chr>      <chr>            <dbl>              <dbl>          <dbl>
## 1 S00001     Male              3.58                291           3.96
## 2 S00001     Male              3.58                291           3.96
## 3 S00002     Other             2.52                112           3.63
## 4 S00002     Other             2.52                112           3.63
## 5 S00003     Female            3.42                715           2.63
## 6 S00003     Female            3.42                715           2.63
## # ℹ 15 more variables: Field_of_Study <chr>, Internships_Completed <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>,
## #   Variable <chr>, Value <dbl>

#Remove Missing Values

Education1_clean <- na.omit(Education1)
head(Education1_clean)
## # A tibble: 6 × 20
##   Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##   <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
## 1 S00001        24 Male              3.58      1052                291
## 2 S00002        21 Other             2.52      1211                112
## 3 S00003        28 Female            3.42      1193                715
## 4 S00004        25 Male              2.43      1497                170
## 5 S00005        22 Male              2.08      1012                599
## 6 S00006        24 Male              2.4       1600                631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

#Remove Duplicate Rows

Education1_no_duplicates <- Education1 %>% distinct()
head(Education1_no_duplicates)
## # A tibble: 6 × 20
##   Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##   <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
## 1 S00001        24 Male              3.58      1052                291
## 2 S00002        21 Other             2.52      1211                112
## 3 S00003        28 Female            3.42      1193                715
## 4 S00004        25 Male              2.43      1497                170
## 5 S00005        22 Male              2.08      1012                599
## 6 S00006        24 Male              2.4       1600                631
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

#Reorder Multiple Rows in Descending Order (e.g., by Score)

Education1_Sorted <- Education1 %>% arrange(desc(High_School_GPA))
head(Education1_Sorted)
## # A tibble: 6 × 20
##   Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##   <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
## 1 S01046        22 Male                 4      1260                828
## 2 S01092        28 Female               4      1242                562
## 3 S01628        26 Male                 4      1087                484
## 4 S02004        20 Female               4       945                969
## 5 S02075        19 Female               4      1239                 16
## 6 S02650        18 Female               4       941                430
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>

#Rename Columns

Education1_Renamed <- Education1 %>% rename(ID=Student_ID)
head(Education1_Renamed)
## # A tibble: 6 × 20
##   ID      Age Gender High_School_GPA SAT_Score University_Ranking University_GPA
##   <chr> <dbl> <chr>            <dbl>     <dbl>              <dbl>          <dbl>
## 1 S000…    24 Male              3.58      1052                291           3.96
## 2 S000…    21 Other             2.52      1211                112           3.63
## 3 S000…    28 Female            3.42      1193                715           2.63
## 4 S000…    25 Male              2.43      1497                170           2.81
## 5 S000…    22 Male              2.08      1012                599           2.48
## 6 S000…    24 Male              2.4       1600                631           3.78
## # ℹ 13 more variables: Field_of_Study <chr>, Internships_Completed <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>

#Add New Variable

Education1_Renamed <- Education1 %>% mutate(Doubled_Score = SAT_Score*2)
View(Education1_Renamed)

#Create Training Set (80% of data)

set.seed(234)
train_indices <- sample(1:nrow(Education1_Renamed),0.8*nrow(Education1_Renamed))
train_set <- Education1_Renamed[train_indices,]
View(train_set)

#Print the summary statistics of your dataset.

summary(Education1)
##   Student_ID             Age           Gender          High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Internships_Completed Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000

#Mean, Median, Mode, Range.

mean_value <- mean(Education1$SAT_Score, na.rm = TRUE)
median_value <- median(Education1$SAT_Score, na.rm = TRUE)
mode_value <- as.numeric(names(sort(table(Education1$SAT_Score), decreasing = TRUE)[1]))
range_value <- range(Education1$SAT_Score, na.rm = TRUE)

print(paste("Mean:", mean_value))
## [1] "Mean: 1253.832"
print(paste("Median:", median_value))
## [1] "Median: 1257"
print(paste("Mode:", mode_value))
## [1] "Mode: 1319"
print(paste("Range:", paste(range_value, collapse = " - ")))
## [1] "Range: 900 - 1600"

#Scatter plot

ggplot(Education1,aes(x=University_Ranking,y=SAT_Score))+geom_point(size=0.5,color = "blue",shape = 10,alpha= 0.3)+labs(title = "Scatter Plot: University Ranking vs SAT SCORE")

#Plot a bar plot for any 2 variables in your dataset

Education1_MeanAge <- Education1 %>%group_by(Field_of_Study) %>%summarise(mean_Age = mean(Age))

ggplot(Education1_MeanAge, aes(x = Field_of_Study, y = mean_Age)) +geom_bar(stat = "identity", fill = "red") +
  labs(
    title = "Age by Field of Study",x = "Field of Study",y = "Age")

#Find the correlation between any 2 variables by applying least square linear regression model

Education1_Coefficient <- cor(Education1$Starting_Salary, Education1$University_Ranking, method = "pearson")

Education1_Coefficient1 <- cor(Education1$Starting_Salary, Education1$University_GPA, method = "pearson")

print(head(Education1_Coefficient))
## [1] 0.02136836
print(head(Education1_Coefficient1))
## [1] 0.001022485