library(readr)
  education <- read_csv("C:/Users/KimCS/Downloads/education.csv")
## Rows: 5000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Student_ID, Gender, Field_of_Study, Current_Job_Level, Entrepreneu...
## dbl (15): Age, High_School_GPA, SAT_Score, University_Ranking, University_GP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  View(education)
# Task: Print the structure of your dataset 
str(education)
## spc_tbl_ [5,000 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Student_ID           : chr [1:5000] "S00001" "S00002" "S00003" "S00004" ...
##  $ Age                  : num [1:5000] 24 21 28 25 22 24 27 20 24 28 ...
##  $ Gender               : chr [1:5000] "Male" "Other" "Female" "Male" ...
##  $ High_School_GPA      : num [1:5000] 3.58 2.52 3.42 2.43 2.08 2.4 2.36 2.68 2.84 3.02 ...
##  $ SAT_Score            : num [1:5000] 1052 1211 1193 1497 1012 ...
##  $ University_Ranking   : num [1:5000] 291 112 715 170 599 631 610 240 337 138 ...
##  $ University_GPA       : num [1:5000] 3.96 3.63 2.63 2.81 2.48 3.78 3.83 2.84 3.31 2.33 ...
##  $ Field_of_Study       : chr [1:5000] "Arts" "Law" "Medicine" "Computer Science" ...
##  $ Internships_Completed: num [1:5000] 3 4 4 3 4 2 0 1 2 1 ...
##  $ Projects_Completed   : num [1:5000] 7 7 8 9 6 3 1 5 3 5 ...
##  $ Certifications       : num [1:5000] 2 3 1 1 4 2 3 5 0 3 ...
##  $ Soft_Skills_Score    : num [1:5000] 9 8 1 10 10 2 3 5 5 10 ...
##  $ Networking_Score     : num [1:5000] 8 1 9 6 9 2 3 1 5 2 ...
##  $ Job_Offers           : num [1:5000] 5 4 0 1 4 1 2 2 2 0 ...
##  $ Starting_Salary      : num [1:5000] 27200 25000 42400 57400 47600 68400 55500 38000 68900 58900 ...
##  $ Career_Satisfaction  : num [1:5000] 4 1 9 7 9 9 7 2 2 4 ...
##  $ Years_to_Promotion   : num [1:5000] 5 1 3 5 5 2 4 3 2 2 ...
##  $ Current_Job_Level    : chr [1:5000] "Entry" "Mid" "Entry" "Mid" ...
##  $ Work_Life_Balance    : num [1:5000] 7 7 7 5 2 8 3 3 2 2 ...
##  $ Entrepreneurship     : chr [1:5000] "No" "No" "No" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Student_ID = col_character(),
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   High_School_GPA = col_double(),
##   ..   SAT_Score = col_double(),
##   ..   University_Ranking = col_double(),
##   ..   University_GPA = col_double(),
##   ..   Field_of_Study = col_character(),
##   ..   Internships_Completed = col_double(),
##   ..   Projects_Completed = col_double(),
##   ..   Certifications = col_double(),
##   ..   Soft_Skills_Score = col_double(),
##   ..   Networking_Score = col_double(),
##   ..   Job_Offers = col_double(),
##   ..   Starting_Salary = col_double(),
##   ..   Career_Satisfaction = col_double(),
##   ..   Years_to_Promotion = col_double(),
##   ..   Current_Job_Level = col_character(),
##   ..   Work_Life_Balance = col_double(),
##   ..   Entrepreneurship = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Task: List the variables in your dataset 
names(education)
##  [1] "Student_ID"            "Age"                   "Gender"               
##  [4] "High_School_GPA"       "SAT_Score"             "University_Ranking"   
##  [7] "University_GPA"        "Field_of_Study"        "Internships_Completed"
## [10] "Projects_Completed"    "Certifications"        "Soft_Skills_Score"    
## [13] "Networking_Score"      "Job_Offers"            "Starting_Salary"      
## [16] "Career_Satisfaction"   "Years_to_Promotion"    "Current_Job_Level"    
## [19] "Work_Life_Balance"     "Entrepreneurship"
# Task: Print the top 15 rows of your dataset 
head(education, n=15)
## # A tibble: 15 × 20
##    Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##    <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
##  1 S00001        24 Male              3.58      1052                291
##  2 S00002        21 Other             2.52      1211                112
##  3 S00003        28 Female            3.42      1193                715
##  4 S00004        25 Male              2.43      1497                170
##  5 S00005        22 Male              2.08      1012                599
##  6 S00006        24 Male              2.4       1600                631
##  7 S00007        27 Male              2.36      1011                610
##  8 S00008        20 Male              2.68      1074                240
##  9 S00009        24 Male              2.84      1201                337
## 10 S00010        28 Male              3.02      1415                138
## 11 S00011        28 Female            2.95      1120                594
## 12 S00012        25 Female            2.54      1070                236
## 13 S00013        22 Female            2.06      1217                648
## 14 S00014        21 Male              3.21      1112                794
## 15 S00015        25 Male              2.79      1152                  3
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>
# Task: Write a user defined function using any of the variables from the data set

# defining the terms first: 
      pass_term <-as.character("Pass")
      fail_term <-as.character("Fail")

# function using the above predefined terms
      pass_or_fail <- function(gpa) {
  if (gpa >= 2.5) {
    return(pass_term)
  } else {
    return(fail_term)
  }
}
# Task: use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset. 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pass_or_fail(education$University_GPA[1])
## [1] "Pass"
# Task: Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

library(dplyr)
library(tidyr)

# Select relevant columns 
    df_selected <-education %>% select(Student_ID, University_GPA, Internships_Completed, Job_Offers)
    
# Reshape from wide to long format (this format is useful for visualization and statistical modeling)
    df_long <-df_selected %>% 
      pivot_longer(cols = c(University_GPA, Internships_Completed), 
                   names_to = "Variable",
                   values_to = "Value")
    
    # whereas this format is used for machine learning and regression models 
    df_wide <- df_long %>% 
      pivot_wider(names_from = Variable, values_from = Value)
# Task: Remove missing values in your dataset 
# 
colSums(is.na(education))
##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship 
##                     0                     0
education_clean <- education %>% drop_na(University_GPA, Job_Offers)
# to verify that missing values are removed
colSums(is.na(education_clean))
##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score    University_Ranking 
##                     0                     0                     0 
##        University_GPA        Field_of_Study Internships_Completed 
##                     0                     0                     0 
##    Projects_Completed        Certifications     Soft_Skills_Score 
##                     0                     0                     0 
##      Networking_Score            Job_Offers       Starting_Salary 
##                     0                     0                     0 
##   Career_Satisfaction    Years_to_Promotion     Current_Job_Level 
##                     0                     0                     0 
##     Work_Life_Balance      Entrepreneurship 
##                     0                     0
# Task Identify and remove duplicated data from your dataset. 
  # to find and count duplicate rows 
    # a. to count duplicate rows 
          sum(duplicated(education))
## [1] 0
  # to display duplicated rows
          education[duplicated(education),]
## # A tibble: 0 × 20
## # ℹ 20 variables: Student_ID <chr>, Age <dbl>, Gender <chr>,
## #   High_School_GPA <dbl>, SAT_Score <dbl>, University_Ranking <dbl>,
## #   University_GPA <dbl>, Field_of_Study <chr>, Internships_Completed <dbl>,
## #   Projects_Completed <dbl>, Certifications <dbl>, Soft_Skills_Score <dbl>,
## #   Networking_Score <dbl>, Job_Offers <dbl>, Starting_Salary <dbl>,
## #   Career_Satisfaction <dbl>, Years_to_Promotion <dbl>,
## #   Current_Job_Level <chr>, Work_Life_Balance <dbl>, Entrepreneurship <chr>
# to check for duplicated values
  # a. count duplicate GPA values
  sum(duplicated(education$University_GPA))
## [1] 4799
# Task reorder multiple rows in descending order 
library(dplyr)

  # reorder by University_GPA and Job_Offers in descending order
    education_sorted <- education %>% arrange(desc(University_GPA), desc(Job_Offers))
    
  # view the sorted dataset 
    head(education_sorted)
## # A tibble: 6 × 20
##   Student_ID   Age Gender High_School_GPA SAT_Score University_Ranking
##   <chr>      <dbl> <chr>            <dbl>     <dbl>              <dbl>
## 1 S00114        20 Male              2.9        963                819
## 2 S03117        20 Male              3.85      1243                281
## 3 S01553        26 Female            2.11      1322                322
## 4 S02276        23 Male              3.8       1238                644
## 5 S04363        24 Female            3.34      1117                644
## 6 S04670        18 Female            2.48      1051                244
## # ℹ 14 more variables: University_GPA <dbl>, Field_of_Study <chr>,
## #   Internships_Completed <dbl>, Projects_Completed <dbl>,
## #   Certifications <dbl>, Soft_Skills_Score <dbl>, Networking_Score <dbl>,
## #   Job_Offers <dbl>, Starting_Salary <dbl>, Career_Satisfaction <dbl>,
## #   Years_to_Promotion <dbl>, Current_Job_Level <chr>, Work_Life_Balance <dbl>,
## #   Entrepreneurship <chr>
# Task: rename some of the columns in your dataset
library(dplyr)
 
       education_renamed <- education %>%
        rename(
          GPA = University_GPA,
          Internships = Internships_Completed,
          Offers = Job_Offers
        )
       
# to view the updated column names
       colnames(education_renamed)
##  [1] "Student_ID"          "Age"                 "Gender"             
##  [4] "High_School_GPA"     "SAT_Score"           "University_Ranking" 
##  [7] "GPA"                 "Field_of_Study"      "Internships"        
## [10] "Projects_Completed"  "Certifications"      "Soft_Skills_Score"  
## [13] "Networking_Score"    "Offers"              "Starting_Salary"    
## [16] "Career_Satisfaction" "Years_to_Promotion"  "Current_Job_Level"  
## [19] "Work_Life_Balance"   "Entrepreneurship"
# Task add new variables in your data frame by using a mathematical function 
  # creates a new column new_variable with the values of column_name multiplied by 2

      library (dplyr)
      education_career_success <- education_clean %>% mutate(Double_Salary = Starting_Salary * 2)
# Task Create a training set using a random number generator engine. 
    # using set.seed() 

    library (dplyr)
    
    # setting the seed to a specific number
      set.seed (123)
      
    # randomly sample 5 numbers, ensures that random numbers are generated by sample () are reproducible 
      random_numbers <- sample(1:10, 5) 
      print(random_numbers)
## [1]  3 10  2  8  6
# Task Print the summary statistics of your dataset.
  summary(education)
##   Student_ID             Age           Gender          High_School_GPA
##  Length:5000        Min.   :18.00   Length:5000        Min.   :2.000  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:2.500  
##  Mode  :character   Median :23.00   Mode  :character   Median :2.990  
##                     Mean   :23.44                      Mean   :2.997  
##                     3rd Qu.:26.00                      3rd Qu.:3.500  
##                     Max.   :29.00                      Max.   :4.000  
##    SAT_Score    University_Ranking University_GPA Field_of_Study    
##  Min.   : 900   Min.   :   1.0     Min.   :2.00   Length:5000       
##  1st Qu.:1076   1st Qu.: 256.0     1st Qu.:2.52   Class :character  
##  Median :1257   Median : 501.5     Median :3.03   Mode  :character  
##  Mean   :1254   Mean   : 504.3     Mean   :3.02                     
##  3rd Qu.:1432   3rd Qu.: 759.0     3rd Qu.:3.51                     
##  Max.   :1600   Max.   :1000.0     Max.   :4.00                     
##  Internships_Completed Projects_Completed Certifications  Soft_Skills_Score
##  Min.   :0.000         Min.   :0.000      Min.   :0.000   Min.   : 1.000   
##  1st Qu.:1.000         1st Qu.:2.000      1st Qu.:1.000   1st Qu.: 3.000   
##  Median :2.000         Median :5.000      Median :3.000   Median : 6.000   
##  Mean   :1.982         Mean   :4.563      Mean   :2.512   Mean   : 5.546   
##  3rd Qu.:3.000         3rd Qu.:7.000      3rd Qu.:4.000   3rd Qu.: 8.000   
##  Max.   :4.000         Max.   :9.000      Max.   :5.000   Max.   :10.000   
##  Networking_Score   Job_Offers    Starting_Salary  Career_Satisfaction
##  Min.   : 1.000   Min.   :0.000   Min.   : 25000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.:1.000   1st Qu.: 40200   1st Qu.: 3.000     
##  Median : 6.000   Median :2.000   Median : 50300   Median : 6.000     
##  Mean   : 5.538   Mean   :2.489   Mean   : 50564   Mean   : 5.578     
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.: 60500   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :5.000   Max.   :101000   Max.   :10.000     
##  Years_to_Promotion Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Min.   :1.000      Length:5000        Min.   : 1.000    Length:5000       
##  1st Qu.:2.000      Class :character   1st Qu.: 3.000    Class :character  
##  Median :3.000      Mode  :character   Median : 6.000    Mode  :character  
##  Mean   :3.016                         Mean   : 5.482                      
##  3rd Qu.:4.000                         3rd Qu.: 8.000                      
##  Max.   :5.000                         Max.   :10.000
# Use any of the numerical variables from the dataset and perform the following statistical functions
      # Median 
            gpa <- c(3.5, 3.8, 4.0, 2.9, 3.6, 3.2, 3.9, 2.8) 
            median_gpa <- median(gpa)
            print(median_gpa) 
## [1] 3.55
# Use any of the numerical variables from the dataset and perform the following statistical functions
      # Mode 
        data <- c(1, 2, 2, 3, 3, 4)
        mode_data <- as.numeric(names(sort(table(data), decreasing = TRUE)))
        mode_data <- mode_data[table(data) == max(table(data))]
        print(mode_data)
## [1] 3 1
# Use any of the numerical variables from the dataset and perform the following statistical functions
      # Range 
      gpa <- c(3.5, 3.8, 4.0, 2.9, 3.6)
      range_gpa <- range(gpa)
      print(range_gpa)
## [1] 2.9 4.0
# Task: Plot a scatter plot for any 2 variables in your dataset.
library (ggplot2)
ggplot(education_clean,aes(x=University_Ranking,y=Starting_Salary))+geom_point(size = 1,color = "purple",shape = 10,alpha = 0.3) +geom_point(size = 1,color = "orange",shape = 10,alpha = 0.3)

# Task Plot a bar plot for any 2 variables in your dataset
library (ggplot2)
  ggplot(data = education_clean, aes(x = Projects_Completed)) +
  geom_bar() +
  labs(title = "Projects Completed Frequency", 
       x = "Projects Completed", 
       y = "Frequency")

# Task Find the correlation between any 2 variables by applying least square linear regression model.
library("knitr")

    # Compute using Pearson Correlation Coefficient
    Education_Career_Success_Coefficient <- cor(education_clean$Starting_Salary, education_clean$University_GPA, method = "pearson")
    
    # Print Correlation Coefficient
    kable(head(Education_Career_Success_Coefficient))
x
0.0010225