Exploratory Data Analysis

# Loading the packages being used 
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

## corrplot 0.95 loaded

# Loading the data set
df <- read.csv("education.csv")

# Basic data review

# Looking at the first few rows
head(df)

##   Student_ID Age Gender High_School_GPA SAT_Score University_GPA
## 1       S001  22   Male             3.8      1450            3.6
## 2       S002  24 Female             3.6      1380            3.4
## 3       S003  21   Male             3.9      1520            3.8
## 4       S004  23 Female             3.5      1300            3.2
## 5       S005  25   Male             3.7      1420            3.5
## 6       S006  22 Female             3.4      1250            3.1
##     Field_of_Study Internships_Completed Projects_Completed Certifications
## 1 Computer Science                     3                  7              2
## 2         Business                     2                  5              3
## 3      Engineering                     4                  9              4
## 4       Psychology                     1                  3              1
## 5         Medicine                     2                  6              2
## 6        Education                     1                  4              1
##   Soft_Skills_Score Networking_Score Job_Offers Starting_Salary
## 1                 8                7          3           85000
## 2                 7                6          2           65000
## 3                 9                8          4          120000
## 4                 6                5          1           48000
## 5                 8                7          3           95000
## 6                 7                6          1           42000
##   Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance
## 1                   8                  2               Mid                 7
## 2                   7                  3               Mid                 6
## 3                   9                  1            Senior                 6
## 4                   6                  4             Entry                 8
## 5                   8                  2               Mid                 5
## 6                   7                  5             Entry                 9
##   Entrepreneurship
## 1               No
## 2               No
## 3               No
## 4               No
## 5               No
## 6               No

# Looking at the structure of the data set
str(df)

## 'data.frame':    400 obs. of  19 variables:
##  $ Student_ID           : chr  "S001" "S002" "S003" "S004" ...
##  $ Age                  : int  22 24 21 23 25 22 24 23 21 25 ...
##  $ Gender               : chr  "Male" "Female" "Male" "Female" ...
##  $ High_School_GPA      : num  3.8 3.6 3.9 3.5 3.7 3.4 3.2 3.8 3.6 3.9 ...
##  $ SAT_Score            : int  1450 1380 1520 1300 1420 1250 1180 1480 1350 1550 ...
##  $ University_GPA       : num  3.6 3.4 3.8 3.2 3.5 3.1 2.9 3.7 3.3 3.9 ...
##  $ Field_of_Study       : chr  "Computer Science" "Business" "Engineering" "Psychology" ...
##  $ Internships_Completed: int  3 2 4 1 2 1 0 3 2 4 ...
##  $ Projects_Completed   : int  7 5 9 3 6 4 2 8 5 9 ...
##  $ Certifications       : int  2 3 4 1 2 1 0 3 2 5 ...
##  $ Soft_Skills_Score    : int  8 7 9 6 8 7 6 9 7 10 ...
##  $ Networking_Score     : int  7 6 8 5 7 6 4 8 7 9 ...
##  $ Job_Offers           : int  3 2 4 1 3 1 0 3 2 5 ...
##  $ Starting_Salary      : int  85000 65000 120000 48000 95000 42000 35000 110000 72000 140000 ...
##  $ Career_Satisfaction  : int  8 7 9 6 8 7 5 9 7 10 ...
##  $ Years_to_Promotion   : int  2 3 1 4 2 5 5 2 3 1 ...
##  $ Current_Job_Level    : chr  "Mid" "Mid" "Senior" "Entry" ...
##  $ Work_Life_Balance    : int  7 6 6 8 5 9 7 6 6 5 ...
##  $ Entrepreneurship     : chr  "No" "No" "No" "No" ...

# Getting summary statistics
summary(df)

##   Student_ID             Age        Gender          High_School_GPA
##  Length:400         Min.   :21   Length:400         Min.   :3.200  
##  Class :character   1st Qu.:22   Class :character   1st Qu.:3.500  
##  Mode  :character   Median :23   Mode  :character   Median :3.600  
##                     Mean   :23                      Mean   :3.624  
##                     3rd Qu.:24                      3rd Qu.:3.800  
##                     Max.   :25                      Max.   :3.900  
##    SAT_Score    University_GPA  Field_of_Study     Internships_Completed
##  Min.   :1160   Min.   :2.800   Length:400         Min.   :0.0          
##  1st Qu.:1300   1st Qu.:3.200   Class :character   1st Qu.:2.0          
##  Median :1380   Median :3.450   Mode  :character   Median :3.0          
##  Mean   :1389   Mean   :3.441                      Mean   :2.5          
##  3rd Qu.:1480   3rd Qu.:3.700                      3rd Qu.:3.0          
##  Max.   :1580   Max.   :4.000                      Max.   :4.0          
##  Projects_Completed Certifications  Soft_Skills_Score Networking_Score
##  Min.   :2.000      Min.   :0.000   Min.   : 5.000    Min.   :4.000   
##  1st Qu.:5.000      1st Qu.:2.000   1st Qu.: 7.000    1st Qu.:6.000   
##  Median :6.000      Median :3.000   Median : 8.000    Median :7.000   
##  Mean   :6.235      Mean   :2.625   Mean   : 7.933    Mean   :6.933   
##  3rd Qu.:8.000      3rd Qu.:3.000   3rd Qu.: 9.000    3rd Qu.:8.000   
##  Max.   :9.000      Max.   :5.000   Max.   :10.000    Max.   :9.000   
##    Job_Offers   Starting_Salary  Career_Satisfaction Years_to_Promotion
##  Min.   :0.00   Min.   : 31000   Min.   : 5.000      Min.   :1.000     
##  1st Qu.:2.00   1st Qu.: 68000   1st Qu.: 7.000      1st Qu.:2.000     
##  Median :3.00   Median : 84000   Median : 8.000      Median :3.000     
##  Mean   :2.74   Mean   : 87562   Mean   : 7.793      Mean   :2.915     
##  3rd Qu.:4.00   3rd Qu.:105250   3rd Qu.: 9.000      3rd Qu.:4.000     
##  Max.   :5.00   Max.   :152000   Max.   :10.000      Max.   :5.000     
##  Current_Job_Level  Work_Life_Balance Entrepreneurship  
##  Length:400         Min.   :5.000     Length:400        
##  Class :character   1st Qu.:6.000     Class :character  
##  Mode  :character   Median :6.000     Mode  :character  
##                     Mean   :6.412                       
##                     3rd Qu.:7.000                       
##                     Max.   :9.000

# Looking for missing data

# Counting the missing values per column
colSums(is.na(df))

##            Student_ID                   Age                Gender 
##                     0                     0                     0 
##       High_School_GPA             SAT_Score        University_GPA 
##                     0                     0                     0 
##        Field_of_Study Internships_Completed    Projects_Completed 
##                     0                     0                     0 
##        Certifications     Soft_Skills_Score      Networking_Score 
##                     0                     0                     0 
##            Job_Offers       Starting_Salary   Career_Satisfaction 
##                     0                     0                     0 
##    Years_to_Promotion     Current_Job_Level     Work_Life_Balance 
##                     0                     0                     0 
##      Entrepreneurship 
##                     0

# Getting a distribution of numeric variables
numeric_vars <- df %>% select(where(is.numeric))

# Histograms for each of the numeric variable
for (v in names(numeric_vars)) {
  hist(numeric_vars[[v]],
       main = paste("Histogram of", v),
       xlab = v,
       col = "blue")
}

# Getting a distribution of the categorical variables
categorical_vars <- df %>% select(where(is.character), where(is.factor))

# Bar plots of each categorical value
for (v in names(categorical_vars)) {
  barplot(table(categorical_vars[[v]]),
          main = paste("Barplot of", v),
          las = 2,
          col = "green")
}

# Getting a correlation matrix of the numeric variables
cor_matrix <- cor(numeric_vars, use = "complete.obs")
cor_matrix

##                              Age High_School_GPA  SAT_Score University_GPA
## Age                    1.0000000       0.1855206  0.1774442      0.1754347
## High_School_GPA        0.1855206       1.0000000  0.9823962      0.9832059
## SAT_Score              0.1774442       0.9823962  1.0000000      0.9790502
## University_GPA         0.1754347       0.9832059  0.9790502      1.0000000
## Internships_Completed  0.2416667       0.9498906  0.9420335      0.9463967
## Projects_Completed     0.2046952       0.9597748  0.9541977      0.9604439
## Certifications         0.2215611       0.9368276  0.9355410      0.9381131
## Soft_Skills_Score      0.1750785       0.9654635  0.9641083      0.9704988
## Networking_Score       0.1705440       0.9667206  0.9641504      0.9703584
## Job_Offers             0.2389824       0.9663614  0.9656408      0.9653985
## Starting_Salary        0.2023658       0.9505302  0.9521848      0.9563597
## Career_Satisfaction    0.2449988       0.9638152  0.9673655      0.9669182
## Years_to_Promotion    -0.1742754      -0.9606054 -0.9560272     -0.9625342
## Work_Life_Balance     -0.1332548      -0.8045875 -0.8002966     -0.8295063
##                       Internships_Completed Projects_Completed Certifications
## Age                               0.2416667          0.2046952      0.2215611
## High_School_GPA                   0.9498906          0.9597748      0.9368276
## SAT_Score                         0.9420335          0.9541977      0.9355410
## University_GPA                    0.9463967          0.9604439      0.9381131
## Internships_Completed             1.0000000          0.9602065      0.9735683
## Projects_Completed                0.9602065          1.0000000      0.9441304
## Certifications                    0.9735683          0.9441304      1.0000000
## Soft_Skills_Score                 0.9346498          0.9426525      0.9351402
## Networking_Score                  0.9355302          0.9429274      0.9354322
## Job_Offers                        0.9663770          0.9666481      0.9710175
## Starting_Salary                   0.9376562          0.9429756      0.9675312
## Career_Satisfaction               0.9619426          0.9641340      0.9614832
## Years_to_Promotion               -0.9400309         -0.9345922     -0.9253936
## Work_Life_Balance                -0.7982597         -0.7875148     -0.8194059
##                       Soft_Skills_Score Networking_Score Job_Offers
## Age                           0.1750785        0.1705440  0.2389824
## High_School_GPA               0.9654635        0.9667206  0.9663614
## SAT_Score                     0.9641083        0.9641504  0.9656408
## University_GPA                0.9704988        0.9703584  0.9653985
## Internships_Completed         0.9346498        0.9355302  0.9663770
## Projects_Completed            0.9426525        0.9429274  0.9666481
## Certifications                0.9351402        0.9354322  0.9710175
## Soft_Skills_Score             1.0000000        0.9985574  0.9567323
## Networking_Score              0.9985574        1.0000000  0.9567738
## Job_Offers                    0.9567323        0.9567738  1.0000000
## Starting_Salary               0.9533963        0.9530115  0.9740865
## Career_Satisfaction           0.9674911        0.9674662  0.9866779
## Years_to_Promotion           -0.9435139       -0.9436244 -0.9377211
## Work_Life_Balance            -0.8262683       -0.8259259 -0.8215705
##                       Starting_Salary Career_Satisfaction Years_to_Promotion
## Age                         0.2023658           0.2449988         -0.1742754
## High_School_GPA             0.9505302           0.9638152         -0.9606054
## SAT_Score                   0.9521848           0.9673655         -0.9560272
## University_GPA              0.9563597           0.9669182         -0.9625342
## Internships_Completed       0.9376562           0.9619426         -0.9400309
## Projects_Completed          0.9429756           0.9641340         -0.9345922
## Certifications              0.9675312           0.9614832         -0.9253936
## Soft_Skills_Score           0.9533963           0.9674911         -0.9435139
## Networking_Score            0.9530115           0.9674662         -0.9436244
## Job_Offers                  0.9740865           0.9866779         -0.9377211
## Starting_Salary             1.0000000           0.9671809         -0.9331127
## Career_Satisfaction         0.9671809           1.0000000         -0.9425471
## Years_to_Promotion         -0.9331127          -0.9425471          1.0000000
## Work_Life_Balance          -0.8525322          -0.8144879          0.8324602
##                       Work_Life_Balance
## Age                          -0.1332548
## High_School_GPA              -0.8045875
## SAT_Score                    -0.8002966
## University_GPA               -0.8295063
## Internships_Completed        -0.7982597
## Projects_Completed           -0.7875148
## Certifications               -0.8194059
## Soft_Skills_Score            -0.8262683
## Networking_Score             -0.8259259
## Job_Offers                   -0.8215705
## Starting_Salary              -0.8525322
## Career_Satisfaction          -0.8144879
## Years_to_Promotion            0.8324602
## Work_Life_Balance             1.0000000

# Heatmap of correlations
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.cex = 0.7,
         tl.col = "black")

# Correlation table
# Creating a correlation matrix usng numeric columns
numeric_df <- df %>% select(where(is.numeric))
cor_matrix <- cor(numeric_df, use = "complete.obs")
cor_matrix

##                              Age High_School_GPA  SAT_Score University_GPA
## Age                    1.0000000       0.1855206  0.1774442      0.1754347
## High_School_GPA        0.1855206       1.0000000  0.9823962      0.9832059
## SAT_Score              0.1774442       0.9823962  1.0000000      0.9790502
## University_GPA         0.1754347       0.9832059  0.9790502      1.0000000
## Internships_Completed  0.2416667       0.9498906  0.9420335      0.9463967
## Projects_Completed     0.2046952       0.9597748  0.9541977      0.9604439
## Certifications         0.2215611       0.9368276  0.9355410      0.9381131
## Soft_Skills_Score      0.1750785       0.9654635  0.9641083      0.9704988
## Networking_Score       0.1705440       0.9667206  0.9641504      0.9703584
## Job_Offers             0.2389824       0.9663614  0.9656408      0.9653985
## Starting_Salary        0.2023658       0.9505302  0.9521848      0.9563597
## Career_Satisfaction    0.2449988       0.9638152  0.9673655      0.9669182
## Years_to_Promotion    -0.1742754      -0.9606054 -0.9560272     -0.9625342
## Work_Life_Balance     -0.1332548      -0.8045875 -0.8002966     -0.8295063
##                       Internships_Completed Projects_Completed Certifications
## Age                               0.2416667          0.2046952      0.2215611
## High_School_GPA                   0.9498906          0.9597748      0.9368276
## SAT_Score                         0.9420335          0.9541977      0.9355410
## University_GPA                    0.9463967          0.9604439      0.9381131
## Internships_Completed             1.0000000          0.9602065      0.9735683
## Projects_Completed                0.9602065          1.0000000      0.9441304
## Certifications                    0.9735683          0.9441304      1.0000000
## Soft_Skills_Score                 0.9346498          0.9426525      0.9351402
## Networking_Score                  0.9355302          0.9429274      0.9354322
## Job_Offers                        0.9663770          0.9666481      0.9710175
## Starting_Salary                   0.9376562          0.9429756      0.9675312
## Career_Satisfaction               0.9619426          0.9641340      0.9614832
## Years_to_Promotion               -0.9400309         -0.9345922     -0.9253936
## Work_Life_Balance                -0.7982597         -0.7875148     -0.8194059
##                       Soft_Skills_Score Networking_Score Job_Offers
## Age                           0.1750785        0.1705440  0.2389824
## High_School_GPA               0.9654635        0.9667206  0.9663614
## SAT_Score                     0.9641083        0.9641504  0.9656408
## University_GPA                0.9704988        0.9703584  0.9653985
## Internships_Completed         0.9346498        0.9355302  0.9663770
## Projects_Completed            0.9426525        0.9429274  0.9666481
## Certifications                0.9351402        0.9354322  0.9710175
## Soft_Skills_Score             1.0000000        0.9985574  0.9567323
## Networking_Score              0.9985574        1.0000000  0.9567738
## Job_Offers                    0.9567323        0.9567738  1.0000000
## Starting_Salary               0.9533963        0.9530115  0.9740865
## Career_Satisfaction           0.9674911        0.9674662  0.9866779
## Years_to_Promotion           -0.9435139       -0.9436244 -0.9377211
## Work_Life_Balance            -0.8262683       -0.8259259 -0.8215705
##                       Starting_Salary Career_Satisfaction Years_to_Promotion
## Age                         0.2023658           0.2449988         -0.1742754
## High_School_GPA             0.9505302           0.9638152         -0.9606054
## SAT_Score                   0.9521848           0.9673655         -0.9560272
## University_GPA              0.9563597           0.9669182         -0.9625342
## Internships_Completed       0.9376562           0.9619426         -0.9400309
## Projects_Completed          0.9429756           0.9641340         -0.9345922
## Certifications              0.9675312           0.9614832         -0.9253936
## Soft_Skills_Score           0.9533963           0.9674911         -0.9435139
## Networking_Score            0.9530115           0.9674662         -0.9436244
## Job_Offers                  0.9740865           0.9866779         -0.9377211
## Starting_Salary             1.0000000           0.9671809         -0.9331127
## Career_Satisfaction         0.9671809           1.0000000         -0.9425471
## Years_to_Promotion         -0.9331127          -0.9425471          1.0000000
## Work_Life_Balance          -0.8525322          -0.8144879          0.8324602
##                       Work_Life_Balance
## Age                          -0.1332548
## High_School_GPA              -0.8045875
## SAT_Score                    -0.8002966
## University_GPA               -0.8295063
## Internships_Completed        -0.7982597
## Projects_Completed           -0.7875148
## Certifications               -0.8194059
## Soft_Skills_Score            -0.8262683
## Networking_Score             -0.8259259
## Job_Offers                   -0.8215705
## Starting_Salary              -0.8525322
## Career_Satisfaction          -0.8144879
## Years_to_Promotion            0.8324602
## Work_Life_Balance             1.0000000

# Checking for outliers
check_outliers_iqr <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  
  sum(x < lower | x > upper, na.rm = TRUE)
}

iqr_outliers <- sapply(numeric_vars, check_outliers_iqr)
iqr_outliers

##                   Age       High_School_GPA             SAT_Score 
##                     0                     0                     0 
##        University_GPA Internships_Completed    Projects_Completed 
##                     0                    10                     0 
##        Certifications     Soft_Skills_Score      Networking_Score 
##                    60                     0                     0 
##            Job_Offers       Starting_Salary   Career_Satisfaction 
##                     0                     0                     0 
##    Years_to_Promotion     Work_Life_Balance 
##                     0                     1

# Visualizing outliers
for (col in names(numeric_vars)) {
  boxplot(numeric_vars[[col]],
          main = paste("Boxplot for", col),
          col = "purple")
}

Data Cleaning

# New dataframe for cleaning
df_clean <- df

# Remove duplicate rows
df_clean <- df_clean %>% 
  distinct() %>%
  drop_na()

# Check number of rows  
print(paste("Rows remaining:", nrow(df_clean)))

## [1] "Rows remaining: 400"

# Standardize text for consistent casing and spacing
# Convert to categorical variables
df_clean <- df_clean %>%
mutate(
    Gender = str_to_title(str_trim(Gender)),
    Field_of_Study = str_to_title(str_trim(Field_of_Study)),
    Current_Job_Level = str_to_title(str_trim(Current_Job_Level))
  ) %>%
  mutate(across(where(is.character), as.factor))

# Double-check for lost data
print(paste("Original size:", nrow(df)))

## [1] "Original size: 400"

print(paste("Cleaned size:", nrow(df_clean)))

## [1] "Cleaned size: 400"

# Check cleaned data
head(df_clean)

##   Student_ID Age Gender High_School_GPA SAT_Score University_GPA
## 1       S001  22   Male             3.8      1450            3.6
## 2       S002  24 Female             3.6      1380            3.4
## 3       S003  21   Male             3.9      1520            3.8
## 4       S004  23 Female             3.5      1300            3.2
## 5       S005  25   Male             3.7      1420            3.5
## 6       S006  22 Female             3.4      1250            3.1
##     Field_of_Study Internships_Completed Projects_Completed Certifications
## 1 Computer Science                     3                  7              2
## 2         Business                     2                  5              3
## 3      Engineering                     4                  9              4
## 4       Psychology                     1                  3              1
## 5         Medicine                     2                  6              2
## 6        Education                     1                  4              1
##   Soft_Skills_Score Networking_Score Job_Offers Starting_Salary
## 1                 8                7          3           85000
## 2                 7                6          2           65000
## 3                 9                8          4          120000
## 4                 6                5          1           48000
## 5                 8                7          3           95000
## 6                 7                6          1           42000
##   Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance
## 1                   8                  2               Mid                 7
## 2                   7                  3               Mid                 6
## 3                   9                  1            Senior                 6
## 4                   6                  4             Entry                 8
## 5                   8                  2               Mid                 5
## 6                   7                  5             Entry                 9
##   Entrepreneurship
## 1               No
## 2               No
## 3               No
## 4               No
## 5               No
## 6               No

Correlation Analysis

# Check relationship between numerical variables
numeric_clean <- df_clean %>% select(where(is.numeric))

# Create correlation matrix
cor_matrix_clean <- cor(numeric_clean, use = "complete.obs")

# Plot the correlation matrix to assess strength
corrplot(cor_matrix_clean,
         method = "circle", 
         type = "upper",
         tl.cex = 0.7,      # Text size for labels
         tl.col = "black",
         title = "Correlation Matrix (Cleaned Data)",
         mar = c(0,0,1,0))

Multiple Regression

# Remove 'Student_ID' as it's not a predictor
# Remove 'Entrepreneurship' as it contains only "No" values
regression_data <- df_clean %>% select(-Student_ID, -Entrepreneurship)

# Build model to predict 'Starting_Salary'
model <- lm(Starting_Salary ~ ., data = regression_data)

# View results
summary(model)

## 
## Call:
## lm(formula = Starting_Salary ~ ., data = regression_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14867.5  -2131.1    500.8   2714.5  11487.6 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -54132.132  22620.611  -2.393 0.017203 *  
## Age                              -192.139    173.337  -1.108 0.268373    
## GenderMale                       2798.936    536.037   5.222 2.95e-07 ***
## High_School_GPA                 14515.761   7798.634   1.861 0.063485 .  
## SAT_Score                          -8.104     12.420  -0.652 0.514501    
## University_GPA                  17624.337   4952.313   3.559 0.000421 ***
## Field_of_StudyBusiness          -2063.666   1165.348  -1.771 0.077401 .  
## Field_of_StudyComputer Science  -1510.870   1465.025  -1.031 0.303072    
## Field_of_StudyEducation        -10046.886   2516.937  -3.992 7.90e-05 ***
## Field_of_StudyEngineering         492.392   1268.139   0.388 0.698031    
## Field_of_StudyFinance           -3162.269   2205.429  -1.434 0.152451    
## Field_of_StudyLaw                -382.910   1269.608  -0.302 0.763127    
## Field_of_StudyMarketing         -4597.972   1252.977  -3.670 0.000278 ***
## Field_of_StudyMedicine           2459.117   1460.547   1.684 0.093077 .  
## Field_of_StudyNursing           -5390.689   2571.705  -2.096 0.036742 *  
## Field_of_StudyPsychology        -2495.910   1083.455  -2.304 0.021791 *  
## Internships_Completed           -7034.511   1701.710  -4.134 4.41e-05 ***
## Projects_Completed               2013.939    560.517   3.593 0.000371 ***
## Certifications                   7123.502   1373.055   5.188 3.50e-07 ***
## Soft_Skills_Score                 149.387   3006.388   0.050 0.960396    
## Networking_Score                  663.752   3030.437   0.219 0.826748    
## Job_Offers                       4917.857   1343.152   3.661 0.000287 ***
## Career_Satisfaction              3382.041   1191.763   2.838 0.004790 ** 
## Years_to_Promotion              -1551.687    684.536  -2.267 0.023976 *  
## Current_Job_LevelMid            -1616.182   1434.958  -1.126 0.260766    
## Current_Job_LevelSenior          6549.151   2554.189   2.564 0.010736 *  
## Work_Life_Balance               -1659.997    646.090  -2.569 0.010578 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3922 on 373 degrees of freedom
## Multiple R-squared:  0.9837, Adjusted R-squared:  0.9825 
## F-statistic: 864.8 on 26 and 373 DF,  p-value: < 2.2e-16

Model Diagnostics

# Create diagnostic plots to check model reliability
par(mfrow = c(2, 2))
plot(model)

Model Evaluation

# Get R-squared values
r_squared <- summary(model)$r.squared
adj_r_squared <- summary(model)$adj.r.squared

cat("R-squared:", round(r_squared, 4), "\n")

## R-squared: 0.9837

cat("Adjusted R-squared:", round(adj_r_squared, 4), "\n")

## Adjusted R-squared: 0.9825

cat("\n")

# Calculate predictions and residuals
predictions <- predict(model, regression_data)
actual <- regression_data$Starting_Salary
residuals <- actual - predictions

# Calculate RMSE (Root Mean Squared Error)
rmse <- sqrt(mean(residuals^2))
cat("RMSE:", round(rmse, 2), "\n")

## RMSE: 3787.36

# Calculate MAE (Mean Absolute Error)
mae <- mean(abs(residuals))
cat("MAE:", round(mae, 2), "\n")

## MAE: 2842.17

cat("\n")

# Calculate MAPE (Mean Absolute Percentage Error)
mape <- mean(abs(residuals / actual)) * 100
cat("MAPE:", round(mape, 2), "%\n")

## MAPE: 3.76 %

# Create a simple residual plot
plot(predictions, residuals,
     main = "Residuals vs Fitted Values",
     xlab = "Fitted Values",
     ylab = "Residuals",
     pch = 16,
     col = "blue")
abline(h = 0, col = "red", lwd = 2, lty = 2)

# Histogram of residuals
hist(residuals,
     main = "Distribution of Residuals",
     xlab = "Residuals",
     col = "lightblue",
     breaks = 30)

# Summary of model performance
cat("Model Performance Summary\n")

## Model Performance Summary

cat("The model explains", round(adj_r_squared * 100, 2), 
    "% of variance in Starting Salary\n")

## The model explains 98.25 % of variance in Starting Salary

cat("Average prediction error (MAE):", round(mae, 2), "dollars\n")

## Average prediction error (MAE): 2842.17 dollars

cat("Typical prediction error (RMSE):", round(rmse, 2), "dollars\n")

## Typical prediction error (RMSE): 3787.36 dollars

cat("Average percentage error:", round(mape, 2), "%\n")

## Average percentage error: 3.76 %

Final Project

Ursula Podosenin, Stefan Huber, Ali Ahmed

2025-12-03

Exploratory Data Analysis

Data Cleaning

Correlation Analysis

Multiple Regression

Model Diagnostics

Model Evaluation