Exploratory Data Analysis
# Loading the packages being used
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.95 loaded
# Loading the data set
df <- read.csv("education.csv")
# Basic data review
# Looking at the first few rows
head(df)
## Student_ID Age Gender High_School_GPA SAT_Score University_GPA
## 1 S001 22 Male 3.8 1450 3.6
## 2 S002 24 Female 3.6 1380 3.4
## 3 S003 21 Male 3.9 1520 3.8
## 4 S004 23 Female 3.5 1300 3.2
## 5 S005 25 Male 3.7 1420 3.5
## 6 S006 22 Female 3.4 1250 3.1
## Field_of_Study Internships_Completed Projects_Completed Certifications
## 1 Computer Science 3 7 2
## 2 Business 2 5 3
## 3 Engineering 4 9 4
## 4 Psychology 1 3 1
## 5 Medicine 2 6 2
## 6 Education 1 4 1
## Soft_Skills_Score Networking_Score Job_Offers Starting_Salary
## 1 8 7 3 85000
## 2 7 6 2 65000
## 3 9 8 4 120000
## 4 6 5 1 48000
## 5 8 7 3 95000
## 6 7 6 1 42000
## Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance
## 1 8 2 Mid 7
## 2 7 3 Mid 6
## 3 9 1 Senior 6
## 4 6 4 Entry 8
## 5 8 2 Mid 5
## 6 7 5 Entry 9
## Entrepreneurship
## 1 No
## 2 No
## 3 No
## 4 No
## 5 No
## 6 No
# Looking at the structure of the data set
str(df)
## 'data.frame': 400 obs. of 19 variables:
## $ Student_ID : chr "S001" "S002" "S003" "S004" ...
## $ Age : int 22 24 21 23 25 22 24 23 21 25 ...
## $ Gender : chr "Male" "Female" "Male" "Female" ...
## $ High_School_GPA : num 3.8 3.6 3.9 3.5 3.7 3.4 3.2 3.8 3.6 3.9 ...
## $ SAT_Score : int 1450 1380 1520 1300 1420 1250 1180 1480 1350 1550 ...
## $ University_GPA : num 3.6 3.4 3.8 3.2 3.5 3.1 2.9 3.7 3.3 3.9 ...
## $ Field_of_Study : chr "Computer Science" "Business" "Engineering" "Psychology" ...
## $ Internships_Completed: int 3 2 4 1 2 1 0 3 2 4 ...
## $ Projects_Completed : int 7 5 9 3 6 4 2 8 5 9 ...
## $ Certifications : int 2 3 4 1 2 1 0 3 2 5 ...
## $ Soft_Skills_Score : int 8 7 9 6 8 7 6 9 7 10 ...
## $ Networking_Score : int 7 6 8 5 7 6 4 8 7 9 ...
## $ Job_Offers : int 3 2 4 1 3 1 0 3 2 5 ...
## $ Starting_Salary : int 85000 65000 120000 48000 95000 42000 35000 110000 72000 140000 ...
## $ Career_Satisfaction : int 8 7 9 6 8 7 5 9 7 10 ...
## $ Years_to_Promotion : int 2 3 1 4 2 5 5 2 3 1 ...
## $ Current_Job_Level : chr "Mid" "Mid" "Senior" "Entry" ...
## $ Work_Life_Balance : int 7 6 6 8 5 9 7 6 6 5 ...
## $ Entrepreneurship : chr "No" "No" "No" "No" ...
# Getting summary statistics
summary(df)
## Student_ID Age Gender High_School_GPA
## Length:400 Min. :21 Length:400 Min. :3.200
## Class :character 1st Qu.:22 Class :character 1st Qu.:3.500
## Mode :character Median :23 Mode :character Median :3.600
## Mean :23 Mean :3.624
## 3rd Qu.:24 3rd Qu.:3.800
## Max. :25 Max. :3.900
## SAT_Score University_GPA Field_of_Study Internships_Completed
## Min. :1160 Min. :2.800 Length:400 Min. :0.0
## 1st Qu.:1300 1st Qu.:3.200 Class :character 1st Qu.:2.0
## Median :1380 Median :3.450 Mode :character Median :3.0
## Mean :1389 Mean :3.441 Mean :2.5
## 3rd Qu.:1480 3rd Qu.:3.700 3rd Qu.:3.0
## Max. :1580 Max. :4.000 Max. :4.0
## Projects_Completed Certifications Soft_Skills_Score Networking_Score
## Min. :2.000 Min. :0.000 Min. : 5.000 Min. :4.000
## 1st Qu.:5.000 1st Qu.:2.000 1st Qu.: 7.000 1st Qu.:6.000
## Median :6.000 Median :3.000 Median : 8.000 Median :7.000
## Mean :6.235 Mean :2.625 Mean : 7.933 Mean :6.933
## 3rd Qu.:8.000 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.:8.000
## Max. :9.000 Max. :5.000 Max. :10.000 Max. :9.000
## Job_Offers Starting_Salary Career_Satisfaction Years_to_Promotion
## Min. :0.00 Min. : 31000 Min. : 5.000 Min. :1.000
## 1st Qu.:2.00 1st Qu.: 68000 1st Qu.: 7.000 1st Qu.:2.000
## Median :3.00 Median : 84000 Median : 8.000 Median :3.000
## Mean :2.74 Mean : 87562 Mean : 7.793 Mean :2.915
## 3rd Qu.:4.00 3rd Qu.:105250 3rd Qu.: 9.000 3rd Qu.:4.000
## Max. :5.00 Max. :152000 Max. :10.000 Max. :5.000
## Current_Job_Level Work_Life_Balance Entrepreneurship
## Length:400 Min. :5.000 Length:400
## Class :character 1st Qu.:6.000 Class :character
## Mode :character Median :6.000 Mode :character
## Mean :6.412
## 3rd Qu.:7.000
## Max. :9.000
# Looking for missing data
# Counting the missing values per column
colSums(is.na(df))
## Student_ID Age Gender
## 0 0 0
## High_School_GPA SAT_Score University_GPA
## 0 0 0
## Field_of_Study Internships_Completed Projects_Completed
## 0 0 0
## Certifications Soft_Skills_Score Networking_Score
## 0 0 0
## Job_Offers Starting_Salary Career_Satisfaction
## 0 0 0
## Years_to_Promotion Current_Job_Level Work_Life_Balance
## 0 0 0
## Entrepreneurship
## 0
# Getting a distribution of numeric variables
numeric_vars <- df %>% select(where(is.numeric))
# Histograms for each of the numeric variable
for (v in names(numeric_vars)) {
hist(numeric_vars[[v]],
main = paste("Histogram of", v),
xlab = v,
col = "blue")
}














# Getting a distribution of the categorical variables
categorical_vars <- df %>% select(where(is.character), where(is.factor))
# Bar plots of each categorical value
for (v in names(categorical_vars)) {
barplot(table(categorical_vars[[v]]),
main = paste("Barplot of", v),
las = 2,
col = "green")
}





# Getting a correlation matrix of the numeric variables
cor_matrix <- cor(numeric_vars, use = "complete.obs")
cor_matrix
## Age High_School_GPA SAT_Score University_GPA
## Age 1.0000000 0.1855206 0.1774442 0.1754347
## High_School_GPA 0.1855206 1.0000000 0.9823962 0.9832059
## SAT_Score 0.1774442 0.9823962 1.0000000 0.9790502
## University_GPA 0.1754347 0.9832059 0.9790502 1.0000000
## Internships_Completed 0.2416667 0.9498906 0.9420335 0.9463967
## Projects_Completed 0.2046952 0.9597748 0.9541977 0.9604439
## Certifications 0.2215611 0.9368276 0.9355410 0.9381131
## Soft_Skills_Score 0.1750785 0.9654635 0.9641083 0.9704988
## Networking_Score 0.1705440 0.9667206 0.9641504 0.9703584
## Job_Offers 0.2389824 0.9663614 0.9656408 0.9653985
## Starting_Salary 0.2023658 0.9505302 0.9521848 0.9563597
## Career_Satisfaction 0.2449988 0.9638152 0.9673655 0.9669182
## Years_to_Promotion -0.1742754 -0.9606054 -0.9560272 -0.9625342
## Work_Life_Balance -0.1332548 -0.8045875 -0.8002966 -0.8295063
## Internships_Completed Projects_Completed Certifications
## Age 0.2416667 0.2046952 0.2215611
## High_School_GPA 0.9498906 0.9597748 0.9368276
## SAT_Score 0.9420335 0.9541977 0.9355410
## University_GPA 0.9463967 0.9604439 0.9381131
## Internships_Completed 1.0000000 0.9602065 0.9735683
## Projects_Completed 0.9602065 1.0000000 0.9441304
## Certifications 0.9735683 0.9441304 1.0000000
## Soft_Skills_Score 0.9346498 0.9426525 0.9351402
## Networking_Score 0.9355302 0.9429274 0.9354322
## Job_Offers 0.9663770 0.9666481 0.9710175
## Starting_Salary 0.9376562 0.9429756 0.9675312
## Career_Satisfaction 0.9619426 0.9641340 0.9614832
## Years_to_Promotion -0.9400309 -0.9345922 -0.9253936
## Work_Life_Balance -0.7982597 -0.7875148 -0.8194059
## Soft_Skills_Score Networking_Score Job_Offers
## Age 0.1750785 0.1705440 0.2389824
## High_School_GPA 0.9654635 0.9667206 0.9663614
## SAT_Score 0.9641083 0.9641504 0.9656408
## University_GPA 0.9704988 0.9703584 0.9653985
## Internships_Completed 0.9346498 0.9355302 0.9663770
## Projects_Completed 0.9426525 0.9429274 0.9666481
## Certifications 0.9351402 0.9354322 0.9710175
## Soft_Skills_Score 1.0000000 0.9985574 0.9567323
## Networking_Score 0.9985574 1.0000000 0.9567738
## Job_Offers 0.9567323 0.9567738 1.0000000
## Starting_Salary 0.9533963 0.9530115 0.9740865
## Career_Satisfaction 0.9674911 0.9674662 0.9866779
## Years_to_Promotion -0.9435139 -0.9436244 -0.9377211
## Work_Life_Balance -0.8262683 -0.8259259 -0.8215705
## Starting_Salary Career_Satisfaction Years_to_Promotion
## Age 0.2023658 0.2449988 -0.1742754
## High_School_GPA 0.9505302 0.9638152 -0.9606054
## SAT_Score 0.9521848 0.9673655 -0.9560272
## University_GPA 0.9563597 0.9669182 -0.9625342
## Internships_Completed 0.9376562 0.9619426 -0.9400309
## Projects_Completed 0.9429756 0.9641340 -0.9345922
## Certifications 0.9675312 0.9614832 -0.9253936
## Soft_Skills_Score 0.9533963 0.9674911 -0.9435139
## Networking_Score 0.9530115 0.9674662 -0.9436244
## Job_Offers 0.9740865 0.9866779 -0.9377211
## Starting_Salary 1.0000000 0.9671809 -0.9331127
## Career_Satisfaction 0.9671809 1.0000000 -0.9425471
## Years_to_Promotion -0.9331127 -0.9425471 1.0000000
## Work_Life_Balance -0.8525322 -0.8144879 0.8324602
## Work_Life_Balance
## Age -0.1332548
## High_School_GPA -0.8045875
## SAT_Score -0.8002966
## University_GPA -0.8295063
## Internships_Completed -0.7982597
## Projects_Completed -0.7875148
## Certifications -0.8194059
## Soft_Skills_Score -0.8262683
## Networking_Score -0.8259259
## Job_Offers -0.8215705
## Starting_Salary -0.8525322
## Career_Satisfaction -0.8144879
## Years_to_Promotion 0.8324602
## Work_Life_Balance 1.0000000
# Heatmap of correlations
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.cex = 0.7,
tl.col = "black")

# Correlation table
# Creating a correlation matrix usng numeric columns
numeric_df <- df %>% select(where(is.numeric))
cor_matrix <- cor(numeric_df, use = "complete.obs")
cor_matrix
## Age High_School_GPA SAT_Score University_GPA
## Age 1.0000000 0.1855206 0.1774442 0.1754347
## High_School_GPA 0.1855206 1.0000000 0.9823962 0.9832059
## SAT_Score 0.1774442 0.9823962 1.0000000 0.9790502
## University_GPA 0.1754347 0.9832059 0.9790502 1.0000000
## Internships_Completed 0.2416667 0.9498906 0.9420335 0.9463967
## Projects_Completed 0.2046952 0.9597748 0.9541977 0.9604439
## Certifications 0.2215611 0.9368276 0.9355410 0.9381131
## Soft_Skills_Score 0.1750785 0.9654635 0.9641083 0.9704988
## Networking_Score 0.1705440 0.9667206 0.9641504 0.9703584
## Job_Offers 0.2389824 0.9663614 0.9656408 0.9653985
## Starting_Salary 0.2023658 0.9505302 0.9521848 0.9563597
## Career_Satisfaction 0.2449988 0.9638152 0.9673655 0.9669182
## Years_to_Promotion -0.1742754 -0.9606054 -0.9560272 -0.9625342
## Work_Life_Balance -0.1332548 -0.8045875 -0.8002966 -0.8295063
## Internships_Completed Projects_Completed Certifications
## Age 0.2416667 0.2046952 0.2215611
## High_School_GPA 0.9498906 0.9597748 0.9368276
## SAT_Score 0.9420335 0.9541977 0.9355410
## University_GPA 0.9463967 0.9604439 0.9381131
## Internships_Completed 1.0000000 0.9602065 0.9735683
## Projects_Completed 0.9602065 1.0000000 0.9441304
## Certifications 0.9735683 0.9441304 1.0000000
## Soft_Skills_Score 0.9346498 0.9426525 0.9351402
## Networking_Score 0.9355302 0.9429274 0.9354322
## Job_Offers 0.9663770 0.9666481 0.9710175
## Starting_Salary 0.9376562 0.9429756 0.9675312
## Career_Satisfaction 0.9619426 0.9641340 0.9614832
## Years_to_Promotion -0.9400309 -0.9345922 -0.9253936
## Work_Life_Balance -0.7982597 -0.7875148 -0.8194059
## Soft_Skills_Score Networking_Score Job_Offers
## Age 0.1750785 0.1705440 0.2389824
## High_School_GPA 0.9654635 0.9667206 0.9663614
## SAT_Score 0.9641083 0.9641504 0.9656408
## University_GPA 0.9704988 0.9703584 0.9653985
## Internships_Completed 0.9346498 0.9355302 0.9663770
## Projects_Completed 0.9426525 0.9429274 0.9666481
## Certifications 0.9351402 0.9354322 0.9710175
## Soft_Skills_Score 1.0000000 0.9985574 0.9567323
## Networking_Score 0.9985574 1.0000000 0.9567738
## Job_Offers 0.9567323 0.9567738 1.0000000
## Starting_Salary 0.9533963 0.9530115 0.9740865
## Career_Satisfaction 0.9674911 0.9674662 0.9866779
## Years_to_Promotion -0.9435139 -0.9436244 -0.9377211
## Work_Life_Balance -0.8262683 -0.8259259 -0.8215705
## Starting_Salary Career_Satisfaction Years_to_Promotion
## Age 0.2023658 0.2449988 -0.1742754
## High_School_GPA 0.9505302 0.9638152 -0.9606054
## SAT_Score 0.9521848 0.9673655 -0.9560272
## University_GPA 0.9563597 0.9669182 -0.9625342
## Internships_Completed 0.9376562 0.9619426 -0.9400309
## Projects_Completed 0.9429756 0.9641340 -0.9345922
## Certifications 0.9675312 0.9614832 -0.9253936
## Soft_Skills_Score 0.9533963 0.9674911 -0.9435139
## Networking_Score 0.9530115 0.9674662 -0.9436244
## Job_Offers 0.9740865 0.9866779 -0.9377211
## Starting_Salary 1.0000000 0.9671809 -0.9331127
## Career_Satisfaction 0.9671809 1.0000000 -0.9425471
## Years_to_Promotion -0.9331127 -0.9425471 1.0000000
## Work_Life_Balance -0.8525322 -0.8144879 0.8324602
## Work_Life_Balance
## Age -0.1332548
## High_School_GPA -0.8045875
## SAT_Score -0.8002966
## University_GPA -0.8295063
## Internships_Completed -0.7982597
## Projects_Completed -0.7875148
## Certifications -0.8194059
## Soft_Skills_Score -0.8262683
## Networking_Score -0.8259259
## Job_Offers -0.8215705
## Starting_Salary -0.8525322
## Career_Satisfaction -0.8144879
## Years_to_Promotion 0.8324602
## Work_Life_Balance 1.0000000
# Checking for outliers
check_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
sum(x < lower | x > upper, na.rm = TRUE)
}
iqr_outliers <- sapply(numeric_vars, check_outliers_iqr)
iqr_outliers
## Age High_School_GPA SAT_Score
## 0 0 0
## University_GPA Internships_Completed Projects_Completed
## 0 10 0
## Certifications Soft_Skills_Score Networking_Score
## 60 0 0
## Job_Offers Starting_Salary Career_Satisfaction
## 0 0 0
## Years_to_Promotion Work_Life_Balance
## 0 1
# Visualizing outliers
for (col in names(numeric_vars)) {
boxplot(numeric_vars[[col]],
main = paste("Boxplot for", col),
col = "purple")
}














Data Cleaning
# New dataframe for cleaning
df_clean <- df
# Remove duplicate rows
df_clean <- df_clean %>%
distinct() %>%
drop_na()
# Check number of rows
print(paste("Rows remaining:", nrow(df_clean)))
## [1] "Rows remaining: 400"
# Standardize text for consistent casing and spacing
# Convert to categorical variables
df_clean <- df_clean %>%
mutate(
Gender = str_to_title(str_trim(Gender)),
Field_of_Study = str_to_title(str_trim(Field_of_Study)),
Current_Job_Level = str_to_title(str_trim(Current_Job_Level))
) %>%
mutate(across(where(is.character), as.factor))
# Double-check for lost data
print(paste("Original size:", nrow(df)))
## [1] "Original size: 400"
print(paste("Cleaned size:", nrow(df_clean)))
## [1] "Cleaned size: 400"
# Check cleaned data
head(df_clean)
## Student_ID Age Gender High_School_GPA SAT_Score University_GPA
## 1 S001 22 Male 3.8 1450 3.6
## 2 S002 24 Female 3.6 1380 3.4
## 3 S003 21 Male 3.9 1520 3.8
## 4 S004 23 Female 3.5 1300 3.2
## 5 S005 25 Male 3.7 1420 3.5
## 6 S006 22 Female 3.4 1250 3.1
## Field_of_Study Internships_Completed Projects_Completed Certifications
## 1 Computer Science 3 7 2
## 2 Business 2 5 3
## 3 Engineering 4 9 4
## 4 Psychology 1 3 1
## 5 Medicine 2 6 2
## 6 Education 1 4 1
## Soft_Skills_Score Networking_Score Job_Offers Starting_Salary
## 1 8 7 3 85000
## 2 7 6 2 65000
## 3 9 8 4 120000
## 4 6 5 1 48000
## 5 8 7 3 95000
## 6 7 6 1 42000
## Career_Satisfaction Years_to_Promotion Current_Job_Level Work_Life_Balance
## 1 8 2 Mid 7
## 2 7 3 Mid 6
## 3 9 1 Senior 6
## 4 6 4 Entry 8
## 5 8 2 Mid 5
## 6 7 5 Entry 9
## Entrepreneurship
## 1 No
## 2 No
## 3 No
## 4 No
## 5 No
## 6 No
Correlation Analysis
# Check relationship between numerical variables
numeric_clean <- df_clean %>% select(where(is.numeric))
# Create correlation matrix
cor_matrix_clean <- cor(numeric_clean, use = "complete.obs")
# Plot the correlation matrix to assess strength
corrplot(cor_matrix_clean,
method = "circle",
type = "upper",
tl.cex = 0.7, # Text size for labels
tl.col = "black",
title = "Correlation Matrix (Cleaned Data)",
mar = c(0,0,1,0))

Multiple Regression
# Remove 'Student_ID' as it's not a predictor
# Remove 'Entrepreneurship' as it contains only "No" values
regression_data <- df_clean %>% select(-Student_ID, -Entrepreneurship)
# Build model to predict 'Starting_Salary'
model <- lm(Starting_Salary ~ ., data = regression_data)
# View results
summary(model)
##
## Call:
## lm(formula = Starting_Salary ~ ., data = regression_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14867.5 -2131.1 500.8 2714.5 11487.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -54132.132 22620.611 -2.393 0.017203 *
## Age -192.139 173.337 -1.108 0.268373
## GenderMale 2798.936 536.037 5.222 2.95e-07 ***
## High_School_GPA 14515.761 7798.634 1.861 0.063485 .
## SAT_Score -8.104 12.420 -0.652 0.514501
## University_GPA 17624.337 4952.313 3.559 0.000421 ***
## Field_of_StudyBusiness -2063.666 1165.348 -1.771 0.077401 .
## Field_of_StudyComputer Science -1510.870 1465.025 -1.031 0.303072
## Field_of_StudyEducation -10046.886 2516.937 -3.992 7.90e-05 ***
## Field_of_StudyEngineering 492.392 1268.139 0.388 0.698031
## Field_of_StudyFinance -3162.269 2205.429 -1.434 0.152451
## Field_of_StudyLaw -382.910 1269.608 -0.302 0.763127
## Field_of_StudyMarketing -4597.972 1252.977 -3.670 0.000278 ***
## Field_of_StudyMedicine 2459.117 1460.547 1.684 0.093077 .
## Field_of_StudyNursing -5390.689 2571.705 -2.096 0.036742 *
## Field_of_StudyPsychology -2495.910 1083.455 -2.304 0.021791 *
## Internships_Completed -7034.511 1701.710 -4.134 4.41e-05 ***
## Projects_Completed 2013.939 560.517 3.593 0.000371 ***
## Certifications 7123.502 1373.055 5.188 3.50e-07 ***
## Soft_Skills_Score 149.387 3006.388 0.050 0.960396
## Networking_Score 663.752 3030.437 0.219 0.826748
## Job_Offers 4917.857 1343.152 3.661 0.000287 ***
## Career_Satisfaction 3382.041 1191.763 2.838 0.004790 **
## Years_to_Promotion -1551.687 684.536 -2.267 0.023976 *
## Current_Job_LevelMid -1616.182 1434.958 -1.126 0.260766
## Current_Job_LevelSenior 6549.151 2554.189 2.564 0.010736 *
## Work_Life_Balance -1659.997 646.090 -2.569 0.010578 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3922 on 373 degrees of freedom
## Multiple R-squared: 0.9837, Adjusted R-squared: 0.9825
## F-statistic: 864.8 on 26 and 373 DF, p-value: < 2.2e-16
Model Diagnostics
# Create diagnostic plots to check model reliability
par(mfrow = c(2, 2))
plot(model)

Model Evaluation
# Get R-squared values
r_squared <- summary(model)$r.squared
adj_r_squared <- summary(model)$adj.r.squared
cat("R-squared:", round(r_squared, 4), "\n")
## R-squared: 0.9837
cat("Adjusted R-squared:", round(adj_r_squared, 4), "\n")
## Adjusted R-squared: 0.9825
cat("\n")
# Calculate predictions and residuals
predictions <- predict(model, regression_data)
actual <- regression_data$Starting_Salary
residuals <- actual - predictions
# Calculate RMSE (Root Mean Squared Error)
rmse <- sqrt(mean(residuals^2))
cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 3787.36
# Calculate MAE (Mean Absolute Error)
mae <- mean(abs(residuals))
cat("MAE:", round(mae, 2), "\n")
## MAE: 2842.17
cat("\n")
# Calculate MAPE (Mean Absolute Percentage Error)
mape <- mean(abs(residuals / actual)) * 100
cat("MAPE:", round(mape, 2), "%\n")
## MAPE: 3.76 %
# Create a simple residual plot
plot(predictions, residuals,
main = "Residuals vs Fitted Values",
xlab = "Fitted Values",
ylab = "Residuals",
pch = 16,
col = "blue")
abline(h = 0, col = "red", lwd = 2, lty = 2)

# Histogram of residuals
hist(residuals,
main = "Distribution of Residuals",
xlab = "Residuals",
col = "lightblue",
breaks = 30)

# Summary of model performance
cat("Model Performance Summary\n")
## Model Performance Summary
cat("The model explains", round(adj_r_squared * 100, 2),
"% of variance in Starting Salary\n")
## The model explains 98.25 % of variance in Starting Salary
cat("Average prediction error (MAE):", round(mae, 2), "dollars\n")
## Average prediction error (MAE): 2842.17 dollars
cat("Typical prediction error (RMSE):", round(rmse, 2), "dollars\n")
## Typical prediction error (RMSE): 3787.36 dollars
cat("Average percentage error:", round(mape, 2), "%\n")
## Average percentage error: 3.76 %