# import Data
student_behavior <- read.csv("Downloads/Student Attitude and Behavior (Final).csv")
head(student_behavior)
## Student_ID Gender Height_cm Weight_kg Eleventh_Grade_GPA Twelfth_Grade_GPA
## 1 1 Male 100 58 3.224490 2.723404
## 2 2 Female 90 40 2.857143 3.404255
## 3 3 Male 159 78 2.897959 2.595745
## 4 4 Female 147 20 2.857143 2.510638
## 5 5 Male 170 54 1.632653 2.765957
## 6 6 Female 139 33 3.673469 3.191489
## College_GPA Certification_Course Hobbies Daily_Study_Time_mins
## 1 3.20 No Video Games 188.01559
## 2 2.80 No Cinema 136.77480
## 3 2.20 Yes Cinema 124.78446
## 4 2.32 Yes Reading books 11.09966
## 5 1.20 No Video Games 30.00000
## 6 2.80 Yes Cinema 64.15472
## Preferred_Study_Time Salary_Expectation Likes_Degree Social_Media_Use_mins
## 1 Morning 40000 No 95
## 2 Morning 15000 Yes 65
## 3 No preference 13000 Yes 140
## 4 No preference 1500000 No 90
## 5 Morning 50000 Yes 115
## 6 Night 20000 Yes 40
## Commute_Time_mins Stress_Level Financial_Status Part_Time_Job Average_GPA
## 1 13.84624 2 2 No 3.049298
## 2 49.56766 2 2 No 3.020466
## 3 22.67896 1 2 No 2.564568
## 4 88.31212 2 3 No 2.562594
## 5 145.19791 3 3 No 1.866204
## 6 75.79959 2 3 No 3.221653
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(student_behavior)
## Rows: 235
## Columns: 19
## $ Student_ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ Gender <chr> "Male", "Female", "Male", "Female", "Male", "Fem…
## $ Height_cm <dbl> 100, 90, 159, 147, 170, 139, 165, 152, 190, 150,…
## $ Weight_kg <dbl> 58, 40, 78, 20, 54, 33, 50, 43, 85, 84, 50, 51, …
## $ Eleventh_Grade_GPA <dbl> 3.224490, 2.857143, 2.897959, 2.857143, 1.632653…
## $ Twelfth_Grade_GPA <dbl> 2.723404, 3.404255, 2.595745, 2.510638, 2.765957…
## $ College_GPA <dbl> 3.20, 2.80, 2.20, 2.32, 1.20, 2.80, 0.12, 3.00, …
## $ Certification_Course <chr> "No", "No", "Yes", "Yes", "No", "Yes", "Yes", "N…
## $ Hobbies <chr> "Video Games", "Cinema", "Cinema", "Reading book…
## $ Daily_Study_Time_mins <dbl> 188.01559, 136.77480, 124.78446, 11.09966, 30.00…
## $ Preferred_Study_Time <chr> "Morning", "Morning", "No preference", "No prefe…
## $ Salary_Expectation <int> 40000, 15000, 13000, 1500000, 50000, 20000, 1500…
## $ Likes_Degree <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "…
## $ Social_Media_Use_mins <int> 95, 65, 140, 90, 115, 40, 90, 75, 110, 45, 80, 9…
## $ Commute_Time_mins <dbl> 13.846239, 49.567665, 22.678964, 88.312123, 145.…
## $ Stress_Level <int> 2, 2, 1, 2, 3, 2, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, …
## $ Financial_Status <int> 2, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 1, 3, 2, …
## $ Part_Time_Job <chr> "No", "No", "No", "No", "No", "No", "No", "No", …
## $ Average_GPA <dbl> 3.049298, 3.020466, 2.564568, 2.562594, 1.866204…
colnames(student_behavior)
## [1] "Student_ID" "Gender" "Height_cm"
## [4] "Weight_kg" "Eleventh_Grade_GPA" "Twelfth_Grade_GPA"
## [7] "College_GPA" "Certification_Course" "Hobbies"
## [10] "Daily_Study_Time_mins" "Preferred_Study_Time" "Salary_Expectation"
## [13] "Likes_Degree" "Social_Media_Use_mins" "Commute_Time_mins"
## [16] "Stress_Level" "Financial_Status" "Part_Time_Job"
## [19] "Average_GPA"
essentials = student_behavior %>%
select(Twelfth_Grade_GPA:Financial_Status, Certification_Course:Commute_Time_mins, Eleventh_Grade_GPA)
head(essentials)
## Twelfth_Grade_GPA College_GPA Certification_Course Hobbies
## 1 2.723404 3.20 No Video Games
## 2 3.404255 2.80 No Cinema
## 3 2.595745 2.20 Yes Cinema
## 4 2.510638 2.32 Yes Reading books
## 5 2.765957 1.20 No Video Games
## 6 3.191489 2.80 Yes Cinema
## Daily_Study_Time_mins Preferred_Study_Time Salary_Expectation Likes_Degree
## 1 188.01559 Morning 40000 No
## 2 136.77480 Morning 15000 Yes
## 3 124.78446 No preference 13000 Yes
## 4 11.09966 No preference 1500000 No
## 5 30.00000 Morning 50000 Yes
## 6 64.15472 Night 20000 Yes
## Social_Media_Use_mins Commute_Time_mins Stress_Level Financial_Status
## 1 95 13.84624 2 2
## 2 65 49.56766 2 2
## 3 140 22.67896 1 2
## 4 90 88.31212 2 3
## 5 115 145.19791 3 3
## 6 40 75.79959 2 3
## Eleventh_Grade_GPA
## 1 3.224490
## 2 2.857143
## 3 2.897959
## 4 2.857143
## 5 1.632653
## 6 3.673469
assessment_df = essentials %>% select(Eleventh_Grade_GPA, Twelfth_Grade_GPA, College_GPA, Salary_Expectation, Financial_Status, Social_Media_Use_mins, Daily_Study_Time_mins, Commute_Time_mins, Likes_Degree)
head(assessment_df)
## Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## 1 3.224490 2.723404 3.20 40000
## 2 2.857143 3.404255 2.80 15000
## 3 2.897959 2.595745 2.20 13000
## 4 2.857143 2.510638 2.32 1500000
## 5 1.632653 2.765957 1.20 50000
## 6 3.673469 3.191489 2.80 20000
## Financial_Status Social_Media_Use_mins Daily_Study_Time_mins
## 1 2 95 188.01559
## 2 2 65 136.77480
## 3 2 140 124.78446
## 4 3 90 11.09966
## 5 3 115 30.00000
## 6 3 40 64.15472
## Commute_Time_mins Likes_Degree
## 1 13.84624 No
## 2 49.56766 Yes
## 3 22.67896 Yes
## 4 88.31212 No
## 5 145.19791 Yes
## 6 75.79959 Yes
library(patchwork)
library(ggplot2)
# Ensure `df` is a data frame
density_plot <- assessment_df %>% ggplot( aes(x = Salary_Expectation)) +
geom_density(fill = "lightblue")
# Render the plot
density_plot
# Values are dramatically different :
v = c(mean(assessment_df$Salary_Expectation), median(assessment_df$Salary_Expectation))
names(v) = c("mean", "median")
v = c(v, v["mean"] - v["median"])
names(v) = c("mean", "median", "difference")
v
## mean median difference
## 32481.68 20000.00 12481.68
# Load necessary library
library(ggplot2)
# Create a box plot
assessment_df %>% ggplot(aes(y = Salary_Expectation)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of Salary Expectation", y = "Salary Expectation") +
theme_minimal()
Take out Outliers :
quartiles <- quantile(assessment_df$Salary_Expectation, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
quartiles
## 25% 50% 75%
## 15000 20000 25000
above_threshold = quartiles["75%"] + IQR(assessment_df$Salary_Expectation)*1.5
# You are an outlier if you make above : $40,000s!
above_threshold
## 75%
## 40000
above_outlier = assessment_df %>% filter(Salary_Expectation >= above_threshold); head(above_outlier)
## Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## 1 3.224490 2.723404 3.20 40000
## 2 2.857143 2.510638 2.32 1500000
## 3 1.632653 2.765957 1.20 50000
## 4 3.653061 2.965957 2.80 60000
## 5 3.224490 2.765957 3.00 40000
## 6 2.204082 2.170213 2.64 50000
## Financial_Status Social_Media_Use_mins Daily_Study_Time_mins
## 1 2 95 188.01559
## 2 3 90 11.09966
## 3 3 115 30.00000
## 4 3 0 182.05841
## 5 2 175 152.04155
## 6 3 0 135.63961
## Commute_Time_mins Likes_Degree
## 1 13.84624 No
## 2 88.31212 No
## 3 145.19791 Yes
## 4 10.40952 Yes
## 5 36.56810 Yes
## 6 63.03645 Yes
typical = assessment_df %>% filter(Salary_Expectation < above_threshold); head(typical)
## Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## 1 2.857143 3.404255 2.80 15000
## 2 2.897959 2.595745 2.20 13000
## 3 3.673469 3.191489 2.80 20000
## 4 2.857143 2.680851 0.12 15000
## 5 2.514286 2.624255 3.00 25000
## 6 3.600000 2.872340 2.40 20000
## Financial_Status Social_Media_Use_mins Daily_Study_Time_mins
## 1 2 65 136.77480
## 2 2 140 124.78446
## 3 3 40 64.15472
## 4 3 90 25.89300
## 5 3 75 172.69474
## 6 2 110 127.67657
## Commute_Time_mins Likes_Degree
## 1 49.56766 Yes
## 2 22.67896 Yes
## 3 75.79959 Yes
## 4 115.63578 Yes
## 5 22.13955 Yes
## 6 42.43369 Yes
# % contribution to observations : nrow()
cat(round(nrow(above_outlier)/nrow(assessment_df), 2),
round(1-nrow(above_outlier)/nrow(assessment_df), 2))
## 0.1 0.9
# Create a box plot
typical %>% ggplot(aes(y = Salary_Expectation)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of Salary Expectation", y = "Salary Expectation") +
theme_minimal()
# Compute density : typical person
density_data <- density(typical$Salary_Expectation)
# Function to identify local maxima
is_local_max <- function(y) {
diff_sign <- diff(sign(diff(y)))
c(FALSE, diff_sign == -2, FALSE)
}
# Apply function to density values
local_max_indices <- is_local_max(density_data$y)
# X and Y coordinates of local maxima
local_max_x <- density_data$x[local_max_indices]
local_max_y <- density_data$y[local_max_indices]
# Data frame of local maxima
peaks <- data.frame(x = local_max_x, y = local_max_y)
library(ggplot2)
# Compute density
density_data <- density(typical$Salary_Expectation)
# Function to identify local maxima
is_local_max <- function(y) {
diff_sign <- diff(sign(diff(y)))
c(FALSE, diff_sign == -2, FALSE)
}
# Apply function to density values
local_max_indices <- is_local_max(density_data$y)
# X and Y coordinates of local maxima
local_max_x <- density_data$x[local_max_indices]
local_max_y <- density_data$y[local_max_indices]
# Data frame of local maxima
peaks <- data.frame(x = local_max_x, y = local_max_y)
# Create density plot and annotate peaks
ggplot(data = typical, aes(x = Salary_Expectation)) +
geom_density(fill = "blue", alpha = 0.4) + # Density plot
geom_point(data = peaks, aes(x = x, y = y), color = "red", size = 3) + # Add points at peaks
geom_text(data = peaks, aes(x = x, y = y, label = paste0("(", round(x, 1), ", ", round(y, 3), ")")),
vjust = -1, hjust = 0.5, color = "black", size = 4) + # Add labels at peaks
labs(title = "Salary Expectations Density Plot",
x = "Salary Expectations",
y = "Density") +
theme_minimal() # Clean theme
There must clearly be subgroups, it appears that therea are 2 major subgroups and 6 others. Although Values appear to be low.
Most people have $15-20k expected salary
Y values dont mean much, its the integral which indicates the probability given range.
# Calculate density
density_data <- density(assessment_df$Salary_Expectation)
# Define a function for the density
density_function <- approxfun(density_data$x, density_data$y)
# Integrate the density function from 15,000 to 20,000
result_1 <- integrate(density_function, lower = 10000, upper = 25005)
# Print the result
result_1
## 0.6658827 with absolute error < 1e-04
result_2 <- integrate(density_function, lower = 10000, upper = 30010)
result_2
## 0.7463738 with absolute error < 0.00012
summary(assessment_df)
## Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## Min. :0.302 Min. :1.915 Min. :0.040 Min. : 0
## 1st Qu.:2.857 1st Qu.:2.553 1st Qu.:2.400 1st Qu.: 15000
## Median :3.265 Median :2.936 Median :2.800 Median : 20000
## Mean :3.137 Mean :2.927 Mean :2.826 Mean : 32482
## 3rd Qu.:3.520 3rd Qu.:3.234 3rd Qu.:3.200 3rd Qu.: 25000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :1500000
## Financial_Status Social_Media_Use_mins Daily_Study_Time_mins Commute_Time_mins
## Min. :1.000 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:2.000 1st Qu.: 35.00 1st Qu.: 91.49 1st Qu.: 20.57
## Median :3.000 Median : 60.00 Median :132.58 Median : 40.17
## Mean :2.523 Mean : 66.04 Mean :123.44 Mean : 46.06
## 3rd Qu.:3.000 3rd Qu.: 90.00 3rd Qu.:163.94 3rd Qu.: 65.44
## Max. :4.000 Max. :180.00 Max. :299.12 Max. :177.92
## Likes_Degree
## Length:235
## Class :character
## Mode :character
##
##
##
model = lm(Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, data = assessment_df)
model
##
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA,
## data = assessment_df)
##
## Coefficients:
## (Intercept) Twelfth_Grade_GPA College_GPA
## 108644 -12124 -14392
# Load required libraries
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Create a grid of predictor values
twelfth_gpa <- seq(min(assessment_df$Twelfth_Grade_GPA), max(assessment_df$Twelfth_Grade_GPA), length = 200)
college_gpa <- seq(min(assessment_df$College_GPA), max(assessment_df$College_GPA), length = 200)
# Expand grid to create combinations of twelfth and college GPA
grid <- expand.grid(Twelfth_Grade_GPA = twelfth_gpa, College_GPA = college_gpa)
# Predict Salary_Expectation based on the grid
grid$Predicted_Salary <- predict(model, newdata = grid)
# Convert grid to 3D plot-friendly frmat
fig <- plot_ly(x = ~grid$Twelfth_Grade_GPA,
y = ~grid$College_GPA,
z = ~grid$Predicted_Salary,
type = "mesh3d")
# Add labels and title
fig <- fig %>%
layout(scene = list(
xaxis = list(title = 'Twelfth Grade GPA'),
yaxis = list(title = 'College GPA'),
zaxis = list(title = 'Salary Expectation')
))
# Show the plot
fig
# Load ggplot2 for visualization
library(ggplot2)
# Create the histogram
ggplot(typical, aes(x = Salary_Expectation, fill = as.factor(Financial_Status))) +
geom_histogram(binwidth = 5000, position = "dodge", alpha = 0.7) +
scale_fill_brewer(palette = "Set2", name = "Financial Status") +
labs(
title = "Distribution of Salary Expectation by Financial Status",
x = "Salary Expectation",
y = "Count"
) +
theme_minimal()
# Calculate percentages for each financial status in a single operation
percentages <- typical %>%
group_by(Financial_Status) %>%
summarise(Percentage = round(n() / nrow(typical) * 100, 2)) %>%
pull(Percentage)
# Print the result
status_SampleSize <- c(percentages)
names(status_SampleSize) = c("Lvl 1", "Lvl 2", "Lvl 3", "Lvl 4")
status_SampleSize
## Lvl 1 Lvl 2 Lvl 3 Lvl 4
## 5.21 36.97 55.92 1.90
status_SampleSize[2] + status_SampleSize[3]
## Lvl 2
## 92.89
no_outliers = assessment_df %>% filter(Salary_Expectation < 99000)
max(no_outliers$Salary_Expectation)
## [1] 60000
snipped_model = lm(Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, data = no_outliers)
snipped_model
##
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA,
## data = no_outliers)
##
## Coefficients:
## (Intercept) Twelfth_Grade_GPA College_GPA
## 23881 -2953 1610
# Load required libraries
library(plotly)
# Create a grid of predictor values
twelfth_gpa <- seq(min(no_outliers$Twelfth_Grade_GPA), max(no_outliers$Twelfth_Grade_GPA), length = 200)
college_gpa <- seq(min(no_outliers$College_GPA), max(no_outliers$College_GPA), length = 200)
# Expand grid to create combinations of Twelfth and College GPA
grid <- expand.grid(Twelfth_Grade_GPA = twelfth_gpa, College_GPA = college_gpa)
# Predict Salary_Expectation based on the grid
grid$Predicted_Salary <- predict(snipped_model, newdata = grid)
# Create the 3D plot with the regression plane
fig <- plot_ly()
# Add the regression plane
fig <- fig %>%
add_trace(
x = ~grid$Twelfth_Grade_GPA,
y = ~grid$College_GPA,
z = ~grid$Predicted_Salary,
type = "mesh3d",
opacity = 0.5,
name = "Regression Plane"
)
# Add the actual data points
fig <- fig %>%
add_trace(
x = ~no_outliers$Twelfth_Grade_GPA,
y = ~no_outliers$College_GPA,
z = ~no_outliers$Salary_Expectation,
type = "scatter3d",
mode = "markers",
marker = list(size = 4, color = "red"),
name = "Data Points"
)
# Add labels and title
fig <- fig %>%
layout(
scene = list(
xaxis = list(title = 'Twelfth Grade GPA'),
yaxis = list(title = 'College GPA'),
zaxis = list(title = 'Salary Expectation')
),
title = "3D Plane with Data Points"
)
# Show the plot
fig
model
##
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA,
## data = assessment_df)
##
## Coefficients:
## (Intercept) Twelfth_Grade_GPA College_GPA
## 108644 -12124 -14392
snipped_model
##
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA,
## data = no_outliers)
##
## Coefficients:
## (Intercept) Twelfth_Grade_GPA College_GPA
## 23881 -2953 1610
plot_3d_plane <- function(data, model, grid_size = 200) {
# Load required library
library(plotly)
# Extract predictors from the model formula
predictors <- all.vars(formula(model))[-1]
if (length(predictors) != 2) {
stop("The model must have exactly two predictors for this 3D plot.")
}
# Generate a grid of values for the predictors
grid <- expand.grid(
setNames(
list(
seq(min(data[[predictors[1]]]), max(data[[predictors[1]]]), length = grid_size),
seq(min(data[[predictors[2]]]), max(data[[predictors[2]]]), length = grid_size)
),
predictors
)
)
# Predict values for the grid
grid$Predicted_Salary <- predict(model, newdata = grid)
# Create the 3D plot
fig <- plot_ly()
# Add the regression plane
fig <- fig %>%
add_trace(
x = ~grid[[predictors[1]]],
y = ~grid[[predictors[2]]],
z = ~grid$Predicted_Salary,
type = "mesh3d",
opacity = 0.5,
name = "Regression Plane"
)
# Add the actual data points
fig <- fig %>%
add_trace(
x = ~data[[predictors[1]]],
y = ~data[[predictors[2]]],
z = ~data[[all.vars(formula(model))[1]]],
type = "scatter3d",
mode = "markers",
marker = list(size = 4, color = "red"),
name = "Data Points"
)
# Add labels and title
fig <- fig %>%
layout(
scene = list(
xaxis = list(title = predictors[1]),
yaxis = list(title = predictors[2]),
zaxis = list(title = all.vars(formula(model))[1])
),
title = "3D Plane with Data Points"
)
# Return the plot
return(fig)
}
df = typical %>% filter(Likes_Degree == "Yes")
df_2 = typical %>% filter(Likes_Degree != "Yes")
mean(df$Salary_Expectation) - mean(df_2$Salary_Expectation)
## [1] 3086.596
plot_3d_plane(assessment_df, model)
?approxfun