# import Data 
student_behavior <- read.csv("Downloads/Student Attitude and Behavior (Final).csv")
head(student_behavior)
##   Student_ID Gender Height_cm Weight_kg Eleventh_Grade_GPA Twelfth_Grade_GPA
## 1          1   Male       100        58           3.224490          2.723404
## 2          2 Female        90        40           2.857143          3.404255
## 3          3   Male       159        78           2.897959          2.595745
## 4          4 Female       147        20           2.857143          2.510638
## 5          5   Male       170        54           1.632653          2.765957
## 6          6 Female       139        33           3.673469          3.191489
##   College_GPA Certification_Course       Hobbies Daily_Study_Time_mins
## 1        3.20                   No   Video Games             188.01559
## 2        2.80                   No        Cinema             136.77480
## 3        2.20                  Yes        Cinema             124.78446
## 4        2.32                  Yes Reading books              11.09966
## 5        1.20                   No   Video Games              30.00000
## 6        2.80                  Yes        Cinema              64.15472
##   Preferred_Study_Time Salary_Expectation Likes_Degree Social_Media_Use_mins
## 1              Morning              40000           No                    95
## 2              Morning              15000          Yes                    65
## 3        No preference              13000          Yes                   140
## 4        No preference            1500000           No                    90
## 5              Morning              50000          Yes                   115
## 6                Night              20000          Yes                    40
##   Commute_Time_mins Stress_Level Financial_Status Part_Time_Job Average_GPA
## 1          13.84624            2                2            No    3.049298
## 2          49.56766            2                2            No    3.020466
## 3          22.67896            1                2            No    2.564568
## 4          88.31212            2                3            No    2.562594
## 5         145.19791            3                3            No    1.866204
## 6          75.79959            2                3            No    3.221653
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(student_behavior)
## Rows: 235
## Columns: 19
## $ Student_ID            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ Gender                <chr> "Male", "Female", "Male", "Female", "Male", "Fem…
## $ Height_cm             <dbl> 100, 90, 159, 147, 170, 139, 165, 152, 190, 150,…
## $ Weight_kg             <dbl> 58, 40, 78, 20, 54, 33, 50, 43, 85, 84, 50, 51, …
## $ Eleventh_Grade_GPA    <dbl> 3.224490, 2.857143, 2.897959, 2.857143, 1.632653…
## $ Twelfth_Grade_GPA     <dbl> 2.723404, 3.404255, 2.595745, 2.510638, 2.765957…
## $ College_GPA           <dbl> 3.20, 2.80, 2.20, 2.32, 1.20, 2.80, 0.12, 3.00, …
## $ Certification_Course  <chr> "No", "No", "Yes", "Yes", "No", "Yes", "Yes", "N…
## $ Hobbies               <chr> "Video Games", "Cinema", "Cinema", "Reading book…
## $ Daily_Study_Time_mins <dbl> 188.01559, 136.77480, 124.78446, 11.09966, 30.00…
## $ Preferred_Study_Time  <chr> "Morning", "Morning", "No preference", "No prefe…
## $ Salary_Expectation    <int> 40000, 15000, 13000, 1500000, 50000, 20000, 1500…
## $ Likes_Degree          <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "…
## $ Social_Media_Use_mins <int> 95, 65, 140, 90, 115, 40, 90, 75, 110, 45, 80, 9…
## $ Commute_Time_mins     <dbl> 13.846239, 49.567665, 22.678964, 88.312123, 145.…
## $ Stress_Level          <int> 2, 2, 1, 2, 3, 2, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, …
## $ Financial_Status      <int> 2, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 1, 3, 2, …
## $ Part_Time_Job         <chr> "No", "No", "No", "No", "No", "No", "No", "No", …
## $ Average_GPA           <dbl> 3.049298, 3.020466, 2.564568, 2.562594, 1.866204…
colnames(student_behavior)
##  [1] "Student_ID"            "Gender"                "Height_cm"            
##  [4] "Weight_kg"             "Eleventh_Grade_GPA"    "Twelfth_Grade_GPA"    
##  [7] "College_GPA"           "Certification_Course"  "Hobbies"              
## [10] "Daily_Study_Time_mins" "Preferred_Study_Time"  "Salary_Expectation"   
## [13] "Likes_Degree"          "Social_Media_Use_mins" "Commute_Time_mins"    
## [16] "Stress_Level"          "Financial_Status"      "Part_Time_Job"        
## [19] "Average_GPA"
essentials = student_behavior %>% 
  select(Twelfth_Grade_GPA:Financial_Status,  Certification_Course:Commute_Time_mins, Eleventh_Grade_GPA)

head(essentials)
##   Twelfth_Grade_GPA College_GPA Certification_Course       Hobbies
## 1          2.723404        3.20                   No   Video Games
## 2          3.404255        2.80                   No        Cinema
## 3          2.595745        2.20                  Yes        Cinema
## 4          2.510638        2.32                  Yes Reading books
## 5          2.765957        1.20                   No   Video Games
## 6          3.191489        2.80                  Yes        Cinema
##   Daily_Study_Time_mins Preferred_Study_Time Salary_Expectation Likes_Degree
## 1             188.01559              Morning              40000           No
## 2             136.77480              Morning              15000          Yes
## 3             124.78446        No preference              13000          Yes
## 4              11.09966        No preference            1500000           No
## 5              30.00000              Morning              50000          Yes
## 6              64.15472                Night              20000          Yes
##   Social_Media_Use_mins Commute_Time_mins Stress_Level Financial_Status
## 1                    95          13.84624            2                2
## 2                    65          49.56766            2                2
## 3                   140          22.67896            1                2
## 4                    90          88.31212            2                3
## 5                   115         145.19791            3                3
## 6                    40          75.79959            2                3
##   Eleventh_Grade_GPA
## 1           3.224490
## 2           2.857143
## 3           2.897959
## 4           2.857143
## 5           1.632653
## 6           3.673469
assessment_df = essentials %>% select(Eleventh_Grade_GPA, Twelfth_Grade_GPA, College_GPA, Salary_Expectation, Financial_Status, Social_Media_Use_mins, Daily_Study_Time_mins, Commute_Time_mins, Likes_Degree)
head(assessment_df)
##   Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## 1           3.224490          2.723404        3.20              40000
## 2           2.857143          3.404255        2.80              15000
## 3           2.897959          2.595745        2.20              13000
## 4           2.857143          2.510638        2.32            1500000
## 5           1.632653          2.765957        1.20              50000
## 6           3.673469          3.191489        2.80              20000
##   Financial_Status Social_Media_Use_mins Daily_Study_Time_mins
## 1                2                    95             188.01559
## 2                2                    65             136.77480
## 3                2                   140             124.78446
## 4                3                    90              11.09966
## 5                3                   115              30.00000
## 6                3                    40              64.15472
##   Commute_Time_mins Likes_Degree
## 1          13.84624           No
## 2          49.56766          Yes
## 3          22.67896          Yes
## 4          88.31212           No
## 5         145.19791          Yes
## 6          75.79959          Yes
library(patchwork)
library(ggplot2)
# Ensure `df` is a data frame
density_plot <- assessment_df %>% ggplot( aes(x = Salary_Expectation)) +
  geom_density(fill = "lightblue")

# Render the plot
density_plot

# Values are dramatically different : 
v = c(mean(assessment_df$Salary_Expectation), median(assessment_df$Salary_Expectation))
names(v) = c("mean", "median")
v = c(v,  v["mean"] - v["median"])
names(v) = c("mean", "median", "difference")
v
##       mean     median difference 
##   32481.68   20000.00   12481.68
# Load necessary library
library(ggplot2)

# Create a box plot
assessment_df %>% ggplot(aes(y = Salary_Expectation)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Box Plot of Salary Expectation", y = "Salary Expectation") +
  theme_minimal()

Take out Outliers :

quartiles <- quantile(assessment_df$Salary_Expectation, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
quartiles
##   25%   50%   75% 
## 15000 20000 25000
above_threshold = quartiles["75%"] + IQR(assessment_df$Salary_Expectation)*1.5

# You are an outlier if you make above : $40,000s!
above_threshold
##   75% 
## 40000
above_outlier =  assessment_df %>% filter(Salary_Expectation >= above_threshold); head(above_outlier)
##   Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## 1           3.224490          2.723404        3.20              40000
## 2           2.857143          2.510638        2.32            1500000
## 3           1.632653          2.765957        1.20              50000
## 4           3.653061          2.965957        2.80              60000
## 5           3.224490          2.765957        3.00              40000
## 6           2.204082          2.170213        2.64              50000
##   Financial_Status Social_Media_Use_mins Daily_Study_Time_mins
## 1                2                    95             188.01559
## 2                3                    90              11.09966
## 3                3                   115              30.00000
## 4                3                     0             182.05841
## 5                2                   175             152.04155
## 6                3                     0             135.63961
##   Commute_Time_mins Likes_Degree
## 1          13.84624           No
## 2          88.31212           No
## 3         145.19791          Yes
## 4          10.40952          Yes
## 5          36.56810          Yes
## 6          63.03645          Yes
typical = assessment_df %>% filter(Salary_Expectation < above_threshold); head(typical)
##   Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Salary_Expectation
## 1           2.857143          3.404255        2.80              15000
## 2           2.897959          2.595745        2.20              13000
## 3           3.673469          3.191489        2.80              20000
## 4           2.857143          2.680851        0.12              15000
## 5           2.514286          2.624255        3.00              25000
## 6           3.600000          2.872340        2.40              20000
##   Financial_Status Social_Media_Use_mins Daily_Study_Time_mins
## 1                2                    65             136.77480
## 2                2                   140             124.78446
## 3                3                    40              64.15472
## 4                3                    90              25.89300
## 5                3                    75             172.69474
## 6                2                   110             127.67657
##   Commute_Time_mins Likes_Degree
## 1          49.56766          Yes
## 2          22.67896          Yes
## 3          75.79959          Yes
## 4         115.63578          Yes
## 5          22.13955          Yes
## 6          42.43369          Yes
# % contribution to observations : nrow()
cat(round(nrow(above_outlier)/nrow(assessment_df), 2),
    round(1-nrow(above_outlier)/nrow(assessment_df), 2))
## 0.1 0.9
# Create a box plot
typical %>% ggplot(aes(y = Salary_Expectation)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Box Plot of Salary Expectation", y = "Salary Expectation") +
  theme_minimal()

# Compute density : typical person 
density_data <- density(typical$Salary_Expectation)

# Function to identify local maxima
is_local_max <- function(y) {
  diff_sign <- diff(sign(diff(y)))
  c(FALSE, diff_sign == -2, FALSE)
}

# Apply function to density values
local_max_indices <- is_local_max(density_data$y)


# X and Y coordinates of local maxima
local_max_x <- density_data$x[local_max_indices]
local_max_y <- density_data$y[local_max_indices]

# Data frame of local maxima
peaks <- data.frame(x = local_max_x, y = local_max_y)
library(ggplot2)

# Compute density
density_data <- density(typical$Salary_Expectation)

# Function to identify local maxima
is_local_max <- function(y) {
  diff_sign <- diff(sign(diff(y)))
  c(FALSE, diff_sign == -2, FALSE)
}

# Apply function to density values
local_max_indices <- is_local_max(density_data$y)

# X and Y coordinates of local maxima
local_max_x <- density_data$x[local_max_indices]
local_max_y <- density_data$y[local_max_indices]

# Data frame of local maxima
peaks <- data.frame(x = local_max_x, y = local_max_y)

# Create density plot and annotate peaks
ggplot(data = typical, aes(x = Salary_Expectation)) +
  geom_density(fill = "blue", alpha = 0.4) +  # Density plot
  geom_point(data = peaks, aes(x = x, y = y), color = "red", size = 3) +  # Add points at peaks
  geom_text(data = peaks, aes(x = x, y = y, label = paste0("(", round(x, 1), ", ", round(y, 3), ")")), 
            vjust = -1, hjust = 0.5, color = "black", size = 4) +  # Add labels at peaks
  labs(title = "Salary Expectations Density Plot", 
       x = "Salary Expectations", 
       y = "Density") +
  theme_minimal()  # Clean theme

There must clearly be subgroups, it appears that therea are 2 major subgroups and 6 others. Although Values appear to be low.

# Calculate density
density_data <- density(assessment_df$Salary_Expectation)

# Define a function for the density
density_function <- approxfun(density_data$x, density_data$y)

# Integrate the density function from 15,000 to 20,000
result_1 <- integrate(density_function, lower = 10000, upper = 25005)

# Print the result
result_1
## 0.6658827 with absolute error < 1e-04
result_2 <- integrate(density_function, lower = 10000, upper = 30010)
result_2
## 0.7463738 with absolute error < 0.00012
summary(assessment_df)
##  Eleventh_Grade_GPA Twelfth_Grade_GPA  College_GPA    Salary_Expectation
##  Min.   :0.302      Min.   :1.915     Min.   :0.040   Min.   :      0   
##  1st Qu.:2.857      1st Qu.:2.553     1st Qu.:2.400   1st Qu.:  15000   
##  Median :3.265      Median :2.936     Median :2.800   Median :  20000   
##  Mean   :3.137      Mean   :2.927     Mean   :2.826   Mean   :  32482   
##  3rd Qu.:3.520      3rd Qu.:3.234     3rd Qu.:3.200   3rd Qu.:  25000   
##  Max.   :4.000      Max.   :4.000     Max.   :4.000   Max.   :1500000   
##  Financial_Status Social_Media_Use_mins Daily_Study_Time_mins Commute_Time_mins
##  Min.   :1.000    Min.   :  0.00        Min.   :  0.00        Min.   :  0.00   
##  1st Qu.:2.000    1st Qu.: 35.00        1st Qu.: 91.49        1st Qu.: 20.57   
##  Median :3.000    Median : 60.00        Median :132.58        Median : 40.17   
##  Mean   :2.523    Mean   : 66.04        Mean   :123.44        Mean   : 46.06   
##  3rd Qu.:3.000    3rd Qu.: 90.00        3rd Qu.:163.94        3rd Qu.: 65.44   
##  Max.   :4.000    Max.   :180.00        Max.   :299.12        Max.   :177.92   
##  Likes_Degree      
##  Length:235        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
model = lm(Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, data = assessment_df)

model
## 
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, 
##     data = assessment_df)
## 
## Coefficients:
##       (Intercept)  Twelfth_Grade_GPA        College_GPA  
##            108644             -12124             -14392
# Load ggplot2 for visualization
library(ggplot2)

# Create the histogram
ggplot(typical, aes(x = Salary_Expectation, fill = as.factor(Financial_Status))) +
  geom_histogram(binwidth = 5000, position = "dodge", alpha = 0.7) +
  scale_fill_brewer(palette = "Set2", name = "Financial Status") +
  labs(
    title = "Distribution of Salary Expectation by Financial Status",
    x = "Salary Expectation",
    y = "Count"
  ) +
  theme_minimal()

# Calculate percentages for each financial status in a single operation
percentages <- typical %>%
  group_by(Financial_Status) %>%
  summarise(Percentage = round(n() / nrow(typical) * 100, 2)) %>%
  pull(Percentage)

# Print the result
status_SampleSize <- c(percentages)
names(status_SampleSize) = c("Lvl 1", "Lvl 2", "Lvl 3", "Lvl 4")
status_SampleSize
## Lvl 1 Lvl 2 Lvl 3 Lvl 4 
##  5.21 36.97 55.92  1.90
status_SampleSize[2] + status_SampleSize[3]
## Lvl 2 
## 92.89
no_outliers = assessment_df %>% filter(Salary_Expectation < 99000)
max(no_outliers$Salary_Expectation)
## [1] 60000
snipped_model = lm(Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, data = no_outliers)

snipped_model
## 
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, 
##     data = no_outliers)
## 
## Coefficients:
##       (Intercept)  Twelfth_Grade_GPA        College_GPA  
##             23881              -2953               1610
# Load required libraries
library(plotly)

# Create a grid of predictor values
twelfth_gpa <- seq(min(no_outliers$Twelfth_Grade_GPA), max(no_outliers$Twelfth_Grade_GPA), length = 200)
college_gpa <- seq(min(no_outliers$College_GPA), max(no_outliers$College_GPA), length = 200)

# Expand grid to create combinations of Twelfth and College GPA
grid <- expand.grid(Twelfth_Grade_GPA = twelfth_gpa, College_GPA = college_gpa)

# Predict Salary_Expectation based on the grid
grid$Predicted_Salary <- predict(snipped_model, newdata = grid)

# Create the 3D plot with the regression plane
fig <- plot_ly()

# Add the regression plane
fig <- fig %>%
  add_trace(
    x = ~grid$Twelfth_Grade_GPA,
    y = ~grid$College_GPA,
    z = ~grid$Predicted_Salary,
    type = "mesh3d",
    opacity = 0.5,
    name = "Regression Plane"
  )

# Add the actual data points
fig <- fig %>%
  add_trace(
    x = ~no_outliers$Twelfth_Grade_GPA,
    y = ~no_outliers$College_GPA,
    z = ~no_outliers$Salary_Expectation,
    type = "scatter3d",
    mode = "markers",
    marker = list(size = 4, color = "red"),
    name = "Data Points"
  )

# Add labels and title
fig <- fig %>%
  layout(
    scene = list(
      xaxis = list(title = 'Twelfth Grade GPA'),
      yaxis = list(title = 'College GPA'),
      zaxis = list(title = 'Salary Expectation')
    ),
    title = "3D Plane with Data Points"
  )

# Show the plot
fig
model
## 
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, 
##     data = assessment_df)
## 
## Coefficients:
##       (Intercept)  Twelfth_Grade_GPA        College_GPA  
##            108644             -12124             -14392
snipped_model
## 
## Call:
## lm(formula = Salary_Expectation ~ Twelfth_Grade_GPA + College_GPA, 
##     data = no_outliers)
## 
## Coefficients:
##       (Intercept)  Twelfth_Grade_GPA        College_GPA  
##             23881              -2953               1610
plot_3d_plane <- function(data, model, grid_size = 200) {
  # Load required library
  library(plotly)
  
  # Extract predictors from the model formula
  predictors <- all.vars(formula(model))[-1]
  
  if (length(predictors) != 2) {
    stop("The model must have exactly two predictors for this 3D plot.")
  }
  
  # Generate a grid of values for the predictors
  grid <- expand.grid(
    setNames(
      list(
        seq(min(data[[predictors[1]]]), max(data[[predictors[1]]]), length = grid_size),
        seq(min(data[[predictors[2]]]), max(data[[predictors[2]]]), length = grid_size)
      ),
      predictors
    )
  )
  
  # Predict values for the grid
  grid$Predicted_Salary <- predict(model, newdata = grid)
  
  # Create the 3D plot
  fig <- plot_ly()
  
  # Add the regression plane
  fig <- fig %>%
    add_trace(
      x = ~grid[[predictors[1]]],
      y = ~grid[[predictors[2]]],
      z = ~grid$Predicted_Salary,
      type = "mesh3d",
      opacity = 0.5,
      name = "Regression Plane"
    )
  
  # Add the actual data points
  fig <- fig %>%
    add_trace(
      x = ~data[[predictors[1]]],
      y = ~data[[predictors[2]]],
      z = ~data[[all.vars(formula(model))[1]]],
      type = "scatter3d",
      mode = "markers",
      marker = list(size = 4, color = "red"),
      name = "Data Points"
    )
  
  # Add labels and title
  fig <- fig %>%
    layout(
      scene = list(
        xaxis = list(title = predictors[1]),
        yaxis = list(title = predictors[2]),
        zaxis = list(title = all.vars(formula(model))[1])
      ),
      title = "3D Plane with Data Points"
    )
  
  # Return the plot
  return(fig)
}
df = typical %>% filter(Likes_Degree == "Yes") 
df_2 = typical %>% filter(Likes_Degree != "Yes") 
mean(df$Salary_Expectation) - mean(df_2$Salary_Expectation)
## [1] 3086.596
plot_3d_plane(assessment_df, model)
?approxfun