── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
✔ broom 1.0.5 ✔ rsample 1.2.0
✔ dials 1.2.0 ✔ tune 1.1.2
✔ infer 1.0.5 ✔ workflows 1.1.3
✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
✔ parsnip 1.1.1 ✔ yardstick 1.2.0
✔ recipes 1.0.8
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/
Loading required package: lattice
Attaching package: 'caret'
The following objects are masked from 'package:yardstick':
precision, recall, sensitivity, specificity
The following object is masked from 'package:purrr':
lift
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
corrplot 0.92 loaded
Importing Dataset
# Set the working directory to the location where the CSV file is storedsetwd("C:\\MSBA\\MGT_665 - Solving Problems with Machine Learning\\Final Project")# Read the CSV file into Rplacement_data <-read.csv("Placement_Data_Full_Class.csv", header =TRUE)# View the first few rows of the data to ensure it has been imported correctlyhead(placement_data)
sl_no gender ssc_p ssc_b hsc_p hsc_b hsc_s degree_p degree_t workex
1 1 M 67.00 Others 91.00 Others Commerce 58.00 Sci&Tech No
2 2 M 79.33 Central 78.33 Others Science 77.48 Sci&Tech Yes
3 3 M 65.00 Central 68.00 Central Arts 64.00 Comm&Mgmt No
4 4 M 56.00 Central 52.00 Central Science 52.00 Sci&Tech No
5 5 M 85.80 Central 73.60 Central Commerce 73.30 Comm&Mgmt No
6 6 M 55.00 Others 49.80 Others Science 67.25 Sci&Tech Yes
etest_p specialisation mba_p status salary
1 55.0 Mkt&HR 58.80 Placed 270000
2 86.5 Mkt&Fin 66.28 Placed 200000
3 75.0 Mkt&Fin 57.80 Placed 250000
4 66.0 Mkt&HR 59.43 Not Placed NA
5 96.8 Mkt&Fin 55.50 Placed 425000
6 55.0 Mkt&Fin 51.58 Not Placed NA
Exploratory Data Analysis (EDA) & Data Pre-processing
# Check for missing values in each columnmissing_values <-colSums(is.na(placement_data))# Display the sum of missing values in each columnprint(missing_values)
We have 67 NAs in the missing salary column. This is because 67 students didn’t get placed. This is normal and hence no further investigation is required
# Summary statisticssummary(placement_data)
sl_no gender ssc_p ssc_b
Min. : 1.0 Length:215 Min. :40.89 Length:215
1st Qu.: 54.5 Class :character 1st Qu.:60.60 Class :character
Median :108.0 Mode :character Median :67.00 Mode :character
Mean :108.0 Mean :67.30
3rd Qu.:161.5 3rd Qu.:75.70
Max. :215.0 Max. :89.40
hsc_p hsc_b hsc_s degree_p
Min. :37.00 Length:215 Length:215 Min. :50.00
1st Qu.:60.90 Class :character Class :character 1st Qu.:61.00
Median :65.00 Mode :character Mode :character Median :66.00
Mean :66.33 Mean :66.37
3rd Qu.:73.00 3rd Qu.:72.00
Max. :97.70 Max. :91.00
degree_t workex etest_p specialisation
Length:215 Length:215 Min. :50.0 Length:215
Class :character Class :character 1st Qu.:60.0 Class :character
Mode :character Mode :character Median :71.0 Mode :character
Mean :72.1
3rd Qu.:83.5
Max. :98.0
mba_p status salary
Min. :51.21 Length:215 Min. :200000
1st Qu.:57.95 Class :character 1st Qu.:240000
Median :62.00 Mode :character Median :265000
Mean :62.28 Mean :288655
3rd Qu.:66.25 3rd Qu.:300000
Max. :77.89 Max. :940000
NA's :67
table(placement_data$status)
Not Placed Placed
67 148
table(placement_data$gender)
F M
76 139
table(placement_data$ssc_b)
Central Others
116 99
table(placement_data$hsc_s)
Arts Commerce Science
11 113 91
table(placement_data$degree_t)
Comm&Mgmt Others Sci&Tech
145 11 59
table(placement_data$specialisation)
Mkt&Fin Mkt&HR
120 95
ggplot(placement_data, aes(x = ssc_p,)) +geom_histogram(fill ="lightblue", color ="black") +labs(title ="Distribution of Secondary Education Percentage", x ="Secondary Education %age", y ="Count")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(placement_data, aes(x = hsc_p,)) +geom_histogram(fill ="lightblue", color ="black") +labs(title ="Distribution of Higher Secondary Education Percentage", x ="Higher Secondary Education %age", y ="Count")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(placement_data, aes(x = degree_p,)) +geom_histogram(fill ="lightblue", color ="black") +labs(title ="Distribution of Undergrad Percentage", x ="Undergrad %age", y ="Count")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(placement_data, aes(x = etest_p,)) +geom_histogram(fill ="lightblue", color ="black") +labs(title ="Distribution of Employability Test Percentage", x ="Emp Test %age", y ="Count")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(placement_data, aes(x = mba_p,)) +geom_histogram(fill ="lightblue", color ="black") +labs(title ="Distribution of MBA Percentage", x ="MBA %age", y ="Count")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(placement_data, aes(x = status, fill = status)) +geom_bar(fill ="lightblue", color ="black") +labs(title ="Distribution of Placement Status", x ="Placement Status", y ="Count") +geom_text(stat ='count', aes(label =after_stat(count)), position =position_stack(vjust =0.5))
ggplot(placement_data, aes(x = gender, fill = gender)) +geom_bar(fill ="lightblue", color ="black") +labs(title ="Distribution of Gender", x ="Gender", y ="Count") +geom_text(stat ='count', aes(label =after_stat(count)), position =position_stack(vjust =0.5))
ggplot(placement_data, aes(x = ssc_b, fill = ssc_b)) +geom_bar(fill ="lightblue", color ="black") +labs(title ="Distribution of Board of Education", x ="Board of Education", y ="Count") +geom_text(stat ='count', aes(label =after_stat(count)), position =position_stack(vjust =0.5))
ggplot(placement_data, aes(x = hsc_s, fill = hsc_s)) +geom_bar(fill ="lightblue", color ="black") +labs(title ="Distribution of Specialization in Higher Secondary Education", x ="HSE Specialization", y ="Count") +geom_text(stat ='count', aes(label =after_stat(count)), position =position_stack(vjust =0.5))
ggplot(placement_data, aes(x = degree_t, fill = degree_t)) +geom_bar(fill ="lightblue", color ="black") +labs(title ="Distribution of Specialization in Undergraduate Course", x ="UG Specialization", y ="Count") +geom_text(stat ='count', aes(label =after_stat(count)), position =position_stack(vjust =0.5))
ggplot(placement_data, aes(x = specialisation, fill = specialisation)) +geom_bar(fill ="lightblue", color ="black") +labs(title ="Distribution of Specialization in Graduate Course (MBA)", x ="MBA Specialization", y ="Count") +geom_text(stat ='count', aes(label =after_stat(count)), position =position_stack(vjust =0.5))
Correlation between numerical variables
# Select only the numerical columns for the correlation matrixnumerical_data <- placement_data %>%select(-sl_no) %>%select_if(is.numeric)# Calculate the correlation matrixcorrelation_matrix <-cor(numerical_data, use ="pairwise.complete.obs")# Visualize the correlation matrix as a heatmap with values and color codingcorrplot(correlation_matrix, method ="color", type ="upper", order ="hclust", tl.cex =0.8, tl.col ="black", col =colorRampPalette(c("skyblue", "red"))(100),addCoef.col ="black", number.cex =0.7)
Mutate variables - Converting categorical data into factors
Splitting the data into training (80%) and testing (20%)
set.seed(221023)#clean model dataplacement_data_model <- placement_data %>%select(-salary)%>%mutate(status =as.factor(make.names(status)))#split into two datasetssplit <-createDataPartition(placement_data_model$status,p =0.8, list =FALSE)train_data <- placement_data_model[split,]test_data <- placement_data_model[-split,]
#store the ID variable in its original formaty_test <- test_data$sl_noy_train <- train_data$sl_no#removing sl_no and salary from the data to be processedtest_data <- test_data %>%select (-sl_no)train_data <- train_data %>%select (-sl_no)#center and scale our datapreProcess_range_model <-preProcess(train_data, method=c("center", "scale"))train_data <-predict(preProcess_range_model, newdata = train_data)test_data <-predict(preProcess_range_model, newdata = test_data)
Applied 5-fold Cross-validation 10 times
train.control <-trainControl(method ="repeatedcv", number =10,repeats =5,classProbs = T)
Modeling
Training the model on 4 algorithms - Logistic Regression, Decision Tree, K-nearest neighbor and SVM linear model
# Logistic Regression modellogistic_model <-train(status ~ ., data = train_data, method ="glm", trControl = train.control)# Decision Tree classifier modeldecision_tree_model <-train(status ~ ., data = train_data, method ="rpart", trControl = train.control)# KNN modelknn_model <-train(status ~ ., data = train_data, method ="knn", trControl = train.control)# SVM linear modelsvm_linear_model <-train(status ~ ., data = train_data, method ="svmLinear", trControl = train.control)
Predicting on the basis 4 trained models
# Predict using the test datasetlogistic_predictions <-predict(logistic_model, newdata = test_data)decision_tree_predictions <-predict(decision_tree_model, newdata = test_data)knn_predictions <-predict(knn_model, newdata = test_data)svm_linear_predictions <-predict(svm_linear_model, newdata = test_data)
Evaluating models
# Evaluation of Logistic Regression modellogistic_predictions <-predict(logistic_model, newdata = test_data)logistic_conf_matrix <-confusionMatrix(logistic_predictions, test_data$status)logistic_accuracy <- logistic_conf_matrix$overall['Accuracy']logistic_precision <- logistic_conf_matrix$byClass['Pos Pred Value']logistic_recall <- logistic_conf_matrix$byClass['Sensitivity']logistic_f1 <- logistic_conf_matrix$byClass['F1']# Evaluation of Decision Tree classifier modeldecision_tree_predictions <-predict(decision_tree_model, newdata = test_data)decision_tree_conf_matrix <-confusionMatrix(decision_tree_predictions, test_data$status)decision_tree_accuracy <- decision_tree_conf_matrix$overall['Accuracy']decision_tree_precision <- decision_tree_conf_matrix$byClass['Pos Pred Value']decision_tree_recall <- decision_tree_conf_matrix$byClass['Sensitivity']decision_tree_f1 <- decision_tree_conf_matrix$byClass['F1']# Evaluation of KNN modelknn_predictions <-predict(knn_model, newdata = test_data)knn_conf_matrix <-confusionMatrix(knn_predictions, test_data$status)knn_accuracy <- knn_conf_matrix$overall['Accuracy']knn_precision <- knn_conf_matrix$byClass['Pos Pred Value']knn_recall <- knn_conf_matrix$byClass['Sensitivity']knn_f1 <- knn_conf_matrix$byClass['F1']# Evaluation of SVM linear modelsvm_linear_predictions <-predict(svm_linear_model, newdata = test_data)svm_linear_conf_matrix <-confusionMatrix(svm_linear_predictions, test_data$status)svm_linear_accuracy <- svm_linear_conf_matrix$overall['Accuracy']svm_linear_precision <- svm_linear_conf_matrix$byClass['Pos Pred Value']svm_linear_recall <- svm_linear_conf_matrix$byClass['Sensitivity']svm_linear_f1 <- svm_linear_conf_matrix$byClass['F1']# Displaying the resultsprint("Logistic Regression Model:")