Divorce Prediction Analysis in India

Author

Aarya Kshetri

First step, we load essential libraries for analysis and modeling and load the dataset. We can use some basic functions to investigate the structure and values of the dataset.

# Load libraries
library(tidyverse)
library(plotly)
library(corrplot)
library(caret)
library(rpart)
library(rpart.plot)
library(pROC)
library(randomForest)
library(e1071)
library(nnet)
library(UBL)

# Load the dataset
data <- read_csv("marriage_divorce_india_with_id.csv")

# Check the dataset
glimpse(data)

Rows: 1,200
Columns: 11
$ `Unique ID`                    <chr> "MD1", "MD2", "MD3", "MD4", "MD5", "MD6…
$ `Marriage Duration (Years)`    <dbl> 39, 29, 15, 8, 21, 39, 19, 23, 11, 11, …
$ `Age at Marriage`              <dbl> 29, 34, 34, 27, 34, 32, 29, 33, 25, 34,…
$ `Marriage Type`                <chr> "Love", "Arranged", "Love", "Arranged",…
$ `Education Level`              <chr> "Secondary", "No Education", "No Educat…
$ `Income Level (INR per month)` <dbl> 113464, 18682, 159455, 63160, 28666, 63…
$ `Caste/Religion`               <chr> "Hindu", "Jain", "Muslim", "Jain", "Jai…
$ `Urban/Rural`                  <chr> "Rural", "Rural", "Urban", "Urban", "Ur…
$ `Family Involvement`           <chr> "Moderate", "Moderate", "Moderate", "Hi…
$ Children                       <dbl> 2, 0, 4, 1, 1, 1, 0, 3, 2, 0, 1, 0, 0, …
$ `Divorce Status`               <chr> "No", "Yes", "Yes", "Yes", "Yes", "No",…

colSums(is.na(data))

                   Unique ID    Marriage Duration (Years) 
                           0                            0 
             Age at Marriage                Marriage Type 
                           0                            0 
             Education Level Income Level (INR per month) 
                           0                            0 
              Caste/Religion                  Urban/Rural 
                           0                            0 
          Family Involvement                     Children 
                           0                            0 
              Divorce Status 
                           0

summary(data)

  Unique ID         Marriage Duration (Years) Age at Marriage
 Length:1200        Min.   : 1.00             Min.   :18.00  
 Class :character   1st Qu.:10.00             1st Qu.:22.00  
 Mode  :character   Median :22.00             Median :26.00  
                    Mean   :20.55             Mean   :26.05  
                    3rd Qu.:30.00             3rd Qu.:30.00  
                    Max.   :39.00             Max.   :34.00  
 Marriage Type      Education Level    Income Level (INR per month)
 Length:1200        Length:1200        Min.   :  5287              
 Class :character   Class :character   1st Qu.: 54522              
 Mode  :character   Mode  :character   Median :101889              
                                       Mean   :102353              
                                       3rd Qu.:150569              
                                       Max.   :199999              
 Caste/Religion     Urban/Rural        Family Involvement    Children    
 Length:1200        Length:1200        Length:1200        Min.   :0.000  
 Class :character   Class :character   Class :character   1st Qu.:1.000  
 Mode  :character   Mode  :character   Mode  :character   Median :2.000  
                                                          Mean   :1.886  
                                                          3rd Qu.:3.000  
                                                          Max.   :4.000  
 Divorce Status    
 Length:1200       
 Class :character  
 Mode  :character

# Check values in categorical variables
unique(data$`Education Level`)

[1] "Secondary"    "No Education" "Postgraduate" "Graduate"     "Primary"

unique(data$`Family Involvement`)

[1] "Moderate" "High"     "Low"

unique(data$`Caste/Religion`)

[1] "Hindu"     "Jain"      "Muslim"    "Christian" "Other"     "Sikh"

unique(data$`Urban/Rural`)

[1] "Rural" "Urban"

Data Preprocessing

Next, we introduce new variables such as Income Bracket and Age Group at Marraige, as part of feature engineering based on the existing variables. We also change the categorical variables to factor class.

df <- data %>%
  mutate(
    `Marriage Type` = factor(`Marriage Type`),
    `Education Level` = factor(`Education Level`, ordered = TRUE, 
                               levels = c("No Education", "Primary", "Secondary", "Graduate", "Postgraduate")),
    `Caste/Religion` = factor(`Caste/Religion`),
    `Urban/Rural` = factor(`Urban/Rural`),
    `Family Involvement` = factor(`Family Involvement`, ordered = TRUE, levels = c("Low", "Moderate", "High")),
    `Divorce Status` = factor(`Divorce Status`),
    `Income Bracket` = case_when(
      `Income Level (INR per month)` < 30000 ~ "Low (<30k)",
      `Income Level (INR per month)` >= 30000 & `Income Level (INR per month)` <= 100000 ~ "Middle (30k-100k)",
      `Income Level (INR per month)` > 100000 ~ "High (>100k)"
    ),
    `Age Group at Marriage` = cut(
      `Age at Marriage`,
      breaks = c(15, 25, 30, 40, Inf),
      labels = c("15-24", "25-29", "30-39", "40+"),
      include.lowest = TRUE
    )
  )

colSums(is.na(df))

                   Unique ID    Marriage Duration (Years) 
                           0                            0 
             Age at Marriage                Marriage Type 
                           0                            0 
             Education Level Income Level (INR per month) 
                           0                            0 
              Caste/Religion                  Urban/Rural 
                           0                            0 
          Family Involvement                     Children 
                           0                            0 
              Divorce Status               Income Bracket 
                           0                            0 
       Age Group at Marriage 
                           0

Visualization of Divorce Rates

Multiple bar charts to show the divorce rates by different dimensions to get a better perspective of divorce prevalence in the Indian society.

Overall Divorce Rate

divorce_counts <- df %>%
  count(`Divorce Status`) %>%
  mutate(percentage = n / sum(n) * 100)

plot_ly(
  data = divorce_counts,
  x = ~`Divorce Status`,
  y = ~n,
  type = 'bar',
  text = ~paste0(round(percentage, 1), "%"),
  textposition = 'outside',
  marker = list(color = c("#FF9999", "#66B2FF"))
) %>%
  layout(title = "Overall Divorce Rate in Indian Marriages",
         xaxis = list(title = "Divorce Status"),
         yaxis = list(title = "Count"))

Divorce by Marriage Type

divorce_by_type <- df %>%
  group_by(`Marriage Type`) %>%
  summarise(
    total_marriages = n(),
    total_divorces = sum(`Divorce Status` == "Yes"),
    divorce_rate = (total_divorces / total_marriages) * 100
  )

plot_ly(
  data = divorce_by_type,
  x = ~`Marriage Type`,
  y = ~divorce_rate,
  type = 'bar',
  text = ~paste0(round(divorce_rate, 1), "%"),
  textposition = 'outside',
  marker = list(color = c("#FF9999", "#66B2FF"))
) %>%
  layout(title = "Divorce Rate by Marriage Type",
         xaxis = list(title = "Marriage Type"),
         yaxis = list(title = "Divorce Rate (%)"))

Divorce by Family Involvement

divorce_by_involvement <- data %>%
  group_by(`Family Involvement`) %>%
  summarise(
    total_marriages = n(),
    total_divorces = sum(`Divorce Status` == "Yes"),
    divorce_rate = (total_divorces / total_marriages) * 100
  ) %>%
  mutate(`Family Involvement` = factor(`Family Involvement`, 
                                       levels = c("Low", "Moderate", "High"))) %>%
  arrange(`Family Involvement`)

fig <- plot_ly(
  data = divorce_by_involvement,
  x = ~`Family Involvement`,
  y = ~divorce_rate,
  type = 'bar',
  text = ~paste0(round(divorce_rate, 1), "%"),
  textposition = 'outside',
  marker = list(color = c("#FF9999", "#66B2FF", "#8899FF"))
) %>%
  layout(
    title = "Divorce Rate by Family Involvement",
    xaxis = list(title = "Family Involvement"),
    yaxis = list(title = "Divorce Rate (%)")
  )

fig

Divorce by Religion

divorce_by_religion <- data %>%
  group_by(`Caste/Religion`) %>%
  summarise(
    total_marriages = n(),
    total_divorces = sum(`Divorce Status` == "Yes"),
    divorce_rate = (total_divorces / total_marriages) * 100
  ) %>%
  arrange(desc(divorce_rate)) 

fig <- plot_ly(
  data = divorce_by_religion,
  x = ~`Caste/Religion`,
  y = ~divorce_rate,
  type = 'bar',
  text = ~paste0(round(divorce_rate, 1), "%"),
  textposition = 'outside',
  marker = list(color = 'rgba(54, 162, 235, 0.7)')
) %>%
  layout(
    title = "Divorce Rate by Religion",
    xaxis = list(title = "Religion", tickangle = -45),  # Tilt labels for readability
    yaxis = list(title = "Divorce Rate (%)")
  )

fig

Divorce by Education Level

divorce_by_education <- data %>%
  group_by(`Education Level`) %>%
  summarise(
    total_marriages = n(),
    total_divorces = sum(`Divorce Status` == "Yes"),
    divorce_rate = (total_divorces / total_marriages) * 100
  ) %>%
  mutate(`Education Level` = factor(`Education Level`, ordered = TRUE, 
                                    levels = c("No Education", "Primary", "Secondary", "Graduate", "Postgraduate"))) %>%
  arrange(`Education Level`)


fig <- plot_ly(
  data = divorce_by_education,
  x = ~`Education Level`,
  y = ~divorce_rate,
  type = 'bar',
  text = ~paste0(round(divorce_rate, 1), "%"),
  textposition = 'outside',
  marker = list(color = 'rgba(255, 99, 132, 0.7)')
) %>%
  layout(
    title = "Divorce Rate by Education Level",
    xaxis = list(title = "Education Level", tickangle = -45),  # Tilt labels for readability
    yaxis = list(title = "Divorce Rate (%)")
  )


fig

Divorce by Income Level

divorce_by_income <- df %>%
  group_by(`Income Bracket`) %>%
  summarise(
    total_marriages = n(),
    total_divorces = sum(`Divorce Status` == "Yes"),
    divorce_rate = (total_divorces / total_marriages) * 100
  ) %>%
  mutate(`Income Bracket` = factor(`Income Bracket`, ordered = TRUE, 
                                   levels = c("Low (<30k)", "Middle (30k-100k)", "High (>100k)"))) %>%
  arrange(`Income Bracket`)

fig <- plot_ly(
  data = divorce_by_income,
  x = ~`Income Bracket`,
  y = ~divorce_rate,
  type = 'bar',
  text = ~paste0(round(divorce_rate, 1), "%"),
  textposition = 'outside',
  marker = list(color = 'rgba(75, 192, 192, 0.7)')
) %>%
  layout(
    title = "Divorce Rate by Income Bracket",
    xaxis = list(title = "Income Bracket"),
    yaxis = list(title = "Divorce Rate (%)")
  )

fig

Divorce by Urbanization Level

divorce_by_urbanization <- data %>%
  group_by(`Urban/Rural`) %>%
  summarise(
    total_marriages = n(),
    total_divorces = sum(`Divorce Status` == "Yes"),
    divorce_rate = (total_divorces / total_marriages) * 100
  )

fig <- plot_ly(
  data = divorce_by_urbanization,
  x = ~`Urban/Rural`,
  y = ~divorce_rate,
  type = 'bar',
  text = ~paste0(round(divorce_rate, 1), "%"),
  textposition = 'outside',
  marker = list(color = 'rgba(75, 192, 192, 0.7)')
) %>%
  layout(
    title = "Divorce Rate by Urban/Rural",
    xaxis = list(title = "Urban/Rural"),
    yaxis = list(title = "Divorce Rate (%)")
  )

fig

Model Training and Evaluation

Using 5 machine learning models: Logistic Regression, Decision Tree, Random Forest, SVM and Neural Network which are equipped to handle binary target values like Yes/No, which is the case for Divorce which is either Divorce = Yes or Divorce = No.

Data Preparation

# Convert Divorce Status to Binary (0/1)
df <- df %>%
  mutate(Divorce_Binary = ifelse(`Divorce Status` == "Yes", 1, 0))

df_clean <- df %>%
  select(-c(`Unique ID`, `Income Level (INR per month)`, `Age Group at Marriage`, 
            `Marriage Duration (Years)`, `Divorce Status`))

# Split dataset (70-30)
set.seed(523)
train_index <- createDataPartition(df_clean$Divorce_Binary, p = 0.7, list = FALSE)
train_data <- df_clean[train_index, ]
test_data <- df_clean[-train_index, ]

colnames(train_data) <- make.names(colnames(train_data))  
colnames(test_data) <- make.names(colnames(test_data))

# Ensure proper factor levels
train_data$Divorce_Binary <- factor(train_data$Divorce_Binary, levels = c(0, 1))
test_data$Divorce_Binary <- factor(test_data$Divorce_Binary, levels = c(0, 1))

Model Evaluation Function

evaluate_model <- function(predictions, test_data) {
  if (is.factor(predictions)) {
    predictions <- as.numeric(as.character(predictions))  # Convert factor to numeric
  } else if (is.matrix(predictions) && ncol(predictions) == 2) {
    predictions <- ifelse(predictions[, 2] > 0.5, 1, 0)  # Use second column (class 1)
  } else if (is.numeric(predictions) || is.matrix(predictions)) {
    predictions <- ifelse(predictions > 0.5, 1, 0)  # Handle nnet's single probability output
  }
  
  # Confusion matrix
  cm <- confusionMatrix(factor(predictions, levels = c(0, 1)), 
                        factor(test_data$Divorce_Binary, levels = c(0, 1)))
  
  # Extract performance metrics
  list(
    Accuracy = cm$overall["Accuracy"],
    Sensitivity = cm$byClass["Sensitivity"],  # True Positive Rate
    Specificity = cm$byClass["Specificity"]   # True Negative Rate
  )
}

Train Models

# 1. Logistic Regression

log_model <- glm(Divorce_Binary ~ ., data = train_data, family = binomial)
log_predictions <- predict(log_model, newdata = test_data, type = "response")  # Returns probabilities
log_results <- evaluate_model(log_predictions, test_data)

#2. Decision Tree

tree_model <- train(
  x = train_data[, !names(train_data) %in% "Divorce_Binary"], 
  y = train_data$Divorce_Binary, 
  method = "rpart"
)
tree_predictions <- predict(tree_model, newdata = test_data)
tree_results <- evaluate_model(tree_predictions, test_data)

#3. Random Forest

rf_model <- randomForest(Divorce_Binary ~ ., data = train_data, ntree = 100)
rf_predictions <- predict(rf_model, newdata = test_data, type = "prob") 
rf_results <- evaluate_model(rf_predictions, test_data)

#4. SVM

svm_model <- svm(Divorce_Binary ~ ., data = train_data, probability = TRUE)
svm_predictions <- predict(svm_model, newdata = test_data, probability = TRUE)
svm_results <- evaluate_model(attr(svm_predictions, "probabilities"), test_data)

#5. Neural Network

nn_model <- nnet(Divorce_Binary ~ ., data = train_data, size = 5, maxit = 1000, trace = FALSE)
nn_predictions <- predict(nn_model, newdata = test_data, type = "raw")
nn_results <- evaluate_model(nn_predictions, test_data)

Model Results Comparison

# Compare the results of all the models
results_df <- data.frame(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Neural Network"),
  Accuracy = c(log_results$Accuracy, tree_results$Accuracy, rf_results$Accuracy, svm_results$Accuracy, nn_results$Accuracy),
  Sensitivity = c(log_results$Sensitivity, tree_results$Sensitivity, rf_results$Sensitivity, svm_results$Sensitivity, nn_results$Sensitivity),
  Specificity = c(log_results$Specificity, tree_results$Specificity, rf_results$Specificity, svm_results$Specificity, nn_results$Specificity)
)

results_df

                Model  Accuracy Sensitivity Specificity
1 Logistic Regression 0.6805556   1.0000000  0.00000000
2       Decision Tree 0.6805556   1.0000000  0.00000000
3       Random Forest 0.6638889   0.9469388  0.06086957
4                 SVM 0.6805556   1.0000000  0.00000000
5      Neural Network 0.6805556   1.0000000  0.00000000

Most of the models have Sensitivity = 1 and Specificity = 0. This is problematic and may be an instance of over-fitting for the majority class which is Divorce = No. As we saw in the visual before, around 70% of the data is skewed towards No Divorce which makes it difficult for the model to predict Divorce cases.

One of the simple ways to fix this is using upSample, which creates synthetic datasets based on the existing ones to balance the overall datasets.

Using upSample to fix the Class Imbalance Issue

#Using upSample to fix the class imbalance issue

train_data_balanced <- upSample(x = train_data[, -which(names(train_data) == "Divorce_Binary")], 
                                y = train_data$Divorce_Binary)

# Rename the target column properly
colnames(train_data_balanced)[ncol(train_data_balanced)] <- "Divorce_Binary"

# Check new class distribution
table(train_data_balanced$Divorce_Binary)


  0   1 
581 581

Balanced Model Training

# 1. Logistic Regression

log_model <- glm(Divorce_Binary ~ ., data = train_data_balanced, family = binomial)
log_predictions <- predict(log_model, newdata = test_data, type = "response")  # Returns probabilities
log_results <- evaluate_model(log_predictions, test_data)

# 2. Decision Tree

tree_model <- train(Divorce_Binary ~ ., data = train_data_balanced, method = "rpart")
tree_predictions <- predict(tree_model, newdata = test_data)  # Factor output
tree_results <- evaluate_model(tree_predictions, test_data)

# 3. Random Forest

rf_model <- randomForest(Divorce_Binary ~ ., data = train_data_balanced, ntree = 100)
rf_predictions <- predict(rf_model, newdata = test_data, type = "prob")  # Probability matrix
rf_results <- evaluate_model(rf_predictions, test_data)

# 4. SVM

svm_model <- svm(Divorce_Binary ~ ., data = train_data_balanced, probability = TRUE)
svm_predictions <- predict(svm_model, newdata = test_data, probability = TRUE)
svm_results <- evaluate_model(attr(svm_predictions, "probabilities"), test_data) 

# 5. Neural Network

nn_model <- nnet(Divorce_Binary ~ ., data = train_data_balanced, size = 5, maxit = 1000, trace = FALSE)
nn_predictions <- predict(nn_model, newdata = test_data, type = "raw")  # Probability output
nn_results <- evaluate_model(nn_predictions, test_data)

Model Results Comparison after Retraining

results_df <- data.frame(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Neural Network"),
  Accuracy = c(log_results$Accuracy, tree_results$Accuracy, rf_results$Accuracy, 
               svm_results$Accuracy, nn_results$Accuracy),
  Sensitivity = c(log_results$Sensitivity, tree_results$Sensitivity, rf_results$Sensitivity, 
                  svm_results$Sensitivity, nn_results$Sensitivity),
  Specificity = c(log_results$Specificity, tree_results$Specificity, rf_results$Specificity, 
                  svm_results$Specificity, nn_results$Specificity)
)
results_df

                Model  Accuracy Sensitivity Specificity
1 Logistic Regression 0.4861111   0.4693878   0.5217391
2       Decision Tree 0.4611111   0.4367347   0.5130435
3       Random Forest 0.5694444   0.7265306   0.2347826
4                 SVM 0.5166667   0.5510204   0.4434783
5      Neural Network 0.3611111   0.1142857   0.8869565

Although the accuracy has decreased,there seems to be more balance in terms of Sensitivity and Specificity.

Comparison of ROC of all the models

log_roc <- roc(test_data$Divorce_Binary, log_predictions)
tree_roc <- roc(test_data$Divorce_Binary, as.numeric(tree_predictions))
rf_roc <- roc(test_data$Divorce_Binary, rf_predictions[,2])
svm_roc <- roc(test_data$Divorce_Binary, attr(svm_predictions, "probabilities")[,2])
nn_roc <- roc(test_data$Divorce_Binary, nn_predictions)

plot(log_roc, col = "blue", lwd = 2, main = "ROC Curve Comparison for Divorce Prediction")
lines(tree_roc, col = "green", lwd = 2)
lines(rf_roc, col = "red", lwd = 2)
lines(svm_roc, col = "purple", lwd = 2)
lines(nn_roc, col = "orange", lwd = 2)

# Add Legend
legend("bottomright", legend = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Neural Network"), 
       col = c("blue", "green", "red", "purple", "orange"), lwd = 2)

There is no significant difference between the models in terms of ROC.

Comparison of AUC of all the models

# Compute AUC for each model
log_auc <- auc(log_roc)
tree_auc <- auc(tree_roc)
rf_auc <- auc(rf_roc)
svm_auc <- auc(svm_roc)
nn_auc <- auc(nn_roc)

# Create a dataframe for comparison
auc_results <- data.frame(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Neural Network"),
  AUC = c(log_auc, tree_auc, rf_auc, svm_auc, nn_auc)
)

# Print AUC results
print(auc_results)

                Model       AUC
1 Logistic Regression 0.4890683
2       Decision Tree 0.4748891
3       Random Forest 0.5309139
4                 SVM 0.4952085
5      Neural Network 0.4985626

Random Forest model is the highest in terms of AUC but not by a lot.

Conclusion

Neither of the models yielded satisfactory results, indicating a need for further research such as hyper-parameterized testing or perhaps, other models. It may also be a good approach to examine the data for validity and whether it represents the population as a whole or not.

Data Source: https://www.kaggle.com/code/oktayrdeki/divorce-status-prediction-eda-ml