Introduction

While my previous papers about K-means clustering and dimension reduction are focused on grouping and explaining. In this paper adopts an association rule technique and discovers the characterization between GDP and mental health.

Method

Pre-processing

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(arules)
## Warning: package 'arules' was built under R version 4.5.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.5.2
library(stringr)

gdp<-read.csv("GDP.csv")

mental<-read.csv("Mental Health dataset1.csv")

Variable categorization

Cre <-function(x){
  x<- str_trim(tolower(as.character(x)))
  ifelse(x == "yes", 1,
          ifelse(x =="no", 0, NA))
}

mental <- mental %>%
  mutate(
    Treatment = Cre(Treatment),
    SocialWeakness =Cre(SocialWeakness),
    IncreasingStress = Cre(IncreasingStress),
    MentalHealthHistory = Cre(MentalHealthHistory)
  )

Aggreation to country level

mental_country <- mental %>%
  group_by(Country)%>%
  summarise(
    n=n(),
    treatment_rate=mean(Treatment, na.rm =TRUE),
   socialWeakness_rate=mean(SocialWeakness, na.rm =T),
    increasingstress_rate=mean(IncreasingStress, na.rm =T),
    mentalhistory_rate=mean(MentalHealthHistory, na.rm = T)
  ) %>%
  ungroup() %>%
  filter(n >=10)

GDP data processing

gdp_clean<- gdp %>%
  filter(!is.na(X), nchar(X) == 3)

gdp_cl <- gdp_clean %>%
  transmute(
    country = X.2,
    gdp_pc = as.numeric(gsub(",", "", X.3))
  )
## Warning: There was 1 warning in `transmute()`.
## ℹ In argument: `gdp_pc = as.numeric(gsub(",", "", X.3))`.
## Caused by warning:
## ! NAs introduced by coercion
head(gdp_cl)
##          country   gdp_pc
## 1  United States 28750956
## 2          China 18743803
## 3        Germany  4685593
## 4          Japan  4027598
## 5          India  3909892
## 6 United Kingdom  3686033
summary(gdp_cl$gdp_pc)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##        62     10651     44458   2003108    347034 110982661         8
normal_country <- function(x){
  x %>%
  str_to_lower() %>%
  str_trim() %>%
  str_replace_all("&", "and")%>%
  str_replace_all("[^a-z]", " ")%>%
  str_squish()
}
gdp_country <- gdp %>%
  filter(!is.na(X), nchar(as.character(X))==3) %>%
  transmute(
    Country = as.character(X.2),
    gdp_pc = as.numeric(gsub(",","", as.character(X.3)))
  )%>%
  filter(!is.na(gdp_pc))%>%
  mutate(country_key= normal_country(Country))
## Warning: There was 1 warning in `transmute()`.
## ℹ In argument: `gdp_pc = as.numeric(gsub(",", "", as.character(X.3)))`.
## Caused by warning:
## ! NAs introduced by coercion
mental_country<-mental_country%>%
  mutate(country_key=normal_country(Country))

df<-mental_country %>%
  left_join(gdp_country %>% select(country_key, gdp_pc), by="country_key")%>%
  filter(!is.na(gdp_pc))

Association rules

df<- df %>%
  mutate(
    GDP_High = ifelse(gdp_pc >= median(gdp_pc), "High_GDP", "Low_GDP"),
    Stress_High =ifelse(increasingstress_rate>= median(increasingstress_rate),"High_stress", "Low_Stress"),
    Treatment_High =ifelse(treatment_rate>= median(treatment_rate),"High_Treatment", "Low_Treatment"),
    socialWeakness_High=ifelse(socialWeakness_rate>=median(socialWeakness_rate),"High_Socialweakness", "Low_SocialWeakness")
  )

Categorization

item_col<- c("GDP_High", "Stress_High", "Treatment_High", "socialWeakness_High")

transactions_list<-df %>%
  mutate(rowid=row_number())%>%
  select(rowid,all_of(item_col))%>%
  pivot_longer(cols=all_of(item_col), names_to ="var", values_to ="item")%>%
  group_by(rowid)%>%
  
 summarise(items =list(as.character(item)), .groups = "drop")%>%
  pull(items)

tr<- as(transactions_list, "transactions")

summary(tr)
## transactions as itemMatrix in sparse format with
##  32 rows (elements/itemsets/transactions) and
##  7 columns (items) and a density of 0.5714286 
## 
## most frequent items:
##      High_Treatment High_Socialweakness            High_GDP         High_stress 
##                  32                  17                  16                  16 
##             Low_GDP             (Other) 
##                  16                  31 
## 
## element (itemset/transaction) length distribution:
## sizes
##  4 
## 32 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4       4       4       4       4       4 
## 
## includes extended item information - examples:
##                labels
## 1            High_GDP
## 2 High_Socialweakness
## 3         High_stress
rules<- apriori(
  tr, 
  parameter = list(
    supp = 0.2,
    conf = 0.6,
    minlen = 2
  )
)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5     0.2      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 6 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7 item(s), 32 transaction(s)] done [0.00s].
## sorting and recoding items ... [7 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [37 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 37 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
## 14 19  4 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.00    3.00    2.73    3.00    4.00 
## 
## summary of quality measures:
##     support         confidence        coverage           lift      
##  Min.   :0.2188   Min.   :0.6250   Min.   :0.2188   Min.   :1.000  
##  1st Qu.:0.2500   1st Qu.:0.6250   1st Qu.:0.3125   1st Qu.:1.000  
##  Median :0.3125   Median :0.7000   Median :0.5000   Median :1.250  
##  Mean   :0.3226   Mean   :0.7968   Mean   :0.4155   Mean   :1.184  
##  3rd Qu.:0.3438   3rd Qu.:1.0000   3rd Qu.:0.5000   3rd Qu.:1.294  
##  Max.   :0.5312   Max.   :1.0000   Max.   :0.5312   Max.   :1.556  
##      count      
##  Min.   : 7.00  
##  1st Qu.: 8.00  
##  Median :10.00  
##  Mean   :10.32  
##  3rd Qu.:11.00  
##  Max.   :17.00  
## 
## mining info:
##  data ntransactions support confidence
##    tr            32     0.2        0.6
##                                                                      call
##  apriori(data = tr, parameter = list(supp = 0.2, conf = 0.6, minlen = 2))
rules_lift<-sort(rules,by="lift", decreasing = T)
inspect(head(rules_lift,10))
##      lhs                       rhs                   support confidence coverage     lift count
## [1]  {High_Socialweakness,                                                                     
##       High_stress}          => {High_GDP}            0.21875  0.7777778  0.28125 1.555556     7
## [2]  {High_Socialweakness,                                                                     
##       High_stress,                                                                             
##       High_Treatment}       => {High_GDP}            0.21875  0.7777778  0.28125 1.555556     7
## [3]  {Low_SocialWeakness}   => {Low_GDP}             0.31250  0.6666667  0.46875 1.333333    10
## [4]  {Low_GDP}              => {Low_SocialWeakness}  0.31250  0.6250000  0.50000 1.333333    10
## [5]  {High_Treatment,                                                                          
##       Low_SocialWeakness}   => {Low_GDP}             0.31250  0.6666667  0.46875 1.333333    10
## [6]  {High_Treatment,                                                                          
##       Low_GDP}              => {Low_SocialWeakness}  0.31250  0.6250000  0.50000 1.333333    10
## [7]  {High_GDP,                                                                                
##       High_stress}          => {High_Socialweakness} 0.21875  0.7000000  0.31250 1.317647     7
## [8]  {High_GDP,                                                                                
##       High_stress,                                                                             
##       High_Treatment}       => {High_Socialweakness} 0.21875  0.7000000  0.31250 1.317647     7
## [9]  {High_GDP}             => {High_Socialweakness} 0.34375  0.6875000  0.50000 1.294118    11
## [10] {High_Socialweakness}  => {High_GDP}            0.34375  0.6470588  0.53125 1.294118    11
plot(rules_lift, measure = c("support", "confidence"),shading = "lift")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

#Result In this paper adopts association rules with relatively high confidence and lift values mining for exploring GDP and mental health Data. Each country was considered by composed categorical items. Capita and mental health indicators were discretized by binary categories based on their median. The Apriori generated association rules model finds meaningful patterns.

The first pattern is regular strong rules indicating that high GDP countries tend to be related with high treatment experience rate. This interruption is able to be explained because economically developed countries provide a lot of mental treatment opportunities.

The second pattern is low GDP countries that more frequently tend to have high stress or social weakness. This pattern is similar to a low GDP country real-world problem. They get much stress from status however they are not able to access mental treatment properly. Overall, association rules provide an explainable standard beside clustering that focuses on grouping or dimension reduction. Association rules remain. Therefore it is more useful.