This R Markdown document accompanies the manuscript “The Effects of Prison Experience and Social Support on Mental Health Among Inmates in Brazil” by Luis Anunciação, PhD, and colleagues. It provides a fully reproducible analysis pipeline for data preparation, descriptive statistics, psychometric processing, and hierarchical multiple regression modeling. In particular, it includes a replication of SPSS-style stepwise and blockwise regression procedures using R, with clear annotation and transparent code. While certain analytic steps were included to match legacy outputs (e.g., stepwise regression), the overall structure prioritizes clarity, reproducibility, and theoretical coherence.

Last update: April 5, 2025

Packages

pacman::p_load(tidyverse, mirt, psych, janitor,  arsenal, summarytools, correlation, olsrr,jtools, interactions)

Data

library(haven) 
df <- read_sav("https://osf.io/download/69mev/")
backup <- df

Data processing

Clean data

df = clean_names(df)
df = remove_empty(df)

## value for "which" not specified, defaulting to c("rows", "cols")

Pessoal do nosso grupo, sempre que um filho da puta mandar uma base no spss, removam os atributos.

df <- df %>% 
  mutate_all(function(x) { attributes(x) <- NULL; x })

Fix it

Quantidade de participantes

df = df %>% 
  filter(sujeito != '')

df %>% arrange(sujeito)

Idade

df %>% select(idade) %>% DataExplorer::profile_missing()

df = df %>%
  mutate(age = replace_na(idade, mean(idade, na.rm = TRUE)))

df %>% select(idade, age)

sexo

df %>% select(genero) %>% DataExplorer::plot_missing()

df = df %>% mutate(gender = "male")
df %>% count(genero, gender)

Tempo preso

qplot(df$temp_preso_dias)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 15 rows containing non-finite outside the scale range
## (`stat_bin()`).

arsenal::tableby(~temp_preso_dias, data = df) %>% summary(text = T, digits = 2) %>% as.data.frame()

df = df %>%
  mutate(days_in_prision = replace_na(temp_preso_dias,
                                        mean(temp_preso_dias, na.rm = TRUE)))
df %>% select(temp_preso_dias, days_in_prision)

df = df %>%
  mutate(months_in_prision = round(days_in_prision/30,1))

df %>%
  select(days_in_prision, months_in_prision) %>%
  arrange(days_in_prision)

df = df %>%
  mutate(age_in_days = age * 365) %>% #converte a idade em dias
  mutate(years_in_prision = days_in_prision/365) %>% #converte o tempo preso em anos
  #select(idade_c,temp_preso_dias_c,temp_preso_ano) %>% 
  mutate(age_when_arrested = age-years_in_prision) %>% #verifica a idade que a pessoa foi presa
  filter(age_when_arrested > 17) #se a pessoa tinha menos de 18 anos, não usa == 242

Estado civil

df = df %>%
  mutate(marital_status = case_when(
    estado_civil == 1 ~ "single",
    estado_civil == 2 ~ "married",
    estado_civil == 3 ~ "serious",
    estado_civil == 4 ~ "casual",
    estado_civil == 5 ~ "widower",
    estado_civil == 6 ~ "divorced",
    TRUE ~ NA_character_
  )) 
count(df, marital_status)

Children

df = df %>%
  mutate(children = case_when(
    filhos == 1 ~ "yes",
    filhos == 2 ~ "no",
    TRUE ~ NA_character_
  ))

df = df %>%
  rename(how_many_children = quant_filhos
  )

Escolaridade

df %>% count(escolaridade)

df = df %>%
  mutate(education = case_when(
    escolaridade == 1 ~ "1 no formal education",
    escolaridade == 2 ~ "2 Elementary - incomplete",
    escolaridade == 3 ~ "3 Elementary - commplete",
    escolaridade == 4 ~ "4 High school - incomplete",
    escolaridade == 5 ~ "5 High school - complete",
    escolaridade == 6 ~ "6 Undergrad - incomplete or complete",
    escolaridade == 7 ~ "6 Undergrad - incomplete or complete",
    TRUE ~ NA_character_
  )) 
count(df, education)

##Escolaridade shorter

df %>% count(escolaridade)

df = df %>%
  mutate(education_macro = case_when(
    escolaridade == 1 ~ "1 no formal education",
    escolaridade == 2 ~ "2 Elementary",
    escolaridade == 3 ~ "2 Elementary",
    escolaridade == 4 ~ "3 High school",
    escolaridade == 5 ~ "3 High school",
    escolaridade == 6 ~ "4 Undergrad",
    escolaridade == 7 ~ "4 Undergrad",
    TRUE ~ NA_character_
  )) 
count(df, education_macro)

Religião

df = df %>%
  mutate(religion = case_when(
  religiao == "1" ~ "catholic",
  religiao == "2" ~ "spiritist",
  religiao == "3" ~ "evangelical",
  religiao == "4" ~ "african religion",
  religiao == "5" ~ "buddhist",
  religiao == "6" ~ "adventist",
  religiao == "7" ~ "atheist or agnostic",
  religiao == "8" ~ "spirituality",
  religiao == "9" ~ "atheist or agnostic",
  religiao == "10" ~ "atheist or agnostic",
  religiao == "crista" ~ "catholic",
  religiao == "protestante" ~ "evangelical",
  TRUE ~ NA_character_
  ))
count(df, religiao)

count(df, religion)

Etnia

df = df %>%
  mutate(race = case_when(
  etnia == 1 ~ "black",
  etnia == 2 ~ "white",
  etnia == 3 ~ "mixed black",
  etnia == 4 ~ "indigenous",
  etnia == 5 ~ "mixed asian",
  TRUE ~ NA_character_
  ))
count(df, race)

drugs

df = df %>%
  mutate(drugs = case_when(
  drogas == 1 ~ "yes",
  drogas == 2 ~ "no",
  TRUE ~ NA_character_
  ))
count(df, drugs)

relationship

df = df %>%
  mutate(relationship = case_when(
    estado_civil == 1 ~ "no",
    estado_civil == 2 ~ "yes",
    estado_civil == 3 ~ "yes",
    estado_civil == 4 ~ "yes",
    estado_civil == 5 ~ "yes",
    estado_civil == 6 ~ "no",
    TRUE ~ NA_character_
  ))

tabyl(df, estado_civil,relationship)

Scores

Self Report Questionnaire (SRQ)

This is a dependent variable in our study – this questionnaire is about mental health The higher the worse

df %>%
  select(srq_1:srq_20) %>%
  mutate_all(., as.factor) %>%
  DataExplorer::plot_bar()

df %>%
  select(sep_1:sep_12) %>% DataExplorer::profile_missing()

Experiences in prision (SEP)

This is an idependent variable in our study – the higher the better

df %>%
  select(sep_1:sep_12) %>%
  mutate_all(., as.factor) %>%
  DataExplorer::plot_bar()

df = df %>% mutate_at(vars(sep_1:sep_12), ~if_else(!. %in% C(1:5),NA, .))

df %>%
  select(sep_1:sep_12) %>% DataExplorer::profile_missing()

Social support (EPUS)

This is an idependent variable – the higher the better

df %>%
  select(epsus_1:epsus_36) %>%
  mutate_all(., as.factor) %>%
  DataExplorer::plot_bar()

df = df %>% mutate_at(vars(epsus_1:epsus_36), ~if_else(!. %in% C(0:3),NA, .))

df %>%
  select(epsus_1:epsus_36) %>%
  mutate_all(., as.factor) %>%
  DataExplorer::plot_bar()

df %>%
  select(epsus_1:epsus_36) %>% DataExplorer::profile_missing()

Totals

SRQ

df = df %>% 
 mutate(mhealth_total = rowSums(select(., srq_1:srq_20), na.rm=T)) %>%
  mutate(mhealth_mean = rowMeans(select(., srq_1:srq_20), na.rm=T))

SEP

df = df %>% 
 mutate(sep_total = rowSums(select(., sep_1:sep_12), na.rm=T)) %>%
  mutate(sep_mean = rowMeans(select(., sep_1:sep_12), na.rm=T))

EPSUS

For this scale, we will do a factor analysis and get only items > 0.4 We are doing that because the first analysis was on spss and this was the result

fit <- df %>%  select(epsus_1:epsus_36) %>%
  fa(., nfactors = 5,fm = "pa",  rotate = "Promax")

unclass(fit$loadings) %>%
  as.data.frame() %>%
  mutate_all(., ~if_else(. < 0.4,NA,.)) %>%
  mutate(not_loading = rowSums(is.na(.))) %>%
  filter(not_loading < 5) %>%
  rownames_to_column("item") %>% pull(item)

##  [1] "epsus_1"  "epsus_4"  "epsus_5"  "epsus_6"  "epsus_7"  "epsus_9" 
##  [7] "epsus_12" "epsus_13" "epsus_16" "epsus_17" "epsus_18" "epsus_19"
## [13] "epsus_20" "epsus_22" "epsus_23" "epsus_24" "epsus_25" "epsus_26"
## [19] "epsus_27" "epsus_28" "epsus_29" "epsus_30" "epsus_31" "epsus_32"
## [25] "epsus_33" "epsus_34" "epsus_35" "epsus_36"

df %>% select(.,"epsus_1","epsus_4","epsus_5","epsus_6","epsus_7","epsus_9","epsus_12","epsus_13","epsus_16","epsus_17","epsus_18","epsus_19","epsus_20","epsus_22","epsus_23","epsus_24","epsus_25","epsus_26","epsus_27","epsus_28","epsus_29","epsus_30","epsus_31","epsus_32","epsus_33","epsus_34","epsus_35","epsus_36")%>% describe()

df = df %>% 
  mutate(epsus_total = rowSums(select(.,"epsus_1","epsus_4","epsus_5","epsus_6","epsus_7","epsus_9","epsus_12","epsus_13","epsus_16","epsus_17","epsus_18","epsus_19","epsus_20","epsus_22","epsus_23","epsus_24","epsus_25","epsus_26","epsus_27","epsus_28","epsus_29","epsus_30","epsus_31","epsus_32","epsus_33","epsus_34","epsus_35","epsus_36"), na.rm=T),
         epsus_mean = rowMeans(select(.,"epsus_1","epsus_4","epsus_5","epsus_6","epsus_7","epsus_9","epsus_12","epsus_13","epsus_16","epsus_17","epsus_18","epsus_19","epsus_20","epsus_22","epsus_23","epsus_24","epsus_25","epsus_26","epsus_27","epsus_28","epsus_29","epsus_30","epsus_31","epsus_32","epsus_33","epsus_34","epsus_35","epsus_36"), na.rm=T))

df %>% select(epsus_total) %>% DataExplorer::profile_missing()

Save df

write.csv(df, "df.csv", row.names = F)

Data analysis

Table one

df %>% 
  select(age, gender, marital_status, race, children, how_many_children, 
         education_macro, months_in_prision, religion) %>%
  tableby( ~ ., data= . ,) %>% summary(text = T, digits = 2) %>%
  as.data.frame()

All scales

df %>%
  select(mhealth_total, sep_total, epsus_total) %>%
  tableby(~., data = ., 
          control = tableby.control(numeric.stats=c("N", "meansd", "median"),
                               cat.stats=c("countpct"))) %>% 
  summary(text = T, digits = 2 ) %>%
  as.data.frame()

df %>%
  select(mhealth_total, sep_total, epsus_total) %>%
  rename("Mental Health" = mhealth_total,
         "Perceived Social Support" = epsus_total,
         "Experience in Prison" = sep_total) %>%
  scale() %>% as.data.frame() %>%
  pivot_longer(everything()) %>%
  ggplot(., aes(x = value, fill = name)) + geom_density(alpha=0.3) +
  scale_x_continuous(breaks=seq(-3,3,1), limits=c(-4,3))+
  labs(x = "Z - score", y = "Density", fill = "") + 
  theme_bw() + theme(legend.position = "bottom")

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).

Reliability

Self-Reporting Questionnaire (SRQ-20)

df %>%
  select(srq_1:srq_20) %>% alpha()

## 
## Reliability analysis   
## Call: alpha(x = .)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N  ase mean   sd median_r
##       0.89      0.89     0.9      0.28 7.9 0.01  0.5 0.26     0.28
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.87  0.89  0.91
## Duhachek  0.87  0.89  0.91
## 
##  Reliability if an item is dropped:
##        raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## srq_1       0.88      0.88    0.90      0.28 7.5    0.011 0.0083  0.28
## srq_2       0.88      0.88    0.90      0.28 7.5    0.011 0.0086  0.28
## srq_3       0.88      0.88    0.90      0.29 7.6    0.011 0.0080  0.28
## srq_4       0.88      0.88    0.90      0.28 7.5    0.011 0.0085  0.27
## srq_5       0.89      0.89    0.90      0.29 7.8    0.011 0.0082  0.28
## srq_6       0.88      0.88    0.90      0.28 7.4    0.011 0.0083  0.28
## srq_7       0.88      0.88    0.90      0.28 7.4    0.011 0.0079  0.28
## srq_8       0.88      0.88    0.90      0.28 7.5    0.011 0.0085  0.28
## srq_9       0.89      0.89    0.90      0.29 7.7    0.011 0.0077  0.28
## srq_10      0.88      0.88    0.90      0.28 7.6    0.011 0.0087  0.28
## srq_11      0.88      0.88    0.89      0.27 7.2    0.011 0.0078  0.27
## srq_12      0.88      0.88    0.90      0.28 7.5    0.011 0.0081  0.28
## srq_13      0.88      0.88    0.90      0.28 7.5    0.011 0.0086  0.28
## srq_14      0.89      0.89    0.90      0.29 7.8    0.011 0.0077  0.28
## srq_15      0.88      0.88    0.90      0.28 7.5    0.011 0.0079  0.28
## srq_16      0.88      0.88    0.90      0.28 7.4    0.011 0.0077  0.28
## srq_17      0.88      0.88    0.90      0.28 7.4    0.011 0.0082  0.28
## srq_18      0.88      0.88    0.90      0.28 7.3    0.011 0.0080  0.28
## srq_19      0.88      0.88    0.90      0.28 7.5    0.011 0.0079  0.28
## srq_20      0.88      0.88    0.90      0.28 7.5    0.011 0.0083  0.28
## 
##  Item statistics 
##          n raw.r std.r r.cor r.drop mean   sd
## srq_1  232  0.59  0.58  0.55   0.51 0.64 0.48
## srq_2  232  0.56  0.55  0.51   0.48 0.66 0.47
## srq_3  229  0.51  0.52  0.49   0.45 0.82 0.39
## srq_4  229  0.59  0.58  0.54   0.51 0.42 0.49
## srq_5  229  0.46  0.45  0.41   0.38 0.33 0.47
## srq_6  230  0.59  0.61  0.58   0.54 0.78 0.41
## srq_7  228  0.59  0.60  0.58   0.54 0.47 0.50
## srq_8  229  0.56  0.56  0.53   0.50 0.46 0.50
## srq_9  231  0.45  0.47  0.43   0.40 0.88 0.32
## srq_10 231  0.52  0.54  0.50   0.46 0.51 0.50
## srq_11 227  0.70  0.69  0.68   0.64 0.51 0.50
## srq_12 218  0.55  0.54  0.51   0.48 0.38 0.49
## srq_13 217  0.57  0.57  0.54   0.51 0.27 0.45
## srq_14 217  0.43  0.46  0.41   0.38 0.14 0.35
## srq_15 215  0.57  0.57  0.55   0.50 0.35 0.48
## srq_16 214  0.60  0.61  0.59   0.55 0.27 0.45
## srq_17 218  0.61  0.61  0.59   0.55 0.29 0.46
## srq_18 216  0.65  0.64  0.62   0.58 0.46 0.50
## srq_19 218  0.58  0.57  0.55   0.51 0.56 0.50
## srq_20 214  0.58  0.58  0.55   0.52 0.58 0.49
## 
## Non missing response frequency for each item
##           0    1 miss
## srq_1  0.36 0.64 0.04
## srq_2  0.34 0.66 0.04
## srq_3  0.18 0.82 0.05
## srq_4  0.58 0.42 0.05
## srq_5  0.67 0.33 0.05
## srq_6  0.22 0.78 0.05
## srq_7  0.53 0.47 0.06
## srq_8  0.54 0.46 0.05
## srq_9  0.12 0.88 0.05
## srq_10 0.49 0.51 0.05
## srq_11 0.49 0.51 0.06
## srq_12 0.62 0.38 0.10
## srq_13 0.73 0.27 0.10
## srq_14 0.86 0.14 0.10
## srq_15 0.65 0.35 0.11
## srq_16 0.73 0.27 0.12
## srq_17 0.71 0.29 0.10
## srq_18 0.54 0.46 0.11
## srq_19 0.44 0.56 0.10
## srq_20 0.42 0.58 0.12

Scale of Experience in Prison (SEP)

df %>%
  select(sep_1:sep_12) %>% alpha()

## 
## Reliability analysis   
## Call: alpha(x = .)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.85      0.86    0.87      0.34 6.1 0.014  3.6 0.71     0.34
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.83  0.85  0.88
## Duhachek  0.83  0.85  0.88
## 
##  Reliability if an item is dropped:
##        raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## sep_1       0.85      0.85    0.87      0.34 5.8    0.014 0.0108  0.35
## sep_2       0.84      0.84    0.85      0.33 5.4    0.015 0.0098  0.32
## sep_3       0.83      0.84    0.85      0.32 5.2    0.016 0.0096  0.30
## sep_4       0.84      0.85    0.86      0.34 5.6    0.015 0.0116  0.35
## sep_5       0.84      0.84    0.85      0.32 5.2    0.016 0.0101  0.32
## sep_6       0.84      0.84    0.86      0.33 5.4    0.015 0.0109  0.33
## sep_7       0.85      0.85    0.87      0.35 5.9    0.014 0.0100  0.35
## sep_8       0.85      0.85    0.86      0.34 5.7    0.015 0.0119  0.35
## sep_9       0.84      0.85    0.86      0.34 5.6    0.015 0.0114  0.35
## sep_10      0.84      0.85    0.86      0.33 5.5    0.015 0.0113  0.33
## sep_11      0.85      0.86    0.87      0.35 5.9    0.014 0.0107  0.35
## sep_12      0.85      0.85    0.86      0.34 5.7    0.015 0.0120  0.35
## 
##  Item statistics 
##          n raw.r std.r r.cor r.drop mean   sd
## sep_1  235  0.58  0.56  0.51   0.46  3.0 1.32
## sep_2  233  0.67  0.68  0.66   0.61  3.8 1.06
## sep_3  233  0.72  0.73  0.71   0.66  3.7 1.05
## sep_4  236  0.63  0.63  0.59   0.54  3.6 1.12
## sep_5  236  0.71  0.72  0.71   0.64  3.9 1.11
## sep_6  235  0.68  0.68  0.65   0.59  3.7 1.22
## sep_7  232  0.51  0.54  0.48   0.43  4.3 0.91
## sep_8  232  0.60  0.58  0.53   0.49  2.7 1.30
## sep_9  233  0.60  0.61  0.56   0.51  3.9 1.08
## sep_10 235  0.63  0.65  0.62   0.56  4.2 0.96
## sep_11 235  0.55  0.53  0.47   0.43  2.6 1.28
## sep_12 237  0.60  0.59  0.53   0.48  3.8 1.32
## 
## Non missing response frequency for each item
##           1    2    3    4    5 miss
## sep_1  0.21 0.11 0.23 0.33 0.11 0.03
## sep_2  0.05 0.08 0.13 0.48 0.27 0.04
## sep_3  0.05 0.09 0.17 0.48 0.21 0.04
## sep_4  0.07 0.10 0.16 0.48 0.18 0.02
## sep_5  0.08 0.05 0.07 0.51 0.30 0.02
## sep_6  0.09 0.11 0.07 0.48 0.25 0.03
## sep_7  0.03 0.03 0.03 0.41 0.50 0.04
## sep_8  0.22 0.27 0.17 0.25 0.09 0.04
## sep_9  0.05 0.08 0.12 0.46 0.29 0.04
## sep_10 0.03 0.04 0.03 0.43 0.46 0.03
## sep_11 0.26 0.23 0.22 0.22 0.08 0.03
## sep_12 0.12 0.07 0.08 0.37 0.37 0.02

Epus

df %>% select(.,"epsus_1","epsus_4","epsus_5","epsus_6","epsus_7","epsus_9","epsus_12","epsus_13","epsus_16","epsus_17","epsus_18","epsus_19","epsus_20","epsus_22","epsus_23","epsus_24","epsus_25","epsus_26","epsus_27","epsus_28","epsus_29","epsus_30","epsus_31","epsus_32","epsus_33","epsus_34","epsus_35","epsus_36")  %>% alpha()

## 
## Reliability analysis   
## Call: alpha(x = .)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean   sd median_r
##       0.95      0.95    0.97      0.42  20 0.0045  1.7 0.71     0.42
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.94  0.95  0.96
## Duhachek  0.94  0.95  0.96
## 
##  Reliability if an item is dropped:
##          raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## epsus_1       0.95      0.95    0.97      0.42  20   0.0045 0.0098  0.42
## epsus_4       0.95      0.95    0.96      0.42  20   0.0045 0.0106  0.42
## epsus_5       0.95      0.95    0.96      0.42  20   0.0045 0.0102  0.42
## epsus_6       0.95      0.95    0.96      0.42  20   0.0045 0.0105  0.42
## epsus_7       0.95      0.95    0.96      0.42  20   0.0045 0.0103  0.42
## epsus_9       0.95      0.95    0.96      0.42  19   0.0046 0.0105  0.42
## epsus_12      0.95      0.95    0.96      0.42  19   0.0046 0.0101  0.42
## epsus_13      0.95      0.95    0.96      0.41  19   0.0047 0.0108  0.41
## epsus_16      0.95      0.95    0.96      0.42  19   0.0046 0.0109  0.42
## epsus_17      0.95      0.95    0.96      0.41  19   0.0047 0.0099  0.41
## epsus_18      0.95      0.95    0.96      0.41  19   0.0047 0.0103  0.41
## epsus_19      0.95      0.95    0.96      0.41  19   0.0047 0.0107  0.41
## epsus_20      0.95      0.95    0.96      0.42  19   0.0046 0.0106  0.42
## epsus_22      0.95      0.95    0.96      0.41  19   0.0046 0.0103  0.42
## epsus_23      0.95      0.95    0.96      0.42  20   0.0045 0.0100  0.42
## epsus_24      0.95      0.95    0.96      0.41  19   0.0047 0.0102  0.41
## epsus_25      0.95      0.95    0.96      0.41  19   0.0047 0.0105  0.41
## epsus_26      0.95      0.95    0.96      0.41  19   0.0047 0.0106  0.41
## epsus_27      0.95      0.95    0.96      0.41  19   0.0047 0.0105  0.42
## epsus_28      0.95      0.95    0.96      0.41  19   0.0047 0.0104  0.42
## epsus_29      0.95      0.95    0.96      0.42  19   0.0046 0.0106  0.42
## epsus_30      0.95      0.95    0.96      0.42  19   0.0046 0.0103  0.42
## epsus_31      0.95      0.95    0.96      0.42  19   0.0046 0.0107  0.42
## epsus_32      0.95      0.95    0.96      0.41  19   0.0047 0.0105  0.42
## epsus_33      0.95      0.95    0.96      0.42  19   0.0046 0.0105  0.42
## epsus_34      0.95      0.95    0.96      0.41  19   0.0047 0.0102  0.41
## epsus_35      0.95      0.95    0.96      0.42  19   0.0046 0.0105  0.42
## epsus_36      0.95      0.95    0.96      0.41  19   0.0046 0.0105  0.42
## 
##  Item statistics 
##            n raw.r std.r r.cor r.drop mean   sd
## epsus_1  231  0.51  0.50  0.48   0.45  1.8 1.06
## epsus_4  228  0.58  0.58  0.56   0.54  1.5 1.06
## epsus_5  224  0.54  0.54  0.52   0.50  1.5 1.07
## epsus_6  229  0.59  0.60  0.58   0.55  1.8 0.99
## epsus_7  228  0.56  0.56  0.55   0.52  1.7 0.94
## epsus_9  228  0.66  0.65  0.64   0.62  2.0 1.09
## epsus_12 223  0.63  0.62  0.61   0.59  1.7 1.24
## epsus_13 225  0.71  0.71  0.70   0.68  1.7 1.01
## epsus_16 219  0.64  0.64  0.63   0.61  1.6 1.00
## epsus_17 224  0.72  0.71  0.71   0.69  1.5 1.17
## epsus_18 227  0.73  0.73  0.72   0.70  1.6 1.15
## epsus_19 226  0.73  0.72  0.72   0.69  1.8 1.06
## epsus_20 227  0.67  0.67  0.66   0.64  2.0 0.98
## epsus_22 226  0.69  0.69  0.68   0.66  2.1 1.07
## epsus_23 222  0.53  0.53  0.51   0.49  1.5 1.06
## epsus_24 227  0.77  0.77  0.77   0.75  1.9 1.08
## epsus_25 223  0.74  0.74  0.73   0.71  2.0 1.05
## epsus_26 227  0.74  0.75  0.74   0.71  1.7 0.96
## epsus_27 224  0.72  0.71  0.70   0.68  1.2 1.12
## epsus_28 224  0.72  0.72  0.71   0.69  1.7 0.99
## epsus_29 226  0.66  0.65  0.64   0.62  1.5 1.07
## epsus_30 228  0.65  0.65  0.64   0.62  1.9 1.08
## epsus_31 226  0.64  0.64  0.62   0.60  1.4 1.04
## epsus_32 223  0.70  0.71  0.70   0.68  1.9 0.99
## epsus_33 230  0.64  0.64  0.63   0.60  2.2 0.98
## epsus_34 229  0.74  0.74  0.73   0.70  2.0 1.05
## epsus_35 230  0.66  0.65  0.64   0.63  1.6 1.10
## epsus_36 229  0.69  0.68  0.67   0.65  1.8 1.12
## 
## Non missing response frequency for each item
##             0    1    2    3 miss
## epsus_1  0.12 0.34 0.19 0.35 0.05
## epsus_4  0.20 0.35 0.22 0.23 0.06
## epsus_5  0.21 0.33 0.23 0.24 0.07
## epsus_6  0.09 0.31 0.27 0.32 0.05
## epsus_7  0.09 0.35 0.31 0.26 0.06
## epsus_9  0.12 0.23 0.19 0.45 0.06
## epsus_12 0.26 0.17 0.17 0.41 0.08
## epsus_13 0.12 0.33 0.26 0.28 0.07
## epsus_16 0.15 0.32 0.30 0.23 0.10
## epsus_17 0.28 0.23 0.22 0.28 0.07
## epsus_18 0.23 0.28 0.18 0.31 0.06
## epsus_19 0.15 0.26 0.28 0.31 0.07
## epsus_20 0.08 0.26 0.28 0.38 0.06
## epsus_22 0.11 0.21 0.19 0.50 0.07
## epsus_23 0.21 0.36 0.21 0.23 0.08
## epsus_24 0.14 0.24 0.25 0.37 0.06
## epsus_25 0.12 0.22 0.26 0.41 0.08
## epsus_26 0.11 0.33 0.32 0.23 0.06
## epsus_27 0.35 0.26 0.21 0.19 0.07
## epsus_28 0.12 0.33 0.28 0.27 0.07
## epsus_29 0.22 0.29 0.27 0.23 0.07
## epsus_30 0.15 0.19 0.27 0.39 0.06
## epsus_31 0.23 0.31 0.27 0.19 0.07
## epsus_32 0.11 0.24 0.32 0.33 0.08
## epsus_33 0.07 0.20 0.23 0.50 0.05
## epsus_34 0.11 0.20 0.23 0.46 0.05
## epsus_35 0.21 0.24 0.27 0.28 0.05
## epsus_36 0.17 0.22 0.23 0.38 0.05

Correlation

df %>%
  select(mhealth_total, sep_total, epsus_total,age) %>%
  correlation(., )

Table 2 - Mimics SPSS

– Future Luis – don’t fucking use SPSS neither let students work in SPSS.

df_step = df %>% select(mhealth_total, age, race, how_many_children, months_in_prision, relationship, religion, education_macro, epsus_total, sep_total) %>%
  na.omit()
df_step

write.csv(df_step, "df_step.csv", row.names = F)

Stepwise

This repository contains R code and anonymized data used to replicate SPSS-style hierarchical and stepwise regression analyses. While these methods are widely used in applied research, they are also frequently misunderstood. This script provides reproducible, annotated R implementations of both blockwise (hierarchical) and data-driven (stepwise) regression models using lm(), olsrr, and jtools.

Note: The use of stepwise regression is not recommended for inference purposes. It is included here for the sole purpose of replicating legacy SPSS outputs. For theory-driven modeling, the hierarchical approach is preferred.

I did not use this in the article

#https://zia207.quarto.pub/stepwise-regression.html
library(caret)
# Set seed for reproducibility
set.seed(123)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
model.caret.leaps <-  train(mhealth_total ~ ., data = df_step,
        method = "leapBackward", 
        tuneGrid = data.frame(nvmax = 1:5),
        trControl = train.control
  )
model.caret.leaps$results
summary(model.caret.leaps$finalModel)
#coef(model.caret.leaps$finalModel)

Create models (steps and full)

# Step 1: variable less malleable
model1 <- lm(mhealth_total ~ age + race + relationship + religion + how_many_children + months_in_prision, data = df_step) 

# Step 2: variables more malleable now are included
model2 <- lm(mhealth_total ~ age + race + relationship + religion + how_many_children + months_in_prision +
                           education_macro + epsus_total + sep_total, data = df_step) #block 2

Model 1 fit

“As shown in the Table, variables in the first step of the equation accounted for approximately 15% of the variance in prisoners mental health distress, but did not make a significant contribution ΔR2 =.15, ΔadjR2 = .06, F(13,127) = 1.71, p = 0.07.”

# Compare models
summary(model1)

## 
## Call:
## lm(formula = mhealth_total ~ age + race + relationship + religion + 
##     how_many_children + months_in_prision, data = df_step)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1521 -3.9499  0.0962  3.3234 11.3009 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 15.07315    2.80402   5.376 3.53e-07 ***
## age                         -0.20389    0.05989  -3.404 0.000888 ***
## raceindigenous               0.19053    2.30221   0.083 0.934174    
## racemixed asian              0.64178    3.04223   0.211 0.833258    
## racemixed black             -1.53758    1.23201  -1.248 0.214316    
## racewhite                   -0.31635    1.25222  -0.253 0.800963    
## relationshipyes             -0.03066    0.87945  -0.035 0.972244    
## religionatheist or agnostic -1.98261    3.30868  -0.599 0.550097    
## religioncatholic             0.50078    2.24284   0.223 0.823676    
## religionevangelical          0.11311    2.18130   0.052 0.958725    
## religionspiritist            1.91545    2.79299   0.686 0.494085    
## religionspirituality         0.22600    2.54619   0.089 0.929411    
## how_many_children            0.94824    0.31892   2.973 0.003526 ** 
## months_in_prision           -0.01343    0.01577  -0.851 0.396183    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.96 on 127 degrees of freedom
## Multiple R-squared:  0.1493, Adjusted R-squared:  0.06218 
## F-statistic: 1.714 on 13 and 127 DF,  p-value: 0.0654

summ(model1)

Observations	141
Dependent variable	mhealth_total
Type	OLS linear regression

F(13,127)	1.71
R²	0.15
Adj. R²	0.06

	Est.	S.E.	t val.	p
(Intercept)	15.07	2.80	5.38	0.00
age	-0.20	0.06	-3.40	0.00
raceindigenous	0.19	2.30	0.08	0.93
racemixed asian	0.64	3.04	0.21	0.83
racemixed black	-1.54	1.23	-1.25	0.21
racewhite	-0.32	1.25	-0.25	0.80
relationshipyes	-0.03	0.88	-0.03	0.97
religionatheist or agnostic	-1.98	3.31	-0.60	0.55
religioncatholic	0.50	2.24	0.22	0.82
religionevangelical	0.11	2.18	0.05	0.96
religionspiritist	1.92	2.79	0.69	0.49
religionspirituality	0.23	2.55	0.09	0.93
how_many_children	0.95	0.32	2.97	0.00
months_in_prision	-0.01	0.02	-0.85	0.40
Standard errors: OLS

#summ(model1, scale = T) #get R2 and adj R2

Model 2 fit

In SPSS, the statistics presented is related to the full model. Therefore, F is taken from here.

“The overall (new) full model was significant F(18,122) = 2.44, p < .001.”

#summary(model2) #full model
summ(model2) #get R2 and adj R2

Observations	141
Dependent variable	mhealth_total
Type	OLS linear regression

F(18,122)	2.44
R²	0.26
Adj. R²	0.16

	Est.	S.E.	t val.	p
(Intercept)	23.26	4.52	5.14	0.00
age	-0.21	0.06	-3.52	0.00
raceindigenous	-1.30	2.25	-0.58	0.56
racemixed asian	1.41	2.98	0.47	0.64
racemixed black	-1.25	1.18	-1.06	0.29
racewhite	0.11	1.21	0.09	0.93
relationshipyes	0.01	0.86	0.02	0.99
religionatheist or agnostic	-1.39	3.18	-0.44	0.66
religioncatholic	1.02	2.19	0.47	0.64
religionevangelical	0.40	2.11	0.19	0.85
religionspiritist	2.32	2.72	0.85	0.39
religionspirituality	0.55	2.50	0.22	0.83
how_many_children	0.82	0.31	2.63	0.01
months_in_prision	-0.01	0.02	-0.40	0.69
education_macro2 Elementary	-0.37	2.95	-0.13	0.90
education_macro3 High school	-1.81	2.98	-0.61	0.55
education_macro4 Undergrad	-4.15	3.65	-1.14	0.26
epsus_total	-0.04	0.02	-1.62	0.11
sep_total	-0.14	0.05	-2.59	0.01
Standard errors: OLS

#summ(model2, scale = T, digits=3) #get R2 and adj R2
bind_cols(
parameters::standardize_parameters(model2, method = "basic", ci = 0.95) #to mimics spss
,
parameters::standard_error(model2, method = "basic")[-1]
)

#https://www.rdocumentation.org/packages/parameters/versions/0.22.0/topics/standardize_parameters
#sjPlot::tab_model(model2, show.std=T)

#library(lm.beta)
#std_model <- lm.beta(model2)
#summary(std_model)

Model comparison

However, after controlling for the variables in Step 1, the set of malleable predictors entered on Step 2 accounted for a total of 26% of the variance in mental health distress, ΔR2 = .115, ΔadjR2 = .09 F(5,122) = 3.83, p = 0.003.

summary(model2)$r.squared - summary(model1)$r.squared

## [1] 0.1155131

summary(model2)$adj.r.squared - summary(model1)$adj.r.squared

## [1] 0.09412095

“Degrees of freedom for the F-change test in SPSS correspond to the number of predictors added (df1) and the residual degrees of freedom of the final model (df2), consistent with the logic of partial F-tests.” Here, then: df1 is 127-122 = 5 and df 122

# Model comparison: ANOVA for R² change
performance::compare_performance(model1, model2)

#performance::check_model(model1)
anova(model1, model2)

Reference levels

table(df_step$education_macro)

## 
## 1 no formal education          2 Elementary         3 High school 
##                     3                    69                    63 
##           4 Undergrad 
##                     6

Stepwise algorithm in SPSS

library(olsrr)
stepwise_f <- ols_step_both_p(model2, 
                              prem = 0.05, 
                              prem_method = "f", 
                              pout = 0.10)
stepwise_f$model

## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Coefficients:
##       (Intercept)          sep_total                age  how_many_children  
##           21.2133            -0.1664            -0.1974             0.8952

summary(stepwise_f$model)

## 
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")), 
##     data = l)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8627 -3.6343  0.2928  3.6217 11.9063 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       21.21333    2.60849   8.132 2.25e-13 ***
## sep_total         -0.16641    0.04654  -3.576 0.000483 ***
## age               -0.19736    0.05321  -3.709 0.000302 ***
## how_many_children  0.89522    0.29279   3.058 0.002684 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.672 on 137 degrees of freedom
## Multiple R-squared:  0.1858, Adjusted R-squared:  0.168 
## F-statistic: 10.42 on 3 and 137 DF,  p-value: 3.204e-06

Setepwise algor. spss package

library(StepReg)

## Warning: pacote 'StepReg' foi compilado no R versão 4.4.3

res <- stepwise(formula = mhealth_total ~ age + race + relationship + religion + 
                                  how_many_children + months_in_prision + 
                                  education_macro + epsus_total + sep_total,
                data = df_step,
                type = "linear",
                #include = c("qsec"),
                strategy = "bidirection",
                metric = c("AIC", "AICc", "BIC", "CP", "HQ", "adjRsq", "SL", "SBC", "IC(3/2)", "IC(1)"))
summary(res$bidirection$adjRsq)

## 
## Call:
## lm(formula = mhealth_total ~ 1 + sep_total + age + how_many_children + 
##     education_macro + epsus_total, data = data, weights = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5639 -3.6385  0.4705  3.4907 11.7140 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  22.37429    3.74833   5.969 2.04e-08 ***
## sep_total                    -0.13767    0.04987  -2.760  0.00659 ** 
## age                          -0.19507    0.05360  -3.639  0.00039 ***
## how_many_children             0.76534    0.29587   2.587  0.01076 *  
## education_macro2 Elementary   0.27412    2.74202   0.100  0.92052    
## education_macro3 High school -1.23612    2.73374  -0.452  0.65188    
## education_macro4 Undergrad   -3.11011    3.28214  -0.948  0.34506    
## epsus_total                  -0.03502    0.02100  -1.668  0.09769 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.608 on 133 degrees of freedom
## Multiple R-squared:  0.231,  Adjusted R-squared:  0.1905 
## F-statistic: 5.707 on 7 and 133 DF,  p-value: 8.737e-06

Stepwise algorithm in R (better)

# Stepwise selection starting from model1 and using model2 as the upper scope
stepwise_model <- step(model1, 
                       scope = list(lower = model1, upper = model2),
                       direction = "both", trace = TRUE)

## Start:  AIC=464.85
## mhealth_total ~ age + race + relationship + religion + how_many_children + 
##     months_in_prision
## 
##                   Df Sum of Sq    RSS    AIC
## + sep_total        1    253.66 2870.8 454.92
## + epsus_total      1    167.91 2956.5 459.07
## <none>                         3124.4 464.85
## + education_macro  3    102.04 3022.4 466.17
## 
## Step:  AIC=454.92
## mhealth_total ~ age + race + relationship + religion + how_many_children + 
##     months_in_prision + sep_total
## 
##                   Df Sum of Sq    RSS    AIC
## + epsus_total      1    57.616 2813.2 454.06
## <none>                         2870.8 454.92
## + education_macro  3   112.692 2758.1 455.27
## - sep_total        1   253.656 3124.4 464.85
## 
## Step:  AIC=454.06
## mhealth_total ~ age + race + relationship + religion + how_many_children + 
##     months_in_prision + sep_total + epsus_total
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                         2813.2 454.06
## + education_macro  3   112.968 2700.2 454.28
## - epsus_total      1    57.616 2870.8 454.92
## - sep_total        1   143.367 2956.5 459.07

summary(stepwise_model)

## 
## Call:
## lm(formula = mhealth_total ~ age + race + relationship + religion + 
##     how_many_children + months_in_prision + sep_total + epsus_total, 
##     data = df_step)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6448 -3.7843  0.1079  3.4332  9.7171 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 21.436561   3.335735   6.426 2.49e-09 ***
## age                         -0.200403   0.057310  -3.497 0.000652 ***
## raceindigenous              -1.115902   2.231623  -0.500 0.617926    
## racemixed asian              0.219159   2.914557   0.075 0.940180    
## racemixed black             -1.525888   1.180194  -1.293 0.198426    
## racewhite                   -0.311623   1.197787  -0.260 0.795164    
## relationshipyes             -0.005039   0.842236  -0.006 0.995236    
## religionatheist or agnostic -1.712456   3.202589  -0.535 0.593800    
## religioncatholic             1.248438   2.177722   0.573 0.567487    
## religionevangelical          0.712220   2.099666   0.339 0.735023    
## religionspiritist            2.722446   2.700469   1.008 0.315336    
## religionspirituality         1.129867   2.478328   0.456 0.649254    
## how_many_children            0.933177   0.305057   3.059 0.002717 ** 
## months_in_prision           -0.003877   0.015305  -0.253 0.800414    
## sep_total                   -0.133198   0.052774  -2.524 0.012857 *  
## epsus_total                 -0.035762   0.022351  -1.600 0.112116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.744 on 125 degrees of freedom
## Multiple R-squared:  0.234,  Adjusted R-squared:  0.1421 
## F-statistic: 2.546 on 15 and 125 DF,  p-value: 0.002458

summ(stepwise_model)

Observations	141
Dependent variable	mhealth_total
Type	OLS linear regression

F(15,125)	2.55
R²	0.23
Adj. R²	0.14

	Est.	S.E.	t val.	p
(Intercept)	21.44	3.34	6.43	0.00
age	-0.20	0.06	-3.50	0.00
raceindigenous	-1.12	2.23	-0.50	0.62
racemixed asian	0.22	2.91	0.08	0.94
racemixed black	-1.53	1.18	-1.29	0.20
racewhite	-0.31	1.20	-0.26	0.80
relationshipyes	-0.01	0.84	-0.01	1.00
religionatheist or agnostic	-1.71	3.20	-0.53	0.59
religioncatholic	1.25	2.18	0.57	0.57
religionevangelical	0.71	2.10	0.34	0.74
religionspiritist	2.72	2.70	1.01	0.32
religionspirituality	1.13	2.48	0.46	0.65
how_many_children	0.93	0.31	3.06	0.00
months_in_prision	-0.00	0.02	-0.25	0.80
sep_total	-0.13	0.05	-2.52	0.01
epsus_total	-0.04	0.02	-1.60	0.11
Standard errors: OLS

parameters::standardize_parameters(stepwise_model)

ols_test_normality(model2)

## Warning in ks.test.default(y, "pnorm", mean(y), sd(y)): não devem existir
## empates no teste de Kolmogorov-Smirnov de apenas uma amostra

## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.9906         0.4634 
## Kolmogorov-Smirnov        0.0501         0.8710 
## Cramer-von Mises          10.343         0.0000 
## Anderson-Darling          0.4238         0.3146 
## -----------------------------------------------

! Done