R Markdown

Introduction



Title of Research

Assignment 2.2

This focuses on conducting basic statistical analysis using methods from the Basic Data Analysis and Intermediate Data Analysis groups.

Specifically, this project aims to look at:

The Jamaican Labour force: Examining the Relationship between Age, Marital Status and Employment Status

- Clear the workspace first to begin

rm(list = ls())  #The R code rm(list = ls()) is used to clear the workspace of all objects and variables that were previously loaded into R

Set the working directory using setwd function

getwd()
## [1] "/cloud/project"
setwd("/cloud/project")  

#Used to set the working directory in R to a specific file path on your computer, specifically where Econ 2010 project is. 

Load the necessary libraries

library(haven)
library(ggplot2)
library(ggpubr) #customization of plots
library(jtools)  #summarizing and visualizing regression models
library(sjPlot)  #creating visualizations and summaries of statistical models
library(huxtable)  #creating tables and table summaries
## 
## Attaching package: 'huxtable'
## The following object is masked from 'package:sjPlot':
## 
##     font_size
## The following object is masked from 'package:ggpubr':
## 
##     font
## The following object is masked from 'package:ggplot2':
## 
##     theme_grey
library(rstatix)  #various functions for analyses, including t-tests, correlation analyses, and effect size
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(gtsummary)  #beautiful, publication-ready tables
## 
## Attaching package: 'gtsummary'
## The following object is masked from 'package:huxtable':
## 
##     as_flextable
library(kableExtra)  #formatting tables in RMarkdown documents
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:huxtable':
## 
##     add_footnote
library(rempsyc)  #conducting repeated measures analyses
## Suggested APA citation: Thériault, R. (2022). rempsyc: Convenience functions for psychology 
## (R package version 0.1.1) [Computer software]. https://rempsyc.remi-theriault.com
library(nortest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::add_rownames()  masks huxtable::add_rownames()
## ✖ dplyr::filter()        masks rstatix::filter(), stats::filter()
## ✖ dplyr::group_rows()    masks kableExtra::group_rows()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ huxtable::theme_grey() masks ggplot2::theme_grey()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(knitr)
library(vcd)
## Loading required package: grid
## 
## Attaching package: 'vcd'
## 
## The following object is masked from 'package:huxtable':
## 
##     odds
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(corrplot)
## corrplot 0.92 loaded
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(sjmisc)
## 
## Attaching package: 'sjmisc'
## 
## The following object is masked from 'package:purrr':
## 
##     is_empty
## 
## The following object is masked from 'package:tidyr':
## 
##     replace_na
## 
## The following object is masked from 'package:tibble':
## 
##     add_case
## 
## The following objects are masked from 'package:huxtable':
## 
##     add_columns, add_rows, print_html, print_md
## 
## The following objects are masked from 'package:jtools':
## 
##     %nin%, center
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## 
## The following object is masked from 'package:forcats':
## 
##     as_factor
## 
## The following object is masked from 'package:dplyr':
## 
##     as_label
## 
## The following object is masked from 'package:huxtable':
## 
##     set_label
## 
## The following object is masked from 'package:ggplot2':
## 
##     as_label
## 
## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
library(texreg)
## Version:  1.38.6
## Date:     2022-04-06
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
## 
## Attaching package: 'texreg'
## 
## The following object is masked from 'package:magrittr':
## 
##     extract
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

Set random assigned seed value

set.seed(644)  #generate the same sequence of random numbers every code is run 

Load the data set

Labour.Data <- read_sav("2020_LFSa_JULY.sav") # Import the labour.sav data set and load as the first data frame 

A 90% sample of the data is taken

Labour2 <- sample_frac(Labour.Data, size = 0.90, replace = FALSE) #In this code, we are using the sample_frac function from the dplyr package to create a random sample of 90% of the observations in the Labour.Data dataset

Data Cleaning

Set new dataframe with variables of interest

Lab3 <- Labour2[,c("AGE", "SEX", "Q323", "UNION_STAT", "MARITAL_STAT", "Q38M_occupation", "Q117")]

change the variable names for ease

names(Lab3)[1:3] <- c("Age", "Sex", "Employ_Stat")
names(Lab3)[4:7] <- c("Union Status", "Marital_Stat", "Occupation", "Educhieve")

#changing the names of variables in a dataset in R, you can use the names() function

Filter the missing or NA values

Lab3_filtered <- Lab3[complete.cases(Lab3),] # This function removes any rows in a data frame that contain missing or NA values

Filter out other values which may alter inferences [99 - Not stated, 96 - other (not specified)]

Lab3_filtered <- Lab3 %>%
  mutate_at(vars(Educhieve,Marital_Stat, Employ_Stat), ~ ifelse(. %in% c(96, 99), NA, .)) %>% 
  na.omit()

View the cleaned dataset

head(Lab3_filtered)
AgeSexEmploy_StatUnion StatusMarital_StatOccupationEduchieve
42132231
23134227
36213218
35162231
53132258
54262231
summary(Lab3_filtered)
##       Age             Sex        Employ_Stat     Union Status    Marital_Stat  
##  Min.   :16.00   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:32.00   1st Qu.:1.00   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :44.00   Median :1.00   Median :3.000   Median :3.000   Median :2.000  
##  Mean   :43.91   Mean   :1.44   Mean   :4.034   Mean   :2.735   Mean   :1.827  
##  3rd Qu.:55.00   3rd Qu.:2.00   3rd Qu.:6.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :92.00   Max.   :2.00   Max.   :6.000   Max.   :4.000   Max.   :5.000  
##    Occupation      Educhieve    
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000  
##  Median :3.000   Median :1.000  
##  Mean   :3.595   Mean   :2.687  
##  3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :8.000   Max.   :8.000
mean(Lab3_filtered)
## Warning in mean.default(Lab3_filtered): argument is not numeric or logical:
## returning NA
## [1] NA

Data Analysis

Goal 1

Is there a difference in age groups based on whether respondents are employed in the public or private sector.

• Test to be conducted: Independent sample t test

• Data to be used.

Variable Variable Name Type of Variable Dependent Variable Age AGE Ratio Independent Variable Employ_Stat Nominal

For this analysis, we create a new variable with age groups

Set age as numeric

Lab3_filtered$Age <- as.numeric(Lab3_filtered$Age)

Null hypothesis: There is no significant difference in the mean age of respondents employed in the public and private sector.

Alternative hypothesis: There is a significant difference in the mean age of respondents employed in the public and private sector.

The assumption is that N1 + N2 > 40 & has no outliers.

Since the sample size is very large, the central limit theorem states that sample mean sampling distribution will be approximately normal irrespective of the population distribution. However, we can still do a test to show.

Conduct Anderson Darling test for normality

ad.test(Lab3_filtered$Age)
## 
##  Anderson-Darling normality test
## 
## data:  Lab3_filtered$Age
## A = 34.219, p-value < 2.2e-16

This result indicates that the p-value is less than the significance level of 0.05, suggesting strong evidence against the null hypothesis of normality. Therefore, the data may not be normally distributed, so it violates our assumption.

Plot a histogram to check normality

hist(Lab3_filtered$Age)

curve(dnorm(x, mean = mean(Lab3_filtered$Age), sd = sd(Lab3_filtered$Age)), # Add a normal curve to the histogram for comparison
      col = "blue", lwd = 2, add = TRUE)

hist(Lab3_filtered$Age, freq = FALSE, main = "Histogram of Age with Normal Distribution Overlay")
curve(dnorm(x, mean = mean(Lab3_filtered$Age), sd = sd(Lab3_filtered$Age)), 
      add = TRUE, col = "blue")

The histogram shows that the data reasonably follows the bell curve.

Probability Plot of Age

ggqqplot(Lab3_filtered$Age, 
         ylab = "Age", 
         main = "Probability Plot for Age")

ks.test(Lab3_filtered$Age, "pnorm", mean(Lab3_filtered$Age), sd(Lab3_filtered$Age))
## Warning in ks.test.default(Lab3_filtered$Age, "pnorm", mean(Lab3_filtered$Age),
## : ties should not be present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  Lab3_filtered$Age
## D = 0.057186, p-value < 2.2e-16
## alternative hypothesis: two-sided

The output of the Kolmogorov-Smirnov test for normality of the Age variable indicates a very small p-value (< 2.2e-16), which means that we can reject the null hypothesis of normality. This suggests that the Age variable is not normally distributed.

# Conduct Welch t-test

Recode employment status as public (1) or private (0)

Lab3_filtered$Public_Private <- ifelse(Lab3_filtered$Employ_Stat %in% c(1, 2), 1, 0)
# This is to set up the grouping variable for the Welch t - test 
# Conduct variance test
var_test <- var.test(Age ~ Public_Private, data = Lab3_filtered)
p_value <- var_test$p.value

# Check if variances are equal
if (p_value > 0.05) {
  var_equal <- TRUE
} else {
  var_equal <- FALSE
}

The variance test was conducted to determine if the variances of the Age variable were equal between the Public and Private groups. A p-value of 0 was obtained, indicating that the variances were not equal. Therefore, the Welch’s t-test was used to compare the mean Age values between the two groups, as it does not assume equal variances.

Since the normality assumption is violated and variances are not equal, we can use a robust version of the t-test that does not assume normality, such as Welch’s t-test.

Perform a Welch t-test

Assuming you have two groups, with variable X and grouping variable Group

welch_t <- t.test(Age ~ Public_Private, data = Lab3_filtered, var.equal = FALSE)

# View the results
welch_t
## 
##  Welch Two Sample t-test
## 
## data:  Age by Public_Private
## t = 7.4431, df = 1398.1, p-value = 1.712e-13
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  2.288721 3.926869
## sample estimates:
## mean in group 0 mean in group 1 
##        44.28884        41.18104
welch_tidy <- tidy(welch_t)

welch_tidy %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = F)
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
3.107795 44.28884 41.18104 7.443093 0 1398.124 2.288721 3.926869 Welch Two Sample t-test two.sided
ggboxplot(Lab3_filtered, x = "Public_Private", y = "Age",
          color = "Public_Private", palette = "jco",
          add = "jitter") +
  stat_compare_means(aes(label = ..p.format..),
                     label.y = 85, label.x = 1.5)

The Welch Two-Sample t-test was used to compare the mean age between two groups, Public and Private. The test showed a statistically significant difference between the mean age of the two groups, with a t-value of 7.4431 and a p-value of 1.712e-13. The 95% confidence interval for the difference in means between the two groups ranged from 2.288721 to 3.926869. These results suggest that there is a significant difference in the mean age between the Public and Private groups, with the Public group having a higher mean age of 44.28884 compared to the Private group with a mean age of 41.18104. This finding may have important implications for understanding differences in age distributions between Public and Private groups.

Goal 2:

To determine whether older individuals’ higher chance of being married when compared to younger individuals.

Null Hypothesis: There is no significant difference in the proportion of married individuals between older and younger age groups.

Alternative Hypothesis: The proportion of married individuals is significantly higher in older age groups compared to younger age groups.

We are going to test using this format :

(categorical variable with two groups: “older” and “younger”) and “marital_status” (categorical variable with two groups: “married” and “not married”).

Create two groups based on age

YoungMar <- Lab3_filtered$Marital_Stat[Lab3_filtered$Age < 45]
OldMar <- Lab3_filtered$Marital_Stat[Lab3_filtered$Age >= 45]

Calculate the proportion of married individuals in each group

propor1 <- sum(YoungMar == "1") / length(YoungMar)
propor2 <- sum(OldMar == "1") / length(OldMar)

Conduct a two-sample proportion test

propor_test <- prop.test(x = c(sum(YoungMar == "1"), sum(OldMar == "1")),
                  n = c(length(YoungMar), length(OldMar)))
# Print the test results
propor_test
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(sum(YoungMar == "1"), sum(OldMar == "1")) out of c(length(YoungMar), length(OldMar))
## X-squared = 532.47, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.2488038 -0.2103394
## sample estimates:
##    prop 1    prop 2 
## 0.1393631 0.3689347

Make data more presentable

prop_table <- matrix(c(sum(YoungMar == "1"), sum(OldMar == "1"), 
                       length(YoungMar) - sum(YoungMar == "1"), length(OldMar) - sum(OldMar == "1")), 
                     nrow = 2, ncol = 2, dimnames = list(c("Young", "Old"), c("Married", "Not Married")))

kable(prop_table, format = "markdown", caption = "Comparison of Proportions by Age Group and Marital Status") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  add_header_above(c(" " = 1, "Marital Status" = 2)) %>%
  add_footnote("Source: Lab3_filtered data")
## Warning in kable_styling(., bootstrap_options = "striped", full_width = F):
## Please specify format in kable. kableExtra can customize either HTML or LaTeX
## outputs. See https://haozhu233.github.io/kableExtra/ for details.
## Warning in add_header_above(., c(` ` = 1, `Marital Status` = 2)): Please
## specify format in kable. kableExtra can customize either HTML or LaTeX outputs.
## See https://haozhu233.github.io/kableExtra/ for details.
Comparison of Proportions by Age Group and Marital Status
Married Not Married
Young 547 3378
Old 1361 2328

Note: a Source: Lab3_filtered data Output

# Create a data frame for the plot
Lab3_filtered3 <- data.frame(Group = c("YoungMar", "OldMar"),
                 Proportion = c(propor1, propor2))
ggplot(Lab3_filtered3, aes(x = Group, y = Proportion)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Proportion of married individuals by age group",
       subtitle = paste0("Two-sample proportion test: p-value = ", round(propor_test$p.value, 4),
                         ", 95% CI [", round(propor_test$conf.int[1], 4), ", ", round(propor_test$conf.int[2], 4), "]"))

Goal #3: Examine the relationship between employment status (public/private) and martial status.

Test to be conducted: (logistic regression – include control variables – age, gender, etc)

• Data to be used

Variable    Variable Name   Type of Variable

Dependent Variable Employment Status Q323 Nominal Independent Variable Marital Status MARITAL_STAT Nominal

Logistic regression can be used to model the relationship between a binary outcome variable and one or more predictor variables, which can be categorical or continuous

In this case, employment status is binary (Public or Private) and Marital status is categorical.

Control variables : Age - numerical, Sex - Binary

We recode the variables as numeric since they are nominal or integers

Binary Variable for Employment Status - Public_Private Marital_Stat - Marital Status

compute correlation matrix

cor_matrix <- cor(Lab3_filtered[, c("Public_Private", "Marital_Stat", "Age", "Sex", "Occupation", "Educhieve")])

In this matrix, we can see that there is a small negative correlation between “Public_Private” and “Marital_Stat” (-0.087), which means that people who work in public versus private sectors are slightly less likely to be married.

There is also a small positive correlation between “Sex” and “Public_Private” (0.143), which means that men are slightly more likely to work in the private sector than women.

cordata <- Lab3_filtered

my_corr <- cordata %>% 
  select(Public_Private, Marital_Stat, Age, Sex, Occupation, Educhieve) %>% 
  cor_test(method = "pearson") %>% 
  as_huxtable() %>% 
  set_all_padding(row = everywhere, col = everywhere, value = 6) %>% 
  set_bold(1, everywhere) %>% 
  set_top_border(1, everywhere, value = 0.8) %>% 
  set_bottom_border(1, everywhere, value = 0.4) %>% 
  set_caption("Correlation matrix: Public/Private, Marital Status, Age, and Sex")

my_corr
Correlation matrix: Public/Private, Marital Status, Age, and Sex
var1var2corstatisticpconf.lowconf.highmethod
Public_PrivatePublic_Private1    5.86e+090        1      1     Pearson
Public_PrivateMarital_Stat-0.087-7.65    2.22e-14 -0.11   -0.065 Pearson
Public_PrivateAge-0.071-6.24    4.53e-10 -0.0937 -0.049 Pearson
Public_PrivateSex0.14 12.6     3.11e-36 0.121  0.165 Pearson
Public_PrivateOccupation-0.24 -21.6     1.55e-100-0.261  -0.219 Pearson
Public_PrivateEduchieve0.41 39.7     4.19e-3130.395  0.432 Pearson
Marital_StatPublic_Private-0.087-7.65    2.22e-14 -0.11   -0.065 Pearson
Marital_StatMarital_Stat1    Inf       0        1      1     Pearson
Marital_StatAge-0.06 -5.24    1.68e-07 -0.0823 -0.0375Pearson
Marital_StatSex0.03 2.61    0.0091   0.007430.0523Pearson
Marital_StatOccupation0.0443.85    0.000117 0.0217 0.0665Pearson
Marital_StatEduchieve-0.067-5.89    4.08e-09 -0.0897 -0.0449Pearson
AgePublic_Private-0.071-6.24    4.53e-10 -0.0937 -0.049 Pearson
AgeMarital_Stat-0.06 -5.24    1.68e-07 -0.0823 -0.0375Pearson
AgeAge1    5.86e+090        1      1     Pearson
AgeSex-0.05 -4.37    1.24e-05 -0.0724 -0.0276Pearson
AgeOccupation0.1  9.07    1.53e-19 0.0811 0.126 Pearson
AgeEduchieve-0.27 -24.5     2.65e-127-0.291  -0.249 Pearson
SexPublic_Private0.14 12.6     3.11e-36 0.121  0.165 Pearson
SexMarital_Stat0.03 2.61    0.0091   0.007430.0523Pearson
SexAge-0.05 -4.37    1.24e-05 -0.0724 -0.0276Pearson
SexSex1    Inf       0        1      1     Pearson
SexOccupation-0.28 -25.7     2.31e-139-0.303  -0.261 Pearson
SexEduchieve0.23 20.5     6.02e-91 0.207  0.25  Pearson
OccupationPublic_Private-0.24 -21.6     1.55e-100-0.261  -0.219 Pearson
OccupationMarital_Stat0.0443.85    0.000117 0.0217 0.0665Pearson
OccupationAge0.1  9.07    1.53e-19 0.0811 0.126 Pearson
OccupationSex-0.28 -25.7     2.31e-139-0.303  -0.261 Pearson
OccupationOccupation1    Inf       0        1      1     Pearson
OccupationEduchieve-0.54 -55.5     0        -0.552  -0.52  Pearson
EduchievePublic_Private0.41 39.7     4.19e-3130.395  0.432 Pearson
EduchieveMarital_Stat-0.067-5.89    4.08e-09 -0.0897 -0.0449Pearson
EduchieveAge-0.27 -24.5     2.65e-127-0.291  -0.249 Pearson
EduchieveSex0.23 20.5     6.02e-91 0.207  0.25  Pearson
EduchieveOccupation-0.54 -55.5     0        -0.552  -0.52  Pearson
EduchieveEduchieve1    Inf       0        1      1     Pearson
corrplot(cor_matrix, type = "upper", tl.col = "black", tl.srt = 45, method = "circle", addCoef.col = "black")

ggtitle("Correlation Matrix")
## $title
## [1] "Correlation Matrix"
## 
## attr(,"class")
## [1] "labels"

The strength of the correlation is indicated by the color of the square, with warmer colors indicating stronger positive correlations and cooler colors indicating stronger negative correlations

png("correlation_matrix.png", width=1200, height=1200)
pairs.panels(my_corr,
             smooth = TRUE,
             scale = FALSE,
             density = TRUE,
             ellipses = TRUE,
             method = "pearson",
             pch = 21,
             lm = TRUE,
             cor = TRUE,
             jiggle = FALSE,
             factor = 2,
             hist.col = 4,
             stars = TRUE,
             ci = FALSE)

Logistic Regression

LogitModelr <- glm(Public_Private ~ Marital_Stat + Age + Sex + Occupation + Educhieve,
                   data = Lab3_filtered, family = "binomial")

Model Summary

summ (LogitModelr , exp = TRUE ) 
Observations 7614
Dependent variable Public_Private
Type Generalized linear model
Family binomial
Link logit
𝛘²(5) 1167.44
Pseudo-R² (Cragg-Uhler) 0.27
Pseudo-R² (McFadden) 0.21
AIC 4532.20
BIC 4573.83
exp(Est.) 2.5% 97.5% z val. p
(Intercept) 0.03 0.02 0.06 -11.67 0.00
Marital_Stat 0.67 0.58 0.77 -5.66 0.00
Age 1.00 1.00 1.01 0.70 0.48
Sex 1.53 1.30 1.79 5.15 0.00
Occupation 0.98 0.93 1.03 -0.85 0.40
Educhieve 1.47 1.42 1.52 22.36 0.00
Standard errors: MLE

The model has a Pseudo-R² (McFadden) value of 0.21, which indicates that the model explains 21% of the variation in the dependent variable. The predictors Marital_Stat, Sex, and Educhieve have statistically significant effects on the probability of being in the Public or Private employment sector, as their p-values are less than 0.05. The odds ratio of Sex is 1.53, which means that holding all other predictors constant, the odds of being in the Public employment sector for males (coded as 1) are 1.53 times higher than the odds for females (coded as 0). The odds ratio of Educhieve is 1.47, which means that holding all other predictors constant, a one-unit increase in Educhieve is associated with a 47% increase in the odds of being in the Public employment sector. The predictors Age and Occupation are not statistically significant, as their p-values are greater than 0.05. The AIC value of the model is 4532.20, which can be used to compare it to other models to determine which model fits the data better. A lower AIC value indicates a better fit. The coefficients of the model suggest that Marital_Stat, Sex, and Educhieve are significant predictors of Public_Private. Age and Occupation were not found to be significant predictors.

Multiple Regression Analysis

Test for Assumptions

Linearity:

Plot each independent variable against the dependent variable to check for linearity

plot(Lab3_filtered$Marital_Stat, Lab3_filtered$Public_Private)

plot(Lab3_filtered$Educhieve, Lab3_filtered$Public_Private)

plot(Lab3_filtered$Sex, Lab3_filtered$Public_Private)

Independence of errors:

plot(predict(LogitModelr), residuals(LogitModelr))

Normality of errors:

hist(residuals(LogitModelr))

Equal variance (homoscedasticity):

Plot the residuals against the predicted values to check for equal variance

plot(predict(LogitModelr), residuals(LogitModelr), pch = 20)
abline(h = 0, col = "red")

For linearity: We look for a linear relationship between the two variables, meaning that the points on the scatter plot should form a roughly straight line. In this case, they do.

For the other assumptions, they seem to observe the tenets of a multiple regression.

Categorical variables when used in regression must be coded as 0/1 type variables.

We therefore create new variables that we will code for use

# code the educhieve variable as a binary categorical variable in R
Lab3_filtered$Educhieve_binary <- ifelse(Lab3_filtered$Educhieve > 1, 1, 0)
# coded as 1 if the educhieve variable is greater than 0, and 0 otherwise. This represents years of secondary education 

Code the Marital_Stat variable as a 0/1 binary variable where 1 represents married and 0 represents not married

Lab3_filtered$Married <- ifelse(Lab3_filtered$Marital_Stat == 1, 1, 0)

Create a binary variable for Sex that is 0/1

Lab3_filtered$Sex_binary <- ifelse(Lab3_filtered$Sex == 2, 0, 1)

The multiple regression analysis will allow us to explore the relationship between employment status and marital status while controlling for the effects of sex and education. The dependent variable is employment status, which is a nominal variable that could take values such as privately or publicly employed. The independent variable of interest is marital status, which is also nominal and could take values such as married, divorced, widowed, or never married. The control variables are sex, which is binary (male or female), and education achieved, which is a continuous variable indicating the highest academic exam passed. .

The multiple regression model will estimate the effect of marital status on employment status, while holding constant the effects of sex and education. The regression model will test whether there is a significant relationship between employment status and marital status, after controlling for sex and education. If the regression model finds a significant relationship between employment status and marital status, we can conclude that marital status is a predictor of employment status, even after controlling for sex and educhieve.

Null Hypothesis (H0): There is no significant relationship between employment status and marital status after controlling for sex and educational achievement.

Alternative Hypothesis (H1): There is a significant relationship between employment status and marital status after controlling for sex and educational achievement.

cor_matrix2 <- cor(Lab3_filtered[, c("Public_Private", "Married", "Sex_binary", "Educhieve_binary")])

print(cor_matrix2)
##                  Public_Private      Married   Sex_binary Educhieve_binary
## Public_Private        1.0000000  0.102951634 -0.143327912       0.32870091
## Married               0.1029516  1.000000000 -0.009171946       0.02255395
## Sex_binary           -0.1433279 -0.009171946  1.000000000      -0.22272016
## Educhieve_binary      0.3287009  0.022553946 -0.222720157       1.00000000

Run linear model

linmod2 <- lm(Public_Private ~  Married + Sex_binary + Educhieve_binary, data = Lab3_filtered )

Output Results vs 1

summary (linmod2)
## 
## Call:
## lm(formula = Public_Private ~ Married + Sex_binary + Educhieve_binary, 
##     data = Lab3_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3417 -0.2209 -0.0565 -0.0079  0.9921 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.056504   0.006737   8.388  < 2e-16 ***
## Married           0.072296   0.008151   8.870  < 2e-16 ***
## Sex_binary       -0.048600   0.007298  -6.660 2.94e-11 ***
## Educhieve_binary  0.212951   0.007564  28.154  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3081 on 7610 degrees of freedom
## Multiple R-squared:  0.1223, Adjusted R-squared:  0.1219 
## F-statistic: 353.4 on 3 and 7610 DF,  p-value: < 2.2e-16
plotreg (linmod2)
## Model: bars denote standard errors (95%).

R Code: Diagnostic Plots

par(mfrow =c(2 ,2)) # Create a 2x2 grid
plot (linmod2)