R Markdown

Introduction

Title of Research

Assignment 2.2

This focuses on conducting basic statistical analysis using methods from the Basic Data Analysis and Intermediate Data Analysis groups.

Specifically, this project aims to look at:

The Jamaican Labour force: Examining the Relationship between Age, Marital Status and Employment Status

- Clear the workspace first to begin

rm(list = ls())  #The R code rm(list = ls()) is used to clear the workspace of all objects and variables that were previously loaded into R

Set the working directory using setwd function

getwd()

## [1] "/cloud/project"

setwd("/cloud/project")  

#Used to set the working directory in R to a specific file path on your computer, specifically where Econ 2010 project is.

Load the necessary libraries

library(haven)
library(ggplot2)
library(ggpubr) #customization of plots
library(jtools)  #summarizing and visualizing regression models
library(sjPlot)  #creating visualizations and summaries of statistical models
library(huxtable)  #creating tables and table summaries

## 
## Attaching package: 'huxtable'

## The following object is masked from 'package:sjPlot':
## 
##     font_size

## The following object is masked from 'package:ggpubr':
## 
##     font

## The following object is masked from 'package:ggplot2':
## 
##     theme_grey

library(rstatix)  #various functions for analyses, including t-tests, correlation analyses, and effect size

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

library(gtsummary)  #beautiful, publication-ready tables

## 
## Attaching package: 'gtsummary'

## The following object is masked from 'package:huxtable':
## 
##     as_flextable

library(kableExtra)  #formatting tables in RMarkdown documents

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:huxtable':
## 
##     add_footnote

library(rempsyc)  #conducting repeated measures analyses

## Suggested APA citation: Thériault, R. (2022). rempsyc: Convenience functions for psychology 
## (R package version 0.1.1) [Computer software]. https://rempsyc.remi-theriault.com

library(nortest)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::add_rownames()  masks huxtable::add_rownames()
## ✖ dplyr::filter()        masks rstatix::filter(), stats::filter()
## ✖ dplyr::group_rows()    masks kableExtra::group_rows()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ huxtable::theme_grey() masks ggplot2::theme_grey()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(broom)
library(knitr)
library(vcd)

## Loading required package: grid
## 
## Attaching package: 'vcd'
## 
## The following object is masked from 'package:huxtable':
## 
##     odds

library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(corrplot)

## corrplot 0.92 loaded

library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(sjmisc)

## 
## Attaching package: 'sjmisc'
## 
## The following object is masked from 'package:purrr':
## 
##     is_empty
## 
## The following object is masked from 'package:tidyr':
## 
##     replace_na
## 
## The following object is masked from 'package:tibble':
## 
##     add_case
## 
## The following objects are masked from 'package:huxtable':
## 
##     add_columns, add_rows, print_html, print_md
## 
## The following objects are masked from 'package:jtools':
## 
##     %nin%, center

library(sjlabelled)

## 
## Attaching package: 'sjlabelled'
## 
## The following object is masked from 'package:forcats':
## 
##     as_factor
## 
## The following object is masked from 'package:dplyr':
## 
##     as_label
## 
## The following object is masked from 'package:huxtable':
## 
##     set_label
## 
## The following object is masked from 'package:ggplot2':
## 
##     as_label
## 
## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels

library(texreg)

## Version:  1.38.6
## Date:     2022-04-06
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
## 
## Attaching package: 'texreg'
## 
## The following object is masked from 'package:magrittr':
## 
##     extract
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

Set random assigned seed value

set.seed(644)  #generate the same sequence of random numbers every code is run

Load the data set

Labour.Data <- read_sav("2020_LFSa_JULY.sav") # Import the labour.sav data set and load as the first data frame

A 90% sample of the data is taken

Labour2 <- sample_frac(Labour.Data, size = 0.90, replace = FALSE) #In this code, we are using the sample_frac function from the dplyr package to create a random sample of 90% of the observations in the Labour.Data dataset

Data Cleaning

Set new dataframe with variables of interest

Lab3 <- Labour2[,c("AGE", "SEX", "Q323", "UNION_STAT", "MARITAL_STAT", "Q38M_occupation", "Q117")]

change the variable names for ease

names(Lab3)[1:3] <- c("Age", "Sex", "Employ_Stat")
names(Lab3)[4:7] <- c("Union Status", "Marital_Stat", "Occupation", "Educhieve")

#changing the names of variables in a dataset in R, you can use the names() function

Filter the missing or NA values

Lab3_filtered <- Lab3[complete.cases(Lab3),] # This function removes any rows in a data frame that contain missing or NA values

Filter out other values which may alter inferences [99 - Not stated, 96 - other (not specified)]

Lab3_filtered <- Lab3 %>%
  mutate_at(vars(Educhieve,Marital_Stat, Employ_Stat), ~ ifelse(. %in% c(96, 99), NA, .)) %>% 
  na.omit()

View the cleaned dataset

head(Lab3_filtered)

Age	Sex	Employ_Stat	Union Status	Marital_Stat	Occupation	Educhieve
42	1	3	2	2	3	1
23	1	3	4	2	2	7
36	2	1	3	2	1	8
35	1	6	2	2	3	1
53	1	3	2	2	5	8
54	2	6	2	2	3	1

summary(Lab3_filtered)

##       Age             Sex        Employ_Stat     Union Status    Marital_Stat  
##  Min.   :16.00   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:32.00   1st Qu.:1.00   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :44.00   Median :1.00   Median :3.000   Median :3.000   Median :2.000  
##  Mean   :43.91   Mean   :1.44   Mean   :4.034   Mean   :2.735   Mean   :1.827  
##  3rd Qu.:55.00   3rd Qu.:2.00   3rd Qu.:6.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :92.00   Max.   :2.00   Max.   :6.000   Max.   :4.000   Max.   :5.000  
##    Occupation      Educhieve    
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000  
##  Median :3.000   Median :1.000  
##  Mean   :3.595   Mean   :2.687  
##  3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :8.000   Max.   :8.000

mean(Lab3_filtered)

## Warning in mean.default(Lab3_filtered): argument is not numeric or logical:
## returning NA

## [1] NA

Data Analysis

Goal 1

Is there a difference in age groups based on whether respondents are employed in the public or private sector.

• Test to be conducted: Independent sample t test

• Data to be used.

Variable Variable Name Type of Variable Dependent Variable Age AGE Ratio Independent Variable Employ_Stat Nominal

For this analysis, we create a new variable with age groups

Set age as numeric

Lab3_filtered$Age <- as.numeric(Lab3_filtered$Age)

Null hypothesis: There is no significant difference in the mean age of respondents employed in the public and private sector.

Alternative hypothesis: There is a significant difference in the mean age of respondents employed in the public and private sector.

The assumption is that N1 + N2 > 40 & has no outliers.

Since the sample size is very large, the central limit theorem states that sample mean sampling distribution will be approximately normal irrespective of the population distribution. However, we can still do a test to show.

Conduct Anderson Darling test for normality

ad.test(Lab3_filtered$Age)

## 
##  Anderson-Darling normality test
## 
## data:  Lab3_filtered$Age
## A = 34.219, p-value < 2.2e-16

This result indicates that the p-value is less than the significance level of 0.05, suggesting strong evidence against the null hypothesis of normality. Therefore, the data may not be normally distributed, so it violates our assumption.

Plot a histogram to check normality

hist(Lab3_filtered$Age)

curve(dnorm(x, mean = mean(Lab3_filtered$Age), sd = sd(Lab3_filtered$Age)), # Add a normal curve to the histogram for comparison
      col = "blue", lwd = 2, add = TRUE)

hist(Lab3_filtered$Age, freq = FALSE, main = "Histogram of Age with Normal Distribution Overlay")
curve(dnorm(x, mean = mean(Lab3_filtered$Age), sd = sd(Lab3_filtered$Age)), 
      add = TRUE, col = "blue")

The histogram shows that the data reasonably follows the bell curve.

Probability Plot of Age

ggqqplot(Lab3_filtered$Age, 
         ylab = "Age", 
         main = "Probability Plot for Age")

ks.test(Lab3_filtered$Age, "pnorm", mean(Lab3_filtered$Age), sd(Lab3_filtered$Age))

## Warning in ks.test.default(Lab3_filtered$Age, "pnorm", mean(Lab3_filtered$Age),
## : ties should not be present for the Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  Lab3_filtered$Age
## D = 0.057186, p-value < 2.2e-16
## alternative hypothesis: two-sided

The output of the Kolmogorov-Smirnov test for normality of the Age variable indicates a very small p-value (< 2.2e-16), which means that we can reject the null hypothesis of normality. This suggests that the Age variable is not normally distributed.

# Conduct Welch t-test

Recode employment status as public (1) or private (0)

Lab3_filtered$Public_Private <- ifelse(Lab3_filtered$Employ_Stat %in% c(1, 2), 1, 0)
# This is to set up the grouping variable for the Welch t - test

# Conduct variance test
var_test <- var.test(Age ~ Public_Private, data = Lab3_filtered)
p_value <- var_test$p.value

# Check if variances are equal
if (p_value > 0.05) {
  var_equal <- TRUE
} else {
  var_equal <- FALSE
}

The variance test was conducted to determine if the variances of the Age variable were equal between the Public and Private groups. A p-value of 0 was obtained, indicating that the variances were not equal. Therefore, the Welch’s t-test was used to compare the mean Age values between the two groups, as it does not assume equal variances.

Since the normality assumption is violated and variances are not equal, we can use a robust version of the t-test that does not assume normality, such as Welch’s t-test.

Perform a Welch t-test

Assuming you have two groups, with variable X and grouping variable Group

welch_t <- t.test(Age ~ Public_Private, data = Lab3_filtered, var.equal = FALSE)

# View the results
welch_t

## 
##  Welch Two Sample t-test
## 
## data:  Age by Public_Private
## t = 7.4431, df = 1398.1, p-value = 1.712e-13
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  2.288721 3.926869
## sample estimates:
## mean in group 0 mean in group 1 
##        44.28884        41.18104

welch_tidy <- tidy(welch_t)

welch_tidy %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = F)

estimate	estimate1	estimate2	statistic	p.value	parameter	conf.low	conf.high	method	alternative
3.107795	44.28884	41.18104	7.443093	0	1398.124	2.288721	3.926869	Welch Two Sample t-test	two.sided

ggboxplot(Lab3_filtered, x = "Public_Private", y = "Age",
          color = "Public_Private", palette = "jco",
          add = "jitter") +
  stat_compare_means(aes(label = ..p.format..),
                     label.y = 85, label.x = 1.5)

The Welch Two-Sample t-test was used to compare the mean age between two groups, Public and Private. The test showed a statistically significant difference between the mean age of the two groups, with a t-value of 7.4431 and a p-value of 1.712e-13. The 95% confidence interval for the difference in means between the two groups ranged from 2.288721 to 3.926869. These results suggest that there is a significant difference in the mean age between the Public and Private groups, with the Public group having a higher mean age of 44.28884 compared to the Private group with a mean age of 41.18104. This finding may have important implications for understanding differences in age distributions between Public and Private groups.

Goal 2:

To determine whether older individuals’ higher chance of being married when compared to younger individuals.

Null Hypothesis: There is no significant difference in the proportion of married individuals between older and younger age groups.

Alternative Hypothesis: The proportion of married individuals is significantly higher in older age groups compared to younger age groups.

We are going to test using this format :

(categorical variable with two groups: “older” and “younger”) and “marital_status” (categorical variable with two groups: “married” and “not married”).

Create two groups based on age

YoungMar <- Lab3_filtered$Marital_Stat[Lab3_filtered$Age < 45]
OldMar <- Lab3_filtered$Marital_Stat[Lab3_filtered$Age >= 45]

Calculate the proportion of married individuals in each group

propor1 <- sum(YoungMar == "1") / length(YoungMar)
propor2 <- sum(OldMar == "1") / length(OldMar)

Conduct a two-sample proportion test

propor_test <- prop.test(x = c(sum(YoungMar == "1"), sum(OldMar == "1")),
                  n = c(length(YoungMar), length(OldMar)))
# Print the test results
propor_test

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(sum(YoungMar == "1"), sum(OldMar == "1")) out of c(length(YoungMar), length(OldMar))
## X-squared = 532.47, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.2488038 -0.2103394
## sample estimates:
##    prop 1    prop 2 
## 0.1393631 0.3689347

Make data more presentable

prop_table <- matrix(c(sum(YoungMar == "1"), sum(OldMar == "1"), 
                       length(YoungMar) - sum(YoungMar == "1"), length(OldMar) - sum(OldMar == "1")), 
                     nrow = 2, ncol = 2, dimnames = list(c("Young", "Old"), c("Married", "Not Married")))

kable(prop_table, format = "markdown", caption = "Comparison of Proportions by Age Group and Marital Status") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  add_header_above(c(" " = 1, "Marital Status" = 2)) %>%
  add_footnote("Source: Lab3_filtered data")

## Warning in kable_styling(., bootstrap_options = "striped", full_width = F):
## Please specify format in kable. kableExtra can customize either HTML or LaTeX
## outputs. See https://haozhu233.github.io/kableExtra/ for details.

## Warning in add_header_above(., c(` ` = 1, `Marital Status` = 2)): Please
## specify format in kable. kableExtra can customize either HTML or LaTeX outputs.
## See https://haozhu233.github.io/kableExtra/ for details.

Comparison of Proportions by Age Group and Marital Status
	Married	Not Married
Young	547	3378
Old	1361	2328

Note: ^a Source: Lab3_filtered data Output

# Create a data frame for the plot
Lab3_filtered3 <- data.frame(Group = c("YoungMar", "OldMar"),
                 Proportion = c(propor1, propor2))

ggplot(Lab3_filtered3, aes(x = Group, y = Proportion)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Proportion of married individuals by age group",
       subtitle = paste0("Two-sample proportion test: p-value = ", round(propor_test$p.value, 4),
                         ", 95% CI [", round(propor_test$conf.int[1], 4), ", ", round(propor_test$conf.int[2], 4), "]"))

Goal #3: Examine the relationship between employment status (public/private) and martial status.

Test to be conducted: (logistic regression – include control variables – age, gender, etc)

• Data to be used

Variable    Variable Name   Type of Variable

Dependent Variable Employment Status Q323 Nominal Independent Variable Marital Status MARITAL_STAT Nominal

Logistic regression can be used to model the relationship between a binary outcome variable and one or more predictor variables, which can be categorical or continuous

In this case, employment status is binary (Public or Private) and Marital status is categorical.

Control variables : Age - numerical, Sex - Binary

We recode the variables as numeric since they are nominal or integers

Binary Variable for Employment Status - Public_Private Marital_Stat - Marital Status

compute correlation matrix

cor_matrix <- cor(Lab3_filtered[, c("Public_Private", "Marital_Stat", "Age", "Sex", "Occupation", "Educhieve")])

In this matrix, we can see that there is a small negative correlation between “Public_Private” and “Marital_Stat” (-0.087), which means that people who work in public versus private sectors are slightly less likely to be married.

There is also a small positive correlation between “Sex” and “Public_Private” (0.143), which means that men are slightly more likely to work in the private sector than women.

cordata <- Lab3_filtered

my_corr <- cordata %>% 
  select(Public_Private, Marital_Stat, Age, Sex, Occupation, Educhieve) %>% 
  cor_test(method = "pearson") %>% 
  as_huxtable() %>% 
  set_all_padding(row = everywhere, col = everywhere, value = 6) %>% 
  set_bold(1, everywhere) %>% 
  set_top_border(1, everywhere, value = 0.8) %>% 
  set_bottom_border(1, everywhere, value = 0.4) %>% 
  set_caption("Correlation matrix: Public/Private, Marital Status, Age, and Sex")

my_corr

Correlation matrix: Public/Private, Marital Status, Age, and Sex
var1	var2	cor	statistic	p	conf.low	conf.high	method
Public_Private	Public_Private	1	5.86e+09	0	1	1	Pearson
Public_Private	Marital_Stat	-0.087	-7.65	2.22e-14	-0.11	-0.065	Pearson
Public_Private	Age	-0.071	-6.24	4.53e-10	-0.0937	-0.049	Pearson
Public_Private	Sex	0.14	12.6	3.11e-36	0.121	0.165	Pearson
Public_Private	Occupation	-0.24	-21.6	1.55e-100	-0.261	-0.219	Pearson
Public_Private	Educhieve	0.41	39.7	4.19e-313	0.395	0.432	Pearson
Marital_Stat	Public_Private	-0.087	-7.65	2.22e-14	-0.11	-0.065	Pearson
Marital_Stat	Marital_Stat	1	Inf	0	1	1	Pearson
Marital_Stat	Age	-0.06	-5.24	1.68e-07	-0.0823	-0.0375	Pearson
Marital_Stat	Sex	0.03	2.61	0.0091	0.00743	0.0523	Pearson
Marital_Stat	Occupation	0.044	3.85	0.000117	0.0217	0.0665	Pearson
Marital_Stat	Educhieve	-0.067	-5.89	4.08e-09	-0.0897	-0.0449	Pearson
Age	Public_Private	-0.071	-6.24	4.53e-10	-0.0937	-0.049	Pearson
Age	Marital_Stat	-0.06	-5.24	1.68e-07	-0.0823	-0.0375	Pearson
Age	Age	1	5.86e+09	0	1	1	Pearson
Age	Sex	-0.05	-4.37	1.24e-05	-0.0724	-0.0276	Pearson
Age	Occupation	0.1	9.07	1.53e-19	0.0811	0.126	Pearson
Age	Educhieve	-0.27	-24.5	2.65e-127	-0.291	-0.249	Pearson
Sex	Public_Private	0.14	12.6	3.11e-36	0.121	0.165	Pearson
Sex	Marital_Stat	0.03	2.61	0.0091	0.00743	0.0523	Pearson
Sex	Age	-0.05	-4.37	1.24e-05	-0.0724	-0.0276	Pearson
Sex	Sex	1	Inf	0	1	1	Pearson
Sex	Occupation	-0.28	-25.7	2.31e-139	-0.303	-0.261	Pearson
Sex	Educhieve	0.23	20.5	6.02e-91	0.207	0.25	Pearson
Occupation	Public_Private	-0.24	-21.6	1.55e-100	-0.261	-0.219	Pearson
Occupation	Marital_Stat	0.044	3.85	0.000117	0.0217	0.0665	Pearson
Occupation	Age	0.1	9.07	1.53e-19	0.0811	0.126	Pearson
Occupation	Sex	-0.28	-25.7	2.31e-139	-0.303	-0.261	Pearson
Occupation	Occupation	1	Inf	0	1	1	Pearson
Occupation	Educhieve	-0.54	-55.5	0	-0.552	-0.52	Pearson
Educhieve	Public_Private	0.41	39.7	4.19e-313	0.395	0.432	Pearson
Educhieve	Marital_Stat	-0.067	-5.89	4.08e-09	-0.0897	-0.0449	Pearson
Educhieve	Age	-0.27	-24.5	2.65e-127	-0.291	-0.249	Pearson
Educhieve	Sex	0.23	20.5	6.02e-91	0.207	0.25	Pearson
Educhieve	Occupation	-0.54	-55.5	0	-0.552	-0.52	Pearson
Educhieve	Educhieve	1	Inf	0	1	1	Pearson

corrplot(cor_matrix, type = "upper", tl.col = "black", tl.srt = 45, method = "circle", addCoef.col = "black")

ggtitle("Correlation Matrix")

## $title
## [1] "Correlation Matrix"
## 
## attr(,"class")
## [1] "labels"

The strength of the correlation is indicated by the color of the square, with warmer colors indicating stronger positive correlations and cooler colors indicating stronger negative correlations

png("correlation_matrix.png", width=1200, height=1200)
pairs.panels(my_corr,
             smooth = TRUE,
             scale = FALSE,
             density = TRUE,
             ellipses = TRUE,
             method = "pearson",
             pch = 21,
             lm = TRUE,
             cor = TRUE,
             jiggle = FALSE,
             factor = 2,
             hist.col = 4,
             stars = TRUE,
             ci = FALSE)

Logistic Regression

LogitModelr <- glm(Public_Private ~ Marital_Stat + Age + Sex + Occupation + Educhieve,
                   data = Lab3_filtered, family = "binomial")

Model Summary

summ (LogitModelr , exp = TRUE )

Observations	7614
Dependent variable	Public_Private
Type	Generalized linear model
Family	binomial
Link	logit

𝛘²(5)	1167.44
Pseudo-R² (Cragg-Uhler)	0.27
Pseudo-R² (McFadden)	0.21
AIC	4532.20
BIC	4573.83

	exp(Est.)	2.5%	97.5%	z val.	p
(Intercept)	0.03	0.02	0.06	-11.67	0.00
Marital_Stat	0.67	0.58	0.77	-5.66	0.00
Age	1.00	1.00	1.01	0.70	0.48
Sex	1.53	1.30	1.79	5.15	0.00
Occupation	0.98	0.93	1.03	-0.85	0.40
Educhieve	1.47	1.42	1.52	22.36	0.00
Standard errors: MLE

The model has a Pseudo-R² (McFadden) value of 0.21, which indicates that the model explains 21% of the variation in the dependent variable. The predictors Marital_Stat, Sex, and Educhieve have statistically significant effects on the probability of being in the Public or Private employment sector, as their p-values are less than 0.05. The odds ratio of Sex is 1.53, which means that holding all other predictors constant, the odds of being in the Public employment sector for males (coded as 1) are 1.53 times higher than the odds for females (coded as 0). The odds ratio of Educhieve is 1.47, which means that holding all other predictors constant, a one-unit increase in Educhieve is associated with a 47% increase in the odds of being in the Public employment sector. The predictors Age and Occupation are not statistically significant, as their p-values are greater than 0.05. The AIC value of the model is 4532.20, which can be used to compare it to other models to determine which model fits the data better. A lower AIC value indicates a better fit. The coefficients of the model suggest that Marital_Stat, Sex, and Educhieve are significant predictors of Public_Private. Age and Occupation were not found to be significant predictors.

Multiple Regression Analysis

Test for Assumptions

Linearity:

Plot each independent variable against the dependent variable to check for linearity

plot(Lab3_filtered$Marital_Stat, Lab3_filtered$Public_Private)

plot(Lab3_filtered$Educhieve, Lab3_filtered$Public_Private)

plot(Lab3_filtered$Sex, Lab3_filtered$Public_Private)

Independence of errors:

plot(predict(LogitModelr), residuals(LogitModelr))

Normality of errors:

hist(residuals(LogitModelr))

Equal variance (homoscedasticity):

Plot the residuals against the predicted values to check for equal variance

plot(predict(LogitModelr), residuals(LogitModelr), pch = 20)
abline(h = 0, col = "red")

For linearity: We look for a linear relationship between the two variables, meaning that the points on the scatter plot should form a roughly straight line. In this case, they do.

For the other assumptions, they seem to observe the tenets of a multiple regression.

Categorical variables when used in regression must be coded as 0/1 type variables.

We therefore create new variables that we will code for use

# code the educhieve variable as a binary categorical variable in R
Lab3_filtered$Educhieve_binary <- ifelse(Lab3_filtered$Educhieve > 1, 1, 0)
# coded as 1 if the educhieve variable is greater than 0, and 0 otherwise. This represents years of secondary education

Code the Marital_Stat variable as a 0/1 binary variable where 1 represents married and 0 represents not married

Lab3_filtered$Married <- ifelse(Lab3_filtered$Marital_Stat == 1, 1, 0)

Create a binary variable for Sex that is 0/1

Lab3_filtered$Sex_binary <- ifelse(Lab3_filtered$Sex == 2, 0, 1)

The multiple regression analysis will allow us to explore the relationship between employment status and marital status while controlling for the effects of sex and education. The dependent variable is employment status, which is a nominal variable that could take values such as privately or publicly employed. The independent variable of interest is marital status, which is also nominal and could take values such as married, divorced, widowed, or never married. The control variables are sex, which is binary (male or female), and education achieved, which is a continuous variable indicating the highest academic exam passed. .

The multiple regression model will estimate the effect of marital status on employment status, while holding constant the effects of sex and education. The regression model will test whether there is a significant relationship between employment status and marital status, after controlling for sex and education. If the regression model finds a significant relationship between employment status and marital status, we can conclude that marital status is a predictor of employment status, even after controlling for sex and educhieve.

Null Hypothesis (H0): There is no significant relationship between employment status and marital status after controlling for sex and educational achievement.

Alternative Hypothesis (H1): There is a significant relationship between employment status and marital status after controlling for sex and educational achievement.

cor_matrix2 <- cor(Lab3_filtered[, c("Public_Private", "Married", "Sex_binary", "Educhieve_binary")])

print(cor_matrix2)

##                  Public_Private      Married   Sex_binary Educhieve_binary
## Public_Private        1.0000000  0.102951634 -0.143327912       0.32870091
## Married               0.1029516  1.000000000 -0.009171946       0.02255395
## Sex_binary           -0.1433279 -0.009171946  1.000000000      -0.22272016
## Educhieve_binary      0.3287009  0.022553946 -0.222720157       1.00000000

Run linear model

linmod2 <- lm(Public_Private ~  Married + Sex_binary + Educhieve_binary, data = Lab3_filtered )

Output Results vs 1

summary (linmod2)

## 
## Call:
## lm(formula = Public_Private ~ Married + Sex_binary + Educhieve_binary, 
##     data = Lab3_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3417 -0.2209 -0.0565 -0.0079  0.9921 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.056504   0.006737   8.388  < 2e-16 ***
## Married           0.072296   0.008151   8.870  < 2e-16 ***
## Sex_binary       -0.048600   0.007298  -6.660 2.94e-11 ***
## Educhieve_binary  0.212951   0.007564  28.154  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3081 on 7610 degrees of freedom
## Multiple R-squared:  0.1223, Adjusted R-squared:  0.1219 
## F-statistic: 353.4 on 3 and 7610 DF,  p-value: < 2.2e-16

plotreg (linmod2)

## Model: bars denote standard errors (95%).

R Code: Diagnostic Plots

par(mfrow =c(2 ,2)) # Create a 2x2 grid
plot (linmod2)

New Assignment 2.2

Devin McIntosh

2023-04-07