Background

This is the code for all the analyses and figures in the book R for Introductory Statistics in the Social Sciences by Mark A. Perkins, Ph.D., published by Springer Nature and edited by Faith Su. Use this as a guide to reproduce all the content in the book. Some simulated numbers may vary.

Getting Started

Be sure to chunk your code if using RMarkdown

Example chunk:

``{r example chunk, message=FALSE, warning=FALSE, echo=TRUE}
# YOUR AWESOME CODE GOES IN THIS SPACE
```

Load your libraries. Be sure to install your packages first!

# You will get a lot of output. I’ve suppressed it to prevent overcrowding

packages <- c(
  "ggplot2", "psych", "vcd", "gridExtra", "MASS", "dplyr", "tidyr", "scales",
  "datasets", "AER", "DiagrammeR", "magick", "patchwork", "effectsize",
  "tidyverse", "ez", "rstatix", "car", "apaTables", "nlme", "dunn.test",
  "bannerCommenter", "grid", "mosaicData", "effsize"
)
lapply(packages, library, character.only = TRUE)

Figure 3.1: How our students scored compared to the mean and each other as coded in R

library(ggplot2)

# Create the data
data <- data.frame(
  Name = c("Malcolm", "Josie", "Liam", "Jackson", "Xavier"),
  Score = c(10, 12, 12, 15, 23)
)

# Calculate the mean
mean_score <- mean(data$Score)

# Create the plot object (so we can save it later)
p <- ggplot(data, aes(x = Name, y = Score)) +
  geom_line(group = 1, color = "blue", size = 1) +  # Line connecting the points
  geom_point(aes(color = Score), size = 4) +        # Points representing the actual data
  geom_hline(yintercept = mean_score, linetype = "dashed", color = "red") +  # Mean line
  labs(title = "Line Plot with Mean in the Middle", 
       x = "Name", 
       y = "Score") +
  scale_color_gradient(low = "lightblue", high = "darkblue") +  # Color scale for points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

# Define folder and file name
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.1.tif")

# Save as high-resolution TIFF (6x4 inches, 300 dpi)
tiff(filename = file_path, width = 6*300, height = 4*300, res = 300)
print(p)
dev.off()
## png 
##   2
p

This is how normality lis calculated in R using the psych package.

# Load necessary library
library(psych)

# Load the mtcars dataset
data(mtcars)

# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg

# Calculate skewness using the psych package
psych_skew <- skew(mpg_data)

# Calculate unstandardized skewness (raw third moment)
N <- length(mpg_data)
mean_mpg <- mean(mpg_data)
raw_skew <- (1 / N) * sum((mpg_data - mean_mpg)^3)

# Calculate standardized skewness
sd_mpg <- sd(mpg_data)
std_skew <- raw_skew / (sd_mpg^3)

# Print results for comparison
cat("Psych skewness: ", psych_skew, "\n")
## Psych skewness:  0.610655
cat("Raw skewness (unstandardized): ", raw_skew, "\n")
## Raw skewness (unstandardized):  133.6867
cat("Standardized skewness: ", std_skew, "\n")
## Standardized skewness:  0.610655

This is comparing the psych package formula for kurtosis to manual forms. Since the psych package excess kurtosis and the manual excess kurtosis are equal, we have identified how the psych package calculates kurtosis as shown in chapter 2.

# Load necessary library
library(psych)

# Load the mtcars dataset
data(mtcars)

# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg

# Calculate the sample size (n)
n <- length(mpg_data)

# Calculate the mean (x̄) and standard deviation (σ) of the mpg data
mean_mpg <- mean(mpg_data)
sd_mpg <- sd(mpg_data)

# Calculate the fourth central moment (m4)
m4 <- sum(((mpg_data - mean_mpg) / sd_mpg)^4) / n  # Population m4

# Calculate excess kurtosis (manual)
manual_kurtosis <- m4 - 3

# Calculate kurtosis using psych package for comparison
psych_kurtosis <- kurtosi(mpg_data)

# Output the results
cat("Manual Excess Kurtosis (simplified): ", manual_kurtosis, "\n")
## Manual Excess Kurtosis (simplified):  -0.372766
cat("Psych Package Excess Kurtosis: ", psych_kurtosis, "\n")
## Psych Package Excess Kurtosis:  -0.372766

Figure 3.2: Normal distribution of 2nd grade students’ heights.

library(ggplot2)

# Set seed for reproducibility
set.seed(123)

# Simulate heights for 500 second graders
n <- 500
mean_height <- 50
sd_height <- 3
heights <- round(rnorm(n, mean = mean_height, sd = sd_height))
heights <- pmax(40, pmin(heights, 60))  # Clip to 40–60 inches

# Create the plot object
p2 <- ggplot(data.frame(heights), aes(x = heights)) +
  geom_histogram(aes(y = ..count..), 
                 binwidth = 1, fill = "dodgerblue4", color = "black", boundary = 40) +
  geom_density(fill = "red", alpha = 0.2, adjust = 1.5) +
  labs(title = "Normal Distribution (Bell Curve) of Second Graders' Heights", 
       x = "Height (inches)", y = "Number of Kids") +
  theme_minimal() +
  theme(axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
  scale_x_continuous(breaks = seq(40, 60, by = 1))

# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.2.tif")

# Save as high-resolution TIFF (6x4 inches, 300 dpi)
tiff(filename = file_path, width = 6*300, height = 4*300, res = 300)
print(p2)
dev.off()
## png 
##   2
p2

Figure 3.3- Normal, positive and negative distributions

# Load required libraries
library(ggplot2)
library(gridExtra)
library(MASS)  # For generating skewed distributions

# Set up the number of data points
n <- 1000

# Create data for the distributions
normal_data <- rnorm(n, mean = 5, sd = 1)  # Normal distribution with mean = 5
positively_skewed_data <- rbeta(n, shape1 = 2, shape2 = 5)  # Beta distribution (positively skewed)
negatively_skewed_data <- rbeta(n, shape1 = 5, shape2 = 2)  # Beta distribution (negatively skewed)

# Function to calculate the peak of the density curve for mode label placement
get_peak_y <- function(data) {
  density_data <- density(data, bw = 1)  # Larger bandwidth for "fatter" curves
  peak_y <- max(density_data$y)
  return(peak_y)
}

# Custom Labeling function to avoid overlap
adjust_label_y <- function(peak_y, label_number, offset_factor = 0.1) {
  # Spacing the labels further apart by adding an offset factor for each label
  vertical_offset <- peak_y - (offset_factor * label_number)
  return(vertical_offset)
}

# Normal Distribution Plot
normal_mean <- mean(normal_data)
normal_median <- median(normal_data)
normal_mode <- normal_mean  # Mode is exactly the mean for normal distribution
normal_peak_y <- get_peak_y(normal_data)

normal_plot <- ggplot(data.frame(x = normal_data), aes(x = x)) +
  geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = normal_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = normal_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = normal_mode), color = "black", linetype = "solid") +
  annotate("text", x = normal_mean, y = normal_peak_y * 0.9, label = "Mean", color = "red", hjust = -0.2) +
  annotate("text", x = normal_median, y = normal_peak_y * 0.75, label = "Median", color = "blue", hjust = -0.2) +
  annotate("text", x = normal_mode, y = normal_peak_y * 0.5, label = "Mode", color = "black", hjust = -0.2) +
  ggtitle("Normal Distribution") + theme_minimal()

# Positively Skewed Distribution Plot
positively_skewed_mean <- mean(positively_skewed_data)
positively_skewed_median <- median(positively_skewed_data)
positively_skewed_mode <- positively_skewed_mean - (positively_skewed_mean * 0.19)  # Mode estimate for skewed
positively_skewed_peak_y <- get_peak_y(positively_skewed_data)

positively_skewed_plot <- ggplot(data.frame(x = positively_skewed_data), aes(x = x)) +
  geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = positively_skewed_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = positively_skewed_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = positively_skewed_mode), color = "black", linetype = "solid") +
  annotate("text", x = positively_skewed_mean, y = positively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
  annotate("text", x = positively_skewed_median, y = positively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
  annotate("text", x = positively_skewed_mode, y = positively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
  ggtitle("Positively Skewed Distribution") + theme_minimal()

# Negatively Skewed Distribution Plot
negatively_skewed_mean <- mean(negatively_skewed_data)
negatively_skewed_median <- median(negatively_skewed_data)
negatively_skewed_mode <- negatively_skewed_mean + (negatively_skewed_mean * 0.09)  # Adjusting mode for skewed distribution
negatively_skewed_peak_y <- get_peak_y(negatively_skewed_data)

negatively_skewed_plot <- ggplot(data.frame(x = negatively_skewed_data), aes(x = x)) +
  geom_density(fill = "orange", color = "black", adjust = 1) +
  geom_vline(aes(xintercept = negatively_skewed_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = negatively_skewed_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = negatively_skewed_mode), color = "black", linetype = "solid") +
  annotate("text", x = negatively_skewed_mean, y = negatively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
  annotate("text", x = negatively_skewed_median, y = negatively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
  annotate("text", x = negatively_skewed_mode, y = negatively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
  ggtitle("Negatively Skewed Distribution") + theme_minimal()

# Arrange the plots in a vertical grid (stacked)
grid.arrange(normal_plot, positively_skewed_plot, negatively_skewed_plot, ncol = 2)

# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.3.tif")

# Save as high-resolution TIFF
tiff(filename = file_path, width = 12*300, height = 8*300, res = 300)  # Wider and taller to fit multiple plots
grid.arrange(normal_plot, positively_skewed_plot, negatively_skewed_plot, ncol = 2)
dev.off()
## png 
##   2

Figure 3.4: Mesokurtic, platykurtic and leptokurtic distributions

library(ggplot2)
library(gridExtra)
library(MASS)

# Set up the number of data points
n <- 1000

# Create data for the distributions
mesokurtic_data <- rnorm(n, mean = 5, sd = 1)      # Normal distribution (Mesokurtic)
platykurtic_data <- runif(n, min = 0, max = 10)   # Uniform distribution (Platykurtic)
leptokurtic_data <- rt(n, df = 3)                 # T-distribution with 3 degrees of freedom (Leptokurtic)

# Function to calculate the peak of the density curve
get_peak_y <- function(data) {
  density_data <- density(data, bw = 1)
  max(density_data$y)
}

# Mesokurtic Plot
mesokurtic_plot <- ggplot(data.frame(x = mesokurtic_data), aes(x = x)) +
  geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = mean(mesokurtic_data)), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = median(mesokurtic_data)), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = mean(mesokurtic_data)), color = "black", linetype = "solid") +
  ggtitle("Mesokurtic Distribution") + theme_minimal()

# Platykurtic Plot
platykurtic_plot <- ggplot(data.frame(x = platykurtic_data), aes(x = x)) +
  geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = mean(platykurtic_data)), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = median(platykurtic_data)), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = mean(platykurtic_data)), color = "black", linetype = "solid") +
  ggtitle("Platykurtic Distribution") + theme_minimal()

# Leptokurtic Plot
leptokurtic_plot <- ggplot(data.frame(x = leptokurtic_data), aes(x = x)) +
  geom_density(fill = "orange", color = "black", adjust = 1) +
  geom_vline(aes(xintercept = mean(leptokurtic_data)), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = median(leptokurtic_data)), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = mean(leptokurtic_data)), color = "black", linetype = "solid") +
  ggtitle("Leptokurtic Distribution") + theme_minimal()

# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.4.tif")

# Save as high-resolution TIFF
tiff(filename = file_path, width = 12*300, height = 8*300, res = 300)
grid.arrange(mesokurtic_plot, platykurtic_plot, leptokurtic_plot, ncol = 2)
dev.off()
## png 
##   2
grid.arrange(mesokurtic_plot, platykurtic_plot, leptokurtic_plot, ncol = 2)

# Figure 3.5:The normal distribution given percentile and Z score (number of standard deviations)

## png 
##   2
## Figure saved to: R figures 2/figure3.5.tif

Figure 3.6:Distributions of other standardized scores

library(ggplot2)
library(gridExtra)

# Define parameters for each test
tests <- list(
  IQ = list(mean = 100, sd = 15, x_range = c(40, 160)),
  T_score = list(mean = 50, sd = 10, x_range = c(10, 90)),
  Stanine = list(mean = 5, sd = 2, x_range = c(0, 10))
)

# Function to create bell curve data
create_bell_curve <- function(mean, sd, x_range, test_name) {
  x <- seq(x_range[1], x_range[2], length.out = 1000)
  y <- dnorm(x, mean = mean, sd = sd)
  data.frame(x = x, y = y, test = test_name)
}

# Generate data for all tests
bell_curves <- do.call(rbind, lapply(names(tests), function(test_name) {
  params <- tests[[test_name]]
  create_bell_curve(params$mean, params$sd, params$x_range, test_name)
}))

# Create individual plots with consistent scaling
plots <- lapply(unique(bell_curves$test), function(test_name) {
  params <- tests[[test_name]]
  bell_curves_scaled <- bell_curves[bell_curves$test == test_name, ]
  bell_curves_scaled$y <- bell_curves_scaled$y / max(bell_curves_scaled$y)  # Scale to max=1
  
  ggplot(bell_curves_scaled, aes(x = x, y = y)) +
    geom_line(color = "blue", size = 1) +
    ggtitle(paste(test_name, "Distribution")) +
    theme_minimal() +
    labs(x = "Score", y = "Density") +
    scale_x_continuous(
      breaks = c(seq(params$mean - 3 * params$sd, params$mean + 3 * params$sd, by = params$sd), params$mean),
      labels = function(x) ifelse(x == params$mean, paste0(x, " (Mean)"), x)
    )
})

# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.6.tif")

# Save as high-resolution TIFF
tiff(filename = file_path, width = 12*300, height = 8*300, res = 300)  # 12x8 inches
grid.arrange(grobs = plots, ncol = 2)
dev.off()
## png 
##   2
grid.arrange(grobs = plots, ncol = 2)

It is time to calculate descriptive statistics we will be using the CPS85 dataset for this example

# Create a dataset for dexcriptives called "descriptives" using the CPS85 dataset
descriptives <- CPS85 %>%
  dplyr::select(wage, educ, exper, age)%>%
  psych::describe()


descriptives
##       vars   n  mean    sd median trimmed   mad min  max range  skew kurtosis
## wage     1 534  9.02  5.14   7.78    8.28  4.12   1 44.5  43.5  1.69     4.90
## educ     2 534 13.02  2.62  12.00   13.05  1.48   2 18.0  16.0 -0.20     0.81
## exper    3 534 17.82 12.38  15.00   16.75 11.86   0 55.0  55.0  0.68    -0.40
## age      4 534 36.83 11.73  35.00   36.07 11.86  18 64.0  46.0  0.55    -0.60
##         se
## wage  0.22
## educ  0.11
## exper 0.54
## age   0.51

Now it is time to run descriptives by categorical variables

# See unique categories for the CPS85 dataset
unique(CPS85$sector)
## [1] const    sales    clerical service  manuf    prof     other    manag   
## Levels: clerical const manag manuf other prof sales service
# Count how many unique sectors there are
n_distinct(CPS85$sector)
## [1] 8
# Get descriptive statistics by sector
descriptives_sector <- psych::describeBy(CPS85$wage, group = CPS85$sector, mat = TRUE)

# Convert the matrix into a data frame
descriptives_sector<- as.data.frame(descriptives_sector)

write.csv(descriptives_sector, "descriptives_sector.csv")

descriptives_sector
##     item   group1 vars   n      mean       sd median   trimmed      mad  min
## X11    1 clerical    1  97  7.422577 2.699018  7.500  7.299873 3.335850 3.00
## X12    2    const    1  20  9.502000 3.343877  9.750  9.499375 3.647196 3.75
## X13    3    manag    1  55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X14    4    manuf    1  68  8.036029 4.117607  6.750  7.511786 3.335850 3.00
## X15    5    other    1  68  8.500588 4.601049  6.940  7.869464 3.261720 2.85
## X16    6     prof    1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X17    7    sales    1  38  7.592632 4.232272  5.725  7.157813 2.891070 3.35
## X18    8  service    1  83  6.537470 3.673278  5.500  5.937612 2.816940 1.75
##       max range        skew    kurtosis        se
## X11 15.03 12.03  0.39715109 -0.71992219 0.2740438
## X12 15.00 11.25 -0.02255889 -1.05839301 0.7477136
## X13 44.50 43.50  1.47308721  3.67335914 1.0210774
## X14 22.20 19.20  1.26924002  1.53036996 0.4993332
## X15 26.00 23.15  1.46528476  2.30867798 0.5579592
## X16 24.98 20.63  0.77278554 -0.36441619 0.5390709
## X17 19.98 16.63  0.97785906  0.01196841 0.6865651
## X18 25.00 23.25  2.04386695  6.36667089 0.4031946

We will generate histograms overall and then by category (Figure 4.5 and 4.6 in book)

# Load libraries
library(ggplot2)
library(datasets)  # CPS85 is part of AER package; make sure CPS85 is loaded

# Create folder foR figures 2
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)

# --- Figure 4.5: Overall Wage Distribution ---
fig4_5 <- ggplot(CPS85, aes(x = wage)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", 
                 color = "black", alpha = 0.7) +
  geom_density(color = "red", fill = "red", alpha = 0.3) +
  theme_minimal() +
  labs(title = "Overall Wage Distribution", x = "Wage", y = "Density")

# Save Figure 4.5
tiff4_5 <- file.path(folder, "Figure4.5.tif")
ggsave(filename = tiff4_5, plot = fig4_5, device = "tiff", dpi = 300, width = 8, height = 6)
cat("Figure 4.5 saved as TIFF at:", tiff4_5, "\n")
## Figure 4.5 saved as TIFF at: R figures 2/Figure4.5.tif
# --- Figure 4.6: Wage Distribution by Sector ---
fig4_6 <- ggplot(CPS85, aes(x = wage)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
                 color = "black", alpha = 0.7) +
  geom_density(color = "red", fill = "red", alpha = 0.3) +
  facet_wrap(~ sector) +
  theme_minimal() +
  labs(title = "Wage Distribution by Sector", x = "Wage", y = "Density")

# Save Figure 4.6
tiff4_6 <- file.path(folder, "Figure4.6.tif")
ggsave(filename = tiff4_6, plot = fig4_6, device = "tiff", dpi = 300, width = 10, height = 6)
cat("Figure 4.6 saved as TIFF at:", tiff4_6, "\n")
## Figure 4.6 saved as TIFF at: R figures 2/Figure4.6.tif
fig4_5

fig4_6

Calculating frequecies in R

library(dplyr)
library(tidyr)

## Single tables
married_table<- table(CPS85$married)

count_table <- table(CPS85$married, CPS85$union)

married_prop<- prop.table(table(CPS85$married))

married_union_prop<- prop.table((table(CPS85$married, CPS85$union)))

# Create into one table

df_count <- as.data.frame(count_table)

df_count <- df_count %>%
  rename(Married = Var1, Union = Var2, Count = Freq)

df_count <- df_count %>%
  group_by(Union) %>%
  mutate(Proportion = Count / sum(Count))

df_total <- df_count %>%
  group_by(Married) %>%
  summarise(Count = sum(Count), Proportion = sum(Proportion)) %>%
  mutate(Union = "Total")

final_table <- bind_rows(df_count, df_total)

married_table
## 
## Married  Single 
##     350     184
count_table
##          
##           Not Union
##   Married 278    72
##   Single  160    24
married_prop
## 
##   Married    Single 
## 0.6554307 0.3445693
married_union_prop
##          
##                  Not      Union
##   Married 0.52059925 0.13483146
##   Single  0.29962547 0.04494382
df_count
## # A tibble: 4 × 4
## # Groups:   Union [2]
##   Married Union Count Proportion
##   <fct>   <fct> <int>      <dbl>
## 1 Married Not     278      0.635
## 2 Single  Not     160      0.365
## 3 Married Union    72      0.75 
## 4 Single  Union    24      0.25
df_total
## # A tibble: 2 × 4
##   Married Count Proportion Union
##   <fct>   <int>      <dbl> <chr>
## 1 Married   350      1.38  Total
## 2 Single    184      0.615 Total
final_table
## # A tibble: 6 × 4
## # Groups:   Union [3]
##   Married Union Count Proportion
##   <fct>   <chr> <int>      <dbl>
## 1 Married Not     278      0.635
## 2 Single  Not     160      0.365
## 3 Married Union    72      0.75 
## 4 Single  Union    24      0.25 
## 5 Married Total   350      1.38 
## 6 Single  Total   184      0.615

This is a plot of proportions by category (Figure 4.7 in book)

# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)

# Create folder foR figures 2
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)

# --- Prepare frequency table ---
frequency_table <- CPS85 %>%
  dplyr::select(race, sex, hispanic, south, married, union, sector) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
  group_by(Variable, Category) %>%
  summarise(Frequency = n(), .groups = "drop") %>%
  group_by(Variable) %>%
  mutate(Proportion = Frequency / sum(Frequency)) %>%
  arrange(Variable, desc(Frequency))

# Save frequency table to CSV
write.csv(frequency_table, "frequencytable.csv", row.names = FALSE)

# --- Create Figure 4.7 ---
fig4_7 <- ggplot(frequency_table, aes(x = Category, y = Frequency, fill = Category)) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  geom_text(
    aes(label = percent(Proportion, accuracy = 0.1)),
    position = position_stack(vjust = 0.5),
    size = 3, color = "black"
  ) +
  facet_wrap(~Variable, scales = "free", ncol = 2) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10, face = "bold")
  ) +
  labs(
    title = "Frequency and Proportion of Categories by Variable",
    x = "Category",
    y = "Frequency"
  )

# Save Figure 4.7 as TIFF (300 dpi)
tiff4_7 <- file.path(folder, "Figure4.7.tif")
ggsave(filename = tiff4_7, plot = fig4_7, device = "tiff", dpi = 300, width = 10, height = 8)
cat("Figure 4.7 saved as TIFF at:", tiff4_7, "\n")
## Figure 4.7 saved as TIFF at: R figures 2/Figure4.7.tif
fig4_7

Figure 5.1: A hypothetical experimental study on reading to establish causality

library(DiagrammeR)
library(DiagrammeRsvg)
library(rsvg)
library(magick)

# Create the diagram
diagram <- grViz("
digraph experiment {
  graph [layout = dot, rankdir = LR]

  Population [label = 'Population', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]
  Sample [label = 'Randomly\\nSample\\nPopulation', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]
  Split [label = 'Randomly\\nSplit into\\nGroups', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]

  PreA [label = 'Pre-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
  ImplementA [label = 'Implement\\nCurriculum A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
  PostA [label = 'Post-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
  FollowUpA [label = 'Follow-Up\\nTest:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]

  PreB [label = 'Pre-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
  ImplementB [label = 'Implement\\nCurriculum B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
  PostB [label = 'Post-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
  FollowUpB [label = 'Follow-Up\\nTest:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]

  Analyze [label = 'Run\\nStatistical\\nTests', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]

  Population -> Sample
  Sample -> Split
  Split -> PreA
  PreA -> ImplementA
  ImplementA -> PostA
  PostA -> FollowUpA
  FollowUpA -> Analyze
  Split -> PreB
  PreB -> ImplementB
  ImplementB -> PostB
  PostB -> FollowUpB
  FollowUpB -> Analyze

  {rank = same; PreA; PreB}
  {rank = same; ImplementA; ImplementB}
  {rank = same; PostA; PostB}
  {rank = same; FollowUpA; FollowUpB}
}
")

svg_text <- export_svg(diagram)
temp_svg <- tempfile(fileext = ".svg")
writeLines(svg_text, temp_svg)

folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure5.1.tif")

img <- image_read(temp_svg)
image_write(img, path = tiff_path, format = "tiff", density = "300x300")

cat("Figure 5.1 saved as TIFF at:", tiff_path, "\n")
## Figure 5.1 saved as TIFF at: R figures 2/figure5.1.tif
diagram

Figure 5.2: Probabilistic and non-probabilistic sampling methods

library(DiagrammeR)
library(DiagrammeRsvg)
library(magick)

# Create folder for output
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)

# --- Create Diagrams ---

# Probabilistic Sampling Diagram
prob_diagram <- grViz("
digraph probabilistic_sampling {
  graph [rankdir=TB]
  node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
  probabilistic [label='Probabilistic Sampling\n(High External Validity)', shape=ellipse, fillcolor=palegreen, width=2.5]
  true_random [label='True Random\nSampling\n(Highest External\nValidity)']
  stratified_random [label='Stratified Random\nSampling']
  systematic [label='Systematic Sampling']
  cluster [label='Cluster Sampling']
  probabilistic -> true_random
  probabilistic -> stratified_random
  probabilistic -> systematic
  probabilistic -> cluster
}
")

# Non-Probabilistic Sampling Diagram
non_prob_diagram <- grViz("
digraph non_probabilistic_sampling {
  graph [rankdir=TB]
  node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
  non_probabilistic [label='Non-Probabilistic Sampling\n(Low External Validity)', shape=ellipse, fillcolor=lightcoral, width=2.5]
  convenience [label='Convenience Sampling']
  voluntary [label='Voluntary Sampling']
  purposive [label='Purposive Sampling']
  non_probabilistic -> convenience
  non_probabilistic -> voluntary
  non_probabilistic -> purposive
}
")

# --- Export SVGs to temporary files ---
prob_svg <- export_svg(prob_diagram)
prob_file <- file.path(folder, "probabilistic_sampling.svg")
writeLines(prob_svg, prob_file)

non_prob_svg <- export_svg(non_prob_diagram)
non_prob_file <- file.path(folder, "non_probabilistic_sampling.svg")
writeLines(non_prob_svg, non_prob_file)

# --- Read SVGs into magick images ---
prob_image <- image_read(prob_file)
non_prob_image <- image_read(non_prob_file)

# --- STACKED VERSION ---

# Make images same width
max_width <- max(image_info(prob_image)$width, image_info(non_prob_image)$width)
prob_image <- image_extent(prob_image, geometry = paste0(max_width, "x"), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0(max_width, "x"), gravity = "center")

# Optional spacing
prob_image <- image_border(prob_image, color = "white", geometry = "0x10")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "0x10")

# Stack vertically
combined_stack <- image_append(c(prob_image, non_prob_image), stack = TRUE)



# Make images same height
max_height <- max(image_info(prob_image)$height, image_info(non_prob_image)$height)
prob_image <- image_extent(prob_image, geometry = paste0("x", max_height), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0("x", max_height), gravity = "center")

# Optional spacing
prob_image <- image_border(prob_image, color = "white", geometry = "10x0")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "10x0")

# Combine side-by-side
combined_side <- image_append(c(prob_image, non_prob_image), stack = FALSE)

# --- Save stacked TIFF at 300 dpi ---
tiff_stack_path <- file.path(folder, "figure5.2_stack.tif")
image_write(combined_stack, path = tiff_stack_path, format = "tiff", density = "300x300")

# --- Save side-by-side TIFF at 300 dpi ---
tiff_side_path <- file.path(folder, "figure5.2_side_by_side.tif")
image_write(combined_side, path = tiff_side_path, format = "tiff", density = "300x300")

# --- Print paths ---
cat("Figure 5.2 saved as TIFFs:\nStacked:", tiff_stack_path, "\nSide-by-side:", tiff_side_path, "\n")
## Figure 5.2 saved as TIFFs:
## Stacked: R figures 2/figure5.2_stack.tif 
## Side-by-side: R figures 2/figure5.2_side_by_side.tif
prob_diagram
non_prob_diagram

Figure 5.3 Factor analysis chart

library(DiagrammeR)

# Create folder for output
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)

# --- Create the diagram ---
diagram_5_3<- grViz("
digraph internal_structure {
  graph [rankdir = TB, nodesep=0.4]

  node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica]
  Scale [label = 'Entire Scale', shape = box, fillcolor = lightcoral]

  node [shape = ellipse, fillcolor = lightgoldenrodyellow]
  Factor1 [label = 'Factor 1']
  Factor2 [label = 'Factor 2']

  node [shape = rectangle, fillcolor = lightgray]
  Item1_F1 [label = 'Item 1']
  Item2_F1 [label = 'Item 2']
  Item3_F1 [label = 'Item 3']

  Item1_F2 [label = 'Item 4']
  Item2_F2 [label = 'Item 5']
  Item3_F2 [label = 'Item 6']

  # Define edges
  Scale -> Factor1
  Scale -> Factor2

  Factor1 -> Item1_F1
  Factor1 -> Item2_F1
  Factor1 -> Item3_F1

  Factor2 -> Item1_F2
  Factor2 -> Item2_F2
  Factor2 -> Item3_F2
}
")

# --- Export SVG to temporary file ---
svg_5_3 <- export_svg(diagram_5_3)
svg_file <- file.path(folder, "figure5.3.svg")
writeLines(svg_5_3, svg_file)

# --- Read SVG into magick image ---
img_5_3 <- image_read(svg_file)

# --- Save as TIFF at 300 dpi ---
tiff_path <- file.path(folder, "figure5.3.tif")
image_write(img_5_3, path = tiff_path, format = "tiff", density = "300x300")

# --- Print path ---
cat("Figure 5.3 saved as TIFF at:", tiff_path, "\n")
## Figure 5.3 saved as TIFF at: R figures 2/figure5.3.tif
diagram_5_3

One-tailed and two-tailed tests with figure 5.4

# Load necessary libraries
library(ggplot2)
library(patchwork)
library(gridExtra)

# Define the critical values for two-tailed test
alpha <- 0.05  # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2)  # Two-tailed critical value (±1.96 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)

# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2)  # Shift the distribution to the right

# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Plot for two-tailed test with shifted distribution
plot1 <- ggplot() +
  # Plot the null hypothesis curve (standard normal distribution)
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
  # Plot the alternative hypothesis curve (shifted normal distribution)
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
  # Shade the rejection region for the two-tailed test (both left and right tails)
  geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +  # Left tail
  geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +  # Right tail
  # Add vertical lines for critical values
  geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
  # Labels and annotations
  annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = 1.2, size = 3) +  # Move text up and decrease font size
  annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = -0.2, size = 3) +
  labs(title = "P1: Two-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
  theme_minimal()


# Define the significance level (alpha) and critical values
alpha <- 0.05  
z_critical_two_tailed <- qnorm(1 - alpha / 2)  # Two-tailed critical value (~±1.96 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute density values for null and alternative hypotheses
z_density_null <- dnorm(z_scores)  # Standard normal distribution
z_density_alt <- dnorm(z_scores, mean = -2)  # Shifted left

# Create data frames for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Create the plot
plot2<- ggplot() +
  # Plot the null hypothesis curve
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
  # Plot the alternative hypothesis curve
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
  # Shade the left tail rejection region
  geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
  # Shade the right tail rejection region
  geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
  # Add vertical dashed lines for critical values
  geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
  # Add annotations for rejection regions
  annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = 1.2, size = 3) +  # Move text up and decrease font size
  annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = -0.2, size = 3) +
  # Labels and styling
  labs(title = "P2: Two-Tailed, Martians < Earthlings", x = "Z-Score", y = "Density") +
  theme_minimal()


# Define the critical value for one-tailed test
alpha <- 0.05  # significance level
z_critical_one_tailed <- qnorm(1 - alpha)  # One-tailed critical value (z_critical = 1.645 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)

# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2)  # Shift the distribution to the right

# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Plot for one-tailed test with shifted distribution to the right
plot3 <- ggplot() +
  # Plot the null hypothesis curve (standard normal distribution)
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
  # Plot the alternative hypothesis curve (shifted normal distribution to the right)
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
  # Shade the rejection region for the one-tailed test (right tail)
  geom_ribbon(data = subset(data_null, z_scores >= z_critical_one_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "blue", alpha = 0.5) +  # Right tail
  # Add vertical line for the one-tailed critical value
  geom_vline(xintercept = z_critical_one_tailed, color = "black", linetype = "dashed") +
  # Labels and annotations
  annotate("text", x = z_critical_one_tailed, y = 0.07, label = "One-Tailed\nRejection Region", color = "black", hjust = -0.2, size = 3) +
  labs(title = "P3: One-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
  theme_minimal()

# Define the critical values for two-tailed test
alpha <- 0.05  # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2)  # Two-tailed critical value (±1.96 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)

# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
# Decrease the shift to increase the overlap
z_density_alt <- dnorm(z_scores, mean = 0.3)  # Slightly shifted distribution

# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Plot for more overlap between null and alternative distributions
plot4 <- ggplot() +
  # Plot the null hypothesis curve (standard normal distribution)
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black", alpha = 0.6) +  # Slight transparency for overlap
  # Plot the alternative hypothesis curve (shifted normal distribution)
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue", alpha = 0.6) +  # Slight transparency for overlap
  # Add vertical lines for critical values
  geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
  # Add a label indicating no statistical significance
  annotate("text", x = 0, y = 0.15, label = "Null Hypothesis Accepted", color = "black", size = 4, fontface = "bold") +  # Centered annotation
  labs(title = "P4: Earthlings = Martians", x = "Z-Score", y = "Density") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),  # Center the title
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  )


# Create a list of plots
plots <- list(plot1, plot2, plot3, plot4)

# Use grid.arrange to arrange the plots with specified heights
grid.arrange(grobs = plots, ncol2 = 1, nrow = 2, heights = c(1, 1), widths = c(.5, .5))

combined_plot <- grid.arrange(plot1, plot2, plot3, plot4, ncol = 2, nrow = 2)

# --- Save the combined plot as TIFF at 300 dpi ---
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure5.4.tif")

# Use ggsave for high-res TIFF export
ggsave(
  filename = tiff_path,
  plot = combined_plot,
  device = "tiff",
  dpi = 300,
  width = 10,    # width in inches
  height = 8     # height in inches
)

cat("Figure 5.4 saved as TIFF at:", tiff_path, "\n")
## Figure 5.4 saved as TIFF at: R figures 2/figure5.4.tif

Chi-squaret test of race and union membership

library(summarytools)
library(dplyr)
library(ggplot2)
library(vcd)

# Select relevant columns from the dataset
chidata <- CPS85 %>%
  dplyr::select(race, union)
# Generate contingency table
chitable <- table(chidata$race, chidata$union)

# Run chi-square test
chi_test <- chisq.test(chitable)
chi_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348
# Convert the expected counts matrix to a data frame
expected <- as.data.frame(chi_test$expected)

# Extract row names and make them a column
expected$race <- rownames(expected)

# Move 'race' column to the front (if desired)
expected <- expected %>%
  dplyr::select(race, everything())

# Add 'type' column to indicate that these are expected counts
expected <- expected %>%
  mutate(type = "Expected")

# Convert the contingency table to a data frame
chitable_df <- as.data.frame(chitable)%>%
  rename(race = Var1, union = Var2)

observed <- chitable_df %>%
  dplyr::select(race, union, Freq) %>%
  pivot_wider(names_from = union, values_from = Freq)%>%
  mutate(type = "Observed")

overalltable<- rbind(observed, expected)

print("contengency table")
## [1] "contengency table"
chitable
##     
##      Not Union
##   NW  49    18
##   W  389    78
print("Expected counts")
## [1] "Expected counts"
chi_test$expected
##     
##            Not    Union
##   NW  54.95506 12.04494
##   W  383.04494 83.95506
overalltable
## # A tibble: 4 × 4
##   race    Not Union type    
## * <fct> <dbl> <dbl> <chr>   
## 1 NW     49    18   Observed
## 2 W     389    78   Observed
## 3 NW     55.0  12.0 Expected
## 4 W     383.   84.0 Expected
print("Chi-square test results:")
## [1] "Chi-square test results:"
print(chi_test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348
# Calculate Cramér's V or Phi (for 2x2)
print("Effect size metrics, this uses phi because it is 2x2")
## [1] "Effect size metrics, this uses phi because it is 2x2"
assocstats(chitable)
##                     X^2 df P(> X^2)
## Likelihood Ratio 3.7469  1 0.052906
## Pearson          4.1045  1 0.042770
## 
## Phi-Coefficient   : 0.088 
## Contingency Coeff.: 0.087 
## Cramer's V        : 0.088
write.csv(overalltable, "chi.csv")

Generate a plot of the counts and proportions given categories (Figure 6.1 in book)

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(scales)

# Create folder foR figures 2 if it doesn't exist
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)

# Prepare the data
chitable_plot <- as.data.frame(chitable) %>%
  rename(Race = Var1, Union = Var2) %>%
  group_by(Race) %>%
  mutate(Proportion = Freq / sum(Freq))

# Create the ggplot
fig6_1 <- ggplot(chitable_plot, aes(x = Race, y = Freq, fill = Union)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = percent(Proportion)), 
            position = position_dodge(width = 0.8), 
            vjust = -0.5, size = 3) +
  geom_text(aes(label = Freq), 
            position = position_dodge(width = 0.8), 
            vjust = 1.5, size = 3, color = "white") +
  labs(title = "Contingency Table: Race vs Union",
       x = "Race",
       y = "Count") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal()

# Save as TIFF (300 dpi)
tiff6_1 <- file.path(folder, "Figure6.1.tif")
ggsave(filename = tiff6_1, plot = fig6_1, device = "tiff", dpi = 300, width = 8, height = 6)

cat("Figure 6.1 saved as TIFF at:", tiff6_1, "\n")
## Figure 6.1 saved as TIFF at: R figures 2/Figure6.1.tif
fig6_1

Independent samples t-test bell curves and box plots figure 7.1

# Load necessary libraries
library(ggplot2)
library(gridExtra)

# Simulated data for non-significant difference (Earthlings vs Martians)
nonsig_data <- data.frame(
  ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 22, sd = 4)),
  Group = rep(c("Earthlings", "Martians"), each = 100)
)

# Simulated data for significant difference (Earthlings vs Martians)
sig_data <- data.frame(
  HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
  Group = rep(c("Earthlings", "Martians"), each = 100)
)

# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Non-Significant Difference (Earthlings vs Martians)") +
  ylab("ACT Score") +
  xlab("Group") +
  theme_minimal()

# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Significant Difference (Earthlings vs Martians)") +
  ylab("HS GPA") +
  xlab("Group") +
  theme_minimal()

# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Non-Significant Difference (ACT Scores - Earthlings vs Martians)") +
  xlab("ACT Score") +
  theme_minimal()

# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Significant Difference (HS GPA - Earthlings vs Martians)") +
  xlab("HS GPA") +
  theme_minimal()

# Arrange boxplot and density plot for non-significant difference
g1<-grid.arrange(p1, p3, ncol = 1)

# Arrange boxplot and density plot for significant difference
g2<-grid.arrange(p2, p4, ncol = 1)

grid.arrange(g1, g2, ncol = 2)

final_plot <- grid.arrange(g1, g2, ncol = 2)

# --- Save as TIFF 300 dpi ---
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure7.1.tif")

ggsave(
  filename = tiff_path,
  plot = final_plot,
  device = "tiff",
  dpi = 300,
  width = 12,   # adjust width as needed
  height = 6    # adjust height as needed
)

cat("Figure 7.1 saved as TIFF at:", tiff_path, "\n")
## Figure 7.1 saved as TIFF at: R figures 2/figure7.1.tif

Independent samples t-test (Figure 7.2 in book)

# Independent samples t test descriptive statistics for table 7.1
descriptives_south <- psych::describeBy(CPS85$age, group = CPS85$married, mat = TRUE)

# Convert the matrix into a data frame
descriptives_south<- as.data.frame(descriptives_south)

write.csv(descriptives_south, "descriptives_south.csv")

descriptives_south
##     item  group1 vars   n     mean       sd median  trimmed     mad min max
## X11    1 Married    1 350 39.20286 11.06721     37 38.67500 11.8608  20  64
## X12    2  Single    1 184 32.32609 11.65169     29 30.70946 10.3782  18  64
##     range      skew   kurtosis        se
## X11    44 0.4124848 -0.7844761 0.5915671
## X12    46 1.0927423  0.5295034 0.8589742
## Run normality test on married people given age using a shapiro-wilk test and plots
CPS85$married <- as.factor(CPS85$married)

shapiro.test(CPS85$age[CPS85$married == "Married"])
## 
##  Shapiro-Wilk normality test
## 
## data:  CPS85$age[CPS85$married == "Married"]
## W = 0.96039, p-value = 3.999e-08
shapiro.test(CPS85$age[CPS85$married == "Single"])
## 
##  Shapiro-Wilk normality test
## 
## data:  CPS85$age[CPS85$married == "Single"]
## W = 0.89074, p-value = 2.348e-10
# Load necessary libraries
library(ggplot2)
library(dplyr)

# Create folder foR figures 2 if it doesn't exist
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)

# Ensure 'married' is a factor
CPS85$married <- as.factor(CPS85$married)

# Create the plot
fig7_2 <- ggplot(CPS85, aes(x = age, fill = married)) +
  geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5, position = "identity", color = "black") +
  geom_density(alpha = 0.7) +
  facet_wrap(~married) +
  theme_minimal() +
  labs(title = "Age Distribution by Marital Status",
       x = "Age",
       y = "Density") +
  scale_fill_brewer(palette = "Set2")

# Save as TIFF (300 dpi)
tiff7_2 <- file.path(folder, "Figure7.2.tif")
ggsave(filename = tiff7_2, plot = fig7_2, device = "tiff", dpi = 300, width = 8, height = 6)

cat("Figure 7.2 saved as TIFF at:", tiff7_2, "\n")
## Figure 7.2 saved as TIFF at: R figures 2/Figure7.2.tif
fig7_2

Check Levene’s Homogeneity of Variance then run T-test the Cohen’s d

#Levene's test of equality of variance
leveneTest(age ~ married, data = CPS85)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.1425  0.706
##       532
#Run t-test in R assuming equality of variance
t_test_result <- t.test(age ~ married, data = CPS85, var.equal = TRUE)
t_test_result
## 
##  Two Sample t-test
## 
## data:  age by married
## t = 6.6999, df = 532, p-value = 5.323e-11
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
##  4.860477 8.893063
## sample estimates:
## mean in group Married  mean in group Single 
##              39.20286              32.32609
#Run t-test in R if variances not equal
t_test_result2 <- t.test(age ~ married, data = CPS85, var.equal = FALSE)
t_test_result2
## 
##  Welch Two Sample t-test
## 
## data:  age by married
## t = 6.5934, df = 355.79, p-value = 1.555e-10
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
##  4.825607 8.927933
## sample estimates:
## mean in group Married  mean in group Single 
##              39.20286              32.32609
# Run Cohen's d
cohen_d_result <- CPS85 %>%
  rstatix::cohens_d(age ~ married)

# View the result
cohen_d_result
## # A tibble: 1 × 7
##   .y.   group1  group2 effsize    n1    n2 magnitude
## * <chr> <chr>   <chr>    <dbl> <int> <int> <ord>    
## 1 age   Married Single   0.605   350   184 moderate

Mann Whitney U non-parametric t-test with effect size

# Run Mann-Whitney U test (Wilcoxon rank-sum test)
mann_whitney_result <- wilcox.test(age ~ married, data = CPS85, exact = FALSE)
mann_whitney_result
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by married
## W = 44398, p-value = 5.971e-13
## alternative hypothesis: true location shift is not equal to 0
# Calculating effect size for U
# Compute Mann-Whitney U statistic manually (SPSS-style)

compute_U <- function(group1, group2) {
  combined_data <- c(group1, group2)
  ranks <- rank(combined_data)
  
  rank_sum1 <- sum(ranks[1:length(group1)])
  rank_sum2 <- sum(ranks[(length(group1)+1):length(combined_data)])
  
  U1 <- rank_sum1 - length(group1)*(length(group1) + 1)/2
  U2 <- rank_sum2 - length(group2)*(length(group2) + 1)/2
  
  return(min(U1, U2))  # Smaller U value matches SPSS
}

# Define groups correctly for Mann-Whitney U
group1 <- CPS85$age[CPS85$married == "Married"]
group2 <- CPS85$age[CPS85$married == "Single"]

# Compute U statistic manually
U_value <- compute_U(group1, group2)
print(paste("Mann-Whitney U (Custom Calculation):", U_value))
## [1] "Mann-Whitney U (Custom Calculation): 20002.5"
# Compute effect size (r = Z / sqrt(n))
# Get sample sizes
n1 <- length(group1)
n2 <- length(group2)
n <- n1 + n2

# Approximate Z from the Mann-Whitney U test
Z <- qnorm(1 - mann_whitney_result$p.value / 2)  # Two-tailed assumption

# Compute effect size
effect_size_r <- abs(Z) / sqrt(n)
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.312"

One-Way ANOVA boxplots and bell curves figure 7.3

library(ggplot2)
library(gridExtra)
library(psych)
set.seed(123)

# Simulated data for non-significant difference (Earthlings vs Venusians vs Martians)
nonsig_data <- data.frame(
  ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 24, sd = 4), rnorm(100, mean = 22, sd = 4)),
  Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)

# Simulated data for significant difference (Earthlings vs Martians vs Venusians)
sig_data <- data.frame(
  HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 3.0, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
  Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)

# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Non-Sig. Dif. (ACT)") +
  ylab("ACT Score") +
  xlab("Group") +
  theme_minimal()

# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Sig. Dif. (HS GPA)") +
  ylab("HS GPA") +
  xlab("Group") +
  theme_minimal()

# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Non-Sign. Dif. (ACT Scores)") +
  xlab("ACT Score") +
  theme_minimal()

# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Sig. Dif. (HS GPA)") +
  xlab("HS GPA") +
  theme_minimal()

# Arrange boxplot and density plot for non-significant difference
g1<- grid.arrange(p1, p3, ncol = 1)

# Arrange boxplot and density plot for significant difference
g2<- grid.arrange(p2, p4, ncol = 1)

grid.arrange(g1, g2, ncol =2)

final_plot <- grid.arrange(g1, g2, ncol = 2)

# --- Save as TIFF 300 dpi ---
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure7.3.tif")

ggsave(
  filename = tiff_path,
  plot = final_plot,
  device = "tiff",
  dpi = 300,
  width = 12,  # adjust width as needed
  height = 6   # adjust height as needed
)

cat("Figure 7.3 saved as TIFF at:", tiff_path, "\n")
## Figure 7.3 saved as TIFF at: R figures 2/figure7.3.tif

One-way ANOVA on wages between managers, professionals, and sales

Statistical assumptions and descriptives

Figure 7.4 as a QQ Plot

# Prepare the dataset
anovadata<- CPS85%>%
filter(sector %in% c("manag", "prof", "sales"))%>%
dplyr::select(wage, sector)
anovadata$sector<- as.character(anovadata$sector)

# Get descriptive statistics by profession
descriptives_profession <- psych::describeBy(anovadata$wage, group = anovadata$sector, mat = TRUE)

# Convert the matrix into a data frame
descriptives_profession<- as.data.frame(descriptives_profession)

write.csv(descriptives_profession, "descriptives_profession.csv")
descriptives_profession
##     item group1 vars   n      mean       sd median   trimmed      mad  min
## X11    1  manag    1  55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X12    2   prof    1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X13    3  sales    1  38  7.592632 4.232272  5.725  7.157813 2.891070 3.35
##       max range      skew    kurtosis        se
## X11 44.50 43.50 1.4730872  3.67335914 1.0210774
## X12 24.98 20.63 0.7727855 -0.36441619 0.5390709
## X13 19.98 16.63 0.9778591  0.01196841 0.6865651
# Shapiro-Wilke Test for normality
aov_model <- aov(wage ~ sector, data = anovadata)
shapiro.test(residuals(aov_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(aov_model)
## W = 0.91874, p-value = 5.393e-09
# QQ Plot for normality
qqnorm(residuals(aov_model))
qqline(residuals(aov_model), col = "red")

# Levene's test for homogeniety of variance
library(car)
leveneTest(wage ~ sector, data = anovadata)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)  
## group   2  3.2421 0.0412 *
##       195                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bartlett's test of homogeniety of variance
bartlett.test(wage ~ sector, data = anovadata)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  wage by sector
## Bartlett's K-squared = 15.317, df = 2, p-value = 0.000472
# Create folder foR figures 2 if it doesn't exist
folder <- "R figures"
if(!dir.exists(folder)) dir.create(folder)

tiff7_4 <- file.path(folder, "Figure7.4.tif")

# Save Q-Q plot to TIFF
tiff(filename = tiff7_4, width = 6, height = 6, units = "in", res = 300)
qqnorm(residuals(aov_model, main = "Q-Q Plot of Differences"))
qqline(residuals(aov_model), col = "red", lwd = 2)
dev.off()
## png 
##   2
cat("Figure 7.4 saved as TIFF at:", tiff7_4, "\n")
## Figure 7.4 saved as TIFF at: R figures/Figure7.4.tif

One-Way ANOVA

Omnibus

Post-hoc

Cohen’s d effect size on sig. pairwise tests

# Be sure to load the library
library(apaTables)
library(dplyr)
library(tidyverse)
library(rstatix)

# Run ANOVA
anova_model <- aov(wage~sector, data = anovadata)

# Export the APA table to your folder
apa.aov.table(anova_model, filename = "anova_results.doc")
## 
## 
## ANOVA results using wage as the dependent variable
##  
## 
##    Predictor      SS  df      MS      F    p partial_eta2 CI_90_partial_eta2
##  (Intercept) 8876.54   1 8876.54 249.68 .000                                
##       sector  674.63   2  337.31   9.49 .000          .09         [.03, .15]
##        Error 6932.59 195   35.55                                            
## 
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
summary(anova_model)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## sector        2    675   337.3   9.488 0.000117 ***
## Residuals   195   6933    35.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run post-hoc tests
# Tukey if Equal Variances are Assumed
TukeyHSD(anova_model)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = wage ~ sector, data = anovadata)
## 
## $sector
##                   diff       lwr       upr     p adj
## prof-manag  -0.7565714 -3.100521  1.587379 0.7265306
## sales-manag -5.1113684 -8.081890 -2.140847 0.0002059
## sales-prof  -4.3547970 -7.020710 -1.688884 0.0004545
# Games-Howell if equal variances are not assumed
anovadata%>%
games_howell_test(wage ~ sector)
## # A tibble: 3 × 8
##   .y.   group1 group2 estimate conf.low conf.high      p.adj p.adj.signif
## * <chr> <chr>  <chr>     <dbl>    <dbl>     <dbl>      <dbl> <chr>       
## 1 wage  manag  prof     -0.757    -3.51      2.00 0.79       ns          
## 2 wage  manag  sales    -5.11     -8.04     -2.18 0.000222   ***         
## 3 wage  prof   sales    -4.35     -6.44     -2.27 0.00000942 ****
# Run cohen's d effect size
# Subset your data for Cohen's d
cohen_data_sales_manag<- anovadata%>%
  filter(sector %in% c("sales", "manag"))
cohen_data_sales_prof<- anovadata%>%
  filter(sector %in% c("sales", "prof"))

# Run cohen's d on subset data
 rstatix::cohens_d(cohen_data_sales_manag, wage ~ sector)
## # A tibble: 1 × 7
##   .y.   group1 group2 effsize    n1    n2 magnitude
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 wage  manag  sales    0.833    55    38 large
 rstatix::cohens_d(cohen_data_sales_prof, wage ~ sector)
## # A tibble: 1 × 7
##   .y.   group1 group2 effsize    n1    n2 magnitude
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 wage  prof   sales    0.885   105    38 large

Kruskal-Wallis non-parametric ANOVA test

Including a Dunn post-hoc test

library(dunn.test)

# Run the omnibus KW test
kwtest<-kruskal.test(wage ~ sector, data = anovadata)
print(kwtest)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  wage by sector
## Kruskal-Wallis chi-squared = 22.533, df = 2, p-value = 1.28e-05
# Create ranks and summarize by group
planet_ranks <- anovadata %>%
  mutate(Rank = rank(wage)) %>%
  group_by(sector) %>%
  summarise(N = n(), Mean_Rank = mean(Rank), Sum_Rank = sum(Rank))
planet_ranks
## # A tibble: 3 × 4
##   sector     N Mean_Rank Sum_Rank
##   <chr>  <int>     <dbl>    <dbl>
## 1 manag     55     110.     6032.
## 2 prof     105     109.    11395 
## 3 sales     38      59.9    2274.
#Run post-hoc dunn test
dt<- dunn.test(anovadata$wage, anovadata$sector, kw = TRUE, label = TRUE)
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 22.5328, df = 2, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |      manag       prof
## ---------+----------------------
##     prof |   0.119529
##          |     0.4524
##          |
##    sales |   4.121490   4.487320
##          |    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2
dt
## $chi2
## [1] 22.53284
## 
## $Z
## [1] 0.1195295 4.1214907 4.4873207
## 
## $P
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
## 
## $P.adjusted
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
## 
## $comparisons
## [1] "manag - prof"  "manag - sales" "prof - sales"
# Run eta-squared for omnibus effect size
H <- kwtest$statistic
k <- length(unique(anovadata$sector))
N <- nrow(anovadata)

eta_squared <- (H - k + 1) / (N - k)

# Rename the output explicitly
names(eta_squared) <- "eta_squared"
eta_squared
## eta_squared 
##   0.1052966
# Rank biserial correlation for pairwise tests
# Z-values from Dunn's test results
z_manag_sales <- 4.1215
z_prof_sales <- 4.4873

# Assuming equal sample size for each comparison
n <- nrow(anovadata)  # Total number of observations

# Rank-biserial correlation formula
r_manag_sales <- z_manag_sales / sqrt(n)
r_prof_sales <- z_prof_sales / sqrt(n)

# Print effect sizes
r_manag_sales
## [1] 0.2929023
r_prof_sales
## [1] 0.3188985

Dependent samples t-test from book (with Figure 8.1)

## png 
##   2
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 
##  Paired t-test
## 
## data:  data$Pretest and data$Posttest
## t = -21.456, df = 19, p-value = 8.846e-15
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -15.1462 -12.4538
## sample estimates:
## mean difference 
##           -13.8
##    vars  n  mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 20 73.15 7.73     73   73.06 8.15  59  87    28 0.02    -0.79 1.73
##    vars  n  mean   sd median trimmed mad min max range  skew kurtosis   se
## X1    1 20 86.95 7.85   86.5   87.25 8.9  70  98    28 -0.23    -0.95 1.76
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 20 13.8 2.88     14   13.62 2.22   9  20    11 0.37    -0.57 0.64

Pre, Post, Post-Post reading

# Creating the data frame of three groups with the same scores
data <- data.frame(
  Student = factor(rep(1:5, each=3)),  # 5 students, 3 conditions
  Condition = factor(rep(c("Pre", "Post", "Post-Post"), times=5)),
  Score = c(80, 75, 67, 90, 85, 85, 78, 72, 74, 85, 80, 78, 88, 83, 91)
)

# 1. Run repeated measures ANOVA with ezANOVA (handling assumptions like sphericity)
library(ez)
anova_result <- ezANOVA(data = data,
                        dv = .(Score),
                        wid = .(Student),
                        within = .(Condition),
                        detailed = TRUE)
anova_result
## $ANOVA
##        Effect DFn DFd         SSn       SSd          F            p p<.05
## 1 (Intercept)   1   4 97768.06667 482.93333 809.785202 9.074958e-06     *
## 2   Condition   2   8    90.13333  89.86667   4.011869 6.213045e-02      
##         ges
## 1 0.9941754
## 2 0.1359614
## 
## $`Mauchly's Test for Sphericity`
##      Effect          W           p p<.05
## 2 Condition 0.01730226 0.002275905     *
## 
## $`Sphericity Corrections`
##      Effect       GGe     p[GG] p[GG]<.05       HFe     p[HF] p[HF]<.05
## 2 Condition 0.5043633 0.1150838           0.5087521 0.1144435
# 2. Normality Test for Each Condition
library(dplyr)
data %>%
  group_by(Condition) %>%
  summarise(shapiro_p = shapiro.test(Score)$p.value)
## # A tibble: 3 × 2
##   Condition shapiro_p
##   <fct>         <dbl>
## 1 Post          0.716
## 2 Post-Post     0.977
## 3 Pre           0.641
# 3. Mauchly's Test of Sphericity
# Using ezANOVA, sphericity testing is automatically handled and included in the output.
# If needed, you can manually extract the sphericity correction from the results.

# 4. Post-Hoc Pairwise Comparison if ANOVA is significant
if (anova_result$ANOVA$p[1] < 0.05) {
  pairwise_result <- pairwise.t.test(data$Score, data$Condition, paired = TRUE)
  print(pairwise_result)
}
## 
##  Pairwise comparisons using paired t tests 
## 
## data:  data$Score and data$Condition 
## 
##           Post    Post-Post
## Post-Post 1.00    -        
## Pre       3.9e-05 0.23     
## 
## P value adjustment method: holm

Dependent Samples t-test in R

Here we move to the ‘datasets’ library, so we load it first

You may need to: install.package(‘datasets’)

library(datasets)
library(psych)
library(effectsize)
library(tidyverse)

# This is the default data set. 
# It is not necessary to rename it, but I do it to emphasize it's original form
chick<- ChickWeight

# Here we pivot the data so that we have only time series 4 and 21 in our set
chickdata <- chick%>%
  filter(Time %in% c(4, 21)) %>%  # Select only two time points for comparison
  pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
  drop_na()%>%
  mutate(Difference = Day_21 - Day_4)

# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata$Day_4)
desc_day_21 <- psych::describe(chickdata$Day_21)
desc_difference <- psych::describe(chickdata$Difference)

as.data.frame(desc_day_4)
##    vars  n     mean       sd median  trimmed    mad min max range       skew
## X1    1 45 60.15556 4.279738     61 60.32432 4.4478  48  69    21 -0.4278585
##      kurtosis        se
## X1 0.06485024 0.6379857
as.data.frame(desc_day_21)
##    vars  n     mean       sd median  trimmed     mad min max range       skew
## X1    1 45 218.6889 71.51027    205 218.6216 75.6126  74 373   299 0.08910638
##      kurtosis       se
## X1 -0.7680967 10.66012
as.data.frame(desc_difference)
##    vars  n     mean       sd median  trimmed     mad min max range    skew
## X1    1 45 158.5333 69.83865    149 158.2973 71.1648  16 309   293 0.10106
##      kurtosis       se
## X1 -0.7835153 10.41093
desc_day_4$Variable <- "Day_4"
desc_day_21$Variable <- "Day_21"
desc_difference$Variable <- "Difference"

# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_21, desc_difference)%>%
  dplyr::select(Variable, mean, median, sd, skew, kurtosis)



write.csv(desc_combined, "dependent_desc.csv")

# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist(chickdata$Difference, main = "Histogram of Difference", 
     xlab = "Difference (Day_21 - Day_4)", col = "lightblue", 
     border = "black")

# Q-Q plot to check for normality
qqnorm(chickdata$Difference)
qqline(chickdata$Difference, col = "red")

Figures 8.2 and 8.3 in book

# Create the folder if it doesn't exist
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)


# Figure 8.2 – Histogram of Difference
tiff8_2 <- file.path(folder, "Figure8.2.tif")

tiff(filename = tiff8_2, width = 6, height = 4, units = "in", res = 300)
hist(chickdata$Difference,
     main = "Histogram of Difference",
     xlab = "Difference (Day_21 - Day_4)",
     col = "lightblue",
     border = "black")
dev.off()
## png 
##   2
cat("Figure 8.2 saved as TIFF at:", tiff8_2, "\n")
## Figure 8.2 saved as TIFF at: R figures 2/Figure8.2.tif
# Also show histogram in console
hist(chickdata$Difference,
     main = "Histogram of Difference",
     xlab = "Difference (Day_21 - Day_4)",
     col = "lightblue",
     border = "black")

# Shapiro-Wilk test for normality
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test
## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata$Difference
## W = 0.98718, p-value = 0.8939
# Paired t-test

paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result
## 
##  Paired t-test
## 
## data:  chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -179.5152 -137.5515
## sample estimates:
## mean difference 
##       -158.5333
# Calculate Cohen's d manually

mean_dif <- mean(chickdata$Difference)
sd_dif <- sd(chickdata$Difference)
cohen_d <- mean_dif / sd_dif
cohen_d
## [1] 2.269994
# Calculate Cohen's d using effsize package

chickdata_long <- chickdata %>%
  dplyr::select(Day_4, Day_21) %>%
  tidyr::pivot_longer(cols = c(Day_4, Day_21), 
                      names_to = "Day", 
                      values_to = "Weight")

chickdata_long$Day <- as.factor(chickdata_long$Day)

cohen_d_result <- effectsize::cohens_d(chickdata_long$Weight, chickdata_long$Day, paired = TRUE)
cohen_d_result
## Cohen's d |       95% CI
## ------------------------
## 2.27      | [1.71, 2.82]
# Figure 8.3 – Q-Q Plot of Differences

tiff8_3 <- file.path(folder, "Figure8.3.tif")

tiff(filename = tiff8_3, width = 6, height = 6, units = "in", res = 300)
qqnorm(chickdata$Difference, main = "Q-Q Plot of Differences")
qqline(chickdata$Difference, col = "red", lwd = 2)
dev.off()
## png 
##   2
cat("Figure 8.3 saved as TIFF at:", tiff8_3, "\n")
## Figure 8.3 saved as TIFF at: R figures 2/Figure8.3.tif
# Show Q-Q plot in R plot window
qqnorm(chickdata$Difference, main = "Q-Q Plot of Differences")
qqline(chickdata$Difference, col = "red", lwd = 2)

Paired (dpendent) samples t-test in R

Start with Shapiro Wil

Then do t test

Finish with effect size

# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test
## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata$Difference
## W = 0.98718, p-value = 0.8939
# Run the t-test
paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result
## 
##  Paired t-test
## 
## data:  chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -179.5152 -137.5515
## sample estimates:
## mean difference 
##       -158.5333
# Calculate Cohen's d manually
mean_dif<- mean(chickdata$Difference)
sd_dif<- sd(chickdata$Difference)
cohen_d<- mean_dif/sd_dif
cohen_d
## [1] 2.269994
# Use the effsize package
library(effsize)
cohen_d2 <- cohen.d(chickdata$Day_21, chickdata$Day_4, paired = TRUE)
cohen_d2
## 
## Cohen's d
## 
## d estimate: 2.45338 (large)
## 95 percent confidence interval:
##    lower    upper 
## 1.812256 3.094504

Non parametric Wilcoxon Signed Rank

Followed by effect size

# Load necessary libraries
library(tidyr)
library(rstatix)
library(dplyr)

# Filter for Day 4 and Day 21
chick_subset <- ChickWeight[ChickWeight$Time %in% c(4, 21), ]

# Reshape data to wide format
chick_wide <- chick_subset %>%
  pivot_wider(names_from = Time, values_from = weight) %>%
  na.omit() %>%
  dplyr::select('4', '21')

# Rename columns for clarity
colnames(chick_wide) <- c("Four", "TwentyOne")

# Perform Wilcoxon Signed-Rank Test
test_result <- wilcox.test(chick_wide$Four, chick_wide$TwentyOne, paired = TRUE)

# Print the Wilcoxon Signed-Rank Test result
print(test_result)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  chick_wide$Four and chick_wide$TwentyOne
## V = 0, p-value = 5.355e-09
## alternative hypothesis: true location shift is not equal to 0
# Extract necessary values
Z <- qnorm(test_result$p.value / 2)  # Get Z-score (divide p-value by 2 for two-tailed test)
n <- nrow(chick_wide)  # Number of pairs

# Calculate effect size r
effect_size_r <- abs(Z) / sqrt(n)

# Print effect size
print(paste("Z:", round(Z, 3)))
## [1] "Z: -5.836"
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.87"

Repeated measures ANOVA on Chick weight (With Figures 8.4 and 8.5)

Data set-up and munging

Checking assumptions

Running the omnibus test

Running pairwise post-hoc tests

Effect size calcuations

library(car)
library(tidyverse)
library(dplyr)
library(psych)
library(gridExtra)
library(rstatix)
library(nlme)
library(dplyr)
library(purrr)

chickdata3 <- ChickWeight%>%
  filter(Time %in% c(4, 12, 21)) %>%  # Select only two time points for comparison
  pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
  drop_na()%>%
  dplyr::select(Chick, Day_4, Day_12, Day_21)%>%
  mutate(Difference21_4 = Day_21 - Day_4)%>%
  mutate(Difference12_4 = Day_12- Day_4)%>%
  mutate(Difference21_12 = Day_21- Day_12)

# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata3$Day_4)
desc_day_12 <- psych::describe(chickdata3$Day_12)
desc_day_21 <- psych::describe(chickdata3$Day_21)
desc_difference21_4 <- psych::describe(chickdata3$Difference21_4)
desc_difference12_4 <- psych::describe(chickdata3$Difference12_4)
desc_difference21_12 <- psych::describe(chickdata3$Difference21_12)


as.data.frame(desc_day_4)
##    vars  n     mean       sd median  trimmed    mad min max range       skew
## X1    1 45 60.15556 4.279738     61 60.32432 4.4478  48  69    21 -0.4278585
##      kurtosis        se
## X1 0.06485024 0.6379857
as.data.frame(desc_day_12)
##    vars  n     mean       sd median  trimmed     mad min max range      skew
## X1    1 45 132.7778 32.12397    136 132.5676 28.1694  70 217   147 0.1011454
##       kurtosis       se
## X1 -0.02882665 4.788759
as.data.frame(desc_day_21)
##    vars  n     mean       sd median  trimmed     mad min max range       skew
## X1    1 45 218.6889 71.51027    205 218.6216 75.6126  74 373   299 0.08910638
##      kurtosis       se
## X1 -0.7680967 10.66012
as.data.frame(desc_difference21_4)
##    vars  n     mean       sd median  trimmed     mad min max range    skew
## X1    1 45 158.5333 69.83865    149 158.2973 71.1648  16 309   293 0.10106
##      kurtosis       se
## X1 -0.7835153 10.41093
as.data.frame(desc_difference12_4)
##    vars  n     mean       sd median  trimmed     mad min max range      skew
## X1    1 45 72.62222 29.65505     75 72.32432 25.2042  12 155   143 0.1679009
##    kurtosis       se
## X1 0.203552 4.420714
as.data.frame(desc_difference21_12)
##    vars  n     mean       sd median trimmed     mad min max range     skew
## X1    1 45 85.91111 51.30161     85      85 53.3736   4 177   173 0.192361
##     kurtosis       se
## X1 -1.034653 7.647593
desc_day_4$Variable <- "Day_4"
desc_day_12$Variable<- "Day_12"
desc_day_21$Variable <- "Day_21"
desc_difference21_4$Variable <- "Difference_21_4"
desc_difference12_4$Variable<- "Difference_12_4"
desc_difference21_12$Variable<- "Difference_21_12"

# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_12, desc_day_21, desc_difference21_4, desc_difference12_4, desc_difference21_12)%>% 
  mutate(upper = mean+(se*2))%>%
  mutate(lower = mean-(se*2))%>%
  dplyr::select(Variable, mean, upper, lower, se, sd, skew, kurtosis)

write.csv(desc_combined, "repeated_desc.csv")

# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist_21_4 <- ggplot(chickdata3, aes(x = Difference21_4)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 10) +
  ggtitle("Histogram of Difference 21 and 4") +
  xlab("Difference (Day_21 - Day_4)") +
  theme_minimal()

hist_21_12 <- ggplot(chickdata3, aes(x = Difference21_12)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 10) +
  ggtitle("Histogram of Difference 21 and 12") +
  xlab("Difference (Day_21 - Day_12)") +
  theme_minimal()

hist_12_4 <- ggplot(chickdata3, aes(x = Difference12_4)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 10) +
  ggtitle("Histogram of Difference 12 and 4") +
  xlab("Difference (Day_12 - Day_4)") +
  theme_minimal()

# Create Q-Q plots
qq_21_4 <- ggplot(chickdata3, aes(sample = Difference21_4)) +
  stat_qq() + stat_qq_line(color = "red") +
  ggtitle("Q-Q Plot: Difference Day 21-4") +
  theme_minimal()

qq_21_12 <- ggplot(chickdata3, aes(sample = Difference21_12)) +
  stat_qq() + stat_qq_line(color = "red") +
  ggtitle("Q-Q Plot: Difference Day 21-12") +
  theme_minimal()

qq_12_4 <- ggplot(chickdata3, aes(sample = Difference12_4)) +
  stat_qq() + stat_qq_line(color = "red") +
  ggtitle("Q-Q Plot: Difference Day 12-4") +
  theme_minimal()

# Arrange plots in a grid
grid.arrange(hist_21_4, qq_21_4,
             hist_21_12, qq_21_12,
             hist_12_4, qq_12_4,
             ncol = 2)

library(gridExtra)

# Create folder if needed
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Arrange plots in a grid
combined_8_4 <- grid.arrange(hist_21_4, qq_21_4,
                             hist_21_12, qq_21_12,
                             hist_12_4, qq_12_4,
                             ncol = 2)

# Save TIFF at 300 DPI
tiff(file.path(folder, "Figure8.4.tif"), width = 8, height = 10, units = "in", res = 300)
grid.arrange(hist_21_4, qq_21_4,
             hist_21_12, qq_21_12,
             hist_12_4, qq_12_4,
             ncol = 2)
dev.off()
## png 
##   2
# Show in console
combined_8_4
## TableGrob (3 x 2) "arrange": 6 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
## 6 6 (3-3,2-2) arrange gtable[layout]
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test21_4 <- shapiro.test(chickdata3$Difference21_4)
shapiro_test21_4
## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata3$Difference21_4
## W = 0.98718, p-value = 0.8939
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test21_12 <- shapiro.test(chickdata3$Difference21_12)
shapiro_test21_12
## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata3$Difference21_12
## W = 0.95597, p-value = 0.0857
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test12_4 <- shapiro.test(chickdata3$Difference12_4)
shapiro_test12_4
## 
##  Shapiro-Wilk normality test
## 
## data:  chickdata3$Difference12_4
## W = 0.97458, p-value = 0.4195
# Convert to long format and add a Chick ID
chickdata3_long <- chickdata3 %>%
  dplyr::select(Chick, Day_4, Day_12, Day_21) %>%  
  pivot_longer(cols = c(Day_4, Day_12, Day_21),
               names_to = "Time", 
               values_to = "Weight") %>%
  mutate(Time = factor(Time, levels = c("Day_4", "Day_12", "Day_21")))


# Greenhouse-Guesser correction automatically applied to this ANOVA test see the "ges" indicating an adjusted  eta-squared.
anova_lm <- anova_test(dv = Weight, wid = Chick, within = Time, data = chickdata3_long)
anova_lm
## ANOVA Table (type III tests)
## 
## $ANOVA
##   Effect DFn DFd       F        p p<.05   ges
## 1   Time   2  88 202.706 1.14e-33     * 0.676
## 
## $`Mauchly's Test for Sphericity`
##   Effect     W        p p<.05
## 1   Time 0.315 1.65e-11     *
## 
## $`Sphericity Corrections`
##   Effect   GGe      DF[GG]    p[GG] p[GG]<.05 HFe     DF[HF]    p[HF] p[HF]<.05
## 1   Time 0.594 1.19, 52.23 5.31e-21         * 0.6 1.2, 52.84 3.22e-21         *
get_anova_table(anova_lm)
## ANOVA Table (type III tests)
## 
##   Effect  DFn   DFd       F        p p<.05   ges
## 1   Time 1.19 52.23 202.706 5.31e-21     * 0.676
# Plot the Difference
figure8_5 <- ggplot(chickdata3_long, aes(x = Time, y = Weight, fill = Time)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Weight by Time",
       x = "Time",
       y = "Weight")

# Save at 300 DPI
tiff(file.path(folder, "Figure8.5.tif"), width = 6, height = 4, units = "in", res = 300)
print(figure8_5)
dev.off()
## png 
##   2
# Show in console
figure8_5

# Post-hoc with a bonferroni correction
# Perform pairwise t-tests with Bonferroni correction
pwc <- chickdata3_long %>%
  pairwise_t_test(Weight ~ Time, paired = TRUE, p.adjust.method = "bonferroni")

pwc
## # A tibble: 3 × 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 Weig… Day_4  Day_12    45    45     -16.4    44 2.17e-20 6.51e-20 ****        
## 2 Weig… Day_4  Day_21    45    45     -15.2    44 3.75e-19 1.12e-18 ****        
## 3 Weig… Day_12 Day_21    45    45     -11.2    44 1.64e-14 4.92e-14 ****
## Cohen's d effect size into a table
tesddata_prepost_d<- chickdata3_long%>%
  filter(Time == c("Day_4", "Day_12"))%>%
    mutate(Time = as.character(Time))%>%
  rstatix::cohens_d(Weight ~ Time)%>%
  mutate(Model = "Day 4 to Day 12")

tesddata_prepost_Post_d<- chickdata3_long%>%
  filter(Time == c("Day_4", "Day_21"))%>%
  mutate(Time = as.character(Time))%>%
  rstatix::cohens_d(Weight ~ Time)%>%
  mutate(Model = "Day 4 to Day 12")

testdata_PostPost_Post_d <- chickdata3_long %>%
  filter(Time %in% c("Day_12", "Day_21")) %>%
  mutate(Time = as.character(Time)) %>%
  rstatix::cohens_d(Weight ~ Time) %>%
  mutate(Model = "Day 12 to Day 21")


effectsize<- rbind(tesddata_prepost_d, tesddata_prepost_Post_d, testdata_PostPost_Post_d)

effectsize
## # A tibble: 3 × 8
##   .y.    group1 group2 effsize    n1    n2 magnitude Model           
##   <chr>  <chr>  <chr>    <dbl> <int> <int> <ord>     <chr>           
## 1 Weight Day_12 Day_4     3.36    23    23 large     Day 4 to Day 12 
## 2 Weight Day_21 Day_4     2.63    22    23 large     Day 4 to Day 12 
## 3 Weight Day_12 Day_21   -1.55    45    45 large     Day 12 to Day 21
write.csv(effectsize, "repeated_pair_cohen.csv")

Non-parametric repeated measures Freidman Test

Setting up and munging the data

Running the ombibus Freidman test

Kendall W effect size for Freidman

Wilcoxon post-hoc

Effict size of paired test

library(dplyr)
library(stats)
library(tidyverse)
library(ggpubr)
library(rstatix)
library(purrr)

# Set up the dataset
chickdata4 <- ChickWeight %>%
  filter(Time %in% c(4, 12, 21)) %>%
  dplyr::select(Chick, Time, weight) %>%
  group_by(Chick) %>%
  filter(n() == 3) %>%
  ungroup() %>%
  mutate(
    Time = as.factor(Time))

chickdata4$Chick <- factor(chickdata4$Chick, ordered = FALSE)

# Run the omnibus test
friedtest<- friedman_test(chickdata4, weight ~ Time | Chick)

friedtest
## # A tibble: 1 × 6
##   .y.        n statistic    df        p method       
## * <chr>  <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 weight    45        90     2 2.86e-20 Friedman test
# Get the omnibus effect size
friedman_effsize_result <- friedman_effsize(chickdata4, weight ~ Time | Chick)

friedman_effsize_result
## # A tibble: 1 × 5
##   .y.        n effsize method    magnitude
## * <chr>  <int>   <dbl> <chr>     <ord>    
## 1 weight    45       1 Kendall W large
# Post-Hoc Wilcox Sum Rank Test
chickdata_wide <- chickdata4 %>%
  pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day")

library(dplyr)
library(purrr)
library(tibble)

# Define pairwise comparisons
comparisons <- list(
  c("Day4", "Day12"),
  c("Day4", "Day21"),
  c("Day12", "Day21")
)

# Function to perform Wilcoxon test and compute statistics
perform_test <- function(pair) {
  test <- wilcox.test(chickdata_wide[[pair[1]]], chickdata_wide[[pair[2]]], paired = TRUE)
  
  # Extract the test statistic (V)
  V_stat <- test$statistic
  
  # Sample size (n) for effect size and Z-statistic calculations
  n <- sum(!is.na(chickdata_wide[[pair[1]]]) & !is.na(chickdata_wide[[pair[2]]]))  # Correct for NAs
  
  # Expected value of V
  mu_V <- n * (n + 1) / 4
  
  # Standard deviation of V
  sigma_V <- sqrt((n * (n + 1) * (2 * n + 1)) / 24)
  
  # Compute Z-statistic: Adjust sign based on positive ranks
  Zstat <- (V_stat - mu_V) / sigma_V
  
  # If Z is positive for a comparison with a reverse direction (like Day4 to Day12), flip the sign
  if (V_stat < mu_V) {
    Zstat <- -Zstat
  }
  
  # Compute effect size (r)
  r_effect_size <- abs(Zstat) / sqrt(n)  # You can choose to use absolute or raw Z value
  
  # Return a tibble with results
  tibble(
    Group1 = pair[1],
    Group2 = pair[2],
    n = n,
    V = V_stat,
    Z = Zstat,  # Retain the sign
    p = test$p.value,
    r = r_effect_size
  )
}

# Apply function to all comparisons and store results in a table
wilcoxon_results <- purrr::map_dfr(comparisons, perform_test)

# Print results
print(wilcoxon_results)
## # A tibble: 3 × 7
##   Group1 Group2     n     V     Z             p     r
##   <chr>  <chr>  <int> <dbl> <dbl>         <dbl> <dbl>
## 1 Day4   Day12     45     0  5.84 0.00000000534 0.871
## 2 Day4   Day21     45     0  5.84 0.00000000535 0.871
## 3 Day12  Day21     45     0  5.84 0.00000000535 0.871
# Save results to CSV
write.csv(wilcoxon_results, "wilcoxon_repeated.csv")

Multiple regression analysis

Regression analysis human height and shoe size with Figure 9.1

# Load necessary library
library(ggplot2)

# Sample data for height and shoe size
height <- c(128,170, 175, 180, 165, 160, 185, 178, 182, 173, 168, 
            180, 169, 177, 165, 174, 186, 178, 165, 172, 180)

shoe_size <- c(0,9, 10, 11, 8, 7, 12, 10, 11, 9, 8, 
               11, 8, 10, 8, 9, 12, 11, 8, 9, 10)

# Create a data frame
height_shoe_size <- data.frame(Height = height, Shoe_Size = shoe_size)

# Fit the linear model
model <- lm(Height ~ Shoe_Size, data = height_shoe_size)

# Get the model's coefficients (intercept and slope)
intercept <- coef(model)[1]
slope <- coef(model)[2]

# Define file path
file_path <- file.path(folder, "Figure9.1.tif")

# Calculate the intercept at x = 0
y_intercept_at_x_0 <- intercept  # When x = 0, the y-intercept is the model's intercept
cat("The y-axis intercept when x = 0 is:", y_intercept_at_x_0, "cm\n")
## The y-axis intercept when x = 0 is: 128.1263 cm
# Check if the point exists in your dataset
subset(height_shoe_size, Height == 160 & Shoe_Size == 70)
## [1] Height    Shoe_Size
## <0 rows> (or 0-length row.names)
# Create the scatterplot with the regression line and highlight the specific point (with tolerance)
# Create the scatterplot with the regression line, highlight the specific point, and more x-axis ticks
p1<- ggplot(height_shoe_size, aes(x = Shoe_Size, y = Height)) +
  geom_point(color = "blue") +  # Scatterplot points
  geom_smooth(method = "lm", se = FALSE, color = "red") +  # Fit line with intercept
  # Highlight the specific point (height = 160, shoe size = 70) with a tolerance
  geom_point(data = subset(height_shoe_size, abs(Height - 160) < 1 & abs(Shoe_Size - 70) < 1), 
             aes(x = Shoe_Size, y = Height), 
             color = "black", shape = 1, size = 5, stroke = 2) +  # Circle around the point
  labs(title = "Scatterplot of Height vs Shoe Size",
       x = "Shoe Size (US Men)",
       y = "Height (cm)") +
  theme_minimal() +
  ylim(120, max(height_shoe_size$Height) + 1) +  # Set y-axis limit
  xlim(0, max(height_shoe_size$Shoe_Size) + 1) +  # Set x-axis limit
  scale_x_continuous(breaks = seq(0, max(height_shoe_size$Shoe_Size) + 1, by = 1))  # Set more ticks on the x-axis

p1

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.1.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(p1)
dev.off()
## png 
##   2

Regression magnitude with figure 9.2

# Load required libraries
library(ggplot2)
library(gridExtra)

# Set a seed for reproducibility
set.seed(42)

# Create a function to generate correlated data
generate_data <- function(r) {
  x <- rnorm(100)
  y <- r * x + sqrt(1 - r^2) * rnorm(100)
  data.frame(x, y)
}

file_path <- file.path(folder, "Figure9.2.tif")

# Create the plots for each correlation
correlations <- c(0.2, -0.2, 0.5, -0.5, 0.9, -0.9, 0)
labels <- c("Weak Positive", "Weak Negative", "Moderate Positive", "Moderate Negative", "Strong Positive", "Strong Negative", "No Correlation")


plots <- lapply(1:7, function(i) {
  data <- generate_data(correlations[i])
  ggplot(data, aes(x, y)) + 
    geom_point() + 
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    ggtitle(paste(labels[i], " (r =", correlations[i], ")")) + 
    theme_minimal() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank())
})

# Arrange the plots in a grid
fig<- grid.arrange(grobs = plots, ncol = 3)

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.2.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig)
dev.off()
## png 
##   2
fig
## TableGrob (3 x 3) "arrange": 7 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (2-2,1-1) arrange gtable[layout]
## 5 5 (2-2,2-2) arrange gtable[layout]
## 6 6 (2-2,3-3) arrange gtable[layout]
## 7 7 (3-3,1-1) arrange gtable[layout]

Bivariate correlation analysis with scatterplot and Figures 9.3 and 9.4

What is the relationship between age and experience?

library(ggplot2)
library(ggpubr)
library(Hmisc)
library(bannerCommenter)

data <- CPS85

# Homoscedasticity Check (Residuals vs Fitted Plot)
model <- lm(exper ~ age, data = data)
plot(model, which = 1)  
title("Residuals vs Fitted Plot")

file_path <- file.path(folder, "Figure9.3.tif")

# Normality Check (Q-Q plots)
# Q-Q plot for Age
qq1<-ggqqplot(data$age, main = "Q-Q Plot for Age", ylab = "Quantiles")

# Q-Q plot for Experience
qq2<- ggqqplot(data$exper, main = "Q-Q Plot for Experience", ylab = "Quantiles")

fig9_3<- grid.arrange(qq1, qq2, nrow = 1)

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.3.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_3)

fig9_3
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
dev.off()
## png 
##   2
# Shapiro-Wilk test for normality
shapiro_test_age <- shapiro.test(data$age)
shapiro_test_exper <- shapiro.test(data$exper)

# Linear model for Residuals vs. Fitted plot
banner("Normality test for age and experience shows violation of the assumption")
## 
## ###############################################################################
## ##  Normality test for age and experience shows violation of the assumption  ##
## ###############################################################################
shapiro_test_age
## 
##  Shapiro-Wilk normality test
## 
## data:  data$age
## W = 0.9523, p-value = 4.117e-12
shapiro_test_exper
## 
##  Shapiro-Wilk normality test
## 
## data:  data$exper
## W = 0.94006, p-value = 7.603e-14
# Compute Spearman correlation manually
cor_test_result <- cor.test(data$age, data$exper, method = "spearman")

# Scatterplot with linear fit (Linearity check); we use a spearman correlation due to violation of normality
fig9_4<- ggplot(data, aes(x = age, y = exper)) +
  geom_point(alpha = 0.6) +  # Scatterplot with transparency
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Linear fit line
  annotate("text", x = min(data$age), y = max(data$exper), 
           label = paste("Spearman ρ =", format.pval(cor_test_result$estimate)), 
           hjust = 0, vjust = 1) +  # Manually add p-value label
  labs(title = "Scatterplot of Age vs Experience (Spearman ρ reported)",
       x = "Age",
       y = "Experience") +
  theme_minimal()

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.4.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_4)

banner("We run a spearman correlation since normality is violated")
## 
## #################################################################
## ##  We run a spearman correlation since normality is violated  ##
## #################################################################
# Correlation results using spearman since normality is violated
cor_results <- rcorr(as.matrix(data[, c("age", "exper")]), type = "spearman")
print(cor_results)
##        age exper
## age   1.00  0.97
## exper 0.97  1.00
## 
## n= 534 
## 
## 
## P
##       age exper
## age        0   
## exper  0
banner("model results")
## 
## #################################################################
## ##                        model results                        ##
## #################################################################
model
## 
## Call:
## lm(formula = exper ~ age, data = data)
## 
## Coefficients:
## (Intercept)          age  
##     -20.206        1.032
fig9_4

multiple regression analysis on height with Figure 9.5

# Load necessary library
library(ggplot2)
library(gridExtra)
library(apaTables)
library(lmtest)

file_path <- file.path(folder, "Figure9.2.tif")

# Create a data frame with all variables
multheight <- data.frame(
  height = c(170, 175, 180, 165, 160, 185, 178, 182, 173, 168, 
             180, 169, 177, 165, 174, 186, 178, 165, 172, 180),
  
  shoe_size = c(9, 10, 11, 8, 7, 12, 10, 11, 9, 8, 
                11, 8, 10, 8, 9, 12, 11, 8, 9, 10),
  
  biom= c(160, 162, 155, 167, 170, 158, 165, 172, 168, 160, 
             163, 155, 174, 165, 170, 168, 167, 160, 162, 159),

biof= c(175, 180, 185, 170, 168, 177, 180, 172, 185, 173, 
             169, 178, 174, 181, 179, 176, 182, 173, 170, 177)
)

# View the first few rows
head(multheight)
##   height shoe_size biom biof
## 1    170         9  160  175
## 2    175        10  162  180
## 3    180        11  155  185
## 4    165         8  167  170
## 5    160         7  170  168
## 6    185        12  158  177
# Run multiple regression analysis
model <- lm(height ~ biom + biof, data = multheight)

# Plot to check the linearity assumption for biom
p1<- ggplot(multheight, aes(x = biom, y = height)) + 
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") + 
  labs(title = "Height vs. Bio. Mother", x = "Bio. Mother Height", y = "Child Height")

# Plot to check the linearity assumption for biof
p2<- ggplot(multheight, aes(x = biof, y = height)) + 
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") + 
  labs(title = "Height vs.Bio. Father", x = "Bio Father Height", y = "Child Height")

# Residuals vs fitted values plot to check for independence of errors
p3<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) + 
  geom_point() + 
  geom_hline(yintercept = 0, color = "red") + 
  labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals")

# Residuals vs fitted values plot to check homoscedasticity
p4<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) + 
  geom_point() + 
  geom_hline(yintercept = 0, color = "red") + 
  labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals") +
  theme_minimal()

# Histogram of residuals to check normality
p5<-ggplot(data = multheight, aes(x = residuals(model))) + 
  geom_histogram(bins = 10, fill = "skyblue", color = "black") + 
  labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
  theme_minimal()

# Q-Q plot to check normality
p6<-  ggplot(data = multheight, aes(sample = residuals(model))) + 
  geom_qq() + 
  geom_qq_line(color = "red") + 
  labs(title = "Q-Q Plot of Residuals") +
  theme_minimal()

fig9_5<- grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)

fig9_5
## TableGrob (3 x 2) "arrange": 6 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
## 6 6 (3-3,2-2) arrange gtable[layout]
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.5.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_5)
dev.off()
## png 
##   2
# Check for multicollinearity using VIF
library(car)
vif(model)
##     biom     biof 
## 1.043446 1.043446
# Durbin-Watson test to check for autocorrelation in residuals
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 2.3376, p-value = 0.7756
## alternative hypothesis: true autocorrelation is greater than 0
# BP test homoscedacity
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.0091, df = 2, p-value = 0.3662
# Summarize the model
summary(model)
## 
## Call:
## lm(formula = height ~ biom + biof, data = multheight)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1899  -5.1111   0.0648   3.7485  11.6584 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.03025   86.40189   1.007    0.328
## biom         0.08133    0.31480   0.258    0.799
## biof         0.41846    0.33782   1.239    0.232
## 
## Residual standard error: 7.345 on 17 degrees of freedom
## Multiple R-squared:  0.08279,    Adjusted R-squared:  -0.02512 
## F-statistic: 0.7672 on 2 and 17 DF,  p-value: 0.4797
# Create the APA-style regression table
apa.reg.table(model, filename = "multreg.doc")
## 
## 
## Regression results using height as the criterion
##  
## 
##    Predictor     b         b_95%_CI beta   beta_95%_CI sr2  sr2_95%_CI   r
##  (Intercept) 87.03 [-95.26, 269.32]                                       
##         biom  0.08    [-0.58, 0.75] 0.06 [-0.44, 0.56] .00 [-.05, .05] .00
##         biof  0.42    [-0.29, 1.13] 0.29 [-0.21, 0.79] .08 [-.15, .31] .28
##                                                                           
##                                                                           
##                                                                           
##              Fit
##                 
##                 
##                 
##        R2 = .083
##  95% CI[.00,.30]
##                 
## 
## Note. A significant b-weight indicates the beta-weight and semi-partial correlation are also significant.
## b represents unstandardized regression weights. beta indicates the standardized regression weights. 
## sr2 represents the semi-partial correlation squared. r represents the zero-order correlation.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
## 

How do age, and married status predict wage

The first model is untransformed. The second mode is the dependent variable transformed using a log. The Shapiro Wilke shows it to deviate from normality, but the qq plot shows values around the line, so it’s assumed normal. The rest of the tests indicate assumptions met.

Install the bannerCommenter package for this

library(ggplot2)
library(lmtest)
library(car)
library(bannerCommenter)
library(gridExtra)
library(dplyr)
library(psych)

# Select relevant data
data <- CPS85 %>%
  dplyr::select(wage, age, married)

# Overall descriptive statistics
desc_wage <- psych::describe(data$wage)%>%
  as.data.frame()%>%
  mutate(Variable = "Overall_Wage", group1 = "Overall")%>%
  dplyr::select(Variable, group1, n, mean, sd, skew, kurtosis)

desc_age <- psych::describe(data$age) %>%
  as.data.frame() %>%
  mutate(Variable = "Overall_Age", group1 = "Overall")%>%
  dplyr::select(Variable, group1, n, mean, sd, skew, kurtosis)

# Descriptive statistics by marital status
desc_wage_married <- psych::describeBy(data$wage, group = data$married, mat = TRUE)%>%
  as.data.frame()%>%
  mutate(Variable = "Married_Wage")%>%
  dplyr::select(Variable, group1, n, mean, sd, skew, kurtosis)

desc_age_married <- psych::describeBy(data$age, group = data$married, mat = TRUE)%>%
  as.data.frame()%>%
  mutate(Variable = "Married_Age")%>%
  dplyr::select(Variable, group1, n, mean, sd, skew, kurtosis)

# Combine all results
final_results <- rbind(desc_wage, desc_age, desc_wage_married, desc_age_married)

# Save to CSV
write.csv(final_results, "mult_reg_desc.csv", row.names = FALSE)

final_results
##          Variable  group1   n      mean        sd      skew   kurtosis
## X1   Overall_Wage Overall 534  9.024064  5.139097 1.6877621  4.9042487
## X13   Overall_Age Overall 534 36.833333 11.726573 0.5452207 -0.5956149
## X11  Married_Wage Married 350  9.398486  4.925121 1.1719052  1.2475098
## X12  Married_Wage  Single 184  8.311848  5.466575 2.5133272 10.3119826
## X111  Married_Age Married 350 39.202857 11.067208 0.4124848 -0.7844761
## X121  Married_Age  Single 184 32.326087 11.651693 1.0927423  0.5295034

Statistical Assumptions in Regression Model and figure 9.6

file_path <- file.path(folder, "Figure9.2.tif")

banner("Section 1:", "Model 1 Before Transformation to DV", emph = TRUE)
## 
## ###########################################################################
## ###########################################################################
## ###                                                                     ###
## ###                              SECTION 1:                             ###
## ###                 MODEL 1 BEFORE TRANSFORMATION TO DV                 ###
## ###                                                                     ###
## ###########################################################################
## ###########################################################################
# Check normality for first model
model1<- lm(wage~age + married, data = data)

# Check linearity assumption
p1 <- ggplot(data, aes(x = age, y = wage)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "blue") + 
  ggtitle("Wage vs Age Pre Transf")

# Check normality of residuals
p2 <- ggplot(data.frame(resid = residuals(model1)), aes(sample = resid)) +
  stat_qq() +
  stat_qq_line(color = "red") +
  labs(title = "Q-Q Plot of Residuals Pre Transf") +
  theme_minimal()

# Residuals vs. Fitted plot for Homoscedasticity
p3 <- ggplot(data.frame(Fitted = fitted(model1), Residuals = residuals(model1)), 
             aes(x = Fitted, y = Residuals)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "blue", se = FALSE) + 
  labs(title = "Residuals vs. Fitted Pre Tansf") + 
  theme_minimal()

# Histogram of Residuals
p4 <- ggplot(data.frame(resid = residuals(model1)), aes(x = resid)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Residuals Pre Transf") +
  theme_minimal()

p5<- ggplot(data, aes(x = married, y = wage)) +
  geom_jitter() +  # Adds some noise to prevent overplotting
  labs(title = "Wage vs. Married Status Pre Transf")

p6<- ggplot(data, aes(x = married, y = wage)) +
  geom_boxplot() +
  labs(title = "Boxplot of Wage by Marital S. Pre Trans")

# Arrange the plots using grid.arrange
fig9_6<- grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.6.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_6)
dev.off()
## png 
##   2
banner("Homoscedasticity Test Model 1 (Untransformed)")
## 
## #################################################################
## ##        Homoscedasticity Test Model 1 (Untransformed)        ##
## #################################################################
# Breusch-Pagan test (p > 0.05 suggests homoscedasticity)
bptest(model1)  
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 1.6572, df = 2, p-value = 0.4367
banner("Independence  of errors")
## 
## #################################################################
## ##                   Independence  of errors                   ##
## #################################################################
# Durbin-Watson test of independence of errors
dwtest(model1)
## 
##  Durbin-Watson test
## 
## data:  model1
## DW = 1.7941, p-value = 0.008593
## alternative hypothesis: true autocorrelation is greater than 0
banner("VIF for Model 1 (Untransformed)")
## 
## #################################################################
## ##               VIF for Model 1 (Untransformed)               ##
## #################################################################
# Multicolinearity
vif(model1)
##      age  married 
## 1.084377 1.084377
banner("Multiple regression results untransformed data")
## 
## ##################################################################
## ##        Multiple regression results untransformed data        ##
## ##################################################################
# Print the model
summary(model1)
## 
## Call:
## lm(formula = wage ~ age + married, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.596 -3.587 -1.115  2.124 36.990 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.62427    0.80953   8.183 2.06e-15 ***
## age            0.07077    0.01946   3.636 0.000304 ***
## marriedSingle -0.60000    0.47981  -1.250 0.211674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.06 on 531 degrees of freedom
## Multiple R-squared:  0.03416,    Adjusted R-squared:  0.03052 
## F-statistic: 9.391 on 2 and 531 DF,  p-value: 9.821e-05

Multiple Regression after transforming data to a logatrithm with figure 9.7

file_path <- file.path(folder, "Figure9.2.tif")

banner("Section 2:", "Model 2 after Logarithm Transformation to DV", emph = TRUE)
## 
## ############################################################################
## ############################################################################
## ###                                                                      ###
## ###                              SECTION 2:                              ###
## ###             MODEL 2 AFTER LOGARITHM TRANSFORMATION TO DV             ###
## ###                                                                      ###
## ############################################################################
## ############################################################################
# Transform dependent variable and run model

data$log_wage <- log(data$wage)

# Build the model
model2 <- lm(log_wage ~ age + married, data = data)

# Check linearity assumption
p1 <- ggplot(data, aes(x = age, y = log_wage)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "red") + 
  ggtitle("Wage vs Age After Transf")

# Check normality of residuals
p2 <- ggplot(data.frame(resid = residuals(model2)), aes(sample = resid)) +
  stat_qq() +
  stat_qq_line(color = "blue") +
  labs(title = "Q-Q Plot of Residuals After Transf") +
  theme_minimal()

# Residuals vs. Fitted plot for Homoscedasticity
p3 <- ggplot(data.frame(Fitted = fitted(model2), Residuals = residuals(model2)), 
             aes(x = Fitted, y = Residuals)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "red", se = FALSE) + 
  labs(title = "Residuals vs. Fitted After Tansf") + 
  theme_minimal()

# Histogram of Residuals
p4 <- ggplot(data.frame(resid = residuals(model2)), aes(x = resid)) +
  geom_histogram(binwidth = 1, fill = "red", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Residuals After Transf") +
  theme_minimal()

p5<- ggplot(data, aes(x = married, y = log_wage)) +
  geom_jitter() +  # Adds some noise to prevent overplotting
  labs(title = "Wage vs. Married Status After Transf")

p6<- ggplot(data, aes(x = married, y = log_wage)) +
  geom_boxplot() +
  labs(title = "Boxplot of Wage by Marital S. After Trans")

# Arrange the plots using grid.arrange
fig9_7<- grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.7.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_7)
dev.off()
## png 
##   2
banner("Homoscedasticity Test Model 2 (Transformed)")
## 
## #################################################################
## ##         Homoscedasticity Test Model 2 (Transformed)         ##
## #################################################################
# Breusch-Pagan test (p > 0.05 suggests homoscedasticity)
bptest(model2)  
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 3.6038, df = 2, p-value = 0.165
banner("Independence  of errors (Transformed)")
## 
## #################################################################
## ##            Independence  of errors (Transformed)            ##
## #################################################################
# Durbin-Watson test of independence of errors
dwtest(model2)
## 
##  Durbin-Watson test
## 
## data:  model2
## DW = 1.8208, p-value = 0.01909
## alternative hypothesis: true autocorrelation is greater than 0
banner("VIF for Model 2 (Transformed)")
## 
## #################################################################
## ##                VIF for Model 2 (Transformed)                ##
## #################################################################
# Multicolinearity
vif(model2)
##      age  married 
## 1.084377 1.084377
banner("Multiple regression results transformed data")
## 
## ##################################################################
## ##         Multiple regression results transformed data         ##
## ##################################################################
# Print the model
summary(model2)
## 
## Call:
## lm(formula = log_wage ~ age + married, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13299 -0.37452  0.00341  0.33601  1.92367 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.804337   0.082607  21.842  < 2e-16 ***
## age            0.007825   0.001986   3.940 9.24e-05 ***
## marriedSingle -0.096846   0.048962  -1.978   0.0484 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5163 on 531 degrees of freedom
## Multiple R-squared:  0.04632,    Adjusted R-squared:  0.04273 
## F-statistic:  12.9 on 2 and 531 DF,  p-value: 3.4e-06
apa.reg.table(model2, filename = "mrexample.doc")
## 
## 
## Regression results using log_wage as the criterion
##  
## 
##      Predictor      b       b_95%_CI sr2  sr2_95%_CI             Fit
##    (Intercept) 1.80**   [1.64, 1.97]                                
##            age 0.01**   [0.00, 0.01] .03  [.00, .06]                
##  marriedSingle -0.10* [-0.19, -0.00] .01 [-.01, .02]                
##                                                          R2 = .046**
##                                                      95% CI[.02,.08]
##                                                                     
## 
## Note. A significant b-weight indicates the semi-partial correlation is also significant.
## b represents unstandardized regression weights. 
## sr2 represents the semi-partial correlation squared.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
## 

Logistic regression predicting getting a Arrested using HS GPA and College GPA as predictors

First we set up our dataset and run descriptive statistics on all the variables.

library(ggplot2)
library(car)
library(dplyr)
library(gridExtra)
library(psych)
library(tibble)

set.seed(123)

# Simulate dataset
n <- 20
data <- data.frame(
  High_School_GPA = round(runif(n, 2.5, 4.0), 2),
  College_GPA = round(runif(n, 2.5, 4.0), 2),
  Arrested = sample(0:1, n, replace = TRUE, prob = c(0.5, 0.5))
)

# Model (optional)
model <- glm(Arrested ~ High_School_GPA + College_GPA, data = data, family = binomial)

# Descriptive statistics for the entire dataset
overall_descriptives <- psych::describe(data[, c("High_School_GPA", "College_GPA")])
overall_descriptives_df <- overall_descriptives %>%
  rownames_to_column(var = "Statistic")

# Descriptive statistics by Arrested group
group_descriptives <- psych::describeBy(data[, c("High_School_GPA", "College_GPA")], group = data$Arrested, mat = TRUE)
# mat = TRUE ensures you get a single data frame rather than a list
group_descriptives_df <- group_descriptives %>%
  rownames_to_column(var = "Statistic") %>%
  rename(Group = group1)  # rename group column if needed

# Combine
combined_table <- bind_rows(
  overall_descriptives_df,
  group_descriptives_df
)

# Write CSV
write.csv(combined_table, "logistic_desc2.csv", row.names = FALSE)

Logistic Regression Generate Plots for Assuptions

fig9_8<- grid.arrange(
  ggplot(data, aes(x = High_School_GPA)) + 
    geom_histogram(binwidth = 0.2, fill = "blue", alpha = 0.6) + 
    labs(title = "Distribution of High School GPA"),

  ggplot(data, aes(x = College_GPA)) + 
    geom_histogram(binwidth = 0.2, fill = "red", alpha = 0.6) + 
    labs(title = "Distribution of College GPA"),

  ggplot(data, aes(y = High_School_GPA)) +
    geom_boxplot(fill = "blue", alpha = 0.6) +
    labs(title = "Boxplot of High School GPA"),

  ggplot(data, aes(y = College_GPA)) +
    geom_boxplot(fill = "red", alpha = 0.6) +
    labs(title = "Boxplot of College GPA"),

  ggplot(data, aes(x = High_School_GPA, y = College_GPA, color = as.factor(Arrested))) +
    geom_jitter(width = 0.1, height = 0.1) +
    labs(title = "Scatterplot of GPAs by Arrested Status") +
    scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes")),
  
  # Check residuals
ggplot(data, aes(x = fitted(model), y = residuals(model, type = "pearson"))) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  ggtitle("Residuals vs Fitted Values"),

  ncol = 2
)

folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)

# Save as TIFF
tiff(file.path(folder, "Figure9.8.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_8)
dev.off()
## png 
##   2
fig9_8
## TableGrob (3 x 2) "arrange": 6 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
## 6 6 (3-3,2-2) arrange gtable[layout]

Log-transform continuous predictors for Box-Tidwell test for logistic regression

data$log_High_School_GPA <- log(data$High_School_GPA)
data$log_College_GPA <- log(data$College_GPA)

# Box-Tidwell test: Adding interaction terms between original and log-transformed predictors
box_tidwell_model <- glm(Arrested ~ High_School_GPA + College_GPA + 
                           High_School_GPA:log_High_School_GPA + College_GPA:log_College_GPA, 
                         data = data, family = binomial)

# Check the summary of the Box-Tidwell model
summary(box_tidwell_model)
## 
## Call:
## glm(formula = Arrested ~ High_School_GPA + College_GPA + High_School_GPA:log_High_School_GPA + 
##     College_GPA:log_College_GPA, family = binomial, data = data)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -14.68     164.56  -0.089    0.929
## High_School_GPA                       -34.09      58.00  -0.588    0.557
## College_GPA                            36.80     103.04   0.357    0.721
## High_School_GPA:log_High_School_GPA    16.75      26.82   0.625    0.532
## College_GPA:log_College_GPA           -14.93      47.34  -0.315    0.752
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22.493  on 19  degrees of freedom
## Residual deviance: 11.580  on 15  degrees of freedom
## AIC: 21.58
## 
## Number of Fisher Scoring iterations: 6

Run logistic regression model

library(bannerCommenter)

# Fit logistic regression model
model <- glm(Arrested ~ High_School_GPA + College_GPA, data = data, family = binomial)

# Create a null model to test overall model fit
model_null <- glm(Arrested ~ 1, data = data, family = binomial)

banner("Overall Model Test")
## 
## ##################################################################
## ##                      Overall Model Test                      ##
## ##################################################################
# Overall Model Results
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Arrested ~ 1
## Model 2: Arrested ~ High_School_GPA + College_GPA
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1        19     22.493                        
## 2        17     12.073  2   10.421  0.00546 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
banner("Variance Inflation for Multicolinarity")
## 
## ##################################################################
## ##            Variance Inflation for Multicolinarity            ##
## ##################################################################
# Check multicollinearity
vif(model)
## High_School_GPA     College_GPA 
##        1.041778        1.041778
banner("Model results")
## 
## #################################################################
## ##                        Model results                        ##
## #################################################################
# Model Results
summary(model)
## 
## Call:
## glm(formula = Arrested ~ High_School_GPA + College_GPA, family = binomial, 
##     data = data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      -20.428      9.774  -2.090   0.0366 *
## High_School_GPA    2.331      1.790   1.302   0.1929  
## College_GPA        4.338      2.157   2.012   0.0443 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22.493  on 19  degrees of freedom
## Residual deviance: 12.073  on 17  degrees of freedom
## AIC: 18.073
## 
## Number of Fisher Scoring iterations: 6
banner("Model Odds")
## 
## ##################################################################
## ##                          Model Odds                          ##
## ##################################################################
# Log odds
exp(coef(model))  
##     (Intercept) High_School_GPA     College_GPA 
##    1.343519e-09    1.028845e+01    7.657855e+01