``{r example chunk, message=FALSE, warning=FALSE, echo=TRUE}
# YOUR AWESOME CODE GOES IN THIS SPACE
```
packages <- c(
"ggplot2", "psych", "vcd", "gridExtra", "MASS", "dplyr", "tidyr", "scales",
"datasets", "AER", "DiagrammeR", "magick", "patchwork", "effectsize",
"tidyverse", "ez", "rstatix", "car", "apaTables", "nlme", "dunn.test",
"bannerCommenter", "grid", "mosaicData", "effsize"
)
lapply(packages, library, character.only = TRUE)
library(ggplot2)
# Create the data
data <- data.frame(
Name = c("Malcolm", "Josie", "Liam", "Jackson", "Xavier"),
Score = c(10, 12, 12, 15, 23)
)
# Calculate the mean
mean_score <- mean(data$Score)
# Create the plot object (so we can save it later)
p <- ggplot(data, aes(x = Name, y = Score)) +
geom_line(group = 1, color = "blue", size = 1) + # Line connecting the points
geom_point(aes(color = Score), size = 4) + # Points representing the actual data
geom_hline(yintercept = mean_score, linetype = "dashed", color = "red") + # Mean line
labs(title = "Line Plot with Mean in the Middle",
x = "Name",
y = "Score") +
scale_color_gradient(low = "lightblue", high = "darkblue") + # Color scale for points
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
# Define folder and file name
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.1.tif")
# Save as high-resolution TIFF (6x4 inches, 300 dpi)
tiff(filename = file_path, width = 6*300, height = 4*300, res = 300)
print(p)
dev.off()
## png
## 2
p
# Load necessary library
library(psych)
# Load the mtcars dataset
data(mtcars)
# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg
# Calculate skewness using the psych package
psych_skew <- skew(mpg_data)
# Calculate unstandardized skewness (raw third moment)
N <- length(mpg_data)
mean_mpg <- mean(mpg_data)
raw_skew <- (1 / N) * sum((mpg_data - mean_mpg)^3)
# Calculate standardized skewness
sd_mpg <- sd(mpg_data)
std_skew <- raw_skew / (sd_mpg^3)
# Print results for comparison
cat("Psych skewness: ", psych_skew, "\n")
## Psych skewness: 0.610655
cat("Raw skewness (unstandardized): ", raw_skew, "\n")
## Raw skewness (unstandardized): 133.6867
cat("Standardized skewness: ", std_skew, "\n")
## Standardized skewness: 0.610655
# Load necessary library
library(psych)
# Load the mtcars dataset
data(mtcars)
# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg
# Calculate the sample size (n)
n <- length(mpg_data)
# Calculate the mean (x̄) and standard deviation (σ) of the mpg data
mean_mpg <- mean(mpg_data)
sd_mpg <- sd(mpg_data)
# Calculate the fourth central moment (m4)
m4 <- sum(((mpg_data - mean_mpg) / sd_mpg)^4) / n # Population m4
# Calculate excess kurtosis (manual)
manual_kurtosis <- m4 - 3
# Calculate kurtosis using psych package for comparison
psych_kurtosis <- kurtosi(mpg_data)
# Output the results
cat("Manual Excess Kurtosis (simplified): ", manual_kurtosis, "\n")
## Manual Excess Kurtosis (simplified): -0.372766
cat("Psych Package Excess Kurtosis: ", psych_kurtosis, "\n")
## Psych Package Excess Kurtosis: -0.372766
library(ggplot2)
# Set seed for reproducibility
set.seed(123)
# Simulate heights for 500 second graders
n <- 500
mean_height <- 50
sd_height <- 3
heights <- round(rnorm(n, mean = mean_height, sd = sd_height))
heights <- pmax(40, pmin(heights, 60)) # Clip to 40–60 inches
# Create the plot object
p2 <- ggplot(data.frame(heights), aes(x = heights)) +
geom_histogram(aes(y = ..count..),
binwidth = 1, fill = "dodgerblue4", color = "black", boundary = 40) +
geom_density(fill = "red", alpha = 0.2, adjust = 1.5) +
labs(title = "Normal Distribution (Bell Curve) of Second Graders' Heights",
x = "Height (inches)", y = "Number of Kids") +
theme_minimal() +
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
scale_x_continuous(breaks = seq(40, 60, by = 1))
# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.2.tif")
# Save as high-resolution TIFF (6x4 inches, 300 dpi)
tiff(filename = file_path, width = 6*300, height = 4*300, res = 300)
print(p2)
dev.off()
## png
## 2
p2
# Load required libraries
library(ggplot2)
library(gridExtra)
library(MASS) # For generating skewed distributions
# Set up the number of data points
n <- 1000
# Create data for the distributions
normal_data <- rnorm(n, mean = 5, sd = 1) # Normal distribution with mean = 5
positively_skewed_data <- rbeta(n, shape1 = 2, shape2 = 5) # Beta distribution (positively skewed)
negatively_skewed_data <- rbeta(n, shape1 = 5, shape2 = 2) # Beta distribution (negatively skewed)
# Function to calculate the peak of the density curve for mode label placement
get_peak_y <- function(data) {
density_data <- density(data, bw = 1) # Larger bandwidth for "fatter" curves
peak_y <- max(density_data$y)
return(peak_y)
}
# Custom Labeling function to avoid overlap
adjust_label_y <- function(peak_y, label_number, offset_factor = 0.1) {
# Spacing the labels further apart by adding an offset factor for each label
vertical_offset <- peak_y - (offset_factor * label_number)
return(vertical_offset)
}
# Normal Distribution Plot
normal_mean <- mean(normal_data)
normal_median <- median(normal_data)
normal_mode <- normal_mean # Mode is exactly the mean for normal distribution
normal_peak_y <- get_peak_y(normal_data)
normal_plot <- ggplot(data.frame(x = normal_data), aes(x = x)) +
geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = normal_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = normal_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = normal_mode), color = "black", linetype = "solid") +
annotate("text", x = normal_mean, y = normal_peak_y * 0.9, label = "Mean", color = "red", hjust = -0.2) +
annotate("text", x = normal_median, y = normal_peak_y * 0.75, label = "Median", color = "blue", hjust = -0.2) +
annotate("text", x = normal_mode, y = normal_peak_y * 0.5, label = "Mode", color = "black", hjust = -0.2) +
ggtitle("Normal Distribution") + theme_minimal()
# Positively Skewed Distribution Plot
positively_skewed_mean <- mean(positively_skewed_data)
positively_skewed_median <- median(positively_skewed_data)
positively_skewed_mode <- positively_skewed_mean - (positively_skewed_mean * 0.19) # Mode estimate for skewed
positively_skewed_peak_y <- get_peak_y(positively_skewed_data)
positively_skewed_plot <- ggplot(data.frame(x = positively_skewed_data), aes(x = x)) +
geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = positively_skewed_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = positively_skewed_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = positively_skewed_mode), color = "black", linetype = "solid") +
annotate("text", x = positively_skewed_mean, y = positively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
annotate("text", x = positively_skewed_median, y = positively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
annotate("text", x = positively_skewed_mode, y = positively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
ggtitle("Positively Skewed Distribution") + theme_minimal()
# Negatively Skewed Distribution Plot
negatively_skewed_mean <- mean(negatively_skewed_data)
negatively_skewed_median <- median(negatively_skewed_data)
negatively_skewed_mode <- negatively_skewed_mean + (negatively_skewed_mean * 0.09) # Adjusting mode for skewed distribution
negatively_skewed_peak_y <- get_peak_y(negatively_skewed_data)
negatively_skewed_plot <- ggplot(data.frame(x = negatively_skewed_data), aes(x = x)) +
geom_density(fill = "orange", color = "black", adjust = 1) +
geom_vline(aes(xintercept = negatively_skewed_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = negatively_skewed_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = negatively_skewed_mode), color = "black", linetype = "solid") +
annotate("text", x = negatively_skewed_mean, y = negatively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
annotate("text", x = negatively_skewed_median, y = negatively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
annotate("text", x = negatively_skewed_mode, y = negatively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
ggtitle("Negatively Skewed Distribution") + theme_minimal()
# Arrange the plots in a vertical grid (stacked)
grid.arrange(normal_plot, positively_skewed_plot, negatively_skewed_plot, ncol = 2)
# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.3.tif")
# Save as high-resolution TIFF
tiff(filename = file_path, width = 12*300, height = 8*300, res = 300) # Wider and taller to fit multiple plots
grid.arrange(normal_plot, positively_skewed_plot, negatively_skewed_plot, ncol = 2)
dev.off()
## png
## 2
library(ggplot2)
library(gridExtra)
library(MASS)
# Set up the number of data points
n <- 1000
# Create data for the distributions
mesokurtic_data <- rnorm(n, mean = 5, sd = 1) # Normal distribution (Mesokurtic)
platykurtic_data <- runif(n, min = 0, max = 10) # Uniform distribution (Platykurtic)
leptokurtic_data <- rt(n, df = 3) # T-distribution with 3 degrees of freedom (Leptokurtic)
# Function to calculate the peak of the density curve
get_peak_y <- function(data) {
density_data <- density(data, bw = 1)
max(density_data$y)
}
# Mesokurtic Plot
mesokurtic_plot <- ggplot(data.frame(x = mesokurtic_data), aes(x = x)) +
geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = mean(mesokurtic_data)), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = median(mesokurtic_data)), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = mean(mesokurtic_data)), color = "black", linetype = "solid") +
ggtitle("Mesokurtic Distribution") + theme_minimal()
# Platykurtic Plot
platykurtic_plot <- ggplot(data.frame(x = platykurtic_data), aes(x = x)) +
geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = mean(platykurtic_data)), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = median(platykurtic_data)), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = mean(platykurtic_data)), color = "black", linetype = "solid") +
ggtitle("Platykurtic Distribution") + theme_minimal()
# Leptokurtic Plot
leptokurtic_plot <- ggplot(data.frame(x = leptokurtic_data), aes(x = x)) +
geom_density(fill = "orange", color = "black", adjust = 1) +
geom_vline(aes(xintercept = mean(leptokurtic_data)), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = median(leptokurtic_data)), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = mean(leptokurtic_data)), color = "black", linetype = "solid") +
ggtitle("Leptokurtic Distribution") + theme_minimal()
# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.4.tif")
# Save as high-resolution TIFF
tiff(filename = file_path, width = 12*300, height = 8*300, res = 300)
grid.arrange(mesokurtic_plot, platykurtic_plot, leptokurtic_plot, ncol = 2)
dev.off()
## png
## 2
grid.arrange(mesokurtic_plot, platykurtic_plot, leptokurtic_plot, ncol = 2)
#
Figure 3.5:The normal distribution given percentile and Z score (number
of standard deviations)
## png
## 2
## Figure saved to: R figures 2/figure3.5.tif
library(ggplot2)
library(gridExtra)
# Define parameters for each test
tests <- list(
IQ = list(mean = 100, sd = 15, x_range = c(40, 160)),
T_score = list(mean = 50, sd = 10, x_range = c(10, 90)),
Stanine = list(mean = 5, sd = 2, x_range = c(0, 10))
)
# Function to create bell curve data
create_bell_curve <- function(mean, sd, x_range, test_name) {
x <- seq(x_range[1], x_range[2], length.out = 1000)
y <- dnorm(x, mean = mean, sd = sd)
data.frame(x = x, y = y, test = test_name)
}
# Generate data for all tests
bell_curves <- do.call(rbind, lapply(names(tests), function(test_name) {
params <- tests[[test_name]]
create_bell_curve(params$mean, params$sd, params$x_range, test_name)
}))
# Create individual plots with consistent scaling
plots <- lapply(unique(bell_curves$test), function(test_name) {
params <- tests[[test_name]]
bell_curves_scaled <- bell_curves[bell_curves$test == test_name, ]
bell_curves_scaled$y <- bell_curves_scaled$y / max(bell_curves_scaled$y) # Scale to max=1
ggplot(bell_curves_scaled, aes(x = x, y = y)) +
geom_line(color = "blue", size = 1) +
ggtitle(paste(test_name, "Distribution")) +
theme_minimal() +
labs(x = "Score", y = "Density") +
scale_x_continuous(
breaks = c(seq(params$mean - 3 * params$sd, params$mean + 3 * params$sd, by = params$sd), params$mean),
labels = function(x) ifelse(x == params$mean, paste0(x, " (Mean)"), x)
)
})
# Define folder and file path
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
file_path <- file.path(folder, "figure3.6.tif")
# Save as high-resolution TIFF
tiff(filename = file_path, width = 12*300, height = 8*300, res = 300) # 12x8 inches
grid.arrange(grobs = plots, ncol = 2)
dev.off()
## png
## 2
grid.arrange(grobs = plots, ncol = 2)
# Create a dataset for dexcriptives called "descriptives" using the CPS85 dataset
descriptives <- CPS85 %>%
dplyr::select(wage, educ, exper, age)%>%
psych::describe()
descriptives
## vars n mean sd median trimmed mad min max range skew kurtosis
## wage 1 534 9.02 5.14 7.78 8.28 4.12 1 44.5 43.5 1.69 4.90
## educ 2 534 13.02 2.62 12.00 13.05 1.48 2 18.0 16.0 -0.20 0.81
## exper 3 534 17.82 12.38 15.00 16.75 11.86 0 55.0 55.0 0.68 -0.40
## age 4 534 36.83 11.73 35.00 36.07 11.86 18 64.0 46.0 0.55 -0.60
## se
## wage 0.22
## educ 0.11
## exper 0.54
## age 0.51
# See unique categories for the CPS85 dataset
unique(CPS85$sector)
## [1] const sales clerical service manuf prof other manag
## Levels: clerical const manag manuf other prof sales service
# Count how many unique sectors there are
n_distinct(CPS85$sector)
## [1] 8
# Get descriptive statistics by sector
descriptives_sector <- psych::describeBy(CPS85$wage, group = CPS85$sector, mat = TRUE)
# Convert the matrix into a data frame
descriptives_sector<- as.data.frame(descriptives_sector)
write.csv(descriptives_sector, "descriptives_sector.csv")
descriptives_sector
## item group1 vars n mean sd median trimmed mad min
## X11 1 clerical 1 97 7.422577 2.699018 7.500 7.299873 3.335850 3.00
## X12 2 const 1 20 9.502000 3.343877 9.750 9.499375 3.647196 3.75
## X13 3 manag 1 55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X14 4 manuf 1 68 8.036029 4.117607 6.750 7.511786 3.335850 3.00
## X15 5 other 1 68 8.500588 4.601049 6.940 7.869464 3.261720 2.85
## X16 6 prof 1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X17 7 sales 1 38 7.592632 4.232272 5.725 7.157813 2.891070 3.35
## X18 8 service 1 83 6.537470 3.673278 5.500 5.937612 2.816940 1.75
## max range skew kurtosis se
## X11 15.03 12.03 0.39715109 -0.71992219 0.2740438
## X12 15.00 11.25 -0.02255889 -1.05839301 0.7477136
## X13 44.50 43.50 1.47308721 3.67335914 1.0210774
## X14 22.20 19.20 1.26924002 1.53036996 0.4993332
## X15 26.00 23.15 1.46528476 2.30867798 0.5579592
## X16 24.98 20.63 0.77278554 -0.36441619 0.5390709
## X17 19.98 16.63 0.97785906 0.01196841 0.6865651
## X18 25.00 23.25 2.04386695 6.36667089 0.4031946
# Load libraries
library(ggplot2)
library(datasets) # CPS85 is part of AER package; make sure CPS85 is loaded
# Create folder foR figures 2
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
# --- Figure 4.5: Overall Wage Distribution ---
fig4_5 <- ggplot(CPS85, aes(x = wage)) +
geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
color = "black", alpha = 0.7) +
geom_density(color = "red", fill = "red", alpha = 0.3) +
theme_minimal() +
labs(title = "Overall Wage Distribution", x = "Wage", y = "Density")
# Save Figure 4.5
tiff4_5 <- file.path(folder, "Figure4.5.tif")
ggsave(filename = tiff4_5, plot = fig4_5, device = "tiff", dpi = 300, width = 8, height = 6)
cat("Figure 4.5 saved as TIFF at:", tiff4_5, "\n")
## Figure 4.5 saved as TIFF at: R figures 2/Figure4.5.tif
# --- Figure 4.6: Wage Distribution by Sector ---
fig4_6 <- ggplot(CPS85, aes(x = wage)) +
geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue",
color = "black", alpha = 0.7) +
geom_density(color = "red", fill = "red", alpha = 0.3) +
facet_wrap(~ sector) +
theme_minimal() +
labs(title = "Wage Distribution by Sector", x = "Wage", y = "Density")
# Save Figure 4.6
tiff4_6 <- file.path(folder, "Figure4.6.tif")
ggsave(filename = tiff4_6, plot = fig4_6, device = "tiff", dpi = 300, width = 10, height = 6)
cat("Figure 4.6 saved as TIFF at:", tiff4_6, "\n")
## Figure 4.6 saved as TIFF at: R figures 2/Figure4.6.tif
fig4_5
fig4_6
library(dplyr)
library(tidyr)
## Single tables
married_table<- table(CPS85$married)
count_table <- table(CPS85$married, CPS85$union)
married_prop<- prop.table(table(CPS85$married))
married_union_prop<- prop.table((table(CPS85$married, CPS85$union)))
# Create into one table
df_count <- as.data.frame(count_table)
df_count <- df_count %>%
rename(Married = Var1, Union = Var2, Count = Freq)
df_count <- df_count %>%
group_by(Union) %>%
mutate(Proportion = Count / sum(Count))
df_total <- df_count %>%
group_by(Married) %>%
summarise(Count = sum(Count), Proportion = sum(Proportion)) %>%
mutate(Union = "Total")
final_table <- bind_rows(df_count, df_total)
married_table
##
## Married Single
## 350 184
count_table
##
## Not Union
## Married 278 72
## Single 160 24
married_prop
##
## Married Single
## 0.6554307 0.3445693
married_union_prop
##
## Not Union
## Married 0.52059925 0.13483146
## Single 0.29962547 0.04494382
df_count
## # A tibble: 4 × 4
## # Groups: Union [2]
## Married Union Count Proportion
## <fct> <fct> <int> <dbl>
## 1 Married Not 278 0.635
## 2 Single Not 160 0.365
## 3 Married Union 72 0.75
## 4 Single Union 24 0.25
df_total
## # A tibble: 2 × 4
## Married Count Proportion Union
## <fct> <int> <dbl> <chr>
## 1 Married 350 1.38 Total
## 2 Single 184 0.615 Total
final_table
## # A tibble: 6 × 4
## # Groups: Union [3]
## Married Union Count Proportion
## <fct> <chr> <int> <dbl>
## 1 Married Not 278 0.635
## 2 Single Not 160 0.365
## 3 Married Union 72 0.75
## 4 Single Union 24 0.25
## 5 Married Total 350 1.38
## 6 Single Total 184 0.615
# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
# Create folder foR figures 2
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
# --- Prepare frequency table ---
frequency_table <- CPS85 %>%
dplyr::select(race, sex, hispanic, south, married, union, sector) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
group_by(Variable, Category) %>%
summarise(Frequency = n(), .groups = "drop") %>%
group_by(Variable) %>%
mutate(Proportion = Frequency / sum(Frequency)) %>%
arrange(Variable, desc(Frequency))
# Save frequency table to CSV
write.csv(frequency_table, "frequencytable.csv", row.names = FALSE)
# --- Create Figure 4.7 ---
fig4_7 <- ggplot(frequency_table, aes(x = Category, y = Frequency, fill = Category)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(
aes(label = percent(Proportion, accuracy = 0.1)),
position = position_stack(vjust = 0.5),
size = 3, color = "black"
) +
facet_wrap(~Variable, scales = "free", ncol = 2) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 10, face = "bold")
) +
labs(
title = "Frequency and Proportion of Categories by Variable",
x = "Category",
y = "Frequency"
)
# Save Figure 4.7 as TIFF (300 dpi)
tiff4_7 <- file.path(folder, "Figure4.7.tif")
ggsave(filename = tiff4_7, plot = fig4_7, device = "tiff", dpi = 300, width = 10, height = 8)
cat("Figure 4.7 saved as TIFF at:", tiff4_7, "\n")
## Figure 4.7 saved as TIFF at: R figures 2/Figure4.7.tif
fig4_7
library(DiagrammeR)
library(DiagrammeRsvg)
library(rsvg)
library(magick)
# Create the diagram
diagram <- grViz("
digraph experiment {
graph [layout = dot, rankdir = LR]
Population [label = 'Population', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]
Sample [label = 'Randomly\\nSample\\nPopulation', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]
Split [label = 'Randomly\\nSplit into\\nGroups', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]
PreA [label = 'Pre-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
ImplementA [label = 'Implement\\nCurriculum A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
PostA [label = 'Post-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
FollowUpA [label = 'Follow-Up\\nTest:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
PreB [label = 'Pre-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
ImplementB [label = 'Implement\\nCurriculum B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
PostB [label = 'Post-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
FollowUpB [label = 'Follow-Up\\nTest:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
Analyze [label = 'Run\\nStatistical\\nTests', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]
Population -> Sample
Sample -> Split
Split -> PreA
PreA -> ImplementA
ImplementA -> PostA
PostA -> FollowUpA
FollowUpA -> Analyze
Split -> PreB
PreB -> ImplementB
ImplementB -> PostB
PostB -> FollowUpB
FollowUpB -> Analyze
{rank = same; PreA; PreB}
{rank = same; ImplementA; ImplementB}
{rank = same; PostA; PostB}
{rank = same; FollowUpA; FollowUpB}
}
")
svg_text <- export_svg(diagram)
temp_svg <- tempfile(fileext = ".svg")
writeLines(svg_text, temp_svg)
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure5.1.tif")
img <- image_read(temp_svg)
image_write(img, path = tiff_path, format = "tiff", density = "300x300")
cat("Figure 5.1 saved as TIFF at:", tiff_path, "\n")
## Figure 5.1 saved as TIFF at: R figures 2/figure5.1.tif
diagram
library(DiagrammeR)
library(DiagrammeRsvg)
library(magick)
# Create folder for output
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
# --- Create Diagrams ---
# Probabilistic Sampling Diagram
prob_diagram <- grViz("
digraph probabilistic_sampling {
graph [rankdir=TB]
node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
probabilistic [label='Probabilistic Sampling\n(High External Validity)', shape=ellipse, fillcolor=palegreen, width=2.5]
true_random [label='True Random\nSampling\n(Highest External\nValidity)']
stratified_random [label='Stratified Random\nSampling']
systematic [label='Systematic Sampling']
cluster [label='Cluster Sampling']
probabilistic -> true_random
probabilistic -> stratified_random
probabilistic -> systematic
probabilistic -> cluster
}
")
# Non-Probabilistic Sampling Diagram
non_prob_diagram <- grViz("
digraph non_probabilistic_sampling {
graph [rankdir=TB]
node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
non_probabilistic [label='Non-Probabilistic Sampling\n(Low External Validity)', shape=ellipse, fillcolor=lightcoral, width=2.5]
convenience [label='Convenience Sampling']
voluntary [label='Voluntary Sampling']
purposive [label='Purposive Sampling']
non_probabilistic -> convenience
non_probabilistic -> voluntary
non_probabilistic -> purposive
}
")
# --- Export SVGs to temporary files ---
prob_svg <- export_svg(prob_diagram)
prob_file <- file.path(folder, "probabilistic_sampling.svg")
writeLines(prob_svg, prob_file)
non_prob_svg <- export_svg(non_prob_diagram)
non_prob_file <- file.path(folder, "non_probabilistic_sampling.svg")
writeLines(non_prob_svg, non_prob_file)
# --- Read SVGs into magick images ---
prob_image <- image_read(prob_file)
non_prob_image <- image_read(non_prob_file)
# --- STACKED VERSION ---
# Make images same width
max_width <- max(image_info(prob_image)$width, image_info(non_prob_image)$width)
prob_image <- image_extent(prob_image, geometry = paste0(max_width, "x"), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0(max_width, "x"), gravity = "center")
# Optional spacing
prob_image <- image_border(prob_image, color = "white", geometry = "0x10")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "0x10")
# Stack vertically
combined_stack <- image_append(c(prob_image, non_prob_image), stack = TRUE)
# Make images same height
max_height <- max(image_info(prob_image)$height, image_info(non_prob_image)$height)
prob_image <- image_extent(prob_image, geometry = paste0("x", max_height), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0("x", max_height), gravity = "center")
# Optional spacing
prob_image <- image_border(prob_image, color = "white", geometry = "10x0")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "10x0")
# Combine side-by-side
combined_side <- image_append(c(prob_image, non_prob_image), stack = FALSE)
# --- Save stacked TIFF at 300 dpi ---
tiff_stack_path <- file.path(folder, "figure5.2_stack.tif")
image_write(combined_stack, path = tiff_stack_path, format = "tiff", density = "300x300")
# --- Save side-by-side TIFF at 300 dpi ---
tiff_side_path <- file.path(folder, "figure5.2_side_by_side.tif")
image_write(combined_side, path = tiff_side_path, format = "tiff", density = "300x300")
# --- Print paths ---
cat("Figure 5.2 saved as TIFFs:\nStacked:", tiff_stack_path, "\nSide-by-side:", tiff_side_path, "\n")
## Figure 5.2 saved as TIFFs:
## Stacked: R figures 2/figure5.2_stack.tif
## Side-by-side: R figures 2/figure5.2_side_by_side.tif
prob_diagram
non_prob_diagram
library(DiagrammeR)
# Create folder for output
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
# --- Create the diagram ---
diagram_5_3<- grViz("
digraph internal_structure {
graph [rankdir = TB, nodesep=0.4]
node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica]
Scale [label = 'Entire Scale', shape = box, fillcolor = lightcoral]
node [shape = ellipse, fillcolor = lightgoldenrodyellow]
Factor1 [label = 'Factor 1']
Factor2 [label = 'Factor 2']
node [shape = rectangle, fillcolor = lightgray]
Item1_F1 [label = 'Item 1']
Item2_F1 [label = 'Item 2']
Item3_F1 [label = 'Item 3']
Item1_F2 [label = 'Item 4']
Item2_F2 [label = 'Item 5']
Item3_F2 [label = 'Item 6']
# Define edges
Scale -> Factor1
Scale -> Factor2
Factor1 -> Item1_F1
Factor1 -> Item2_F1
Factor1 -> Item3_F1
Factor2 -> Item1_F2
Factor2 -> Item2_F2
Factor2 -> Item3_F2
}
")
# --- Export SVG to temporary file ---
svg_5_3 <- export_svg(diagram_5_3)
svg_file <- file.path(folder, "figure5.3.svg")
writeLines(svg_5_3, svg_file)
# --- Read SVG into magick image ---
img_5_3 <- image_read(svg_file)
# --- Save as TIFF at 300 dpi ---
tiff_path <- file.path(folder, "figure5.3.tif")
image_write(img_5_3, path = tiff_path, format = "tiff", density = "300x300")
# --- Print path ---
cat("Figure 5.3 saved as TIFF at:", tiff_path, "\n")
## Figure 5.3 saved as TIFF at: R figures 2/figure5.3.tif
diagram_5_3
# Load necessary libraries
library(ggplot2)
library(patchwork)
library(gridExtra)
# Define the critical values for two-tailed test
alpha <- 0.05 # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2) # Two-tailed critical value (±1.96 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)
# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2) # Shift the distribution to the right
# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Plot for two-tailed test with shifted distribution
plot1 <- ggplot() +
# Plot the null hypothesis curve (standard normal distribution)
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
# Plot the alternative hypothesis curve (shifted normal distribution)
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
# Shade the rejection region for the two-tailed test (both left and right tails)
geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) + # Left tail
geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) + # Right tail
# Add vertical lines for critical values
geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
# Labels and annotations
annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = 1.2, size = 3) + # Move text up and decrease font size
annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = -0.2, size = 3) +
labs(title = "P1: Two-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
theme_minimal()
# Define the significance level (alpha) and critical values
alpha <- 0.05
z_critical_two_tailed <- qnorm(1 - alpha / 2) # Two-tailed critical value (~±1.96 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute density values for null and alternative hypotheses
z_density_null <- dnorm(z_scores) # Standard normal distribution
z_density_alt <- dnorm(z_scores, mean = -2) # Shifted left
# Create data frames for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Create the plot
plot2<- ggplot() +
# Plot the null hypothesis curve
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
# Plot the alternative hypothesis curve
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
# Shade the left tail rejection region
geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
# Shade the right tail rejection region
geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
# Add vertical dashed lines for critical values
geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
# Add annotations for rejection regions
annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = 1.2, size = 3) + # Move text up and decrease font size
annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = -0.2, size = 3) +
# Labels and styling
labs(title = "P2: Two-Tailed, Martians < Earthlings", x = "Z-Score", y = "Density") +
theme_minimal()
# Define the critical value for one-tailed test
alpha <- 0.05 # significance level
z_critical_one_tailed <- qnorm(1 - alpha) # One-tailed critical value (z_critical = 1.645 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)
# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2) # Shift the distribution to the right
# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Plot for one-tailed test with shifted distribution to the right
plot3 <- ggplot() +
# Plot the null hypothesis curve (standard normal distribution)
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
# Plot the alternative hypothesis curve (shifted normal distribution to the right)
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
# Shade the rejection region for the one-tailed test (right tail)
geom_ribbon(data = subset(data_null, z_scores >= z_critical_one_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "blue", alpha = 0.5) + # Right tail
# Add vertical line for the one-tailed critical value
geom_vline(xintercept = z_critical_one_tailed, color = "black", linetype = "dashed") +
# Labels and annotations
annotate("text", x = z_critical_one_tailed, y = 0.07, label = "One-Tailed\nRejection Region", color = "black", hjust = -0.2, size = 3) +
labs(title = "P3: One-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
theme_minimal()
# Define the critical values for two-tailed test
alpha <- 0.05 # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2) # Two-tailed critical value (±1.96 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)
# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
# Decrease the shift to increase the overlap
z_density_alt <- dnorm(z_scores, mean = 0.3) # Slightly shifted distribution
# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Plot for more overlap between null and alternative distributions
plot4 <- ggplot() +
# Plot the null hypothesis curve (standard normal distribution)
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black", alpha = 0.6) + # Slight transparency for overlap
# Plot the alternative hypothesis curve (shifted normal distribution)
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue", alpha = 0.6) + # Slight transparency for overlap
# Add vertical lines for critical values
geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
# Add a label indicating no statistical significance
annotate("text", x = 0, y = 0.15, label = "Null Hypothesis Accepted", color = "black", size = 4, fontface = "bold") + # Centered annotation
labs(title = "P4: Earthlings = Martians", x = "Z-Score", y = "Density") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5), # Center the title
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12)
)
# Create a list of plots
plots <- list(plot1, plot2, plot3, plot4)
# Use grid.arrange to arrange the plots with specified heights
grid.arrange(grobs = plots, ncol2 = 1, nrow = 2, heights = c(1, 1), widths = c(.5, .5))
combined_plot <- grid.arrange(plot1, plot2, plot3, plot4, ncol = 2, nrow = 2)
# --- Save the combined plot as TIFF at 300 dpi ---
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure5.4.tif")
# Use ggsave for high-res TIFF export
ggsave(
filename = tiff_path,
plot = combined_plot,
device = "tiff",
dpi = 300,
width = 10, # width in inches
height = 8 # height in inches
)
cat("Figure 5.4 saved as TIFF at:", tiff_path, "\n")
## Figure 5.4 saved as TIFF at: R figures 2/figure5.4.tif
library(summarytools)
library(dplyr)
library(ggplot2)
library(vcd)
# Select relevant columns from the dataset
chidata <- CPS85 %>%
dplyr::select(race, union)
# Generate contingency table
chitable <- table(chidata$race, chidata$union)
# Run chi-square test
chi_test <- chisq.test(chitable)
chi_test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348
# Convert the expected counts matrix to a data frame
expected <- as.data.frame(chi_test$expected)
# Extract row names and make them a column
expected$race <- rownames(expected)
# Move 'race' column to the front (if desired)
expected <- expected %>%
dplyr::select(race, everything())
# Add 'type' column to indicate that these are expected counts
expected <- expected %>%
mutate(type = "Expected")
# Convert the contingency table to a data frame
chitable_df <- as.data.frame(chitable)%>%
rename(race = Var1, union = Var2)
observed <- chitable_df %>%
dplyr::select(race, union, Freq) %>%
pivot_wider(names_from = union, values_from = Freq)%>%
mutate(type = "Observed")
overalltable<- rbind(observed, expected)
print("contengency table")
## [1] "contengency table"
chitable
##
## Not Union
## NW 49 18
## W 389 78
print("Expected counts")
## [1] "Expected counts"
chi_test$expected
##
## Not Union
## NW 54.95506 12.04494
## W 383.04494 83.95506
overalltable
## # A tibble: 4 × 4
## race Not Union type
## * <fct> <dbl> <dbl> <chr>
## 1 NW 49 18 Observed
## 2 W 389 78 Observed
## 3 NW 55.0 12.0 Expected
## 4 W 383. 84.0 Expected
print("Chi-square test results:")
## [1] "Chi-square test results:"
print(chi_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chitable
## X-squared = 3.4442, df = 1, p-value = 0.06348
# Calculate Cramér's V or Phi (for 2x2)
print("Effect size metrics, this uses phi because it is 2x2")
## [1] "Effect size metrics, this uses phi because it is 2x2"
assocstats(chitable)
## X^2 df P(> X^2)
## Likelihood Ratio 3.7469 1 0.052906
## Pearson 4.1045 1 0.042770
##
## Phi-Coefficient : 0.088
## Contingency Coeff.: 0.087
## Cramer's V : 0.088
write.csv(overalltable, "chi.csv")
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(scales)
# Create folder foR figures 2 if it doesn't exist
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
# Prepare the data
chitable_plot <- as.data.frame(chitable) %>%
rename(Race = Var1, Union = Var2) %>%
group_by(Race) %>%
mutate(Proportion = Freq / sum(Freq))
# Create the ggplot
fig6_1 <- ggplot(chitable_plot, aes(x = Race, y = Freq, fill = Union)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = percent(Proportion)),
position = position_dodge(width = 0.8),
vjust = -0.5, size = 3) +
geom_text(aes(label = Freq),
position = position_dodge(width = 0.8),
vjust = 1.5, size = 3, color = "white") +
labs(title = "Contingency Table: Race vs Union",
x = "Race",
y = "Count") +
scale_fill_brewer(palette = "Set1") +
theme_minimal()
# Save as TIFF (300 dpi)
tiff6_1 <- file.path(folder, "Figure6.1.tif")
ggsave(filename = tiff6_1, plot = fig6_1, device = "tiff", dpi = 300, width = 8, height = 6)
cat("Figure 6.1 saved as TIFF at:", tiff6_1, "\n")
## Figure 6.1 saved as TIFF at: R figures 2/Figure6.1.tif
fig6_1
# Load necessary libraries
library(ggplot2)
library(gridExtra)
# Simulated data for non-significant difference (Earthlings vs Martians)
nonsig_data <- data.frame(
ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 22, sd = 4)),
Group = rep(c("Earthlings", "Martians"), each = 100)
)
# Simulated data for significant difference (Earthlings vs Martians)
sig_data <- data.frame(
HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
Group = rep(c("Earthlings", "Martians"), each = 100)
)
# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Non-Significant Difference (Earthlings vs Martians)") +
ylab("ACT Score") +
xlab("Group") +
theme_minimal()
# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Significant Difference (Earthlings vs Martians)") +
ylab("HS GPA") +
xlab("Group") +
theme_minimal()
# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Non-Significant Difference (ACT Scores - Earthlings vs Martians)") +
xlab("ACT Score") +
theme_minimal()
# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Significant Difference (HS GPA - Earthlings vs Martians)") +
xlab("HS GPA") +
theme_minimal()
# Arrange boxplot and density plot for non-significant difference
g1<-grid.arrange(p1, p3, ncol = 1)
# Arrange boxplot and density plot for significant difference
g2<-grid.arrange(p2, p4, ncol = 1)
grid.arrange(g1, g2, ncol = 2)
final_plot <- grid.arrange(g1, g2, ncol = 2)
# --- Save as TIFF 300 dpi ---
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure7.1.tif")
ggsave(
filename = tiff_path,
plot = final_plot,
device = "tiff",
dpi = 300,
width = 12, # adjust width as needed
height = 6 # adjust height as needed
)
cat("Figure 7.1 saved as TIFF at:", tiff_path, "\n")
## Figure 7.1 saved as TIFF at: R figures 2/figure7.1.tif
# Independent samples t test descriptive statistics for table 7.1
descriptives_south <- psych::describeBy(CPS85$age, group = CPS85$married, mat = TRUE)
# Convert the matrix into a data frame
descriptives_south<- as.data.frame(descriptives_south)
write.csv(descriptives_south, "descriptives_south.csv")
descriptives_south
## item group1 vars n mean sd median trimmed mad min max
## X11 1 Married 1 350 39.20286 11.06721 37 38.67500 11.8608 20 64
## X12 2 Single 1 184 32.32609 11.65169 29 30.70946 10.3782 18 64
## range skew kurtosis se
## X11 44 0.4124848 -0.7844761 0.5915671
## X12 46 1.0927423 0.5295034 0.8589742
## Run normality test on married people given age using a shapiro-wilk test and plots
CPS85$married <- as.factor(CPS85$married)
shapiro.test(CPS85$age[CPS85$married == "Married"])
##
## Shapiro-Wilk normality test
##
## data: CPS85$age[CPS85$married == "Married"]
## W = 0.96039, p-value = 3.999e-08
shapiro.test(CPS85$age[CPS85$married == "Single"])
##
## Shapiro-Wilk normality test
##
## data: CPS85$age[CPS85$married == "Single"]
## W = 0.89074, p-value = 2.348e-10
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Create folder foR figures 2 if it doesn't exist
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
# Ensure 'married' is a factor
CPS85$married <- as.factor(CPS85$married)
# Create the plot
fig7_2 <- ggplot(CPS85, aes(x = age, fill = married)) +
geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.5, position = "identity", color = "black") +
geom_density(alpha = 0.7) +
facet_wrap(~married) +
theme_minimal() +
labs(title = "Age Distribution by Marital Status",
x = "Age",
y = "Density") +
scale_fill_brewer(palette = "Set2")
# Save as TIFF (300 dpi)
tiff7_2 <- file.path(folder, "Figure7.2.tif")
ggsave(filename = tiff7_2, plot = fig7_2, device = "tiff", dpi = 300, width = 8, height = 6)
cat("Figure 7.2 saved as TIFF at:", tiff7_2, "\n")
## Figure 7.2 saved as TIFF at: R figures 2/Figure7.2.tif
fig7_2
#Levene's test of equality of variance
leveneTest(age ~ married, data = CPS85)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.1425 0.706
## 532
#Run t-test in R assuming equality of variance
t_test_result <- t.test(age ~ married, data = CPS85, var.equal = TRUE)
t_test_result
##
## Two Sample t-test
##
## data: age by married
## t = 6.6999, df = 532, p-value = 5.323e-11
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
## 4.860477 8.893063
## sample estimates:
## mean in group Married mean in group Single
## 39.20286 32.32609
#Run t-test in R if variances not equal
t_test_result2 <- t.test(age ~ married, data = CPS85, var.equal = FALSE)
t_test_result2
##
## Welch Two Sample t-test
##
## data: age by married
## t = 6.5934, df = 355.79, p-value = 1.555e-10
## alternative hypothesis: true difference in means between group Married and group Single is not equal to 0
## 95 percent confidence interval:
## 4.825607 8.927933
## sample estimates:
## mean in group Married mean in group Single
## 39.20286 32.32609
# Run Cohen's d
cohen_d_result <- CPS85 %>%
rstatix::cohens_d(age ~ married)
# View the result
cohen_d_result
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 age Married Single 0.605 350 184 moderate
# Run Mann-Whitney U test (Wilcoxon rank-sum test)
mann_whitney_result <- wilcox.test(age ~ married, data = CPS85, exact = FALSE)
mann_whitney_result
##
## Wilcoxon rank sum test with continuity correction
##
## data: age by married
## W = 44398, p-value = 5.971e-13
## alternative hypothesis: true location shift is not equal to 0
# Calculating effect size for U
# Compute Mann-Whitney U statistic manually (SPSS-style)
compute_U <- function(group1, group2) {
combined_data <- c(group1, group2)
ranks <- rank(combined_data)
rank_sum1 <- sum(ranks[1:length(group1)])
rank_sum2 <- sum(ranks[(length(group1)+1):length(combined_data)])
U1 <- rank_sum1 - length(group1)*(length(group1) + 1)/2
U2 <- rank_sum2 - length(group2)*(length(group2) + 1)/2
return(min(U1, U2)) # Smaller U value matches SPSS
}
# Define groups correctly for Mann-Whitney U
group1 <- CPS85$age[CPS85$married == "Married"]
group2 <- CPS85$age[CPS85$married == "Single"]
# Compute U statistic manually
U_value <- compute_U(group1, group2)
print(paste("Mann-Whitney U (Custom Calculation):", U_value))
## [1] "Mann-Whitney U (Custom Calculation): 20002.5"
# Compute effect size (r = Z / sqrt(n))
# Get sample sizes
n1 <- length(group1)
n2 <- length(group2)
n <- n1 + n2
# Approximate Z from the Mann-Whitney U test
Z <- qnorm(1 - mann_whitney_result$p.value / 2) # Two-tailed assumption
# Compute effect size
effect_size_r <- abs(Z) / sqrt(n)
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.312"
library(ggplot2)
library(gridExtra)
library(psych)
set.seed(123)
# Simulated data for non-significant difference (Earthlings vs Venusians vs Martians)
nonsig_data <- data.frame(
ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 24, sd = 4), rnorm(100, mean = 22, sd = 4)),
Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)
# Simulated data for significant difference (Earthlings vs Martians vs Venusians)
sig_data <- data.frame(
HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 3.0, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)
# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Non-Sig. Dif. (ACT)") +
ylab("ACT Score") +
xlab("Group") +
theme_minimal()
# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Sig. Dif. (HS GPA)") +
ylab("HS GPA") +
xlab("Group") +
theme_minimal()
# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Non-Sign. Dif. (ACT Scores)") +
xlab("ACT Score") +
theme_minimal()
# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Sig. Dif. (HS GPA)") +
xlab("HS GPA") +
theme_minimal()
# Arrange boxplot and density plot for non-significant difference
g1<- grid.arrange(p1, p3, ncol = 1)
# Arrange boxplot and density plot for significant difference
g2<- grid.arrange(p2, p4, ncol = 1)
grid.arrange(g1, g2, ncol =2)
final_plot <- grid.arrange(g1, g2, ncol = 2)
# --- Save as TIFF 300 dpi ---
folder <- "R figures 2"
if(!dir.exists(folder)) dir.create(folder)
tiff_path <- file.path(folder, "figure7.3.tif")
ggsave(
filename = tiff_path,
plot = final_plot,
device = "tiff",
dpi = 300,
width = 12, # adjust width as needed
height = 6 # adjust height as needed
)
cat("Figure 7.3 saved as TIFF at:", tiff_path, "\n")
## Figure 7.3 saved as TIFF at: R figures 2/figure7.3.tif
# Prepare the dataset
anovadata<- CPS85%>%
filter(sector %in% c("manag", "prof", "sales"))%>%
dplyr::select(wage, sector)
anovadata$sector<- as.character(anovadata$sector)
# Get descriptive statistics by profession
descriptives_profession <- psych::describeBy(anovadata$wage, group = anovadata$sector, mat = TRUE)
# Convert the matrix into a data frame
descriptives_profession<- as.data.frame(descriptives_profession)
write.csv(descriptives_profession, "descriptives_profession.csv")
descriptives_profession
## item group1 vars n mean sd median trimmed mad min
## X11 1 manag 1 55 12.704000 7.572513 10.620 11.966222 6.493788 1.00
## X12 2 prof 1 105 11.947429 5.523833 10.610 11.360706 5.352186 4.35
## X13 3 sales 1 38 7.592632 4.232272 5.725 7.157813 2.891070 3.35
## max range skew kurtosis se
## X11 44.50 43.50 1.4730872 3.67335914 1.0210774
## X12 24.98 20.63 0.7727855 -0.36441619 0.5390709
## X13 19.98 16.63 0.9778591 0.01196841 0.6865651
# Shapiro-Wilke Test for normality
aov_model <- aov(wage ~ sector, data = anovadata)
shapiro.test(residuals(aov_model))
##
## Shapiro-Wilk normality test
##
## data: residuals(aov_model)
## W = 0.91874, p-value = 5.393e-09
# QQ Plot for normality
qqnorm(residuals(aov_model))
qqline(residuals(aov_model), col = "red")
# Levene's test for homogeniety of variance
library(car)
leveneTest(wage ~ sector, data = anovadata)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 3.2421 0.0412 *
## 195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bartlett's test of homogeniety of variance
bartlett.test(wage ~ sector, data = anovadata)
##
## Bartlett test of homogeneity of variances
##
## data: wage by sector
## Bartlett's K-squared = 15.317, df = 2, p-value = 0.000472
# Create folder foR figures 2 if it doesn't exist
folder <- "R figures"
if(!dir.exists(folder)) dir.create(folder)
tiff7_4 <- file.path(folder, "Figure7.4.tif")
# Save Q-Q plot to TIFF
tiff(filename = tiff7_4, width = 6, height = 6, units = "in", res = 300)
qqnorm(residuals(aov_model, main = "Q-Q Plot of Differences"))
qqline(residuals(aov_model), col = "red", lwd = 2)
dev.off()
## png
## 2
cat("Figure 7.4 saved as TIFF at:", tiff7_4, "\n")
## Figure 7.4 saved as TIFF at: R figures/Figure7.4.tif
# Be sure to load the library
library(apaTables)
library(dplyr)
library(tidyverse)
library(rstatix)
# Run ANOVA
anova_model <- aov(wage~sector, data = anovadata)
# Export the APA table to your folder
apa.aov.table(anova_model, filename = "anova_results.doc")
##
##
## ANOVA results using wage as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2 CI_90_partial_eta2
## (Intercept) 8876.54 1 8876.54 249.68 .000
## sector 674.63 2 337.31 9.49 .000 .09 [.03, .15]
## Error 6932.59 195 35.55
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## sector 2 675 337.3 9.488 0.000117 ***
## Residuals 195 6933 35.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run post-hoc tests
# Tukey if Equal Variances are Assumed
TukeyHSD(anova_model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = wage ~ sector, data = anovadata)
##
## $sector
## diff lwr upr p adj
## prof-manag -0.7565714 -3.100521 1.587379 0.7265306
## sales-manag -5.1113684 -8.081890 -2.140847 0.0002059
## sales-prof -4.3547970 -7.020710 -1.688884 0.0004545
# Games-Howell if equal variances are not assumed
anovadata%>%
games_howell_test(wage ~ sector)
## # A tibble: 3 × 8
## .y. group1 group2 estimate conf.low conf.high p.adj p.adj.signif
## * <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 wage manag prof -0.757 -3.51 2.00 0.79 ns
## 2 wage manag sales -5.11 -8.04 -2.18 0.000222 ***
## 3 wage prof sales -4.35 -6.44 -2.27 0.00000942 ****
# Run cohen's d effect size
# Subset your data for Cohen's d
cohen_data_sales_manag<- anovadata%>%
filter(sector %in% c("sales", "manag"))
cohen_data_sales_prof<- anovadata%>%
filter(sector %in% c("sales", "prof"))
# Run cohen's d on subset data
rstatix::cohens_d(cohen_data_sales_manag, wage ~ sector)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 wage manag sales 0.833 55 38 large
rstatix::cohens_d(cohen_data_sales_prof, wage ~ sector)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 wage prof sales 0.885 105 38 large
library(dunn.test)
# Run the omnibus KW test
kwtest<-kruskal.test(wage ~ sector, data = anovadata)
print(kwtest)
##
## Kruskal-Wallis rank sum test
##
## data: wage by sector
## Kruskal-Wallis chi-squared = 22.533, df = 2, p-value = 1.28e-05
# Create ranks and summarize by group
planet_ranks <- anovadata %>%
mutate(Rank = rank(wage)) %>%
group_by(sector) %>%
summarise(N = n(), Mean_Rank = mean(Rank), Sum_Rank = sum(Rank))
planet_ranks
## # A tibble: 3 × 4
## sector N Mean_Rank Sum_Rank
## <chr> <int> <dbl> <dbl>
## 1 manag 55 110. 6032.
## 2 prof 105 109. 11395
## 3 sales 38 59.9 2274.
#Run post-hoc dunn test
dt<- dunn.test(anovadata$wage, anovadata$sector, kw = TRUE, label = TRUE)
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 22.5328, df = 2, p-value = 0
##
##
## Comparison of x by group
## (No adjustment)
## Col Mean-|
## Row Mean | manag prof
## ---------+----------------------
## prof | 0.119529
## | 0.4524
## |
## sales | 4.121490 4.487320
## | 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
dt
## $chi2
## [1] 22.53284
##
## $Z
## [1] 0.1195295 4.1214907 4.4873207
##
## $P
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
##
## $P.adjusted
## [1] 4.524279e-01 1.882143e-05 3.606224e-06
##
## $comparisons
## [1] "manag - prof" "manag - sales" "prof - sales"
# Run eta-squared for omnibus effect size
H <- kwtest$statistic
k <- length(unique(anovadata$sector))
N <- nrow(anovadata)
eta_squared <- (H - k + 1) / (N - k)
# Rename the output explicitly
names(eta_squared) <- "eta_squared"
eta_squared
## eta_squared
## 0.1052966
# Rank biserial correlation for pairwise tests
# Z-values from Dunn's test results
z_manag_sales <- 4.1215
z_prof_sales <- 4.4873
# Assuming equal sample size for each comparison
n <- nrow(anovadata) # Total number of observations
# Rank-biserial correlation formula
r_manag_sales <- z_manag_sales / sqrt(n)
r_prof_sales <- z_prof_sales / sqrt(n)
# Print effect sizes
r_manag_sales
## [1] 0.2929023
r_prof_sales
## [1] 0.3188985
## png
## 2
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## Paired t-test
##
## data: data$Pretest and data$Posttest
## t = -21.456, df = 19, p-value = 8.846e-15
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -15.1462 -12.4538
## sample estimates:
## mean difference
## -13.8
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 73.15 7.73 73 73.06 8.15 59 87 28 0.02 -0.79 1.73
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 86.95 7.85 86.5 87.25 8.9 70 98 28 -0.23 -0.95 1.76
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 13.8 2.88 14 13.62 2.22 9 20 11 0.37 -0.57 0.64
# Creating the data frame of three groups with the same scores
data <- data.frame(
Student = factor(rep(1:5, each=3)), # 5 students, 3 conditions
Condition = factor(rep(c("Pre", "Post", "Post-Post"), times=5)),
Score = c(80, 75, 67, 90, 85, 85, 78, 72, 74, 85, 80, 78, 88, 83, 91)
)
# 1. Run repeated measures ANOVA with ezANOVA (handling assumptions like sphericity)
library(ez)
anova_result <- ezANOVA(data = data,
dv = .(Score),
wid = .(Student),
within = .(Condition),
detailed = TRUE)
anova_result
## $ANOVA
## Effect DFn DFd SSn SSd F p p<.05
## 1 (Intercept) 1 4 97768.06667 482.93333 809.785202 9.074958e-06 *
## 2 Condition 2 8 90.13333 89.86667 4.011869 6.213045e-02
## ges
## 1 0.9941754
## 2 0.1359614
##
## $`Mauchly's Test for Sphericity`
## Effect W p p<.05
## 2 Condition 0.01730226 0.002275905 *
##
## $`Sphericity Corrections`
## Effect GGe p[GG] p[GG]<.05 HFe p[HF] p[HF]<.05
## 2 Condition 0.5043633 0.1150838 0.5087521 0.1144435
# 2. Normality Test for Each Condition
library(dplyr)
data %>%
group_by(Condition) %>%
summarise(shapiro_p = shapiro.test(Score)$p.value)
## # A tibble: 3 × 2
## Condition shapiro_p
## <fct> <dbl>
## 1 Post 0.716
## 2 Post-Post 0.977
## 3 Pre 0.641
# 3. Mauchly's Test of Sphericity
# Using ezANOVA, sphericity testing is automatically handled and included in the output.
# If needed, you can manually extract the sphericity correction from the results.
# 4. Post-Hoc Pairwise Comparison if ANOVA is significant
if (anova_result$ANOVA$p[1] < 0.05) {
pairwise_result <- pairwise.t.test(data$Score, data$Condition, paired = TRUE)
print(pairwise_result)
}
##
## Pairwise comparisons using paired t tests
##
## data: data$Score and data$Condition
##
## Post Post-Post
## Post-Post 1.00 -
## Pre 3.9e-05 0.23
##
## P value adjustment method: holm
library(datasets)
library(psych)
library(effectsize)
library(tidyverse)
# This is the default data set.
# It is not necessary to rename it, but I do it to emphasize it's original form
chick<- ChickWeight
# Here we pivot the data so that we have only time series 4 and 21 in our set
chickdata <- chick%>%
filter(Time %in% c(4, 21)) %>% # Select only two time points for comparison
pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
drop_na()%>%
mutate(Difference = Day_21 - Day_4)
# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata$Day_4)
desc_day_21 <- psych::describe(chickdata$Day_21)
desc_difference <- psych::describe(chickdata$Difference)
as.data.frame(desc_day_4)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 60.15556 4.279738 61 60.32432 4.4478 48 69 21 -0.4278585
## kurtosis se
## X1 0.06485024 0.6379857
as.data.frame(desc_day_21)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 218.6889 71.51027 205 218.6216 75.6126 74 373 299 0.08910638
## kurtosis se
## X1 -0.7680967 10.66012
as.data.frame(desc_difference)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 158.5333 69.83865 149 158.2973 71.1648 16 309 293 0.10106
## kurtosis se
## X1 -0.7835153 10.41093
desc_day_4$Variable <- "Day_4"
desc_day_21$Variable <- "Day_21"
desc_difference$Variable <- "Difference"
# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_21, desc_difference)%>%
dplyr::select(Variable, mean, median, sd, skew, kurtosis)
write.csv(desc_combined, "dependent_desc.csv")
# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist(chickdata$Difference, main = "Histogram of Difference",
xlab = "Difference (Day_21 - Day_4)", col = "lightblue",
border = "black")
# Q-Q plot to check for normality
qqnorm(chickdata$Difference)
qqline(chickdata$Difference, col = "red")
# Create the folder if it doesn't exist
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Figure 8.2 – Histogram of Difference
tiff8_2 <- file.path(folder, "Figure8.2.tif")
tiff(filename = tiff8_2, width = 6, height = 4, units = "in", res = 300)
hist(chickdata$Difference,
main = "Histogram of Difference",
xlab = "Difference (Day_21 - Day_4)",
col = "lightblue",
border = "black")
dev.off()
## png
## 2
cat("Figure 8.2 saved as TIFF at:", tiff8_2, "\n")
## Figure 8.2 saved as TIFF at: R figures 2/Figure8.2.tif
# Also show histogram in console
hist(chickdata$Difference,
main = "Histogram of Difference",
xlab = "Difference (Day_21 - Day_4)",
col = "lightblue",
border = "black")
# Shapiro-Wilk test for normality
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test
##
## Shapiro-Wilk normality test
##
## data: chickdata$Difference
## W = 0.98718, p-value = 0.8939
# Paired t-test
paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result
##
## Paired t-test
##
## data: chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -179.5152 -137.5515
## sample estimates:
## mean difference
## -158.5333
# Calculate Cohen's d manually
mean_dif <- mean(chickdata$Difference)
sd_dif <- sd(chickdata$Difference)
cohen_d <- mean_dif / sd_dif
cohen_d
## [1] 2.269994
# Calculate Cohen's d using effsize package
chickdata_long <- chickdata %>%
dplyr::select(Day_4, Day_21) %>%
tidyr::pivot_longer(cols = c(Day_4, Day_21),
names_to = "Day",
values_to = "Weight")
chickdata_long$Day <- as.factor(chickdata_long$Day)
cohen_d_result <- effectsize::cohens_d(chickdata_long$Weight, chickdata_long$Day, paired = TRUE)
cohen_d_result
## Cohen's d | 95% CI
## ------------------------
## 2.27 | [1.71, 2.82]
# Figure 8.3 – Q-Q Plot of Differences
tiff8_3 <- file.path(folder, "Figure8.3.tif")
tiff(filename = tiff8_3, width = 6, height = 6, units = "in", res = 300)
qqnorm(chickdata$Difference, main = "Q-Q Plot of Differences")
qqline(chickdata$Difference, col = "red", lwd = 2)
dev.off()
## png
## 2
cat("Figure 8.3 saved as TIFF at:", tiff8_3, "\n")
## Figure 8.3 saved as TIFF at: R figures 2/Figure8.3.tif
# Show Q-Q plot in R plot window
qqnorm(chickdata$Difference, main = "Q-Q Plot of Differences")
qqline(chickdata$Difference, col = "red", lwd = 2)
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test <- shapiro.test(chickdata$Difference)
shapiro_test
##
## Shapiro-Wilk normality test
##
## data: chickdata$Difference
## W = 0.98718, p-value = 0.8939
# Run the t-test
paired_result <- t.test(chickdata$Day_4, chickdata$Day_21, paired = TRUE)
paired_result
##
## Paired t-test
##
## data: chickdata$Day_4 and chickdata$Day_21
## t = -15.228, df = 44, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -179.5152 -137.5515
## sample estimates:
## mean difference
## -158.5333
# Calculate Cohen's d manually
mean_dif<- mean(chickdata$Difference)
sd_dif<- sd(chickdata$Difference)
cohen_d<- mean_dif/sd_dif
cohen_d
## [1] 2.269994
# Use the effsize package
library(effsize)
cohen_d2 <- cohen.d(chickdata$Day_21, chickdata$Day_4, paired = TRUE)
cohen_d2
##
## Cohen's d
##
## d estimate: 2.45338 (large)
## 95 percent confidence interval:
## lower upper
## 1.812256 3.094504
# Load necessary libraries
library(tidyr)
library(rstatix)
library(dplyr)
# Filter for Day 4 and Day 21
chick_subset <- ChickWeight[ChickWeight$Time %in% c(4, 21), ]
# Reshape data to wide format
chick_wide <- chick_subset %>%
pivot_wider(names_from = Time, values_from = weight) %>%
na.omit() %>%
dplyr::select('4', '21')
# Rename columns for clarity
colnames(chick_wide) <- c("Four", "TwentyOne")
# Perform Wilcoxon Signed-Rank Test
test_result <- wilcox.test(chick_wide$Four, chick_wide$TwentyOne, paired = TRUE)
# Print the Wilcoxon Signed-Rank Test result
print(test_result)
##
## Wilcoxon signed rank test with continuity correction
##
## data: chick_wide$Four and chick_wide$TwentyOne
## V = 0, p-value = 5.355e-09
## alternative hypothesis: true location shift is not equal to 0
# Extract necessary values
Z <- qnorm(test_result$p.value / 2) # Get Z-score (divide p-value by 2 for two-tailed test)
n <- nrow(chick_wide) # Number of pairs
# Calculate effect size r
effect_size_r <- abs(Z) / sqrt(n)
# Print effect size
print(paste("Z:", round(Z, 3)))
## [1] "Z: -5.836"
print(paste("Effect size (r):", round(effect_size_r, 3)))
## [1] "Effect size (r): 0.87"
library(car)
library(tidyverse)
library(dplyr)
library(psych)
library(gridExtra)
library(rstatix)
library(nlme)
library(dplyr)
library(purrr)
chickdata3 <- ChickWeight%>%
filter(Time %in% c(4, 12, 21)) %>% # Select only two time points for comparison
pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day_") %>%
drop_na()%>%
dplyr::select(Chick, Day_4, Day_12, Day_21)%>%
mutate(Difference21_4 = Day_21 - Day_4)%>%
mutate(Difference12_4 = Day_12- Day_4)%>%
mutate(Difference21_12 = Day_21- Day_12)
# Compute descriptive statistics
desc_day_4 <- psych::describe(chickdata3$Day_4)
desc_day_12 <- psych::describe(chickdata3$Day_12)
desc_day_21 <- psych::describe(chickdata3$Day_21)
desc_difference21_4 <- psych::describe(chickdata3$Difference21_4)
desc_difference12_4 <- psych::describe(chickdata3$Difference12_4)
desc_difference21_12 <- psych::describe(chickdata3$Difference21_12)
as.data.frame(desc_day_4)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 60.15556 4.279738 61 60.32432 4.4478 48 69 21 -0.4278585
## kurtosis se
## X1 0.06485024 0.6379857
as.data.frame(desc_day_12)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 132.7778 32.12397 136 132.5676 28.1694 70 217 147 0.1011454
## kurtosis se
## X1 -0.02882665 4.788759
as.data.frame(desc_day_21)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 218.6889 71.51027 205 218.6216 75.6126 74 373 299 0.08910638
## kurtosis se
## X1 -0.7680967 10.66012
as.data.frame(desc_difference21_4)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 158.5333 69.83865 149 158.2973 71.1648 16 309 293 0.10106
## kurtosis se
## X1 -0.7835153 10.41093
as.data.frame(desc_difference12_4)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 72.62222 29.65505 75 72.32432 25.2042 12 155 143 0.1679009
## kurtosis se
## X1 0.203552 4.420714
as.data.frame(desc_difference21_12)
## vars n mean sd median trimmed mad min max range skew
## X1 1 45 85.91111 51.30161 85 85 53.3736 4 177 173 0.192361
## kurtosis se
## X1 -1.034653 7.647593
desc_day_4$Variable <- "Day_4"
desc_day_12$Variable<- "Day_12"
desc_day_21$Variable <- "Day_21"
desc_difference21_4$Variable <- "Difference_21_4"
desc_difference12_4$Variable<- "Difference_12_4"
desc_difference21_12$Variable<- "Difference_21_12"
# Combine all into one dataset
desc_combined <- rbind(desc_day_4, desc_day_12, desc_day_21, desc_difference21_4, desc_difference12_4, desc_difference21_12)%>%
mutate(upper = mean+(se*2))%>%
mutate(lower = mean-(se*2))%>%
dplyr::select(Variable, mean, upper, lower, se, sd, skew, kurtosis)
write.csv(desc_combined, "repeated_desc.csv")
# We now want to check the assumption of normality
# Plot a histogram of the Difference variable
hist_21_4 <- ggplot(chickdata3, aes(x = Difference21_4)) +
geom_histogram(fill = "lightblue", color = "black", bins = 10) +
ggtitle("Histogram of Difference 21 and 4") +
xlab("Difference (Day_21 - Day_4)") +
theme_minimal()
hist_21_12 <- ggplot(chickdata3, aes(x = Difference21_12)) +
geom_histogram(fill = "lightblue", color = "black", bins = 10) +
ggtitle("Histogram of Difference 21 and 12") +
xlab("Difference (Day_21 - Day_12)") +
theme_minimal()
hist_12_4 <- ggplot(chickdata3, aes(x = Difference12_4)) +
geom_histogram(fill = "lightblue", color = "black", bins = 10) +
ggtitle("Histogram of Difference 12 and 4") +
xlab("Difference (Day_12 - Day_4)") +
theme_minimal()
# Create Q-Q plots
qq_21_4 <- ggplot(chickdata3, aes(sample = Difference21_4)) +
stat_qq() + stat_qq_line(color = "red") +
ggtitle("Q-Q Plot: Difference Day 21-4") +
theme_minimal()
qq_21_12 <- ggplot(chickdata3, aes(sample = Difference21_12)) +
stat_qq() + stat_qq_line(color = "red") +
ggtitle("Q-Q Plot: Difference Day 21-12") +
theme_minimal()
qq_12_4 <- ggplot(chickdata3, aes(sample = Difference12_4)) +
stat_qq() + stat_qq_line(color = "red") +
ggtitle("Q-Q Plot: Difference Day 12-4") +
theme_minimal()
# Arrange plots in a grid
grid.arrange(hist_21_4, qq_21_4,
hist_21_12, qq_21_12,
hist_12_4, qq_12_4,
ncol = 2)
library(gridExtra)
# Create folder if needed
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Arrange plots in a grid
combined_8_4 <- grid.arrange(hist_21_4, qq_21_4,
hist_21_12, qq_21_12,
hist_12_4, qq_12_4,
ncol = 2)
# Save TIFF at 300 DPI
tiff(file.path(folder, "Figure8.4.tif"), width = 8, height = 10, units = "in", res = 300)
grid.arrange(hist_21_4, qq_21_4,
hist_21_12, qq_21_12,
hist_12_4, qq_12_4,
ncol = 2)
dev.off()
## png
## 2
# Show in console
combined_8_4
## TableGrob (3 x 2) "arrange": 6 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
## 6 6 (3-3,2-2) arrange gtable[layout]
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test21_4 <- shapiro.test(chickdata3$Difference21_4)
shapiro_test21_4
##
## Shapiro-Wilk normality test
##
## data: chickdata3$Difference21_4
## W = 0.98718, p-value = 0.8939
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test21_12 <- shapiro.test(chickdata3$Difference21_12)
shapiro_test21_12
##
## Shapiro-Wilk normality test
##
## data: chickdata3$Difference21_12
## W = 0.95597, p-value = 0.0857
# Perform Shapiro-Wilk test for normality on the Difference variable
shapiro_test12_4 <- shapiro.test(chickdata3$Difference12_4)
shapiro_test12_4
##
## Shapiro-Wilk normality test
##
## data: chickdata3$Difference12_4
## W = 0.97458, p-value = 0.4195
# Convert to long format and add a Chick ID
chickdata3_long <- chickdata3 %>%
dplyr::select(Chick, Day_4, Day_12, Day_21) %>%
pivot_longer(cols = c(Day_4, Day_12, Day_21),
names_to = "Time",
values_to = "Weight") %>%
mutate(Time = factor(Time, levels = c("Day_4", "Day_12", "Day_21")))
# Greenhouse-Guesser correction automatically applied to this ANOVA test see the "ges" indicating an adjusted eta-squared.
anova_lm <- anova_test(dv = Weight, wid = Chick, within = Time, data = chickdata3_long)
anova_lm
## ANOVA Table (type III tests)
##
## $ANOVA
## Effect DFn DFd F p p<.05 ges
## 1 Time 2 88 202.706 1.14e-33 * 0.676
##
## $`Mauchly's Test for Sphericity`
## Effect W p p<.05
## 1 Time 0.315 1.65e-11 *
##
## $`Sphericity Corrections`
## Effect GGe DF[GG] p[GG] p[GG]<.05 HFe DF[HF] p[HF] p[HF]<.05
## 1 Time 0.594 1.19, 52.23 5.31e-21 * 0.6 1.2, 52.84 3.22e-21 *
get_anova_table(anova_lm)
## ANOVA Table (type III tests)
##
## Effect DFn DFd F p p<.05 ges
## 1 Time 1.19 52.23 202.706 5.31e-21 * 0.676
# Plot the Difference
figure8_5 <- ggplot(chickdata3_long, aes(x = Time, y = Weight, fill = Time)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot of Weight by Time",
x = "Time",
y = "Weight")
# Save at 300 DPI
tiff(file.path(folder, "Figure8.5.tif"), width = 6, height = 4, units = "in", res = 300)
print(figure8_5)
dev.off()
## png
## 2
# Show in console
figure8_5
# Post-hoc with a bonferroni correction
# Perform pairwise t-tests with Bonferroni correction
pwc <- chickdata3_long %>%
pairwise_t_test(Weight ~ Time, paired = TRUE, p.adjust.method = "bonferroni")
pwc
## # A tibble: 3 × 10
## .y. group1 group2 n1 n2 statistic df p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Weig… Day_4 Day_12 45 45 -16.4 44 2.17e-20 6.51e-20 ****
## 2 Weig… Day_4 Day_21 45 45 -15.2 44 3.75e-19 1.12e-18 ****
## 3 Weig… Day_12 Day_21 45 45 -11.2 44 1.64e-14 4.92e-14 ****
## Cohen's d effect size into a table
tesddata_prepost_d<- chickdata3_long%>%
filter(Time == c("Day_4", "Day_12"))%>%
mutate(Time = as.character(Time))%>%
rstatix::cohens_d(Weight ~ Time)%>%
mutate(Model = "Day 4 to Day 12")
tesddata_prepost_Post_d<- chickdata3_long%>%
filter(Time == c("Day_4", "Day_21"))%>%
mutate(Time = as.character(Time))%>%
rstatix::cohens_d(Weight ~ Time)%>%
mutate(Model = "Day 4 to Day 12")
testdata_PostPost_Post_d <- chickdata3_long %>%
filter(Time %in% c("Day_12", "Day_21")) %>%
mutate(Time = as.character(Time)) %>%
rstatix::cohens_d(Weight ~ Time) %>%
mutate(Model = "Day 12 to Day 21")
effectsize<- rbind(tesddata_prepost_d, tesddata_prepost_Post_d, testdata_PostPost_Post_d)
effectsize
## # A tibble: 3 × 8
## .y. group1 group2 effsize n1 n2 magnitude Model
## <chr> <chr> <chr> <dbl> <int> <int> <ord> <chr>
## 1 Weight Day_12 Day_4 3.36 23 23 large Day 4 to Day 12
## 2 Weight Day_21 Day_4 2.63 22 23 large Day 4 to Day 12
## 3 Weight Day_12 Day_21 -1.55 45 45 large Day 12 to Day 21
write.csv(effectsize, "repeated_pair_cohen.csv")
library(dplyr)
library(stats)
library(tidyverse)
library(ggpubr)
library(rstatix)
library(purrr)
# Set up the dataset
chickdata4 <- ChickWeight %>%
filter(Time %in% c(4, 12, 21)) %>%
dplyr::select(Chick, Time, weight) %>%
group_by(Chick) %>%
filter(n() == 3) %>%
ungroup() %>%
mutate(
Time = as.factor(Time))
chickdata4$Chick <- factor(chickdata4$Chick, ordered = FALSE)
# Run the omnibus test
friedtest<- friedman_test(chickdata4, weight ~ Time | Chick)
friedtest
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 weight 45 90 2 2.86e-20 Friedman test
# Get the omnibus effect size
friedman_effsize_result <- friedman_effsize(chickdata4, weight ~ Time | Chick)
friedman_effsize_result
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 weight 45 1 Kendall W large
# Post-Hoc Wilcox Sum Rank Test
chickdata_wide <- chickdata4 %>%
pivot_wider(names_from = Time, values_from = weight, names_prefix = "Day")
library(dplyr)
library(purrr)
library(tibble)
# Define pairwise comparisons
comparisons <- list(
c("Day4", "Day12"),
c("Day4", "Day21"),
c("Day12", "Day21")
)
# Function to perform Wilcoxon test and compute statistics
perform_test <- function(pair) {
test <- wilcox.test(chickdata_wide[[pair[1]]], chickdata_wide[[pair[2]]], paired = TRUE)
# Extract the test statistic (V)
V_stat <- test$statistic
# Sample size (n) for effect size and Z-statistic calculations
n <- sum(!is.na(chickdata_wide[[pair[1]]]) & !is.na(chickdata_wide[[pair[2]]])) # Correct for NAs
# Expected value of V
mu_V <- n * (n + 1) / 4
# Standard deviation of V
sigma_V <- sqrt((n * (n + 1) * (2 * n + 1)) / 24)
# Compute Z-statistic: Adjust sign based on positive ranks
Zstat <- (V_stat - mu_V) / sigma_V
# If Z is positive for a comparison with a reverse direction (like Day4 to Day12), flip the sign
if (V_stat < mu_V) {
Zstat <- -Zstat
}
# Compute effect size (r)
r_effect_size <- abs(Zstat) / sqrt(n) # You can choose to use absolute or raw Z value
# Return a tibble with results
tibble(
Group1 = pair[1],
Group2 = pair[2],
n = n,
V = V_stat,
Z = Zstat, # Retain the sign
p = test$p.value,
r = r_effect_size
)
}
# Apply function to all comparisons and store results in a table
wilcoxon_results <- purrr::map_dfr(comparisons, perform_test)
# Print results
print(wilcoxon_results)
## # A tibble: 3 × 7
## Group1 Group2 n V Z p r
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Day4 Day12 45 0 5.84 0.00000000534 0.871
## 2 Day4 Day21 45 0 5.84 0.00000000535 0.871
## 3 Day12 Day21 45 0 5.84 0.00000000535 0.871
# Save results to CSV
write.csv(wilcoxon_results, "wilcoxon_repeated.csv")
# Load necessary library
library(ggplot2)
# Sample data for height and shoe size
height <- c(128,170, 175, 180, 165, 160, 185, 178, 182, 173, 168,
180, 169, 177, 165, 174, 186, 178, 165, 172, 180)
shoe_size <- c(0,9, 10, 11, 8, 7, 12, 10, 11, 9, 8,
11, 8, 10, 8, 9, 12, 11, 8, 9, 10)
# Create a data frame
height_shoe_size <- data.frame(Height = height, Shoe_Size = shoe_size)
# Fit the linear model
model <- lm(Height ~ Shoe_Size, data = height_shoe_size)
# Get the model's coefficients (intercept and slope)
intercept <- coef(model)[1]
slope <- coef(model)[2]
# Define file path
file_path <- file.path(folder, "Figure9.1.tif")
# Calculate the intercept at x = 0
y_intercept_at_x_0 <- intercept # When x = 0, the y-intercept is the model's intercept
cat("The y-axis intercept when x = 0 is:", y_intercept_at_x_0, "cm\n")
## The y-axis intercept when x = 0 is: 128.1263 cm
# Check if the point exists in your dataset
subset(height_shoe_size, Height == 160 & Shoe_Size == 70)
## [1] Height Shoe_Size
## <0 rows> (or 0-length row.names)
# Create the scatterplot with the regression line and highlight the specific point (with tolerance)
# Create the scatterplot with the regression line, highlight the specific point, and more x-axis ticks
p1<- ggplot(height_shoe_size, aes(x = Shoe_Size, y = Height)) +
geom_point(color = "blue") + # Scatterplot points
geom_smooth(method = "lm", se = FALSE, color = "red") + # Fit line with intercept
# Highlight the specific point (height = 160, shoe size = 70) with a tolerance
geom_point(data = subset(height_shoe_size, abs(Height - 160) < 1 & abs(Shoe_Size - 70) < 1),
aes(x = Shoe_Size, y = Height),
color = "black", shape = 1, size = 5, stroke = 2) + # Circle around the point
labs(title = "Scatterplot of Height vs Shoe Size",
x = "Shoe Size (US Men)",
y = "Height (cm)") +
theme_minimal() +
ylim(120, max(height_shoe_size$Height) + 1) + # Set y-axis limit
xlim(0, max(height_shoe_size$Shoe_Size) + 1) + # Set x-axis limit
scale_x_continuous(breaks = seq(0, max(height_shoe_size$Shoe_Size) + 1, by = 1)) # Set more ticks on the x-axis
p1
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.1.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(p1)
dev.off()
## png
## 2
# Load required libraries
library(ggplot2)
library(gridExtra)
# Set a seed for reproducibility
set.seed(42)
# Create a function to generate correlated data
generate_data <- function(r) {
x <- rnorm(100)
y <- r * x + sqrt(1 - r^2) * rnorm(100)
data.frame(x, y)
}
file_path <- file.path(folder, "Figure9.2.tif")
# Create the plots for each correlation
correlations <- c(0.2, -0.2, 0.5, -0.5, 0.9, -0.9, 0)
labels <- c("Weak Positive", "Weak Negative", "Moderate Positive", "Moderate Negative", "Strong Positive", "Strong Negative", "No Correlation")
plots <- lapply(1:7, function(i) {
data <- generate_data(correlations[i])
ggplot(data, aes(x, y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
ggtitle(paste(labels[i], " (r =", correlations[i], ")")) +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank())
})
# Arrange the plots in a grid
fig<- grid.arrange(grobs = plots, ncol = 3)
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.2.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig)
dev.off()
## png
## 2
fig
## TableGrob (3 x 3) "arrange": 7 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (2-2,1-1) arrange gtable[layout]
## 5 5 (2-2,2-2) arrange gtable[layout]
## 6 6 (2-2,3-3) arrange gtable[layout]
## 7 7 (3-3,1-1) arrange gtable[layout]
library(ggplot2)
library(ggpubr)
library(Hmisc)
library(bannerCommenter)
data <- CPS85
# Homoscedasticity Check (Residuals vs Fitted Plot)
model <- lm(exper ~ age, data = data)
plot(model, which = 1)
title("Residuals vs Fitted Plot")
file_path <- file.path(folder, "Figure9.3.tif")
# Normality Check (Q-Q plots)
# Q-Q plot for Age
qq1<-ggqqplot(data$age, main = "Q-Q Plot for Age", ylab = "Quantiles")
# Q-Q plot for Experience
qq2<- ggqqplot(data$exper, main = "Q-Q Plot for Experience", ylab = "Quantiles")
fig9_3<- grid.arrange(qq1, qq2, nrow = 1)
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.3.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_3)
fig9_3
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
dev.off()
## png
## 2
# Shapiro-Wilk test for normality
shapiro_test_age <- shapiro.test(data$age)
shapiro_test_exper <- shapiro.test(data$exper)
# Linear model for Residuals vs. Fitted plot
banner("Normality test for age and experience shows violation of the assumption")
##
## ###############################################################################
## ## Normality test for age and experience shows violation of the assumption ##
## ###############################################################################
shapiro_test_age
##
## Shapiro-Wilk normality test
##
## data: data$age
## W = 0.9523, p-value = 4.117e-12
shapiro_test_exper
##
## Shapiro-Wilk normality test
##
## data: data$exper
## W = 0.94006, p-value = 7.603e-14
# Compute Spearman correlation manually
cor_test_result <- cor.test(data$age, data$exper, method = "spearman")
# Scatterplot with linear fit (Linearity check); we use a spearman correlation due to violation of normality
fig9_4<- ggplot(data, aes(x = age, y = exper)) +
geom_point(alpha = 0.6) + # Scatterplot with transparency
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Linear fit line
annotate("text", x = min(data$age), y = max(data$exper),
label = paste("Spearman ρ =", format.pval(cor_test_result$estimate)),
hjust = 0, vjust = 1) + # Manually add p-value label
labs(title = "Scatterplot of Age vs Experience (Spearman ρ reported)",
x = "Age",
y = "Experience") +
theme_minimal()
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.4.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_4)
banner("We run a spearman correlation since normality is violated")
##
## #################################################################
## ## We run a spearman correlation since normality is violated ##
## #################################################################
# Correlation results using spearman since normality is violated
cor_results <- rcorr(as.matrix(data[, c("age", "exper")]), type = "spearman")
print(cor_results)
## age exper
## age 1.00 0.97
## exper 0.97 1.00
##
## n= 534
##
##
## P
## age exper
## age 0
## exper 0
banner("model results")
##
## #################################################################
## ## model results ##
## #################################################################
model
##
## Call:
## lm(formula = exper ~ age, data = data)
##
## Coefficients:
## (Intercept) age
## -20.206 1.032
fig9_4
# Load necessary library
library(ggplot2)
library(gridExtra)
library(apaTables)
library(lmtest)
file_path <- file.path(folder, "Figure9.2.tif")
# Create a data frame with all variables
multheight <- data.frame(
height = c(170, 175, 180, 165, 160, 185, 178, 182, 173, 168,
180, 169, 177, 165, 174, 186, 178, 165, 172, 180),
shoe_size = c(9, 10, 11, 8, 7, 12, 10, 11, 9, 8,
11, 8, 10, 8, 9, 12, 11, 8, 9, 10),
biom= c(160, 162, 155, 167, 170, 158, 165, 172, 168, 160,
163, 155, 174, 165, 170, 168, 167, 160, 162, 159),
biof= c(175, 180, 185, 170, 168, 177, 180, 172, 185, 173,
169, 178, 174, 181, 179, 176, 182, 173, 170, 177)
)
# View the first few rows
head(multheight)
## height shoe_size biom biof
## 1 170 9 160 175
## 2 175 10 162 180
## 3 180 11 155 185
## 4 165 8 167 170
## 5 160 7 170 168
## 6 185 12 158 177
# Run multiple regression analysis
model <- lm(height ~ biom + biof, data = multheight)
# Plot to check the linearity assumption for biom
p1<- ggplot(multheight, aes(x = biom, y = height)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Height vs. Bio. Mother", x = "Bio. Mother Height", y = "Child Height")
# Plot to check the linearity assumption for biof
p2<- ggplot(multheight, aes(x = biof, y = height)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Height vs.Bio. Father", x = "Bio Father Height", y = "Child Height")
# Residuals vs fitted values plot to check for independence of errors
p3<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals")
# Residuals vs fitted values plot to check homoscedasticity
p4<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals") +
theme_minimal()
# Histogram of residuals to check normality
p5<-ggplot(data = multheight, aes(x = residuals(model))) +
geom_histogram(bins = 10, fill = "skyblue", color = "black") +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
theme_minimal()
# Q-Q plot to check normality
p6<- ggplot(data = multheight, aes(sample = residuals(model))) +
geom_qq() +
geom_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
fig9_5<- grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
fig9_5
## TableGrob (3 x 2) "arrange": 6 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
## 6 6 (3-3,2-2) arrange gtable[layout]
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.5.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_5)
dev.off()
## png
## 2
# Check for multicollinearity using VIF
library(car)
vif(model)
## biom biof
## 1.043446 1.043446
# Durbin-Watson test to check for autocorrelation in residuals
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 2.3376, p-value = 0.7756
## alternative hypothesis: true autocorrelation is greater than 0
# BP test homoscedacity
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.0091, df = 2, p-value = 0.3662
# Summarize the model
summary(model)
##
## Call:
## lm(formula = height ~ biom + biof, data = multheight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1899 -5.1111 0.0648 3.7485 11.6584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.03025 86.40189 1.007 0.328
## biom 0.08133 0.31480 0.258 0.799
## biof 0.41846 0.33782 1.239 0.232
##
## Residual standard error: 7.345 on 17 degrees of freedom
## Multiple R-squared: 0.08279, Adjusted R-squared: -0.02512
## F-statistic: 0.7672 on 2 and 17 DF, p-value: 0.4797
# Create the APA-style regression table
apa.reg.table(model, filename = "multreg.doc")
##
##
## Regression results using height as the criterion
##
##
## Predictor b b_95%_CI beta beta_95%_CI sr2 sr2_95%_CI r
## (Intercept) 87.03 [-95.26, 269.32]
## biom 0.08 [-0.58, 0.75] 0.06 [-0.44, 0.56] .00 [-.05, .05] .00
## biof 0.42 [-0.29, 1.13] 0.29 [-0.21, 0.79] .08 [-.15, .31] .28
##
##
##
## Fit
##
##
##
## R2 = .083
## 95% CI[.00,.30]
##
##
## Note. A significant b-weight indicates the beta-weight and semi-partial correlation are also significant.
## b represents unstandardized regression weights. beta indicates the standardized regression weights.
## sr2 represents the semi-partial correlation squared. r represents the zero-order correlation.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
##
file_path <- file.path(folder, "Figure9.2.tif")
banner("Section 1:", "Model 1 Before Transformation to DV", emph = TRUE)
##
## ###########################################################################
## ###########################################################################
## ### ###
## ### SECTION 1: ###
## ### MODEL 1 BEFORE TRANSFORMATION TO DV ###
## ### ###
## ###########################################################################
## ###########################################################################
# Check normality for first model
model1<- lm(wage~age + married, data = data)
# Check linearity assumption
p1 <- ggplot(data, aes(x = age, y = wage)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Wage vs Age Pre Transf")
# Check normality of residuals
p2 <- ggplot(data.frame(resid = residuals(model1)), aes(sample = resid)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals Pre Transf") +
theme_minimal()
# Residuals vs. Fitted plot for Homoscedasticity
p3 <- ggplot(data.frame(Fitted = fitted(model1), Residuals = residuals(model1)),
aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", col = "blue", se = FALSE) +
labs(title = "Residuals vs. Fitted Pre Tansf") +
theme_minimal()
# Histogram of Residuals
p4 <- ggplot(data.frame(resid = residuals(model1)), aes(x = resid)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Residuals Pre Transf") +
theme_minimal()
p5<- ggplot(data, aes(x = married, y = wage)) +
geom_jitter() + # Adds some noise to prevent overplotting
labs(title = "Wage vs. Married Status Pre Transf")
p6<- ggplot(data, aes(x = married, y = wage)) +
geom_boxplot() +
labs(title = "Boxplot of Wage by Marital S. Pre Trans")
# Arrange the plots using grid.arrange
fig9_6<- grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.6.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_6)
dev.off()
## png
## 2
banner("Homoscedasticity Test Model 1 (Untransformed)")
##
## #################################################################
## ## Homoscedasticity Test Model 1 (Untransformed) ##
## #################################################################
# Breusch-Pagan test (p > 0.05 suggests homoscedasticity)
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 1.6572, df = 2, p-value = 0.4367
banner("Independence of errors")
##
## #################################################################
## ## Independence of errors ##
## #################################################################
# Durbin-Watson test of independence of errors
dwtest(model1)
##
## Durbin-Watson test
##
## data: model1
## DW = 1.7941, p-value = 0.008593
## alternative hypothesis: true autocorrelation is greater than 0
banner("VIF for Model 1 (Untransformed)")
##
## #################################################################
## ## VIF for Model 1 (Untransformed) ##
## #################################################################
# Multicolinearity
vif(model1)
## age married
## 1.084377 1.084377
banner("Multiple regression results untransformed data")
##
## ##################################################################
## ## Multiple regression results untransformed data ##
## ##################################################################
# Print the model
summary(model1)
##
## Call:
## lm(formula = wage ~ age + married, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.596 -3.587 -1.115 2.124 36.990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.62427 0.80953 8.183 2.06e-15 ***
## age 0.07077 0.01946 3.636 0.000304 ***
## marriedSingle -0.60000 0.47981 -1.250 0.211674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.06 on 531 degrees of freedom
## Multiple R-squared: 0.03416, Adjusted R-squared: 0.03052
## F-statistic: 9.391 on 2 and 531 DF, p-value: 9.821e-05
file_path <- file.path(folder, "Figure9.2.tif")
banner("Section 2:", "Model 2 after Logarithm Transformation to DV", emph = TRUE)
##
## ############################################################################
## ############################################################################
## ### ###
## ### SECTION 2: ###
## ### MODEL 2 AFTER LOGARITHM TRANSFORMATION TO DV ###
## ### ###
## ############################################################################
## ############################################################################
# Transform dependent variable and run model
data$log_wage <- log(data$wage)
# Build the model
model2 <- lm(log_wage ~ age + married, data = data)
# Check linearity assumption
p1 <- ggplot(data, aes(x = age, y = log_wage)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
ggtitle("Wage vs Age After Transf")
# Check normality of residuals
p2 <- ggplot(data.frame(resid = residuals(model2)), aes(sample = resid)) +
stat_qq() +
stat_qq_line(color = "blue") +
labs(title = "Q-Q Plot of Residuals After Transf") +
theme_minimal()
# Residuals vs. Fitted plot for Homoscedasticity
p3 <- ggplot(data.frame(Fitted = fitted(model2), Residuals = residuals(model2)),
aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", col = "red", se = FALSE) +
labs(title = "Residuals vs. Fitted After Tansf") +
theme_minimal()
# Histogram of Residuals
p4 <- ggplot(data.frame(resid = residuals(model2)), aes(x = resid)) +
geom_histogram(binwidth = 1, fill = "red", color = "black", alpha = 0.7) +
labs(title = "Histogram of Residuals After Transf") +
theme_minimal()
p5<- ggplot(data, aes(x = married, y = log_wage)) +
geom_jitter() + # Adds some noise to prevent overplotting
labs(title = "Wage vs. Married Status After Transf")
p6<- ggplot(data, aes(x = married, y = log_wage)) +
geom_boxplot() +
labs(title = "Boxplot of Wage by Marital S. After Trans")
# Arrange the plots using grid.arrange
fig9_7<- grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.7.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_7)
dev.off()
## png
## 2
banner("Homoscedasticity Test Model 2 (Transformed)")
##
## #################################################################
## ## Homoscedasticity Test Model 2 (Transformed) ##
## #################################################################
# Breusch-Pagan test (p > 0.05 suggests homoscedasticity)
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 3.6038, df = 2, p-value = 0.165
banner("Independence of errors (Transformed)")
##
## #################################################################
## ## Independence of errors (Transformed) ##
## #################################################################
# Durbin-Watson test of independence of errors
dwtest(model2)
##
## Durbin-Watson test
##
## data: model2
## DW = 1.8208, p-value = 0.01909
## alternative hypothesis: true autocorrelation is greater than 0
banner("VIF for Model 2 (Transformed)")
##
## #################################################################
## ## VIF for Model 2 (Transformed) ##
## #################################################################
# Multicolinearity
vif(model2)
## age married
## 1.084377 1.084377
banner("Multiple regression results transformed data")
##
## ##################################################################
## ## Multiple regression results transformed data ##
## ##################################################################
# Print the model
summary(model2)
##
## Call:
## lm(formula = log_wage ~ age + married, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.13299 -0.37452 0.00341 0.33601 1.92367
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.804337 0.082607 21.842 < 2e-16 ***
## age 0.007825 0.001986 3.940 9.24e-05 ***
## marriedSingle -0.096846 0.048962 -1.978 0.0484 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5163 on 531 degrees of freedom
## Multiple R-squared: 0.04632, Adjusted R-squared: 0.04273
## F-statistic: 12.9 on 2 and 531 DF, p-value: 3.4e-06
apa.reg.table(model2, filename = "mrexample.doc")
##
##
## Regression results using log_wage as the criterion
##
##
## Predictor b b_95%_CI sr2 sr2_95%_CI Fit
## (Intercept) 1.80** [1.64, 1.97]
## age 0.01** [0.00, 0.01] .03 [.00, .06]
## marriedSingle -0.10* [-0.19, -0.00] .01 [-.01, .02]
## R2 = .046**
## 95% CI[.02,.08]
##
##
## Note. A significant b-weight indicates the semi-partial correlation is also significant.
## b represents unstandardized regression weights.
## sr2 represents the semi-partial correlation squared.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
##
library(ggplot2)
library(car)
library(dplyr)
library(gridExtra)
library(psych)
library(tibble)
set.seed(123)
# Simulate dataset
n <- 20
data <- data.frame(
High_School_GPA = round(runif(n, 2.5, 4.0), 2),
College_GPA = round(runif(n, 2.5, 4.0), 2),
Arrested = sample(0:1, n, replace = TRUE, prob = c(0.5, 0.5))
)
# Model (optional)
model <- glm(Arrested ~ High_School_GPA + College_GPA, data = data, family = binomial)
# Descriptive statistics for the entire dataset
overall_descriptives <- psych::describe(data[, c("High_School_GPA", "College_GPA")])
overall_descriptives_df <- overall_descriptives %>%
rownames_to_column(var = "Statistic")
# Descriptive statistics by Arrested group
group_descriptives <- psych::describeBy(data[, c("High_School_GPA", "College_GPA")], group = data$Arrested, mat = TRUE)
# mat = TRUE ensures you get a single data frame rather than a list
group_descriptives_df <- group_descriptives %>%
rownames_to_column(var = "Statistic") %>%
rename(Group = group1) # rename group column if needed
# Combine
combined_table <- bind_rows(
overall_descriptives_df,
group_descriptives_df
)
# Write CSV
write.csv(combined_table, "logistic_desc2.csv", row.names = FALSE)
fig9_8<- grid.arrange(
ggplot(data, aes(x = High_School_GPA)) +
geom_histogram(binwidth = 0.2, fill = "blue", alpha = 0.6) +
labs(title = "Distribution of High School GPA"),
ggplot(data, aes(x = College_GPA)) +
geom_histogram(binwidth = 0.2, fill = "red", alpha = 0.6) +
labs(title = "Distribution of College GPA"),
ggplot(data, aes(y = High_School_GPA)) +
geom_boxplot(fill = "blue", alpha = 0.6) +
labs(title = "Boxplot of High School GPA"),
ggplot(data, aes(y = College_GPA)) +
geom_boxplot(fill = "red", alpha = 0.6) +
labs(title = "Boxplot of College GPA"),
ggplot(data, aes(x = High_School_GPA, y = College_GPA, color = as.factor(Arrested))) +
geom_jitter(width = 0.1, height = 0.1) +
labs(title = "Scatterplot of GPAs by Arrested Status") +
scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes")),
# Check residuals
ggplot(data, aes(x = fitted(model), y = residuals(model, type = "pearson"))) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
ggtitle("Residuals vs Fitted Values"),
ncol = 2
)
folder <- "R figures 2"
if (!dir.exists(folder)) dir.create(folder)
# Save as TIFF
tiff(file.path(folder, "Figure9.8.tif"), width = 10, height = 5, units = "in", res = 300)
grid.arrange(fig9_8)
dev.off()
## png
## 2
fig9_8
## TableGrob (3 x 2) "arrange": 6 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
## 6 6 (3-3,2-2) arrange gtable[layout]
data$log_High_School_GPA <- log(data$High_School_GPA)
data$log_College_GPA <- log(data$College_GPA)
# Box-Tidwell test: Adding interaction terms between original and log-transformed predictors
box_tidwell_model <- glm(Arrested ~ High_School_GPA + College_GPA +
High_School_GPA:log_High_School_GPA + College_GPA:log_College_GPA,
data = data, family = binomial)
# Check the summary of the Box-Tidwell model
summary(box_tidwell_model)
##
## Call:
## glm(formula = Arrested ~ High_School_GPA + College_GPA + High_School_GPA:log_High_School_GPA +
## College_GPA:log_College_GPA, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.68 164.56 -0.089 0.929
## High_School_GPA -34.09 58.00 -0.588 0.557
## College_GPA 36.80 103.04 0.357 0.721
## High_School_GPA:log_High_School_GPA 16.75 26.82 0.625 0.532
## College_GPA:log_College_GPA -14.93 47.34 -0.315 0.752
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22.493 on 19 degrees of freedom
## Residual deviance: 11.580 on 15 degrees of freedom
## AIC: 21.58
##
## Number of Fisher Scoring iterations: 6
library(bannerCommenter)
# Fit logistic regression model
model <- glm(Arrested ~ High_School_GPA + College_GPA, data = data, family = binomial)
# Create a null model to test overall model fit
model_null <- glm(Arrested ~ 1, data = data, family = binomial)
banner("Overall Model Test")
##
## ##################################################################
## ## Overall Model Test ##
## ##################################################################
# Overall Model Results
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Arrested ~ 1
## Model 2: Arrested ~ High_School_GPA + College_GPA
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 19 22.493
## 2 17 12.073 2 10.421 0.00546 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
banner("Variance Inflation for Multicolinarity")
##
## ##################################################################
## ## Variance Inflation for Multicolinarity ##
## ##################################################################
# Check multicollinearity
vif(model)
## High_School_GPA College_GPA
## 1.041778 1.041778
banner("Model results")
##
## #################################################################
## ## Model results ##
## #################################################################
# Model Results
summary(model)
##
## Call:
## glm(formula = Arrested ~ High_School_GPA + College_GPA, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.428 9.774 -2.090 0.0366 *
## High_School_GPA 2.331 1.790 1.302 0.1929
## College_GPA 4.338 2.157 2.012 0.0443 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22.493 on 19 degrees of freedom
## Residual deviance: 12.073 on 17 degrees of freedom
## AIC: 18.073
##
## Number of Fisher Scoring iterations: 6
banner("Model Odds")
##
## ##################################################################
## ## Model Odds ##
## ##################################################################
# Log odds
exp(coef(model))
## (Intercept) High_School_GPA College_GPA
## 1.343519e-09 1.028845e+01 7.657855e+01