#Load the libraries

library(tidyverse)
library(dplyr)
library(ggplot2)

#Figure 2.1: How our students scored compared to the mean and each other as coded in R

# Create the data
data <- data.frame(
  Name = c("Malcolm", "Josie", "Liam", "Jackson", "Xavier"),
  Score = c(10, 12, 12, 15, 23))

# Calculate the mean
mean_score <- mean(data$Score)

# Plot the data
ggplot(data, aes(x = Name, y = Score)) +
  geom_line(group = 1, color = "blue", size = 1) +  # Line connecting the points
  geom_point(aes(color = Score), size = 4) +  # Points representing the actual data
  geom_hline(yintercept = mean_score, linetype = "dashed", color = "red") +  # Mean line
  labs(title = "Line Plot with Mean in the Middle", 
       x = "Name", 
       y = "Score") +
  scale_color_gradient(low = "lightblue", high = "darkblue") +  # Color scale for points
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

#This is how normality lis calculated in R using the psych package.

# Load necessary library
library(psych)

# Load the mtcars dataset
data(mtcars)

# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg

# Calculate skewness using the psych package
psych_skew <- skew(mpg_data)

# Calculate unstandardized skewness (raw third moment)
N <- length(mpg_data)
mean_mpg <- mean(mpg_data)
raw_skew <- (1 / N) * sum((mpg_data - mean_mpg)^3)

# Calculate standardized skewness
sd_mpg <- sd(mpg_data)
std_skew <- raw_skew / (sd_mpg^3)

# Print results for comparison
cat("Psych skewness: ", psych_skew, "\n")
## Psych skewness:  0.610655
cat("Raw skewness (unstandardized): ", raw_skew, "\n")
## Raw skewness (unstandardized):  133.6867
cat("Standardized skewness: ", std_skew, "\n")
## Standardized skewness:  0.610655

##This is comparing the psych package formula for kurtosis to manual forms. Since the psych package excess kurtosis and the manual excess kurtosis are equal, we have identified how the psych package calculates kurtosis as shown in chapter 2 of Perkins.

# Load necessary library
library(psych)

# Load the mtcars dataset
data(mtcars)

# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg

# Calculate the sample size (n)
n <- length(mpg_data)

# Calculate the mean (x̄) and standard deviation (σ) of the mpg data
mean_mpg <- mean(mpg_data)
sd_mpg <- sd(mpg_data)

# Calculate the fourth central moment (m4)
m4 <- sum(((mpg_data - mean_mpg) / sd_mpg)^4) / n  # Population m4

# Calculate excess kurtosis (manual)
manual_kurtosis <- m4 - 3

# Calculate kurtosis using psych package for comparison
psych_kurtosis <- kurtosi(mpg_data)

# Output the results
cat("Manual Excess Kurtosis (simplified): ", manual_kurtosis, "\n")
## Manual Excess Kurtosis (simplified):  -0.372766
cat("Psych Package Excess Kurtosis: ", psych_kurtosis, "\n")
## Psych Package Excess Kurtosis:  -0.372766

#Figure 2.2: Normal distribution of 2nd grade students’ heights.

# Load necessary libraries
library(ggplot2)

# Set seed for reproducibility
set.seed(123)

# Simulate heights for 500 second graders based on a normal distribution
n <- 500
mean_height <- 50
sd_height <- 3
heights <- round(rnorm(n, mean = mean_height, sd = sd_height))  # Normally distributed heights

# Clip values to stay within 40 to 60 inches
heights <- pmax(40, pmin(heights, 60))

# Plot the bell curve with both histogram and density
ggplot(data.frame(heights), aes(x = heights)) +
  geom_histogram(aes(y = ..count..), binwidth = 1, fill = "dodgerblue4", color = "black", boundary = 40) +  # Histogram for the counts
  theme_minimal() +
  labs(title = "Normal Distribution (Bell Curve) of Second Graders' Heights", x = "Height (inches)", y = "Number of Kids") +
  theme(axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
  # Overlay a density curve with the original 'heights' data
  geom_density(fill = "red", alpha = 0.2, adjust = 1.5) +
  scale_x_continuous(breaks = seq(40, 60, by = 1))  # More tick marks on the x-axis

#Figure 2.3: Normal, positive and negative distributions

# Load required libraries
library(ggplot2)
library(gridExtra)
library(MASS)  # For generating skewed distributions

# Set up the number of data points
n <- 1000

# Create data for the distributions
normal_data <- rnorm(n, mean = 5, sd = 1)  # Normal distribution with mean = 5
positively_skewed_data <- rbeta(n, shape1 = 2, shape2 = 5)  # Beta distribution (positively skewed)
negatively_skewed_data <- rbeta(n, shape1 = 5, shape2 = 2)  # Beta distribution (negatively skewed)

# Function to calculate the peak of the density curve for mode label placement
get_peak_y <- function(data) {
  density_data <- density(data, bw = 1)  # Larger bandwidth for "fatter" curves
  peak_y <- max(density_data$y)
  return(peak_y)
}

# Custom Labeling function to avoid overlap
adjust_label_y <- function(peak_y, label_number, offset_factor = 0.1) {
  # Spacing the labels further apart by adding an offset factor for each label
  vertical_offset <- peak_y - (offset_factor * label_number)
  return(vertical_offset)
}

# Normal Distribution Plot
normal_mean <- mean(normal_data)
normal_median <- median(normal_data)
normal_mode <- normal_mean  # Mode is exactly the mean for normal distribution
normal_peak_y <- get_peak_y(normal_data)

normal_plot <- ggplot(data.frame(x = normal_data), aes(x = x)) +
  geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = normal_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = normal_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = normal_mode), color = "black", linetype = "solid") +
  annotate("text", x = normal_mean, y = normal_peak_y * 0.9, label = "Mean", color = "red", hjust = -0.2) +
  annotate("text", x = normal_median, y = normal_peak_y * 0.75, label = "Median", color = "blue", hjust = -0.2) +
  annotate("text", x = normal_mode, y = normal_peak_y * 0.5, label = "Mode", color = "black", hjust = -0.2) +
  ggtitle("Normal Distribution") + theme_minimal()

# Positively Skewed Distribution Plot
positively_skewed_mean <- mean(positively_skewed_data)
positively_skewed_median <- median(positively_skewed_data)
positively_skewed_mode <- positively_skewed_mean - (positively_skewed_mean * 0.19)  # Mode estimate for skewed
positively_skewed_peak_y <- get_peak_y(positively_skewed_data)

positively_skewed_plot <- ggplot(data.frame(x = positively_skewed_data), aes(x = x)) +
  geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = positively_skewed_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = positively_skewed_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = positively_skewed_mode), color = "black", linetype = "solid") +
  annotate("text", x = positively_skewed_mean, y = positively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
  annotate("text", x = positively_skewed_median, y = positively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
  annotate("text", x = positively_skewed_mode, y = positively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
  ggtitle("Positively Skewed Distribution") + theme_minimal()

# Negatively Skewed Distribution Plot
negatively_skewed_mean <- mean(negatively_skewed_data)
negatively_skewed_median <- median(negatively_skewed_data)
negatively_skewed_mode <- negatively_skewed_mean + (negatively_skewed_mean * 0.09)  # Adjusting mode for skewed distribution
negatively_skewed_peak_y <- get_peak_y(negatively_skewed_data)

negatively_skewed_plot <- ggplot(data.frame(x = negatively_skewed_data), aes(x = x)) +
  geom_density(fill = "orange", color = "black", adjust = 1) +
  geom_vline(aes(xintercept = negatively_skewed_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = negatively_skewed_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = negatively_skewed_mode), color = "black", linetype = "solid") +
  annotate("text", x = negatively_skewed_mean, y = negatively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
  annotate("text", x = negatively_skewed_median, y = negatively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
  annotate("text", x = negatively_skewed_mode, y = negatively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
  ggtitle("Negatively Skewed Distribution") + theme_minimal()

# Arrange the plots in a vertical grid (stacked)
grid.arrange(normal_plot, positively_skewed_plot, negatively_skewed_plot, ncol = 2)

#Figure 2.4: Mesokurtic, platykurtic and leptokurtic distributions

# Load required libraries
library(ggplot2)
library(gridExtra)
library(MASS)  # For generating distributions

# Set up the number of data points
n <- 1000

# Create data for the distributions
mesokurtic_data <- rnorm(n, mean = 5, sd = 1)  # Normal distribution (Mesokurtic)
platykurtic_data <- runif(n, min = 0, max = 10)  # Uniform distribution (Platykurtic)
leptokurtic_data <- rt(n, df = 3)  # T-distribution with 3 degrees of freedom (Leptokurtic)

# Function to calculate the peak of the density curve for mode label placement
get_peak_y <- function(data) {
  density_data <- density(data, bw = 1)  # Larger bandwidth for "fatter" curves
  peak_y <- max(density_data$y)
  return(peak_y)
}

# Mesokurtic Distribution Plot (Normal Distribution)
mesokurtic_mean <- mean(mesokurtic_data)
mesokurtic_median <- median(mesokurtic_data)
mesokurtic_mode <- mesokurtic_mean  # Mode is exactly the mean for normal distribution
mesokurtic_peak_y <- get_peak_y(mesokurtic_data)

mesokurtic_plot <- ggplot(data.frame(x = mesokurtic_data), aes(x = x)) +
  geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = mesokurtic_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = mesokurtic_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = mesokurtic_mode), color = "black", linetype = "solid") +
  ggtitle("Mesokurtic Distribution") + theme_minimal()

# Platykurtic Distribution Plot (Uniform Distribution)
platykurtic_mean <- mean(platykurtic_data)
platykurtic_median <- median(platykurtic_data)
platykurtic_mode <- platykurtic_mean  # Mode is approximately the mean for uniform distribution
platykurtic_peak_y <- get_peak_y(platykurtic_data)

platykurtic_plot <- ggplot(data.frame(x = platykurtic_data), aes(x = x)) +
  geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
  geom_vline(aes(xintercept = platykurtic_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = platykurtic_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = platykurtic_mode), color = "black", linetype = "solid") +
  ggtitle("Platykurtic Distribution") + theme_minimal()

# Leptokurtic Distribution Plot (T-distribution with 3 degrees of freedom)
leptokurtic_mean <- mean(leptokurtic_data)
leptokurtic_median <- median(leptokurtic_data)
leptokurtic_mode <- leptokurtic_mean  # Mode is approximately the mean for symmetric distributions
leptokurtic_peak_y <- get_peak_y(leptokurtic_data)

leptokurtic_plot <- ggplot(data.frame(x = leptokurtic_data), aes(x = x)) +
  geom_density(fill = "orange", color = "black", adjust = 1) +
  geom_vline(aes(xintercept = leptokurtic_mean), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = leptokurtic_median), color = "blue", linetype = "dotted") +
  geom_vline(aes(xintercept = leptokurtic_mode), color = "black", linetype = "solid") +
  ggtitle("Leptokurtic Distribution") + theme_minimal()

# Arrange the plots in a 2-column grid
grid.arrange(mesokurtic_plot, platykurtic_plot, leptokurtic_plot, ncol = 2)

#Figure 2.5:The normal distribution given percentile and Z score (number of standard deviations)

#Figure 2.6:Distributions of other standardized scores

# Load necessary libraries
library(ggplot2)
library(gridExtra)

# Define parameters for each test
tests <- list(
  IQ = list(mean = 100, sd = 15, x_range = c(40, 160)),
  T_score = list(mean = 50, sd = 10, x_range = c(10, 90)),
  Stanine = list(mean = 5, sd = 2, x_range = c(0, 10))
)

# Function to create bell curve data
create_bell_curve <- function(mean, sd, x_range, test_name) {
  x <- seq(x_range[1], x_range[2], length.out = 1000)
  y <- dnorm(x, mean = mean, sd = sd)
  data.frame(x = x, y = y, test = test_name)
}

# Generate data for all tests
bell_curves <- do.call(rbind, lapply(names(tests), function(test_name) {
  params <- tests[[test_name]]
  create_bell_curve(params$mean, params$sd, params$x_range, test_name)
}))

# Find the maximum density value across all distributions
max_density <- max(bell_curves$y)

# Create individual plots with consistent density scaling
plots <- lapply(unique(bell_curves$test), function(test_name) {
  # Extract test parameters
  params <- tests[[test_name]]
  
  # Adjust density values to be comparable across distributions
  bell_curves_scaled <- bell_curves[bell_curves$test == test_name, ]
  bell_curves_scaled$y <- bell_curves_scaled$y / max(bell_curves_scaled$y) # Scale the y-values
  
  ggplot(bell_curves_scaled, aes(x = x, y = y)) +
    geom_line(color = "blue", size = 1) +
    ggtitle(paste(test_name, "Distribution")) +
    theme_minimal() +
    labs(x = "Score", y = "Density") +
    scale_x_continuous(
      breaks = c(seq(params$mean - 3 * params$sd, params$mean + 3 * params$sd, by = params$sd), params$mean),
      labels = function(x) ifelse(x == params$mean, paste0(x, " (Mean)"), x)
    ) # Customize x-axis ticks to include the mean
})

# Arrange plots in a grid
grid.arrange(grobs = plots, ncol = 1)

#Figure 4.1: A hypothetical experimental study on reading to establish causality

library(DiagrammeR)
library(DiagrammeRsvg)
library(rsvg)

diagram <-grViz("
digraph experiment {
  graph [layout = dot, rankdir = LR]

  # Nodes for general steps
  Population [label = 'Population', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]
  Sample [label = 'Randomly\\nSample\\nPopulation', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]
  Split [label = 'Randomly\\nSplit into\\nGroups', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]

  # Nodes for Group A (Curriculum A)
  PreA [label = 'Pre-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
  ImplementA [label = 'Implement\\nCurriculum A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
  PostA [label = 'Post-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
  FollowUpA [label = 'Follow-Up\\nTest:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]

  # Nodes for Group B (Curriculum B)
  PreB [label = 'Pre-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
  ImplementB [label = 'Implement\\nCurriculum B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
  PostB [label = 'Post-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
  FollowUpB [label = 'Follow-Up\\nTest:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]

  # Final analysis node
  Analyze [label = 'Run\\nStatistical\\nTests', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]

  # Edges for general steps
  Population -> Sample
  Sample -> Split

  # Edges for Group A
  Split -> PreA
  PreA -> ImplementA
  ImplementA -> PostA
  PostA -> FollowUpA
  FollowUpA -> Analyze

  # Edges for Group B
  Split -> PreB
  PreB -> ImplementB
  ImplementB -> PostB
  PostB -> FollowUpB
  FollowUpB -> Analyze

  # Alignment for lanes
  {rank = same; PreA; PreB}
  {rank = same; ImplementA; ImplementB}
  {rank = same; PostA; PostB}
  {rank = same; FollowUpA; FollowUpB}
}
")

# Convert to SVG
svg <- export_svg(diagram)

rsvg_png(charToRaw(svg), "diagram.png")

#Figure 4.2: Probabilistic and non-probabilistic sampling methods

# Load required libraries
library(DiagrammeR)
library(rsvg)
library(magick)

# Create the Probabilistic Sampling Diagram
prob_diagram <- grViz("
digraph probabilistic_sampling {
  graph [rankdir=TB]  // Top to bottom layout for Probabilistic Sampling
  
  node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]

  // Main category
  probabilistic [label='Probabilistic Sampling\n(High External Validity)', shape=ellipse, fillcolor=palegreen, width=2.5]
  
  // Sampling strategies
  true_random [label='True Random\nSampling\n(Highest External\nValidity)']
  stratified_random [label='Stratified Random\nSampling']
  systematic [label='Systematic Sampling']
  cluster [label='Cluster Sampling']

  // Connections
  probabilistic -> true_random
  probabilistic -> stratified_random
  probabilistic -> systematic
  probabilistic -> cluster
}
")

# Create the Non-Probabilistic Sampling Diagram
non_prob_diagram <- grViz("
digraph non_probabilistic_sampling {
  graph [rankdir=TB]  // Top to bottom layout for Non-Probabilistic Sampling
  
  node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]

  // Main category
  non_probabilistic [label='Non-Probabilistic Sampling\n(Low External Validity)', shape=ellipse, fillcolor=lightcoral, width=2.5]
  
  // Sampling strategies
  convenience [label='Convenience Sampling']
  voluntary [label='Voluntary Sampling']
  purposive [label='Purposive Sampling']

  // Connections
  non_probabilistic -> convenience
  non_probabilistic -> voluntary
  non_probabilistic -> purposive
}
")

# Export the diagrams to SVG
prob_svg <- export_svg(prob_diagram)
non_prob_svg <- export_svg(non_prob_diagram)

# Convert SVG to PNG with rsvg
rsvg_png(charToRaw(prob_svg), "probabilistic_sampling.png", width = 1920, height = 1080)
rsvg_png(charToRaw(non_prob_svg), "non_probabilistic_sampling.png", width = 1920, height = 1080)

# Read the PNGs with magick
prob_image <- image_read("probabilistic_sampling.png")
non_prob_image <- image_read("non_probabilistic_sampling.png")

# Ensure both images have the same width
max_width <- max(image_info(prob_image)$width, image_info(non_prob_image)$width)
prob_image <- image_extent(prob_image, geometry = paste0(max_width, "x"), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0(max_width, "x"), gravity = "center")

# Add spacing between diagrams (optional)
prob_image <- image_border(prob_image, color = "white", geometry = "0x10")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "0x10")

# Stack the diagrams vertically
combined_image <- image_append(c(prob_image, non_prob_image), stack = TRUE)

# Save the final combined image
image_write(combined_image, "combined_sampling_highres.png", density = 300)


### Side-by-side image
# Load required libraries
library(DiagrammeR)
library(rsvg)
library(magick)

# Create the Probabilistic Sampling Diagram
prob_diagram <- grViz("
digraph probabilistic_sampling {
  graph [rankdir=TB]  // Top to bottom layout for Probabilistic Sampling
  
  node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]

  // Main category
  probabilistic [label='Probabilistic Sampling\n(High External Validity)', shape=ellipse, fillcolor=palegreen, width=2.5]
  
  // Sampling strategies
  true_random [label='True Random\nSampling\n(Highest External\nValidity)']
  stratified_random [label='Stratified Random\nSampling']
  systematic [label='Systematic Sampling']
  cluster [label='Cluster Sampling']

  // Connections
  probabilistic -> true_random
  probabilistic -> stratified_random
  probabilistic -> systematic
  probabilistic -> cluster
}
")

# Create the Non-Probabilistic Sampling Diagram
non_prob_diagram <- grViz("
digraph non_probabilistic_sampling {
  graph [rankdir=TB]  // Top to bottom layout for Non-Probabilistic Sampling
  
  node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]

  // Main category
  non_probabilistic [label='Non-Probabilistic Sampling\n(Low External Validity)', shape=ellipse, fillcolor=lightcoral, width=2.5]
  
  // Sampling strategies
  convenience [label='Convenience Sampling']
  voluntary [label='Voluntary Sampling']
  purposive [label='Purposive Sampling']

  // Connections
  non_probabilistic -> convenience
  non_probabilistic -> voluntary
  non_probabilistic -> purposive
}
")

# Export the diagrams to SVG
prob_svg <- export_svg(prob_diagram)
non_prob_svg <- export_svg(non_prob_diagram)

# Convert SVG to PNG with rsvg
rsvg_png(charToRaw(prob_svg), "probabilistic_sampling_side.png", width = 1920, height = 1080)
rsvg_png(charToRaw(non_prob_svg), "non_probabilistic_sampling_side.png", width = 1920, height = 1080)

# Read the PNGs with magick
prob_image <- image_read("probabilistic_sampling_side.png")
non_prob_image <- image_read("non_probabilistic_sampling_side.png")

# Ensure both images have the same height
max_height <- max(image_info(prob_image)$height, image_info(non_prob_image)$height)
prob_image <- image_extent(prob_image, geometry = paste0("x", max_height), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0("x", max_height), gravity = "center")

# Add spacing between diagrams (optional)
prob_image <- image_border(prob_image, color = "white", geometry = "10x0")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "10x0")

# Combine images side-by-side
combined_image_side <- image_append(c(prob_image, non_prob_image), stack = FALSE)

# Save the final combined image
image_write(combined_image_side, "combined_sampling_side_by_side.png", density = 300)

#Figure 4.2 Factor analysis chart

library(DiagrammeR)

grViz("
digraph internal_structure {
  graph [rankdir = TB, nodesep=0.4]

  node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica]
  Scale [label = 'Entire Scale', shape = box, fillcolor = lightcoral]

  node [shape = ellipse, fillcolor = lightgoldenrodyellow]
  Factor1 [label = 'Factor 1']
  Factor2 [label = 'Factor 2']

  node [shape = rectangle, fillcolor = lightgray]
  Item1_F1 [label = 'Item 1']
  Item2_F1 [label = 'Item 2']
  Item3_F1 [label = 'Item 3']

  Item1_F2 [label = 'Item 4']
  Item2_F2 [label = 'Item 5']
  Item3_F2 [label = 'Item 6']

  # Define edges
  Scale -> Factor1
  Scale -> Factor2

  Factor1 -> Item1_F1
  Factor1 -> Item2_F1
  Factor1 -> Item3_F1

  Factor2 -> Item1_F2
  Factor2 -> Item2_F2
  Factor2 -> Item3_F2
}
")

#One-tailed and two-tailed tests

# Load necessary libraries
library(ggplot2)
library(patchwork)
library(gridExtra)

# Define the critical values for two-tailed test
alpha <- 0.05  # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2)  # Two-tailed critical value (±1.96 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)

# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2)  # Shift the distribution to the right

# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Plot for two-tailed test with shifted distribution
plot1 <- ggplot() +
  # Plot the null hypothesis curve (standard normal distribution)
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
  # Plot the alternative hypothesis curve (shifted normal distribution)
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
  # Shade the rejection region for the two-tailed test (both left and right tails)
  geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +  # Left tail
  geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +  # Right tail
  # Add vertical lines for critical values
  geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
  # Labels and annotations
  annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = 1.2, size = 3) +  # Move text up and decrease font size
  annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = -0.2, size = 3) +
  labs(title = "P1: Two-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
  theme_minimal()


# Define the significance level (alpha) and critical values
alpha <- 0.05  
z_critical_two_tailed <- qnorm(1 - alpha / 2)  # Two-tailed critical value (~±1.96 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute density values for null and alternative hypotheses
z_density_null <- dnorm(z_scores)  # Standard normal distribution
z_density_alt <- dnorm(z_scores, mean = -2)  # Shifted left

# Create data frames for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Create the plot
plot2<- ggplot() +
  # Plot the null hypothesis curve
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
  # Plot the alternative hypothesis curve
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
  # Shade the left tail rejection region
  geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
  # Shade the right tail rejection region
  geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
  # Add vertical dashed lines for critical values
  geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
  # Add annotations for rejection regions
  annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = 1.2, size = 3) +  # Move text up and decrease font size
  annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black", 
           hjust = -0.2, size = 3) +
  # Labels and styling
  labs(title = "P2: Two-Tailed, Martians < Earthlings", x = "Z-Score", y = "Density") +
  theme_minimal()


# Define the critical value for one-tailed test
alpha <- 0.05  # significance level
z_critical_one_tailed <- qnorm(1 - alpha)  # One-tailed critical value (z_critical = 1.645 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)

# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2)  # Shift the distribution to the right

# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Plot for one-tailed test with shifted distribution to the right
plot3 <- ggplot() +
  # Plot the null hypothesis curve (standard normal distribution)
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
  # Plot the alternative hypothesis curve (shifted normal distribution to the right)
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
  # Shade the rejection region for the one-tailed test (right tail)
  geom_ribbon(data = subset(data_null, z_scores >= z_critical_one_tailed), 
              aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "blue", alpha = 0.5) +  # Right tail
  # Add vertical line for the one-tailed critical value
  geom_vline(xintercept = z_critical_one_tailed, color = "black", linetype = "dashed") +
  # Labels and annotations
  annotate("text", x = z_critical_one_tailed, y = 0.07, label = "One-Tailed\nRejection Region", color = "black", hjust = -0.2, size = 3) +
  labs(title = "P3: One-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
  theme_minimal()

# Define the critical values for two-tailed test
alpha <- 0.05  # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2)  # Two-tailed critical value (±1.96 for alpha = 0.05)

# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)

# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)

# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
# Decrease the shift to increase the overlap
z_density_alt <- dnorm(z_scores, mean = 0.3)  # Slightly shifted distribution

# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)

# Plot for more overlap between null and alternative distributions
plot4 <- ggplot() +
  # Plot the null hypothesis curve (standard normal distribution)
  geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black", alpha = 0.6) +  # Slight transparency for overlap
  # Plot the alternative hypothesis curve (shifted normal distribution)
  geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue", alpha = 0.6) +  # Slight transparency for overlap
  # Add vertical lines for critical values
  geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
  # Add a label indicating no statistical significance
  annotate("text", x = 0, y = 0.15, label = "Null Hypothesis Accepted", color = "black", size = 4, fontface = "bold") +  # Centered annotation
  labs(title = "P4: Earthlings = Martians", x = "Z-Score", y = "Density") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),  # Center the title
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  )


# Create a list of plots
plots <- list(plot1, plot2, plot3, plot4)

# Use grid.arrange to arrange the plots with specified heights
grid.arrange(grobs = plots, ncol2 = 1, nrow = 2, heights = c(1, 1), widths = c(.5, .5))

Independent samples t-test bell curves and box plots

# Load necessary libraries
library(ggplot2)
library(gridExtra)

# Simulated data for non-significant difference (Earthlings vs Martians)
nonsig_data <- data.frame(
  ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 22, sd = 4)),
  Group = rep(c("Earthlings", "Martians"), each = 100)
)

# Simulated data for significant difference (Earthlings vs Martians)
sig_data <- data.frame(
  HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
  Group = rep(c("Earthlings", "Martians"), each = 100)
)

# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Non-Significant Difference (Earthlings vs Martians)") +
  ylab("ACT Score") +
  xlab("Group") +
  theme_minimal()

# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Significant Difference (Earthlings vs Martians)") +
  ylab("HS GPA") +
  xlab("Group") +
  theme_minimal()

# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Non-Significant Difference (ACT Scores - Earthlings vs Martians)") +
  xlab("ACT Score") +
  theme_minimal()

# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Significant Difference (HS GPA - Earthlings vs Martians)") +
  xlab("HS GPA") +
  theme_minimal()

# Arrange boxplot and density plot for non-significant difference
grid.arrange(p1, p3, ncol = 1)

# Arrange boxplot and density plot for significant difference
grid.arrange(p2, p4, ncol = 1)

# One-Way ANOVA boxplots and bell curves

library(ggplot2)
library(gridExtra)
library(psych)
set.seed(123)

# Simulated data for non-significant difference (Earthlings vs Venusians vs Martians)
nonsig_data <- data.frame(
  ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 24, sd = 4), rnorm(100, mean = 22, sd = 4)),
  Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)

# Simulated data for significant difference (Earthlings vs Martians vs Venusians)
sig_data <- data.frame(
  HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 3.0, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
  Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)

# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Non-Significant Difference (Earthlings, Venusians, Martians)") +
  ylab("ACT Score") +
  xlab("Group") +
  theme_minimal()

# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Significant Difference (Earthlings, Venusians, Martians)") +
  ylab("HS GPA") +
  xlab("Group") +
  theme_minimal()

# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Non-Significant Difference (ACT Scores - Earthlings, Venusians, Martians)") +
  xlab("ACT Score") +
  theme_minimal()

# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
  geom_density(alpha = 0.6) +
  ggtitle("Significant Difference (HS GPA - Earthlings, Venusians, Martians)") +
  xlab("HS GPA") +
  theme_minimal()

# Arrange boxplot and density plot for non-significant difference
grid.arrange(p1, p3, ncol = 1)

# Arrange boxplot and density plot for significant difference
grid.arrange(p2, p4, ncol = 1)

# Dependent samples t-test from book

## 
##  Paired t-test
## 
## data:  data$Pretest and data$Posttest
## t = -21.456, df = 19, p-value = 8.846e-15
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -15.1462 -12.4538
## sample estimates:
## mean difference 
##           -13.8
##    vars  n  mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 20 73.15 7.73     73   73.06 8.15  59  87    28 0.02    -0.79 1.73
##    vars  n  mean   sd median trimmed mad min max range  skew kurtosis   se
## X1    1 20 86.95 7.85   86.5   87.25 8.9  70  98    28 -0.23    -0.95 1.76
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 20 13.8 2.88     14   13.62 2.22   9  20    11 0.37    -0.57 0.64

Pre, Post, Post-Post reading

# Creating the data frame of three groups with the same scores
data <- data.frame(
  Student = factor(rep(1:5, each=3)),  # 5 students, 3 conditions
  Condition = factor(rep(c("Pre", "Post", "Post-Post"), times=5)),
  Score = c(80, 75, 67, 90, 85, 85, 78, 72, 74, 85, 80, 78, 88, 83, 91)
)

# 1. Run repeated measures ANOVA with ezANOVA (handling assumptions like sphericity)
library(ez)
anova_result <- ezANOVA(data = data,
                        dv = .(Score),
                        wid = .(Student),
                        within = .(Condition),
                        detailed = TRUE)
anova_result
## $ANOVA
##        Effect DFn DFd         SSn       SSd          F            p p<.05
## 1 (Intercept)   1   4 97768.06667 482.93333 809.785202 9.074958e-06     *
## 2   Condition   2   8    90.13333  89.86667   4.011869 6.213045e-02      
##         ges
## 1 0.9941754
## 2 0.1359614
## 
## $`Mauchly's Test for Sphericity`
##      Effect          W           p p<.05
## 2 Condition 0.01730226 0.002275905     *
## 
## $`Sphericity Corrections`
##      Effect       GGe     p[GG] p[GG]<.05       HFe     p[HF] p[HF]<.05
## 2 Condition 0.5043633 0.1150838           0.5087521 0.1144435
# 2. Normality Test for Each Condition
library(dplyr)
data %>%
  group_by(Condition) %>%
  summarise(shapiro_p = shapiro.test(Score)$p.value)
## # A tibble: 3 × 2
##   Condition shapiro_p
##   <fct>         <dbl>
## 1 Post          0.716
## 2 Post-Post     0.977
## 3 Pre           0.641
# 3. Mauchly's Test of Sphericity
# Using ezANOVA, sphericity testing is automatically handled and included in the output.
# If needed, you can manually extract the sphericity correction from the results.

# 4. Post-Hoc Pairwise Comparison if ANOVA is significant
if (anova_result$ANOVA$p[1] < 0.05) {
  pairwise_result <- pairwise.t.test(data$Score, data$Condition, paired = TRUE)
  print(pairwise_result)
}
## 
##  Pairwise comparisons using paired t tests 
## 
## data:  data$Score and data$Condition 
## 
##           Post    Post-Post
## Post-Post 1.00    -        
## Pre       3.9e-05 0.23     
## 
## P value adjustment method: holm

Regression analysis human height and shoe size

# Load necessary library
library(ggplot2)

# Sample data for height and shoe size
height <- c(128,170, 175, 180, 165, 160, 185, 178, 182, 173, 168, 
            180, 169, 177, 165, 174, 186, 178, 165, 172, 180)

shoe_size <- c(0,9, 10, 11, 8, 7, 12, 10, 11, 9, 8, 
               11, 8, 10, 8, 9, 12, 11, 8, 9, 10)

# Create a data frame
height_shoe_size <- data.frame(Height = height, Shoe_Size = shoe_size)

# Fit the linear model
model <- lm(Height ~ Shoe_Size, data = height_shoe_size)

# Get the model's coefficients (intercept and slope)
intercept <- coef(model)[1]
slope <- coef(model)[2]

# Calculate the intercept at x = 0
y_intercept_at_x_0 <- intercept  # When x = 0, the y-intercept is the model's intercept
cat("The y-axis intercept when x = 0 is:", y_intercept_at_x_0, "cm\n")
## The y-axis intercept when x = 0 is: 128.1263 cm
# Check if the point exists in your dataset
subset(height_shoe_size, Height == 160 & Shoe_Size == 70)
## [1] Height    Shoe_Size
## <0 rows> (or 0-length row.names)
# Create the scatterplot with the regression line and highlight the specific point (with tolerance)
# Create the scatterplot with the regression line, highlight the specific point, and more x-axis ticks
ggplot(height_shoe_size, aes(x = Shoe_Size, y = Height)) +
  geom_point(color = "blue") +  # Scatterplot points
  geom_smooth(method = "lm", se = FALSE, color = "red") +  # Fit line with intercept
  # Highlight the specific point (height = 160, shoe size = 70) with a tolerance
  geom_point(data = subset(height_shoe_size, abs(Height - 160) < 1 & abs(Shoe_Size - 70) < 1), 
             aes(x = Shoe_Size, y = Height), 
             color = "black", shape = 1, size = 5, stroke = 2) +  # Circle around the point
  labs(title = "Scatterplot of Height vs Shoe Size",
       x = "Shoe Size (US Men)",
       y = "Height (cm)") +
  theme_minimal() +
  ylim(120, max(height_shoe_size$Height) + 1) +  # Set y-axis limit
  xlim(0, max(height_shoe_size$Shoe_Size) + 1) +  # Set x-axis limit
  scale_x_continuous(breaks = seq(0, max(height_shoe_size$Shoe_Size) + 1, by = 1))  # Set more ticks on the x-axis

# Load required libraries
library(ggplot2)
library(gridExtra)

# Set a seed for reproducibility
set.seed(42)

# Create a function to generate correlated data
generate_data <- function(r) {
  x <- rnorm(100)
  y <- r * x + sqrt(1 - r^2) * rnorm(100)
  data.frame(x, y)
}

# Create the plots for each correlation
correlations <- c(0.2, -0.2, 0.5, -0.5, 0.9, -0.9, 0)
labels <- c("Weak Positive", "Weak Negative", "Moderate Positive", "Moderate Negative", "Strong Positive", "Strong Negative", "No Correlation")

plots <- lapply(1:7, function(i) {
  data <- generate_data(correlations[i])
  ggplot(data, aes(x, y)) + 
    geom_point() + 
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    ggtitle(paste(labels[i], " (r =", correlations[i], ")")) + 
    theme_minimal() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank())
})

# Arrange the plots in a grid
grid.arrange(grobs = plots, ncol = 3)

# Load necessary library
library(ggplot2)
library(gridExtra)
library(apaTables)
library(lmtest)

# Create a data frame with all variables
multheight <- data.frame(
  height = c(170, 175, 180, 165, 160, 185, 178, 182, 173, 168, 
             180, 169, 177, 165, 174, 186, 178, 165, 172, 180),
  
  shoe_size = c(9, 10, 11, 8, 7, 12, 10, 11, 9, 8, 
                11, 8, 10, 8, 9, 12, 11, 8, 9, 10),
  
  biom= c(160, 162, 155, 167, 170, 158, 165, 172, 168, 160, 
             163, 155, 174, 165, 170, 168, 167, 160, 162, 159),

biof= c(175, 180, 185, 170, 168, 177, 180, 172, 185, 173, 
             169, 178, 174, 181, 179, 176, 182, 173, 170, 177)
)

# View the first few rows
head(multheight)
##   height shoe_size biom biof
## 1    170         9  160  175
## 2    175        10  162  180
## 3    180        11  155  185
## 4    165         8  167  170
## 5    160         7  170  168
## 6    185        12  158  177
# Run multiple regression analysis
model <- lm(height ~ biom + biof, data = multheight)

# Plot to check the linearity assumption for biom
p1<- ggplot(multheight, aes(x = biom, y = height)) + 
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") + 
  labs(title = "Height vs. Biom (Biological Mother)", x = "Biological Mother Height", y = "Child Height")

# Plot to check the linearity assumption for biof
p2<- ggplot(multheight, aes(x = biof, y = height)) + 
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") + 
  labs(title = "Height vs. Biof (Biological Father)", x = "Biological Father Height", y = "Child Height")

# Residuals vs fitted values plot to check for independence of errors
p3<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) + 
  geom_point() + 
  geom_hline(yintercept = 0, color = "red") + 
  labs(title = "Residuals vs Fitted Values", x = "Fitted Values", y = "Residuals")

# Residuals vs fitted values plot to check homoscedasticity
p4<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) + 
  geom_point() + 
  geom_hline(yintercept = 0, color = "red") + 
  labs(title = "Residuals vs Fitted Values (Homoscedasticity)", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

# Histogram of residuals to check normality
p5<-ggplot(data = multheight, aes(x = residuals(model))) + 
  geom_histogram(bins = 10, fill = "skyblue", color = "black") + 
  labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
  theme_minimal()

# Q-Q plot to check normality
p6<-  ggplot(data = multheight, aes(sample = residuals(model))) + 
  geom_qq() + 
  geom_qq_line(color = "red") + 
  labs(title = "Q-Q Plot of Residuals") +
  theme_minimal()

grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)

# Check for multicollinearity using VIF
library(car)
vif(model)
##     biom     biof 
## 1.043446 1.043446
# Durbin-Watson test to check for autocorrelation in residuals
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 2.3376, p-value = 0.7756
## alternative hypothesis: true autocorrelation is greater than 0
# BP test homoscedacity
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.0091, df = 2, p-value = 0.3662
# Summarize the model
summary(model)
## 
## Call:
## lm(formula = height ~ biom + biof, data = multheight)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1899  -5.1111   0.0648   3.7485  11.6584 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.03025   86.40189   1.007    0.328
## biom         0.08133    0.31480   0.258    0.799
## biof         0.41846    0.33782   1.239    0.232
## 
## Residual standard error: 7.345 on 17 degrees of freedom
## Multiple R-squared:  0.08279,    Adjusted R-squared:  -0.02512 
## F-statistic: 0.7672 on 2 and 17 DF,  p-value: 0.4797
# Create the APA-style regression table
apa.reg.table(model, filename = "multreg.doc")
## 
## 
## Regression results using height as the criterion
##  
## 
##    Predictor     b         b_95%_CI beta   beta_95%_CI sr2  sr2_95%_CI   r
##  (Intercept) 87.03 [-95.26, 269.32]                                       
##         biom  0.08    [-0.58, 0.75] 0.06 [-0.44, 0.56] .00 [-.05, .05] .00
##         biof  0.42    [-0.29, 1.13] 0.29 [-0.21, 0.79] .08 [-.15, .31] .28
##                                                                           
##                                                                           
##                                                                           
##              Fit
##                 
##                 
##                 
##        R2 = .083
##  95% CI[.00,.30]
##                 
## 
## Note. A significant b-weight indicates the beta-weight and semi-partial correlation are also significant.
## b represents unstandardized regression weights. beta indicates the standardized regression weights. 
## sr2 represents the semi-partial correlation squared. r represents the zero-order correlation.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
## 

Logistic regression predicting getting a doctorate using HS GPA and College GPA as predictors

# Check assumptions
library(ggplot2)
library(car)
library(dplyr)
library(gridExtra)

# Simulate dataset
n <- 20
data <- data.frame(
  High_School_GPA = round(runif(n, 2.5, 4.0), 2),
  College_GPA = round(runif(n, 2.5, 4.0), 2),
  Doctorate = sample(0:1, n, replace = TRUE, prob = c(0.5, 0.5))  # More 0s than 1s
)


grid.arrange(
  ggplot(data, aes(x = High_School_GPA)) + 
    geom_histogram(binwidth = 0.2, fill = "blue", alpha = 0.6) + 
    labs(title = "Distribution of High School GPA"),

  ggplot(data, aes(x = College_GPA)) + 
    geom_histogram(binwidth = 0.2, fill = "red", alpha = 0.6) + 
    labs(title = "Distribution of College GPA"),

  ggplot(data, aes(y = High_School_GPA)) +
    geom_boxplot(fill = "blue", alpha = 0.6) +
    labs(title = "Boxplot of High School GPA"),

  ggplot(data, aes(y = College_GPA)) +
    geom_boxplot(fill = "red", alpha = 0.6) +
    labs(title = "Boxplot of College GPA"),

  ggplot(data, aes(x = High_School_GPA, y = College_GPA, color = as.factor(Doctorate))) +
    geom_jitter(width = 0.1, height = 0.1) +
    labs(title = "Scatterplot of GPAs by Doctorate Status") +
    scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes")),
  
  # Check residuals
ggplot(data, aes(x = fitted(model), y = residuals(model, type = "pearson"))) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  ggtitle("Residuals vs Fitted Values"),

  ncol = 2
)

# Log-transform continuous predictors for Box-Tidwell test
data$log_High_School_GPA <- log(data$High_School_GPA)
data$log_College_GPA <- log(data$College_GPA)


# Box-Tidwell test: Adding interaction terms between original and log-transformed predictors
box_tidwell_model <- glm(Doctorate ~ High_School_GPA + College_GPA + 
                           High_School_GPA:log_High_School_GPA + College_GPA:log_College_GPA, 
                         data = data, family = binomial)

# Check the summary of the Box-Tidwell model
summary(box_tidwell_model)
## 
## Call:
## glm(formula = Doctorate ~ High_School_GPA + College_GPA + High_School_GPA:log_High_School_GPA + 
##     College_GPA:log_College_GPA, family = binomial, data = data)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                          -318.09     187.62  -1.695    0.090 .
## High_School_GPA                        40.05      46.47   0.862    0.389  
## College_GPA                           173.92     111.04   1.566    0.117  
## High_School_GPA:log_High_School_GPA   -19.14      21.43  -0.893    0.372  
## College_GPA:log_College_GPA           -78.97      50.45  -1.565    0.118  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 21.933  on 15  degrees of freedom
## AIC: 31.933
## 
## Number of Fisher Scoring iterations: 4
# Check multicollinearity
vif(box_tidwell_model)
##                     High_School_GPA                         College_GPA 
##                            1986.623                            2797.653 
## High_School_GPA:log_High_School_GPA         College_GPA:log_College_GPA 
##                            1991.941                            2790.169
# Fit logistic regression model
model <- glm(Doctorate ~ High_School_GPA + College_GPA, data = data, family = binomial)
summary(model)
## 
## Call:
## glm(formula = Doctorate ~ High_School_GPA + College_GPA, family = binomial, 
##     data = data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)       5.1370     6.1004   0.842    0.400
## High_School_GPA  -1.2857     0.9839  -1.307    0.191
## College_GPA      -0.2914     1.8128  -0.161    0.872
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 25.628  on 17  degrees of freedom
## AIC: 31.628
## 
## Number of Fisher Scoring iterations: 4
# Fit logistic regression model without Box-Tidwell terms first
model <- glm(Doctorate ~ High_School_GPA + College_GPA, data = data, family = binomial)

model_null <- glm(Doctorate ~ 1, data = data, family = binomial)

anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Doctorate ~ 1
## Model 2: Doctorate ~ High_School_GPA + College_GPA
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        19     27.726                     
## 2        17     25.628  2   2.0976   0.3504
summary(model)
## 
## Call:
## glm(formula = Doctorate ~ High_School_GPA + College_GPA, family = binomial, 
##     data = data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)       5.1370     6.1004   0.842    0.400
## High_School_GPA  -1.2857     0.9839  -1.307    0.191
## College_GPA      -0.2914     1.8128  -0.161    0.872
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 25.628  on 17  degrees of freedom
## AIC: 31.628
## 
## Number of Fisher Scoring iterations: 4
# Check multicollinearity
vif(model)
## High_School_GPA     College_GPA 
##        1.056721        1.056721
# APA-style interpretation
exp(coef(model))  # Convert log-odds to odds ratios
##     (Intercept) High_School_GPA     College_GPA 
##     170.2024177       0.2764435       0.7472186
write.csv(data, "logisticdata.csv")
library(ggplot2)

set.seed(42)
n <- 20
x <- seq(1, 10, length.out = n)
a <- 2
b <- 1.5
e <- rnorm(n, mean = 0, sd = 1)
y <- a + b * x + e

model <- lm(y ~ x)
predicted_y <- predict(model)

df <- data.frame(x, y, predicted_y)

intercept <- coef(model)[1]

x_pred_annotate <- 9
y_pred_annotate <- predict(model, newdata = data.frame(x = x_pred_annotate))

error_mid <- (df$predicted_y[1] + df$y[1]) / 2

p <- ggplot(df, aes(x, y)) +
  geom_point(color = "blue", size = 3) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  geom_segment(aes(x = x, xend = x, y = predicted_y, yend = y), 
               arrow = arrow(length = unit(0.2, "cm")), color = "black") +
  geom_vline(xintercept = 0, color = "black", size = 1) +
  scale_x_continuous(limits = c(-1, max(x) + 1), breaks = seq(-1, max(x) + 1, by = 1)) +  # X-axis ticks at increments of 1
  labs(title = "Scatterplot with y = a + bx + e where  y = # of interviews and x = # of shoes",
       x = "Number of Shoes", 
       y = "Number of Interviews") +
  theme_minimal() +
  theme(axis.line = element_line(color = "black", size = 1)) +
  annotate("segment", x = -0.5, xend = 0, 
           y = intercept + 0.5, yend = intercept, 
           arrow = arrow(length = unit(0.2, "cm")), color = "black") +
  annotate("text", x = -0.6, y = intercept + 0.9, 
           label = "a, or intercept", hjust = .5, vjust = 0.5, color = "black") +
  annotate("segment", x = x_pred_annotate - 1, xend = x_pred_annotate - .2,  # Shift left
         y = y_pred_annotate + 1, yend = y_pred_annotate,  # Arrow pointing down
         arrow = arrow(length = unit(0.2, "cm")), 
         size = .5,  # Thicker arrow
         color = "red") +
  annotate("text", x = x_pred_annotate - 0.3, y = y_pred_annotate + 1.2, 
           label = "prediction line, or bx", hjust = 1, color = "red") +  # Shift left
  annotate("segment", x = df$x[1] + 0.5, xend = df$x[1], 
           y = error_mid + 0.5, yend = error_mid, 
           arrow = arrow(length = unit(0.2, "cm")), color = "black") +
  annotate("text", x = df$x[1] + 0.6, y = error_mid + 0.1, 
           label = "error, or e", hjust = 0, color = "black")

print(p)