#Load the libraries
library(tidyverse)
library(dplyr)
library(ggplot2)
#Figure 2.1: How our students scored compared to the mean and each other as coded in R
# Create the data
data <- data.frame(
Name = c("Malcolm", "Josie", "Liam", "Jackson", "Xavier"),
Score = c(10, 12, 12, 15, 23))
# Calculate the mean
mean_score <- mean(data$Score)
# Plot the data
ggplot(data, aes(x = Name, y = Score)) +
geom_line(group = 1, color = "blue", size = 1) + # Line connecting the points
geom_point(aes(color = Score), size = 4) + # Points representing the actual data
geom_hline(yintercept = mean_score, linetype = "dashed", color = "red") + # Mean line
labs(title = "Line Plot with Mean in the Middle",
x = "Name",
y = "Score") +
scale_color_gradient(low = "lightblue", high = "darkblue") + # Color scale for points
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
#This is how normality lis calculated in R using the psych package.
# Load necessary library
library(psych)
# Load the mtcars dataset
data(mtcars)
# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg
# Calculate skewness using the psych package
psych_skew <- skew(mpg_data)
# Calculate unstandardized skewness (raw third moment)
N <- length(mpg_data)
mean_mpg <- mean(mpg_data)
raw_skew <- (1 / N) * sum((mpg_data - mean_mpg)^3)
# Calculate standardized skewness
sd_mpg <- sd(mpg_data)
std_skew <- raw_skew / (sd_mpg^3)
# Print results for comparison
cat("Psych skewness: ", psych_skew, "\n")
## Psych skewness: 0.610655
cat("Raw skewness (unstandardized): ", raw_skew, "\n")
## Raw skewness (unstandardized): 133.6867
cat("Standardized skewness: ", std_skew, "\n")
## Standardized skewness: 0.610655
##This is comparing the psych package formula for kurtosis to manual forms. Since the psych package excess kurtosis and the manual excess kurtosis are equal, we have identified how the psych package calculates kurtosis as shown in chapter 2 of Perkins.
# Load necessary library
library(psych)
# Load the mtcars dataset
data(mtcars)
# Select the 'mpg' variable (miles per gallon)
mpg_data <- mtcars$mpg
# Calculate the sample size (n)
n <- length(mpg_data)
# Calculate the mean (x̄) and standard deviation (σ) of the mpg data
mean_mpg <- mean(mpg_data)
sd_mpg <- sd(mpg_data)
# Calculate the fourth central moment (m4)
m4 <- sum(((mpg_data - mean_mpg) / sd_mpg)^4) / n # Population m4
# Calculate excess kurtosis (manual)
manual_kurtosis <- m4 - 3
# Calculate kurtosis using psych package for comparison
psych_kurtosis <- kurtosi(mpg_data)
# Output the results
cat("Manual Excess Kurtosis (simplified): ", manual_kurtosis, "\n")
## Manual Excess Kurtosis (simplified): -0.372766
cat("Psych Package Excess Kurtosis: ", psych_kurtosis, "\n")
## Psych Package Excess Kurtosis: -0.372766
#Figure 2.2: Normal distribution of 2nd grade students’ heights.
# Load necessary libraries
library(ggplot2)
# Set seed for reproducibility
set.seed(123)
# Simulate heights for 500 second graders based on a normal distribution
n <- 500
mean_height <- 50
sd_height <- 3
heights <- round(rnorm(n, mean = mean_height, sd = sd_height)) # Normally distributed heights
# Clip values to stay within 40 to 60 inches
heights <- pmax(40, pmin(heights, 60))
# Plot the bell curve with both histogram and density
ggplot(data.frame(heights), aes(x = heights)) +
geom_histogram(aes(y = ..count..), binwidth = 1, fill = "dodgerblue4", color = "black", boundary = 40) + # Histogram for the counts
theme_minimal() +
labs(title = "Normal Distribution (Bell Curve) of Second Graders' Heights", x = "Height (inches)", y = "Number of Kids") +
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
# Overlay a density curve with the original 'heights' data
geom_density(fill = "red", alpha = 0.2, adjust = 1.5) +
scale_x_continuous(breaks = seq(40, 60, by = 1)) # More tick marks on the x-axis
#Figure 2.3: Normal, positive and negative distributions
# Load required libraries
library(ggplot2)
library(gridExtra)
library(MASS) # For generating skewed distributions
# Set up the number of data points
n <- 1000
# Create data for the distributions
normal_data <- rnorm(n, mean = 5, sd = 1) # Normal distribution with mean = 5
positively_skewed_data <- rbeta(n, shape1 = 2, shape2 = 5) # Beta distribution (positively skewed)
negatively_skewed_data <- rbeta(n, shape1 = 5, shape2 = 2) # Beta distribution (negatively skewed)
# Function to calculate the peak of the density curve for mode label placement
get_peak_y <- function(data) {
density_data <- density(data, bw = 1) # Larger bandwidth for "fatter" curves
peak_y <- max(density_data$y)
return(peak_y)
}
# Custom Labeling function to avoid overlap
adjust_label_y <- function(peak_y, label_number, offset_factor = 0.1) {
# Spacing the labels further apart by adding an offset factor for each label
vertical_offset <- peak_y - (offset_factor * label_number)
return(vertical_offset)
}
# Normal Distribution Plot
normal_mean <- mean(normal_data)
normal_median <- median(normal_data)
normal_mode <- normal_mean # Mode is exactly the mean for normal distribution
normal_peak_y <- get_peak_y(normal_data)
normal_plot <- ggplot(data.frame(x = normal_data), aes(x = x)) +
geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = normal_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = normal_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = normal_mode), color = "black", linetype = "solid") +
annotate("text", x = normal_mean, y = normal_peak_y * 0.9, label = "Mean", color = "red", hjust = -0.2) +
annotate("text", x = normal_median, y = normal_peak_y * 0.75, label = "Median", color = "blue", hjust = -0.2) +
annotate("text", x = normal_mode, y = normal_peak_y * 0.5, label = "Mode", color = "black", hjust = -0.2) +
ggtitle("Normal Distribution") + theme_minimal()
# Positively Skewed Distribution Plot
positively_skewed_mean <- mean(positively_skewed_data)
positively_skewed_median <- median(positively_skewed_data)
positively_skewed_mode <- positively_skewed_mean - (positively_skewed_mean * 0.19) # Mode estimate for skewed
positively_skewed_peak_y <- get_peak_y(positively_skewed_data)
positively_skewed_plot <- ggplot(data.frame(x = positively_skewed_data), aes(x = x)) +
geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = positively_skewed_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = positively_skewed_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = positively_skewed_mode), color = "black", linetype = "solid") +
annotate("text", x = positively_skewed_mean, y = positively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
annotate("text", x = positively_skewed_median, y = positively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
annotate("text", x = positively_skewed_mode, y = positively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
ggtitle("Positively Skewed Distribution") + theme_minimal()
# Negatively Skewed Distribution Plot
negatively_skewed_mean <- mean(negatively_skewed_data)
negatively_skewed_median <- median(negatively_skewed_data)
negatively_skewed_mode <- negatively_skewed_mean + (negatively_skewed_mean * 0.09) # Adjusting mode for skewed distribution
negatively_skewed_peak_y <- get_peak_y(negatively_skewed_data)
negatively_skewed_plot <- ggplot(data.frame(x = negatively_skewed_data), aes(x = x)) +
geom_density(fill = "orange", color = "black", adjust = 1) +
geom_vline(aes(xintercept = negatively_skewed_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = negatively_skewed_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = negatively_skewed_mode), color = "black", linetype = "solid") +
annotate("text", x = negatively_skewed_mean, y = negatively_skewed_peak_y * 3, label = "Mean", color = "red", hjust = -0.2) +
annotate("text", x = negatively_skewed_median, y = negatively_skewed_peak_y * 2, label = "Median", color = "blue", hjust = -0.2) +
annotate("text", x = negatively_skewed_mode, y = negatively_skewed_peak_y * 1, label = "Mode", color = "black", hjust = -0.2) +
ggtitle("Negatively Skewed Distribution") + theme_minimal()
# Arrange the plots in a vertical grid (stacked)
grid.arrange(normal_plot, positively_skewed_plot, negatively_skewed_plot, ncol = 2)
#Figure 2.4: Mesokurtic, platykurtic and leptokurtic distributions
# Load required libraries
library(ggplot2)
library(gridExtra)
library(MASS) # For generating distributions
# Set up the number of data points
n <- 1000
# Create data for the distributions
mesokurtic_data <- rnorm(n, mean = 5, sd = 1) # Normal distribution (Mesokurtic)
platykurtic_data <- runif(n, min = 0, max = 10) # Uniform distribution (Platykurtic)
leptokurtic_data <- rt(n, df = 3) # T-distribution with 3 degrees of freedom (Leptokurtic)
# Function to calculate the peak of the density curve for mode label placement
get_peak_y <- function(data) {
density_data <- density(data, bw = 1) # Larger bandwidth for "fatter" curves
peak_y <- max(density_data$y)
return(peak_y)
}
# Mesokurtic Distribution Plot (Normal Distribution)
mesokurtic_mean <- mean(mesokurtic_data)
mesokurtic_median <- median(mesokurtic_data)
mesokurtic_mode <- mesokurtic_mean # Mode is exactly the mean for normal distribution
mesokurtic_peak_y <- get_peak_y(mesokurtic_data)
mesokurtic_plot <- ggplot(data.frame(x = mesokurtic_data), aes(x = x)) +
geom_density(fill = "skyblue", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = mesokurtic_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = mesokurtic_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = mesokurtic_mode), color = "black", linetype = "solid") +
ggtitle("Mesokurtic Distribution") + theme_minimal()
# Platykurtic Distribution Plot (Uniform Distribution)
platykurtic_mean <- mean(platykurtic_data)
platykurtic_median <- median(platykurtic_data)
platykurtic_mode <- platykurtic_mean # Mode is approximately the mean for uniform distribution
platykurtic_peak_y <- get_peak_y(platykurtic_data)
platykurtic_plot <- ggplot(data.frame(x = platykurtic_data), aes(x = x)) +
geom_density(fill = "lightgreen", color = "black", adjust = 1.5) +
geom_vline(aes(xintercept = platykurtic_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = platykurtic_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = platykurtic_mode), color = "black", linetype = "solid") +
ggtitle("Platykurtic Distribution") + theme_minimal()
# Leptokurtic Distribution Plot (T-distribution with 3 degrees of freedom)
leptokurtic_mean <- mean(leptokurtic_data)
leptokurtic_median <- median(leptokurtic_data)
leptokurtic_mode <- leptokurtic_mean # Mode is approximately the mean for symmetric distributions
leptokurtic_peak_y <- get_peak_y(leptokurtic_data)
leptokurtic_plot <- ggplot(data.frame(x = leptokurtic_data), aes(x = x)) +
geom_density(fill = "orange", color = "black", adjust = 1) +
geom_vline(aes(xintercept = leptokurtic_mean), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = leptokurtic_median), color = "blue", linetype = "dotted") +
geom_vline(aes(xintercept = leptokurtic_mode), color = "black", linetype = "solid") +
ggtitle("Leptokurtic Distribution") + theme_minimal()
# Arrange the plots in a 2-column grid
grid.arrange(mesokurtic_plot, platykurtic_plot, leptokurtic_plot, ncol = 2)
#Figure 2.5:The normal distribution given percentile and Z score (number
of standard deviations)
#Figure 2.6:Distributions of other standardized scores
# Load necessary libraries
library(ggplot2)
library(gridExtra)
# Define parameters for each test
tests <- list(
IQ = list(mean = 100, sd = 15, x_range = c(40, 160)),
T_score = list(mean = 50, sd = 10, x_range = c(10, 90)),
Stanine = list(mean = 5, sd = 2, x_range = c(0, 10))
)
# Function to create bell curve data
create_bell_curve <- function(mean, sd, x_range, test_name) {
x <- seq(x_range[1], x_range[2], length.out = 1000)
y <- dnorm(x, mean = mean, sd = sd)
data.frame(x = x, y = y, test = test_name)
}
# Generate data for all tests
bell_curves <- do.call(rbind, lapply(names(tests), function(test_name) {
params <- tests[[test_name]]
create_bell_curve(params$mean, params$sd, params$x_range, test_name)
}))
# Find the maximum density value across all distributions
max_density <- max(bell_curves$y)
# Create individual plots with consistent density scaling
plots <- lapply(unique(bell_curves$test), function(test_name) {
# Extract test parameters
params <- tests[[test_name]]
# Adjust density values to be comparable across distributions
bell_curves_scaled <- bell_curves[bell_curves$test == test_name, ]
bell_curves_scaled$y <- bell_curves_scaled$y / max(bell_curves_scaled$y) # Scale the y-values
ggplot(bell_curves_scaled, aes(x = x, y = y)) +
geom_line(color = "blue", size = 1) +
ggtitle(paste(test_name, "Distribution")) +
theme_minimal() +
labs(x = "Score", y = "Density") +
scale_x_continuous(
breaks = c(seq(params$mean - 3 * params$sd, params$mean + 3 * params$sd, by = params$sd), params$mean),
labels = function(x) ifelse(x == params$mean, paste0(x, " (Mean)"), x)
) # Customize x-axis ticks to include the mean
})
# Arrange plots in a grid
grid.arrange(grobs = plots, ncol = 1)
#Figure 4.1: A hypothetical experimental study on reading to establish causality
library(DiagrammeR)
library(DiagrammeRsvg)
library(rsvg)
diagram <-grViz("
digraph experiment {
graph [layout = dot, rankdir = LR]
# Nodes for general steps
Population [label = 'Population', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]
Sample [label = 'Randomly\\nSample\\nPopulation', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]
Split [label = 'Randomly\\nSplit into\\nGroups', shape = rectangle, style = filled, fillcolor = palegreen, fontcolor = black, width = 1]
# Nodes for Group A (Curriculum A)
PreA [label = 'Pre-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
ImplementA [label = 'Implement\\nCurriculum A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
PostA [label = 'Post-Test:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
FollowUpA [label = 'Follow-Up\\nTest:\\nGroup A', shape = rectangle, style = filled, fillcolor = mistyrose, fontcolor = black, width = 1]
# Nodes for Group B (Curriculum B)
PreB [label = 'Pre-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
ImplementB [label = 'Implement\\nCurriculum B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
PostB [label = 'Post-Test:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
FollowUpB [label = 'Follow-Up\\nTest:\\nGroup B', shape = rectangle, style = filled, fillcolor = lemonchiffon, fontcolor = black, width = 1]
# Final analysis node
Analyze [label = 'Run\\nStatistical\\nTests', shape = rectangle, style = filled, fillcolor = lightblue, fontcolor = black, width = 1]
# Edges for general steps
Population -> Sample
Sample -> Split
# Edges for Group A
Split -> PreA
PreA -> ImplementA
ImplementA -> PostA
PostA -> FollowUpA
FollowUpA -> Analyze
# Edges for Group B
Split -> PreB
PreB -> ImplementB
ImplementB -> PostB
PostB -> FollowUpB
FollowUpB -> Analyze
# Alignment for lanes
{rank = same; PreA; PreB}
{rank = same; ImplementA; ImplementB}
{rank = same; PostA; PostB}
{rank = same; FollowUpA; FollowUpB}
}
")
# Convert to SVG
svg <- export_svg(diagram)
rsvg_png(charToRaw(svg), "diagram.png")
#Figure 4.2: Probabilistic and non-probabilistic sampling methods
# Load required libraries
library(DiagrammeR)
library(rsvg)
library(magick)
# Create the Probabilistic Sampling Diagram
prob_diagram <- grViz("
digraph probabilistic_sampling {
graph [rankdir=TB] // Top to bottom layout for Probabilistic Sampling
node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
// Main category
probabilistic [label='Probabilistic Sampling\n(High External Validity)', shape=ellipse, fillcolor=palegreen, width=2.5]
// Sampling strategies
true_random [label='True Random\nSampling\n(Highest External\nValidity)']
stratified_random [label='Stratified Random\nSampling']
systematic [label='Systematic Sampling']
cluster [label='Cluster Sampling']
// Connections
probabilistic -> true_random
probabilistic -> stratified_random
probabilistic -> systematic
probabilistic -> cluster
}
")
# Create the Non-Probabilistic Sampling Diagram
non_prob_diagram <- grViz("
digraph non_probabilistic_sampling {
graph [rankdir=TB] // Top to bottom layout for Non-Probabilistic Sampling
node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
// Main category
non_probabilistic [label='Non-Probabilistic Sampling\n(Low External Validity)', shape=ellipse, fillcolor=lightcoral, width=2.5]
// Sampling strategies
convenience [label='Convenience Sampling']
voluntary [label='Voluntary Sampling']
purposive [label='Purposive Sampling']
// Connections
non_probabilistic -> convenience
non_probabilistic -> voluntary
non_probabilistic -> purposive
}
")
# Export the diagrams to SVG
prob_svg <- export_svg(prob_diagram)
non_prob_svg <- export_svg(non_prob_diagram)
# Convert SVG to PNG with rsvg
rsvg_png(charToRaw(prob_svg), "probabilistic_sampling.png", width = 1920, height = 1080)
rsvg_png(charToRaw(non_prob_svg), "non_probabilistic_sampling.png", width = 1920, height = 1080)
# Read the PNGs with magick
prob_image <- image_read("probabilistic_sampling.png")
non_prob_image <- image_read("non_probabilistic_sampling.png")
# Ensure both images have the same width
max_width <- max(image_info(prob_image)$width, image_info(non_prob_image)$width)
prob_image <- image_extent(prob_image, geometry = paste0(max_width, "x"), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0(max_width, "x"), gravity = "center")
# Add spacing between diagrams (optional)
prob_image <- image_border(prob_image, color = "white", geometry = "0x10")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "0x10")
# Stack the diagrams vertically
combined_image <- image_append(c(prob_image, non_prob_image), stack = TRUE)
# Save the final combined image
image_write(combined_image, "combined_sampling_highres.png", density = 300)
### Side-by-side image
# Load required libraries
library(DiagrammeR)
library(rsvg)
library(magick)
# Create the Probabilistic Sampling Diagram
prob_diagram <- grViz("
digraph probabilistic_sampling {
graph [rankdir=TB] // Top to bottom layout for Probabilistic Sampling
node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
// Main category
probabilistic [label='Probabilistic Sampling\n(High External Validity)', shape=ellipse, fillcolor=palegreen, width=2.5]
// Sampling strategies
true_random [label='True Random\nSampling\n(Highest External\nValidity)']
stratified_random [label='Stratified Random\nSampling']
systematic [label='Systematic Sampling']
cluster [label='Cluster Sampling']
// Connections
probabilistic -> true_random
probabilistic -> stratified_random
probabilistic -> systematic
probabilistic -> cluster
}
")
# Create the Non-Probabilistic Sampling Diagram
non_prob_diagram <- grViz("
digraph non_probabilistic_sampling {
graph [rankdir=TB] // Top to bottom layout for Non-Probabilistic Sampling
node [shape=rectangle, style=filled, fontname=Helvetica, fontsize=10]
// Main category
non_probabilistic [label='Non-Probabilistic Sampling\n(Low External Validity)', shape=ellipse, fillcolor=lightcoral, width=2.5]
// Sampling strategies
convenience [label='Convenience Sampling']
voluntary [label='Voluntary Sampling']
purposive [label='Purposive Sampling']
// Connections
non_probabilistic -> convenience
non_probabilistic -> voluntary
non_probabilistic -> purposive
}
")
# Export the diagrams to SVG
prob_svg <- export_svg(prob_diagram)
non_prob_svg <- export_svg(non_prob_diagram)
# Convert SVG to PNG with rsvg
rsvg_png(charToRaw(prob_svg), "probabilistic_sampling_side.png", width = 1920, height = 1080)
rsvg_png(charToRaw(non_prob_svg), "non_probabilistic_sampling_side.png", width = 1920, height = 1080)
# Read the PNGs with magick
prob_image <- image_read("probabilistic_sampling_side.png")
non_prob_image <- image_read("non_probabilistic_sampling_side.png")
# Ensure both images have the same height
max_height <- max(image_info(prob_image)$height, image_info(non_prob_image)$height)
prob_image <- image_extent(prob_image, geometry = paste0("x", max_height), gravity = "center")
non_prob_image <- image_extent(non_prob_image, geometry = paste0("x", max_height), gravity = "center")
# Add spacing between diagrams (optional)
prob_image <- image_border(prob_image, color = "white", geometry = "10x0")
non_prob_image <- image_border(non_prob_image, color = "white", geometry = "10x0")
# Combine images side-by-side
combined_image_side <- image_append(c(prob_image, non_prob_image), stack = FALSE)
# Save the final combined image
image_write(combined_image_side, "combined_sampling_side_by_side.png", density = 300)
#Figure 4.2 Factor analysis chart
library(DiagrammeR)
grViz("
digraph internal_structure {
graph [rankdir = TB, nodesep=0.4]
node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica]
Scale [label = 'Entire Scale', shape = box, fillcolor = lightcoral]
node [shape = ellipse, fillcolor = lightgoldenrodyellow]
Factor1 [label = 'Factor 1']
Factor2 [label = 'Factor 2']
node [shape = rectangle, fillcolor = lightgray]
Item1_F1 [label = 'Item 1']
Item2_F1 [label = 'Item 2']
Item3_F1 [label = 'Item 3']
Item1_F2 [label = 'Item 4']
Item2_F2 [label = 'Item 5']
Item3_F2 [label = 'Item 6']
# Define edges
Scale -> Factor1
Scale -> Factor2
Factor1 -> Item1_F1
Factor1 -> Item2_F1
Factor1 -> Item3_F1
Factor2 -> Item1_F2
Factor2 -> Item2_F2
Factor2 -> Item3_F2
}
")
#One-tailed and two-tailed tests
# Load necessary libraries
library(ggplot2)
library(patchwork)
library(gridExtra)
# Define the critical values for two-tailed test
alpha <- 0.05 # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2) # Two-tailed critical value (±1.96 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)
# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2) # Shift the distribution to the right
# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Plot for two-tailed test with shifted distribution
plot1 <- ggplot() +
# Plot the null hypothesis curve (standard normal distribution)
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
# Plot the alternative hypothesis curve (shifted normal distribution)
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
# Shade the rejection region for the two-tailed test (both left and right tails)
geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) + # Left tail
geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) + # Right tail
# Add vertical lines for critical values
geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
# Labels and annotations
annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = 1.2, size = 3) + # Move text up and decrease font size
annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = -0.2, size = 3) +
labs(title = "P1: Two-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
theme_minimal()
# Define the significance level (alpha) and critical values
alpha <- 0.05
z_critical_two_tailed <- qnorm(1 - alpha / 2) # Two-tailed critical value (~±1.96 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute density values for null and alternative hypotheses
z_density_null <- dnorm(z_scores) # Standard normal distribution
z_density_alt <- dnorm(z_scores, mean = -2) # Shifted left
# Create data frames for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Create the plot
plot2<- ggplot() +
# Plot the null hypothesis curve
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
# Plot the alternative hypothesis curve
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
# Shade the left tail rejection region
geom_ribbon(data = subset(data_null, z_scores <= -z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
# Shade the right tail rejection region
geom_ribbon(data = subset(data_null, z_scores >= z_critical_two_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "red", alpha = 0.5) +
# Add vertical dashed lines for critical values
geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
# Add annotations for rejection regions
annotate("text", x = -z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = 1.2, size = 3) + # Move text up and decrease font size
annotate("text", x = z_critical_two_tailed, y = 0.07, label = "Two-Tailed\nRejection Region", color = "black",
hjust = -0.2, size = 3) +
# Labels and styling
labs(title = "P2: Two-Tailed, Martians < Earthlings", x = "Z-Score", y = "Density") +
theme_minimal()
# Define the critical value for one-tailed test
alpha <- 0.05 # significance level
z_critical_one_tailed <- qnorm(1 - alpha) # One-tailed critical value (z_critical = 1.645 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)
# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
z_density_alt <- dnorm(z_scores, mean = 2) # Shift the distribution to the right
# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Plot for one-tailed test with shifted distribution to the right
plot3 <- ggplot() +
# Plot the null hypothesis curve (standard normal distribution)
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black") +
# Plot the alternative hypothesis curve (shifted normal distribution to the right)
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue") +
# Shade the rejection region for the one-tailed test (right tail)
geom_ribbon(data = subset(data_null, z_scores >= z_critical_one_tailed),
aes(x = z_scores, ymin = 0, ymax = z_density_null), fill = "blue", alpha = 0.5) + # Right tail
# Add vertical line for the one-tailed critical value
geom_vline(xintercept = z_critical_one_tailed, color = "black", linetype = "dashed") +
# Labels and annotations
annotate("text", x = z_critical_one_tailed, y = 0.07, label = "One-Tailed\nRejection Region", color = "black", hjust = -0.2, size = 3) +
labs(title = "P3: One-Tailed, Martians > Earthlings", x = "Z-Score", y = "Density") +
theme_minimal()
# Define the critical values for two-tailed test
alpha <- 0.05 # significance level
z_critical_two_tailed <- qnorm(1 - alpha / 2) # Two-tailed critical value (±1.96 for alpha = 0.05)
# Generate a sequence of Z-scores
z_scores <- seq(-4, 4, length.out = 1000)
# Compute the normal distribution density values for the null hypothesis (normal distribution)
z_density_null <- dnorm(z_scores)
# Compute the normal distribution density values for the alternative hypothesis (shifted to the right)
# Decrease the shift to increase the overlap
z_density_alt <- dnorm(z_scores, mean = 0.3) # Slightly shifted distribution
# Create a data frame for plotting
data_null <- data.frame(z_scores, z_density_null)
data_alt <- data.frame(z_scores, z_density_alt)
# Plot for more overlap between null and alternative distributions
plot4 <- ggplot() +
# Plot the null hypothesis curve (standard normal distribution)
geom_line(data = data_null, aes(x = z_scores, y = z_density_null), color = "black", alpha = 0.6) + # Slight transparency for overlap
# Plot the alternative hypothesis curve (shifted normal distribution)
geom_line(data = data_alt, aes(x = z_scores, y = z_density_alt), color = "blue", alpha = 0.6) + # Slight transparency for overlap
# Add vertical lines for critical values
geom_vline(xintercept = c(-z_critical_two_tailed, z_critical_two_tailed), color = "black", linetype = "dashed") +
# Add a label indicating no statistical significance
annotate("text", x = 0, y = 0.15, label = "Null Hypothesis Accepted", color = "black", size = 4, fontface = "bold") + # Centered annotation
labs(title = "P4: Earthlings = Martians", x = "Z-Score", y = "Density") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5), # Center the title
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12)
)
# Create a list of plots
plots <- list(plot1, plot2, plot3, plot4)
# Use grid.arrange to arrange the plots with specified heights
grid.arrange(grobs = plots, ncol2 = 1, nrow = 2, heights = c(1, 1), widths = c(.5, .5))
# Load necessary libraries
library(ggplot2)
library(gridExtra)
# Simulated data for non-significant difference (Earthlings vs Martians)
nonsig_data <- data.frame(
ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 22, sd = 4)),
Group = rep(c("Earthlings", "Martians"), each = 100)
)
# Simulated data for significant difference (Earthlings vs Martians)
sig_data <- data.frame(
HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
Group = rep(c("Earthlings", "Martians"), each = 100)
)
# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Non-Significant Difference (Earthlings vs Martians)") +
ylab("ACT Score") +
xlab("Group") +
theme_minimal()
# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Significant Difference (Earthlings vs Martians)") +
ylab("HS GPA") +
xlab("Group") +
theme_minimal()
# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Non-Significant Difference (ACT Scores - Earthlings vs Martians)") +
xlab("ACT Score") +
theme_minimal()
# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Significant Difference (HS GPA - Earthlings vs Martians)") +
xlab("HS GPA") +
theme_minimal()
# Arrange boxplot and density plot for non-significant difference
grid.arrange(p1, p3, ncol = 1)
# Arrange boxplot and density plot for significant difference
grid.arrange(p2, p4, ncol = 1)
# One-Way ANOVA boxplots and bell curves
library(ggplot2)
library(gridExtra)
library(psych)
set.seed(123)
# Simulated data for non-significant difference (Earthlings vs Venusians vs Martians)
nonsig_data <- data.frame(
ACT_Score = c(rnorm(100, mean = 23, sd = 4), rnorm(100, mean = 24, sd = 4), rnorm(100, mean = 22, sd = 4)),
Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)
# Simulated data for significant difference (Earthlings vs Martians vs Venusians)
sig_data <- data.frame(
HS_GPA = c(rnorm(100, mean = 3.5, sd = 0.3), rnorm(100, mean = 3.0, sd = 0.3), rnorm(100, mean = 2.5, sd = 0.3)),
Group = rep(c("Earthlings", "Venusians", "Martians"), each = 100)
)
# Boxplot non-significant difference (ACT scores)
p1 <- ggplot(nonsig_data, aes(x = Group, y = ACT_Score, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Non-Significant Difference (Earthlings, Venusians, Martians)") +
ylab("ACT Score") +
xlab("Group") +
theme_minimal()
# Boxplot significant difference (HS GPA)
p2 <- ggplot(sig_data, aes(x = Group, y = HS_GPA, fill = Group)) +
geom_boxplot(alpha = 0.6) +
ggtitle("Significant Difference (Earthlings, Venusians, Martians)") +
ylab("HS GPA") +
xlab("Group") +
theme_minimal()
# Density plot non-significant difference (ACT scores)
p3 <- ggplot(nonsig_data, aes(x = ACT_Score, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Non-Significant Difference (ACT Scores - Earthlings, Venusians, Martians)") +
xlab("ACT Score") +
theme_minimal()
# Density plot significant difference (HS GPA)
p4 <- ggplot(sig_data, aes(x = HS_GPA, fill = Group)) +
geom_density(alpha = 0.6) +
ggtitle("Significant Difference (HS GPA - Earthlings, Venusians, Martians)") +
xlab("HS GPA") +
theme_minimal()
# Arrange boxplot and density plot for non-significant difference
grid.arrange(p1, p3, ncol = 1)
# Arrange boxplot and density plot for significant difference
grid.arrange(p2, p4, ncol = 1)
# Dependent samples t-test from book
##
## Paired t-test
##
## data: data$Pretest and data$Posttest
## t = -21.456, df = 19, p-value = 8.846e-15
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -15.1462 -12.4538
## sample estimates:
## mean difference
## -13.8
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 73.15 7.73 73 73.06 8.15 59 87 28 0.02 -0.79 1.73
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 86.95 7.85 86.5 87.25 8.9 70 98 28 -0.23 -0.95 1.76
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 13.8 2.88 14 13.62 2.22 9 20 11 0.37 -0.57 0.64
# Creating the data frame of three groups with the same scores
data <- data.frame(
Student = factor(rep(1:5, each=3)), # 5 students, 3 conditions
Condition = factor(rep(c("Pre", "Post", "Post-Post"), times=5)),
Score = c(80, 75, 67, 90, 85, 85, 78, 72, 74, 85, 80, 78, 88, 83, 91)
)
# 1. Run repeated measures ANOVA with ezANOVA (handling assumptions like sphericity)
library(ez)
anova_result <- ezANOVA(data = data,
dv = .(Score),
wid = .(Student),
within = .(Condition),
detailed = TRUE)
anova_result
## $ANOVA
## Effect DFn DFd SSn SSd F p p<.05
## 1 (Intercept) 1 4 97768.06667 482.93333 809.785202 9.074958e-06 *
## 2 Condition 2 8 90.13333 89.86667 4.011869 6.213045e-02
## ges
## 1 0.9941754
## 2 0.1359614
##
## $`Mauchly's Test for Sphericity`
## Effect W p p<.05
## 2 Condition 0.01730226 0.002275905 *
##
## $`Sphericity Corrections`
## Effect GGe p[GG] p[GG]<.05 HFe p[HF] p[HF]<.05
## 2 Condition 0.5043633 0.1150838 0.5087521 0.1144435
# 2. Normality Test for Each Condition
library(dplyr)
data %>%
group_by(Condition) %>%
summarise(shapiro_p = shapiro.test(Score)$p.value)
## # A tibble: 3 × 2
## Condition shapiro_p
## <fct> <dbl>
## 1 Post 0.716
## 2 Post-Post 0.977
## 3 Pre 0.641
# 3. Mauchly's Test of Sphericity
# Using ezANOVA, sphericity testing is automatically handled and included in the output.
# If needed, you can manually extract the sphericity correction from the results.
# 4. Post-Hoc Pairwise Comparison if ANOVA is significant
if (anova_result$ANOVA$p[1] < 0.05) {
pairwise_result <- pairwise.t.test(data$Score, data$Condition, paired = TRUE)
print(pairwise_result)
}
##
## Pairwise comparisons using paired t tests
##
## data: data$Score and data$Condition
##
## Post Post-Post
## Post-Post 1.00 -
## Pre 3.9e-05 0.23
##
## P value adjustment method: holm
# Load necessary library
library(ggplot2)
# Sample data for height and shoe size
height <- c(128,170, 175, 180, 165, 160, 185, 178, 182, 173, 168,
180, 169, 177, 165, 174, 186, 178, 165, 172, 180)
shoe_size <- c(0,9, 10, 11, 8, 7, 12, 10, 11, 9, 8,
11, 8, 10, 8, 9, 12, 11, 8, 9, 10)
# Create a data frame
height_shoe_size <- data.frame(Height = height, Shoe_Size = shoe_size)
# Fit the linear model
model <- lm(Height ~ Shoe_Size, data = height_shoe_size)
# Get the model's coefficients (intercept and slope)
intercept <- coef(model)[1]
slope <- coef(model)[2]
# Calculate the intercept at x = 0
y_intercept_at_x_0 <- intercept # When x = 0, the y-intercept is the model's intercept
cat("The y-axis intercept when x = 0 is:", y_intercept_at_x_0, "cm\n")
## The y-axis intercept when x = 0 is: 128.1263 cm
# Check if the point exists in your dataset
subset(height_shoe_size, Height == 160 & Shoe_Size == 70)
## [1] Height Shoe_Size
## <0 rows> (or 0-length row.names)
# Create the scatterplot with the regression line and highlight the specific point (with tolerance)
# Create the scatterplot with the regression line, highlight the specific point, and more x-axis ticks
ggplot(height_shoe_size, aes(x = Shoe_Size, y = Height)) +
geom_point(color = "blue") + # Scatterplot points
geom_smooth(method = "lm", se = FALSE, color = "red") + # Fit line with intercept
# Highlight the specific point (height = 160, shoe size = 70) with a tolerance
geom_point(data = subset(height_shoe_size, abs(Height - 160) < 1 & abs(Shoe_Size - 70) < 1),
aes(x = Shoe_Size, y = Height),
color = "black", shape = 1, size = 5, stroke = 2) + # Circle around the point
labs(title = "Scatterplot of Height vs Shoe Size",
x = "Shoe Size (US Men)",
y = "Height (cm)") +
theme_minimal() +
ylim(120, max(height_shoe_size$Height) + 1) + # Set y-axis limit
xlim(0, max(height_shoe_size$Shoe_Size) + 1) + # Set x-axis limit
scale_x_continuous(breaks = seq(0, max(height_shoe_size$Shoe_Size) + 1, by = 1)) # Set more ticks on the x-axis
# Load required libraries
library(ggplot2)
library(gridExtra)
# Set a seed for reproducibility
set.seed(42)
# Create a function to generate correlated data
generate_data <- function(r) {
x <- rnorm(100)
y <- r * x + sqrt(1 - r^2) * rnorm(100)
data.frame(x, y)
}
# Create the plots for each correlation
correlations <- c(0.2, -0.2, 0.5, -0.5, 0.9, -0.9, 0)
labels <- c("Weak Positive", "Weak Negative", "Moderate Positive", "Moderate Negative", "Strong Positive", "Strong Negative", "No Correlation")
plots <- lapply(1:7, function(i) {
data <- generate_data(correlations[i])
ggplot(data, aes(x, y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
ggtitle(paste(labels[i], " (r =", correlations[i], ")")) +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank())
})
# Arrange the plots in a grid
grid.arrange(grobs = plots, ncol = 3)
# Load necessary library
library(ggplot2)
library(gridExtra)
library(apaTables)
library(lmtest)
# Create a data frame with all variables
multheight <- data.frame(
height = c(170, 175, 180, 165, 160, 185, 178, 182, 173, 168,
180, 169, 177, 165, 174, 186, 178, 165, 172, 180),
shoe_size = c(9, 10, 11, 8, 7, 12, 10, 11, 9, 8,
11, 8, 10, 8, 9, 12, 11, 8, 9, 10),
biom= c(160, 162, 155, 167, 170, 158, 165, 172, 168, 160,
163, 155, 174, 165, 170, 168, 167, 160, 162, 159),
biof= c(175, 180, 185, 170, 168, 177, 180, 172, 185, 173,
169, 178, 174, 181, 179, 176, 182, 173, 170, 177)
)
# View the first few rows
head(multheight)
## height shoe_size biom biof
## 1 170 9 160 175
## 2 175 10 162 180
## 3 180 11 155 185
## 4 165 8 167 170
## 5 160 7 170 168
## 6 185 12 158 177
# Run multiple regression analysis
model <- lm(height ~ biom + biof, data = multheight)
# Plot to check the linearity assumption for biom
p1<- ggplot(multheight, aes(x = biom, y = height)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Height vs. Biom (Biological Mother)", x = "Biological Mother Height", y = "Child Height")
# Plot to check the linearity assumption for biof
p2<- ggplot(multheight, aes(x = biof, y = height)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Height vs. Biof (Biological Father)", x = "Biological Father Height", y = "Child Height")
# Residuals vs fitted values plot to check for independence of errors
p3<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted Values", x = "Fitted Values", y = "Residuals")
# Residuals vs fitted values plot to check homoscedasticity
p4<-ggplot(data = multheight, aes(x = model$fitted.values, y = residuals(model))) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted Values (Homoscedasticity)", x = "Fitted Values", y = "Residuals") +
theme_minimal()
# Histogram of residuals to check normality
p5<-ggplot(data = multheight, aes(x = residuals(model))) +
geom_histogram(bins = 10, fill = "skyblue", color = "black") +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
theme_minimal()
# Q-Q plot to check normality
p6<- ggplot(data = multheight, aes(sample = residuals(model))) +
geom_qq() +
geom_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
# Check for multicollinearity using VIF
library(car)
vif(model)
## biom biof
## 1.043446 1.043446
# Durbin-Watson test to check for autocorrelation in residuals
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 2.3376, p-value = 0.7756
## alternative hypothesis: true autocorrelation is greater than 0
# BP test homoscedacity
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.0091, df = 2, p-value = 0.3662
# Summarize the model
summary(model)
##
## Call:
## lm(formula = height ~ biom + biof, data = multheight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1899 -5.1111 0.0648 3.7485 11.6584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.03025 86.40189 1.007 0.328
## biom 0.08133 0.31480 0.258 0.799
## biof 0.41846 0.33782 1.239 0.232
##
## Residual standard error: 7.345 on 17 degrees of freedom
## Multiple R-squared: 0.08279, Adjusted R-squared: -0.02512
## F-statistic: 0.7672 on 2 and 17 DF, p-value: 0.4797
# Create the APA-style regression table
apa.reg.table(model, filename = "multreg.doc")
##
##
## Regression results using height as the criterion
##
##
## Predictor b b_95%_CI beta beta_95%_CI sr2 sr2_95%_CI r
## (Intercept) 87.03 [-95.26, 269.32]
## biom 0.08 [-0.58, 0.75] 0.06 [-0.44, 0.56] .00 [-.05, .05] .00
## biof 0.42 [-0.29, 1.13] 0.29 [-0.21, 0.79] .08 [-.15, .31] .28
##
##
##
## Fit
##
##
##
## R2 = .083
## 95% CI[.00,.30]
##
##
## Note. A significant b-weight indicates the beta-weight and semi-partial correlation are also significant.
## b represents unstandardized regression weights. beta indicates the standardized regression weights.
## sr2 represents the semi-partial correlation squared. r represents the zero-order correlation.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
##
# Check assumptions
library(ggplot2)
library(car)
library(dplyr)
library(gridExtra)
# Simulate dataset
n <- 20
data <- data.frame(
High_School_GPA = round(runif(n, 2.5, 4.0), 2),
College_GPA = round(runif(n, 2.5, 4.0), 2),
Doctorate = sample(0:1, n, replace = TRUE, prob = c(0.5, 0.5)) # More 0s than 1s
)
grid.arrange(
ggplot(data, aes(x = High_School_GPA)) +
geom_histogram(binwidth = 0.2, fill = "blue", alpha = 0.6) +
labs(title = "Distribution of High School GPA"),
ggplot(data, aes(x = College_GPA)) +
geom_histogram(binwidth = 0.2, fill = "red", alpha = 0.6) +
labs(title = "Distribution of College GPA"),
ggplot(data, aes(y = High_School_GPA)) +
geom_boxplot(fill = "blue", alpha = 0.6) +
labs(title = "Boxplot of High School GPA"),
ggplot(data, aes(y = College_GPA)) +
geom_boxplot(fill = "red", alpha = 0.6) +
labs(title = "Boxplot of College GPA"),
ggplot(data, aes(x = High_School_GPA, y = College_GPA, color = as.factor(Doctorate))) +
geom_jitter(width = 0.1, height = 0.1) +
labs(title = "Scatterplot of GPAs by Doctorate Status") +
scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes")),
# Check residuals
ggplot(data, aes(x = fitted(model), y = residuals(model, type = "pearson"))) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
ggtitle("Residuals vs Fitted Values"),
ncol = 2
)
# Log-transform continuous predictors for Box-Tidwell test
data$log_High_School_GPA <- log(data$High_School_GPA)
data$log_College_GPA <- log(data$College_GPA)
# Box-Tidwell test: Adding interaction terms between original and log-transformed predictors
box_tidwell_model <- glm(Doctorate ~ High_School_GPA + College_GPA +
High_School_GPA:log_High_School_GPA + College_GPA:log_College_GPA,
data = data, family = binomial)
# Check the summary of the Box-Tidwell model
summary(box_tidwell_model)
##
## Call:
## glm(formula = Doctorate ~ High_School_GPA + College_GPA + High_School_GPA:log_High_School_GPA +
## College_GPA:log_College_GPA, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -318.09 187.62 -1.695 0.090 .
## High_School_GPA 40.05 46.47 0.862 0.389
## College_GPA 173.92 111.04 1.566 0.117
## High_School_GPA:log_High_School_GPA -19.14 21.43 -0.893 0.372
## College_GPA:log_College_GPA -78.97 50.45 -1.565 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27.726 on 19 degrees of freedom
## Residual deviance: 21.933 on 15 degrees of freedom
## AIC: 31.933
##
## Number of Fisher Scoring iterations: 4
# Check multicollinearity
vif(box_tidwell_model)
## High_School_GPA College_GPA
## 1986.623 2797.653
## High_School_GPA:log_High_School_GPA College_GPA:log_College_GPA
## 1991.941 2790.169
# Fit logistic regression model
model <- glm(Doctorate ~ High_School_GPA + College_GPA, data = data, family = binomial)
summary(model)
##
## Call:
## glm(formula = Doctorate ~ High_School_GPA + College_GPA, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.1370 6.1004 0.842 0.400
## High_School_GPA -1.2857 0.9839 -1.307 0.191
## College_GPA -0.2914 1.8128 -0.161 0.872
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27.726 on 19 degrees of freedom
## Residual deviance: 25.628 on 17 degrees of freedom
## AIC: 31.628
##
## Number of Fisher Scoring iterations: 4
# Fit logistic regression model without Box-Tidwell terms first
model <- glm(Doctorate ~ High_School_GPA + College_GPA, data = data, family = binomial)
model_null <- glm(Doctorate ~ 1, data = data, family = binomial)
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Doctorate ~ 1
## Model 2: Doctorate ~ High_School_GPA + College_GPA
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 19 27.726
## 2 17 25.628 2 2.0976 0.3504
summary(model)
##
## Call:
## glm(formula = Doctorate ~ High_School_GPA + College_GPA, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.1370 6.1004 0.842 0.400
## High_School_GPA -1.2857 0.9839 -1.307 0.191
## College_GPA -0.2914 1.8128 -0.161 0.872
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27.726 on 19 degrees of freedom
## Residual deviance: 25.628 on 17 degrees of freedom
## AIC: 31.628
##
## Number of Fisher Scoring iterations: 4
# Check multicollinearity
vif(model)
## High_School_GPA College_GPA
## 1.056721 1.056721
# APA-style interpretation
exp(coef(model)) # Convert log-odds to odds ratios
## (Intercept) High_School_GPA College_GPA
## 170.2024177 0.2764435 0.7472186
write.csv(data, "logisticdata.csv")
library(ggplot2)
set.seed(42)
n <- 20
x <- seq(1, 10, length.out = n)
a <- 2
b <- 1.5
e <- rnorm(n, mean = 0, sd = 1)
y <- a + b * x + e
model <- lm(y ~ x)
predicted_y <- predict(model)
df <- data.frame(x, y, predicted_y)
intercept <- coef(model)[1]
x_pred_annotate <- 9
y_pred_annotate <- predict(model, newdata = data.frame(x = x_pred_annotate))
error_mid <- (df$predicted_y[1] + df$y[1]) / 2
p <- ggplot(df, aes(x, y)) +
geom_point(color = "blue", size = 3) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
geom_segment(aes(x = x, xend = x, y = predicted_y, yend = y),
arrow = arrow(length = unit(0.2, "cm")), color = "black") +
geom_vline(xintercept = 0, color = "black", size = 1) +
scale_x_continuous(limits = c(-1, max(x) + 1), breaks = seq(-1, max(x) + 1, by = 1)) + # X-axis ticks at increments of 1
labs(title = "Scatterplot with y = a + bx + e where y = # of interviews and x = # of shoes",
x = "Number of Shoes",
y = "Number of Interviews") +
theme_minimal() +
theme(axis.line = element_line(color = "black", size = 1)) +
annotate("segment", x = -0.5, xend = 0,
y = intercept + 0.5, yend = intercept,
arrow = arrow(length = unit(0.2, "cm")), color = "black") +
annotate("text", x = -0.6, y = intercept + 0.9,
label = "a, or intercept", hjust = .5, vjust = 0.5, color = "black") +
annotate("segment", x = x_pred_annotate - 1, xend = x_pred_annotate - .2, # Shift left
y = y_pred_annotate + 1, yend = y_pred_annotate, # Arrow pointing down
arrow = arrow(length = unit(0.2, "cm")),
size = .5, # Thicker arrow
color = "red") +
annotate("text", x = x_pred_annotate - 0.3, y = y_pred_annotate + 1.2,
label = "prediction line, or bx", hjust = 1, color = "red") + # Shift left
annotate("segment", x = df$x[1] + 0.5, xend = df$x[1],
y = error_mid + 0.5, yend = error_mid,
arrow = arrow(length = unit(0.2, "cm")), color = "black") +
annotate("text", x = df$x[1] + 0.6, y = error_mid + 0.1,
label = "error, or e", hjust = 0, color = "black")
print(p)