Simpson’s Paradox

This chunk of code will 5 groups of correlated data

correlatedValues = function(x, r = 0.9){
  r2 = r**2
  ve = 1-r2
  SD = sqrt(ve)
  e  = rnorm(length(x), mean=0, sd=SD)
  y  = r*x + e
  return(y)
}

x1 = rnorm(100, mean = -6, sd = 1)
y1 = correlatedValues(x1, r = 0.75) + 6
x2 = rnorm(100, mean = -3, sd = 1)
y2 = correlatedValues(x2, r = 0.75) + 3
x3 = rnorm(100, mean = 0, sd = 1)
y3 = correlatedValues(x3, r = 0.75)
x4 = rnorm(100, mean = 3, sd = 1)
y4 = correlatedValues(x4, r = 0.75) - 3
x5 = rnorm(100, mean = 6, sd = 1)
y5 = correlatedValues(x5, r = 0.75) - 6

df1 <- data.frame(x1, y1, "group 1")
df2 <- data.frame(x2, y2, "group 2")
df3 <- data.frame(x3, y3, "group 3")
df4 <- data.frame(x4, y4, "group 4")
df5 <- data.frame(x5, y5, "group 5")
names(df1) <- c("xdata", "ydata", "group")
names(df2) <- c("xdata", "ydata", "group")
names(df3) <- c("xdata", "ydata", "group")
names(df4) <- c("xdata", "ydata", "group")
names(df5) <- c("xdata", "ydata", "group")
main_df <- rbind(df1, df2, df3, df4, df5)

Now we will visualize the data.

library("ggplot2")
library("tidyverse")

ggplot(main_df, aes(x = xdata, y = ydata)) +
  geom_point() +
  ggtitle("All data together")

#find correlation value
main_df %>%
  summarize(correlation = cor(xdata, ydata))
##   correlation
## 1  -0.5395027

Now let us treat the groups separately.

ggplot(main_df, aes(x = xdata, y = ydata)) +
  geom_point(aes(color = group)) +
  ggtitle("All data together")

#find correlation values
main_df %>%
  group_by(group) %>%
  summarize(correlation = cor(xdata, ydata))
## # A tibble: 5 x 2
##   group   correlation
##   <fct>         <dbl>
## 1 group 1       0.754
## 2 group 2       0.691
## 3 group 3       0.705
## 4 group 4       0.805
## 5 group 5       0.608

Combining variables may make trends disappear or reverse. This phenomenon is called Simpson’s Paradox.

Example: Crab People

“I collected the amphipod crustacean Platorchestia platensis on a beach near Stony Brook, Long Island, in April, 1987, removed and counted the number of eggs each female was carrying, then freeze-dried and weighed the mothers.” (Source)

weight <- c(5.38, 7.36, 6.13, 4.75, 8.10, 8.62, 6.30, 7.44, 7.26, 7.17, 
             7.78, 6.23, 5.42, 7.87, 5.25, 7.37, 8.01, 4.92, 7.03, 6.45,
             5.06, 6.72, 7.00, 9.39, 6.49, 6.34, 6.16, 5.74)
eggs <- c(29, 23, 22, 20, 25, 25, 17, 24, 20, 27,
          24, 21, 22, 22, 23, 35, 27, 23, 25, 24,
          19, 21, 20, 33, 17, 21, 25, 22)
df <- data.frame(weight, eggs)
  1. Use ggplot to make a scatterplot of the data, and draw a linear regression line (with se = FALSE).
ggplot(df, aes(x = weight, y = eggs)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + #linear regression
  ggtitle("Platorchestia platensis") +
  theme_minimal() + #removes gray background
  xlab("Weight (mg)") +
  ylab("Eggs")

  1. Use ggplot to make a scatterplot of the data, and draw a linear regression line (with se = TRUE).
ggplot(df, aes(x = weight, y = eggs)) +
  geom_point() + 
  geom_smooth(method = "lm", se = TRUE) + #linear regression
  ggtitle("Platorchestia platensis") +
  theme_minimal() + #removes gray background
  xlab("Weight (mg)") +
  ylab("Eggs")

  1. Predict how many eggs a crab will have if its weight is 8 mg.
#linear model
linear_fit <- lm(eggs ~ weight, data = df)

#make a prediction
prediction <- predict(linear_fit, newdata = data.frame(weight = 8))

Our linear model predicts that the crab has 25.5028021 eggs.

Example: Reaction Rates

The reaction rates of the reaction S \(\rightarrow\) P catalyzed by enzyme E were determined under conditions such that only very little product was formed. Compute the maximum reaction velocity asymptote \(V_{\text{max}}\) and the Michaelis-Menton constant \(K_{m}\)

#substrate concentration (mircomolars)
S <- c(0.08, 0.12, 0.54, 1.23, 1.82, 2.72, 4.94, 10.00)
rS <- 1/S #reciprocal

#reaction rate (micromolars per minute)
v <- c(0.15, 0.21, 0.7, 1.1, 1.3, 1.5, 1.7, 1.8)
rv <- 1/v #reciprocal

df <- data.frame(S, v, rS, rv)
ggplot(df, aes(x = S, y = v)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  ggtitle("Michaelis-Menton Experiment") +
  xlab("substrate concentration (micromolars)") +
  ylab("reaction rate (micromolars per minute)")

ggplot(df, aes(x = rS, y = rv)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Double Reciprocal Plot") +
  xlab("1/[S] (1/micromoles)") +
  ylab("1/v (minutes per micromolar)")

linear_fit <- lm(rv ~ rS, data = df)

slope <- linear_fit$coefficients[2]
intercept <- linear_fit$coefficients[1]

Vmax <- 1/intercept
Km <- Vmax*slope

print(paste("The Vmax asymptote is ", Vmax))
## [1] "The Vmax asymptote is  1.98946533065081"
print(paste("The Michaelis-Menton constant is ", Km))
## [1] "The Michaelis-Menton constant is  0.991986524114477"