This chunk of code will 5 groups of correlated data
correlatedValues = function(x, r = 0.9){
r2 = r**2
ve = 1-r2
SD = sqrt(ve)
e = rnorm(length(x), mean=0, sd=SD)
y = r*x + e
return(y)
}
x1 = rnorm(100, mean = -6, sd = 1)
y1 = correlatedValues(x1, r = 0.75) + 6
x2 = rnorm(100, mean = -3, sd = 1)
y2 = correlatedValues(x2, r = 0.75) + 3
x3 = rnorm(100, mean = 0, sd = 1)
y3 = correlatedValues(x3, r = 0.75)
x4 = rnorm(100, mean = 3, sd = 1)
y4 = correlatedValues(x4, r = 0.75) - 3
x5 = rnorm(100, mean = 6, sd = 1)
y5 = correlatedValues(x5, r = 0.75) - 6
df1 <- data.frame(x1, y1, "group 1")
df2 <- data.frame(x2, y2, "group 2")
df3 <- data.frame(x3, y3, "group 3")
df4 <- data.frame(x4, y4, "group 4")
df5 <- data.frame(x5, y5, "group 5")
names(df1) <- c("xdata", "ydata", "group")
names(df2) <- c("xdata", "ydata", "group")
names(df3) <- c("xdata", "ydata", "group")
names(df4) <- c("xdata", "ydata", "group")
names(df5) <- c("xdata", "ydata", "group")
main_df <- rbind(df1, df2, df3, df4, df5)
Now we will visualize the data.
library("ggplot2")
library("tidyverse")
ggplot(main_df, aes(x = xdata, y = ydata)) +
geom_point() +
ggtitle("All data together")
#find correlation value
main_df %>%
summarize(correlation = cor(xdata, ydata))
## correlation
## 1 -0.5395027
Now let us treat the groups separately.
ggplot(main_df, aes(x = xdata, y = ydata)) +
geom_point(aes(color = group)) +
ggtitle("All data together")
#find correlation values
main_df %>%
group_by(group) %>%
summarize(correlation = cor(xdata, ydata))
## # A tibble: 5 x 2
## group correlation
## <fct> <dbl>
## 1 group 1 0.754
## 2 group 2 0.691
## 3 group 3 0.705
## 4 group 4 0.805
## 5 group 5 0.608
Combining variables may make trends disappear or reverse. This phenomenon is called Simpson’s Paradox.
“I collected the amphipod crustacean Platorchestia platensis on a beach near Stony Brook, Long Island, in April, 1987, removed and counted the number of eggs each female was carrying, then freeze-dried and weighed the mothers.” (Source)
weight <- c(5.38, 7.36, 6.13, 4.75, 8.10, 8.62, 6.30, 7.44, 7.26, 7.17,
7.78, 6.23, 5.42, 7.87, 5.25, 7.37, 8.01, 4.92, 7.03, 6.45,
5.06, 6.72, 7.00, 9.39, 6.49, 6.34, 6.16, 5.74)
eggs <- c(29, 23, 22, 20, 25, 25, 17, 24, 20, 27,
24, 21, 22, 22, 23, 35, 27, 23, 25, 24,
19, 21, 20, 33, 17, 21, 25, 22)
df <- data.frame(weight, eggs)
ggplot to make a scatterplot of the data, and draw a linear regression line (with se = FALSE).ggplot(df, aes(x = weight, y = eggs)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) + #linear regression
ggtitle("Platorchestia platensis") +
theme_minimal() + #removes gray background
xlab("Weight (mg)") +
ylab("Eggs")
ggplot to make a scatterplot of the data, and draw a linear regression line (with se = TRUE).ggplot(df, aes(x = weight, y = eggs)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) + #linear regression
ggtitle("Platorchestia platensis") +
theme_minimal() + #removes gray background
xlab("Weight (mg)") +
ylab("Eggs")
#linear model
linear_fit <- lm(eggs ~ weight, data = df)
#make a prediction
prediction <- predict(linear_fit, newdata = data.frame(weight = 8))
Our linear model predicts that the crab has 25.5028021 eggs.
The reaction rates of the reaction S \(\rightarrow\) P catalyzed by enzyme E were determined under conditions such that only very little product was formed. Compute the maximum reaction velocity asymptote \(V_{\text{max}}\) and the Michaelis-Menton constant \(K_{m}\)
#substrate concentration (mircomolars)
S <- c(0.08, 0.12, 0.54, 1.23, 1.82, 2.72, 4.94, 10.00)
rS <- 1/S #reciprocal
#reaction rate (micromolars per minute)
v <- c(0.15, 0.21, 0.7, 1.1, 1.3, 1.5, 1.7, 1.8)
rv <- 1/v #reciprocal
df <- data.frame(S, v, rS, rv)
ggplot(df, aes(x = S, y = v)) +
geom_point() +
geom_smooth(se = FALSE) +
ggtitle("Michaelis-Menton Experiment") +
xlab("substrate concentration (micromolars)") +
ylab("reaction rate (micromolars per minute)")
ggplot(df, aes(x = rS, y = rv)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Double Reciprocal Plot") +
xlab("1/[S] (1/micromoles)") +
ylab("1/v (minutes per micromolar)")
linear_fit <- lm(rv ~ rS, data = df)
slope <- linear_fit$coefficients[2]
intercept <- linear_fit$coefficients[1]
Vmax <- 1/intercept
Km <- Vmax*slope
print(paste("The Vmax asymptote is ", Vmax))
## [1] "The Vmax asymptote is 1.98946533065081"
print(paste("The Michaelis-Menton constant is ", Km))
## [1] "The Michaelis-Menton constant is 0.991986524114477"