ddirt <- read.csv("ddirt.csv")
dl_all <- ddirt[ddirt$litter_treatment == "DL" ,]
ggplot(ddirt, aes(percentOrgC_2014, percentN_2014, color=litter_treatment))+
geom_point()+
geom_smooth(method="lm", se=F)
I think I will just go with organic carbon as a predictor for nitrogen using 2014 data. I will isolate double litter plots since there is a steeper relationship.
xbar <- mean(dl_all$percentOrgC_2014, na.rm = T)
ybar <- mean(dl_all$percentN_2014, na.rm=T)
#find the difference between each data point and mean
x_diff <- dl_all$percentOrgC_2014-xbar
y_diff <- dl_all$percentN_2014-ybar
#find numerator and denominator
numerator <- sum(x_diff*y_diff, na.rm = T)
denominator <- sum(x_diff^2, na.rm = T)
#calculate slope
b1 <- numerator/denominator
#find teh intercept Y=b0+b1(X)
b0 <- ybar-b1*xbar
sd(dl_all$percentN_2014, na.rm=T)
## [1] 0.03391587
n <- 30
simulated_carbon <- runif(n, min=0, max=1) #choose range for x data
In this case, I am using the parameters from the roach data…but I could easily just choose any parameters.
b0 #intercept
## [1] 0.008990332
b1 #slope
## [1] 0.1051602
mu <- b0 + b1*simulated_carbon
sigma <- .03
simulated_nitrogen <- rnorm(n, mean=mu, sd=sigma)
#name model for line
model_nc <- lm(simulated_nitrogen~simulated_carbon)
summary(model_nc)
##
## Call:
## lm(formula = simulated_nitrogen ~ simulated_carbon)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.045421 -0.018955 -0.000841 0.020077 0.058650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01441 0.01093 1.319 0.198
## simulated_carbon 0.09380 0.01832 5.119 2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02747 on 28 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.465
## F-statistic: 26.21 on 1 and 28 DF, p-value: 2.002e-05
plot(simulated_nitrogen ~ simulated_carbon,
main="Simulated double litter plots",
xlab = "% Organic C",
ylab = "% N",
ylim=c(0,.2),
xlim=c(0, 1))+
abline(model_nc, col = "red", lwd = 2)
Fig. 1: HW1 generated data for relationship between % organic carbon and % nitrogen for double litter plots for DDIRT project. For every 1% increase in organic carbon, percent nitrogen increased by approximately 0.11% in the simulated double litter plots.
## integer(0)
model_nc$coefficients
## (Intercept) simulated_carbon
## 0.01441212 0.09380411
#OG slope and int
b0
## [1] 0.008990332
b1
## [1] 0.1051602
The fitted linear model closely recovered the true parameters used to generate the data, with an estimated slope (0.11) and intercept (−0.002) that were very similar to the specified values (slope = 0.105, intercept = 0.009), indicating good model performance despite added noise.
predicted_cn<- predict(model_nc)
fake_DDIRT <- data.frame(simulated_carbon, simulated_nitrogen, predicted_cn)
#create a function for r^2 equation (work from the inside out!)
#ingredient 1: sum((y-y_hat)^2)
#ingredient 2: sum(y-mean(y))^2
#ingredient 3: (1-ingredient 1/ingredient 2)
r_squared <- function(y, y_hat) {
ss_res <- sum((y-y_hat)^2)
ss_tot <- sum((y-mean(y))^2)
1-(ss_res/ss_tot)
}
#use the function on the data:
r_squared(fake_DDIRT$simulated_nitrogen, fake_DDIRT$predicted_cn)
## [1] 0.4834554
roaches <- read.csv("cockroachneurons.csv")
headroach <- head(roaches)
headroach |>
gt()
| temperature | rate |
|---|---|
| 11.9 | 155.03 |
| 14.4 | 142.33 |
| 15.3 | 140.11 |
| 16.4 | 142.65 |
| 17.9 | 135.24 |
| 20.4 | 130.16 |
ggplot(roaches, aes(temperature, rate))+
geom_point(color="#00B8E7")+
geom_smooth(color="#ED68ED", method="lm", se=F)+
labs(title="Cockroach Neural Activity ~ Temperature", x="Temperature (C)", y=expression("Neural Firing Rate (s" ^-1 ~ ")")
)
## `geom_smooth()` using formula = 'y ~ x'
xbar <- mean(roaches$temperature)
ybar <- mean(roaches$rate)
#find the difference between each data point and mean
x_diff <- roaches$temperature-xbar
y_diff <- roaches$rate-ybar
#find numerator and denominator
numerator <- sum(x_diff*y_diff)
denominator <- sum(x_diff^2)
#calculate slope
b1 <- numerator/denominator
#find teh intercept Y=b0+b1(X)
b0 <- ybar-b1*xbar
#when x is 0,
n <- 140
predictor <- runif(n, min=10, max=35) #choose range for x data
In this case, I am using the parameters from the roach data…but I could easily just choose any parameters.
b0 #intercept
## [1] 133.9533
b1 #slope
## [1] -2.058026
mu <- b0 + b1*predictor
sigma <- 3
response <- rnorm(n, mean=mu, sd=sigma)
plot(response ~ predictor,
main="Cockroach neural activity as function of temperature",
xlab = expression(Temperature~degree*C),
ylab = expression("Firing Activity (s" ^-1 ~ ")"))
lions <- read.csv("lionnose.csv")
plot(ageInYears ~ proportionBlack, data=lions)
#equation for a line: Y = B0 +B1(X)
xBar<-mean(lions$proportionBlack)
yBar <- mean(lions$ageInYears)
#calculating slope and intercept using the least squares regression:
x_diff <- lions$proportionBlack-xBar
y_diff <- lions$ageInYears-yBar
numerator <- sum(x_diff * y_diff)
denominator <- sum(x_diff^2) #make sure square is on inside of parentheses!
#Slope pg 544
b1 <- numerator/denominator
#find intercept using b1
# Y = b0 + b1X
b0 <- yBar-b1*xBar
numerator
## [1] 13.01234
denominator
## [1] 1.222147
b1
## [1] 10.64712
xBar
## [1] 0.3221875
yBar
## [1] 4.309375
b0
## [1] 0.8790062
#the equation can be written as Age = 0.88 + 10.65(proportion black) aka Y = 0.88 + 10.65X
#now you can add the regression line to the plot using abline function: abline(a = intercept, b = slope)
plot(ageInYears ~ proportionBlack, lions, main="Lion nose pigmentation", ylab="Age (years)", xlab="Proportion black")
abline(a=b0, b=b1, col="coral")
#Ok now using this example, how do I generate my own linear data using a normal distribution?
#If I already know slope and intercept, how do i generate data that follow that line wit
#you choose a slope and intercept, compute the true line, then add normally distributed error around it.
#now generate some data that follows this example using the lion nose slope and intercept.
#STEP 1: choose sample size
n <- 40
#STEP 2: Generate predictor (X), which btw, DOES NOT NEED TO BE NORMAL. it works the same to create a predictor for uniform or normal distributions
predictor <- runif(n, min=0, max=1)
#STEP 3: Choose the true parameters for slope and intercept(in this case use the lion calculated data)
b0
## [1] 0.8790062
b1
## [1] 10.64712
#in this case I am just using the calculated slope and intercept from teh data
#### The equation looks like this: Y=b0 + b1X where we are predicting x's for each spot in y in the first step
#### Y = 0.9 + 10.6X
# STEP 4: Compute the TRUE mean (average response) or teh average Y for your data
mu <- b0 + b1*predictor #mu represents the PREDICTED values, not the actual values -- therefore, it isnt Ybar
# STEP 5: Add normally distributed error to the predicted values (since environmental data always has it!)
sigma <- 10 #this needs to be reasonable. And it depends on the data units...
# STEP 6: Create the generated data to follow the normal distribution
response <- rnorm(n, mean = mu, sd=sigma) #when mean=mu, there is no longer ONE PEAK. there are peaks around each data point that follow a normal distribution. if mean=10 in this equation, there would be one point on the graph where the y value equaled 10, but it doesnt just equal 10...we have assigned it a vector. so it is creating random points withing a certain allowed space!
#and then sigma here is giving it the amount of noise...100 isnt exactly reasonable...10 is kinda reasonable, 2-5 might be most reasonable!
# STEP 7: plot the simulation of response given predictor
plot(response ~ predictor) #see how the noise is a bit ridiculous?
#refine
sigma <- 4 #play around with it
response<- rnorm(n, mean=mu, sd=sigma)
plot(response~predictor)
#okay make it look better
plot(response~predictor, main="practice lions", xlab="Proportion of black noses", ylab="Age (years)")
#add the TRUE line
abline(b0, b1, col="lightblue", lwd = 2)
#add the model line (see if the model is a good fit or not)
model1 <- lm(response~predictor)
#add the model line to visualize if its a good fit
abline(model1, col="purple", lwd=2, lty = 2)
#STEP 8: Compare estimation to true or actual
coef(model1)
## (Intercept) predictor
## 1.036222 10.261343
# -0.8518539 (yint) 12.9267226 (estimated slope)
# actual b0 given was 0.8790062 (yintercept)
# actual b1 given was 10.64712 (actual slope)
#slope is off by
12.93-10.65
## [1] 2.28
#about 2.3 units...the slope in this model is more informative because the model was EXTRAPOLATING the intercept...since none of your data were on the zero mark for x...
#if you wanted your data to look closer to the line, you could decrease sigma and increase sample size.
##### SENTENCE: While the fitted model recovered the general magnitude of the true slope, the intercept estimate differed from the true value, reflecting sampling noise and the sensitivity of intercept estimates when extrapolating beyond the central range of the predictor.
#### TAKEAWAY: slopes are learned from patterns. Intercepts are learned from boundaries. Where does something begin? Boundaries are often weakly informed (think of humans trying to set boundaries. They can be hard to find when not explicit and depending on family of originn! lol.)