Pearce<-read.csv("C:/Users/kelse/OneDrive/Documents/Research Design Analysis/Files to Import/Pearce_etal_2013.csv", header=TRUE)

Question 1

a.

# Just to note- I missed lecture this week so hopefully my interpretations of the slides are correct. I got notes from some classmates, but please leave me a lot of feedback if I am misunderstanding. 

nrow(Pearce)
## [1] 286
ncol(Pearce)
## [1] 15

b.

hist(Pearce$home_range[!is.na(Pearce$home_range)], 
     main = "Distribution of Home Range Size", 
     xlab = "Home Range Size (hectares)", 
     ylab = "Frequency", 
     col = "darksalmon", 
     breaks = 20, 
     border = "brown3") 

# The data is right-skewed, meaning most home ranges are small (clustered into that left tall bar), while a few very large ones stretch out to the right. It very clearly needs to be transformed, I don't need to add any density curves to see that! 


log_home_range <- log(Pearce$home_range)


hist_info <- hist(log_home_range, 
     main = "Log-Transformed Distribution of Home Range", 
     xlab = "Log(Home Range)", 
     col = "darksalmon", 
     breaks = 20)

# I can't get the density curve to not be a flat line, so I am going to scale it. I have no clue what I am doing wrong here. 
density_curve <- density(log_home_range, na.rm = TRUE)
scaled_density <- density_curve$y * max(hist_info$counts) / max(density_curve$y)
lines(density_curve$x, scaled_density, col = "black", lwd = 2)

# Location (I am using the location and spreads that were suggested in the power point for non-normal data. I am assuming that even though I transformed it, the data is still considered non-normal)
exp(mean(log_home_range, na.rm = TRUE))
## [1] 51.64006
median(log_home_range, na.rm = TRUE)
## [1] 3.465736
#Spread 
exp(sd(log_home_range, na.rm = TRUE))
## [1] 7.311191
IQR(log_home_range, na.rm = TRUE)
## [1] 2.732112

Question 2

a.

plot(Pearce$mass, Pearce$home_range, 
     main="Home Range as a Function of Body Mass", 
     xlab="Mass (grams)", 
     ylab="Home Range (square km)")

b.

plot(Pearce$mass, Pearce$home_range, 
     main="Home Range as a Function of Body Mass", 
     xlab="Mass (grams)", 
     ylab="Home Range (square km)")
lm.res <- lm(home_range ~ mass, data=Pearce)
abline(lm.res, col="purple")

c.

plot(lm.res)

# Yes assumptions are violated.
# Residuals vs. Fitted plot- first assumption is violated (relationship is linear). The line should be a straight linear one, and not go up and back down the way this does. The 3rd assumption is also violated (error terms are normally distributed with a mean of zero). The points do not correlate with the line as closely as they should. 

# Q-Q plot- the third assumption is also violated here (error terms are normally distributed with a mean of zero.). Both ends of the data are not near the line and don't show normal distribution. 

# Scale-location plot- The fourth is violated (errors have constant variance). There is very little variation as the values increase, and it's definitely not constant. 

# Residuals vs. Leverage plot- I don't notice any violations here. 

Question 3

a.

Pearce$log_home_range <- log(Pearce$home_range)
Pearce$log_mass <- log(Pearce$mass)

plot(Pearce$log_mass, Pearce$log_home_range,
     main="Log(Home Range) as a Function of Log(Mass)",
     xlab="Log(Mass) (grams)", 
     ylab="Log(Home Range) (square km)")

b.

plot(Pearce$log_mass, Pearce$log_home_range,
     main="Log(Home Range) as a Function of Log(Mass)",
     xlab="Log(Mass) (grams)", 
     ylab="Log(Home Range) (square km)")
log_model <- lm(log_home_range ~ log_mass, data=Pearce)
abline(log_model, col="purple")

c. 

plot(log_model) 

# Yes assumptions are still being violated. The violations seem to be improving with the transformations, though. I don't see any of the violations being resolved, just improved.

d. 

# Going forward I am assuming that I am using the transformed data. Please let me know if I should have done the non-transformed data instead. I would love to do both for good measure, but I don't have the time this week. 

intercept <- summary(log_model)$coefficients[1, 1]
slope <- summary(log_model)$coefficients[2, 1]
r_squared <- summary(log_model)$r.squared
p_value_slope <- summary(log_model)$coefficients[2, 4]

intercept
## [1] -3.595846
slope
## [1] 0.9009145
r_squared
## [1] 0.4131651
p_value_slope
## [1] 9.862139e-35
# Interpreting the slope- Since the slope is 0.9009145 it means that when the body mass increases by 1% then the range size will increase by ~0.9%. It's slightly lower than a 1:1 ratio, so I'm not sure if we would round up and consider it that or not. 

# What does the p-value say about our tested hypothesis- We can reject the null hypothesis because the p-value is below the alpha 0.05. This means that body mass affects range pretty significantly. Meaning that we can confidently say that the relationship between body mass and range is statistically significant.

e.

# I am going to use Pearson here because I am using two variables that have a linear relationship and the data became more normally distributed when I transformed it. I would not use that here if I was using the non-transformed data. 
cor(Pearce$log_mass, Pearce$log_home_range, method = "pearson")
## [1] 0.6427792
summary(log_model)$coefficients[2, 4]
## [1] 9.862139e-35

Question 4

a.

# I put these in normally and saw that they needed more linear transformations. I'm deleting the original plots because it's becoming just too many plots for my brain to look at. 

Pearce$log_home_range <- log(Pearce$home_range)
Pearce$log_mass <- log(Pearce$mass)


plot(Pearce$log_mass, Pearce$log_home_range, 
     main = "Home Range vs. Body Mass",
     xlab = "Body Mass (Log Transformed)", ylab = "Home Range (Log Transformed)")

abline(lm(log_home_range ~ log_mass, data = Pearce), col = "darksalmon")

Pearce$log_group_size <- log(Pearce$group_size)

plot(Pearce$log_group_size, Pearce$log_home_range, 
     main = "Home Range vs. Group Size",
     xlab = "Log(Group Size)", ylab = "Log(Home Range)")

abline(lm(log_home_range ~ log_group_size, data = Pearce), col = "purple")

b.

# To compare mass and group size, we need to center and scale them. This allows us to put both variables on the same scale, making them comparable despite having different unit measurements. There is no need to transform the data again. Since the slope of the mass is larger, it means that it has a greater impact/effect on home range size than the group size does. 

lm_mass_scaled <- lm(log_home_range ~ scale(log_mass), data = Pearce)
lm_group_size_scaled <- lm(log_home_range ~ scale(log_group_size), data = Pearce)

lm_mass_scaled$coefficients[2]
## scale(log_mass) 
##        1.278749
lm_group_size_scaled$coefficients[2]
## scale(log_group_size) 
##              1.137276