AnAge <- read.csv("C:/Users/kelse/OneDrive/Documents/Research Design Analysis/Files to Import/anage_longevity.csv", header=TRUE)
AnAge_mammalia <- subset(AnAge, Class == "Mammalia")
head(AnAge_mammalia)
## HAGRID Kingdom Phylum Class Order Family Genus
## 2327 2340 Animalia Chordata Mammalia Afrosoricida Tenrecidae Echinops
## 2328 2341 Animalia Chordata Mammalia Afrosoricida Tenrecidae Geogale
## 2329 2342 Animalia Chordata Mammalia Afrosoricida Tenrecidae Hemicentetes
## 2330 2343 Animalia Chordata Mammalia Afrosoricida Tenrecidae Microgale
## 2331 2344 Animalia Chordata Mammalia Afrosoricida Tenrecidae Microgale
## 2332 2345 Animalia Chordata Mammalia Afrosoricida Tenrecidae Setifer
## Species FemaleMaturity_days MaleMaturity_days
## 2327 telfairi 365 365
## 2328 aurita NA NA
## 2329 semispinosus 35 NA
## 2330 dobsoni 669 669
## 2331 talazaci 639 639
## 2332 setosus 198 258
## Gestation.Incubation_days Weaning_days Litter.Clutch_size
## 2327 55 29 6.00
## 2328 63 37 3.90
## 2329 59 21 4.00
## 2330 61 29 2.67
## 2331 61 29 2.00
## 2332 54 24 3.00
## Litters.ClutchesPerYear Interlitter.Interbirth_interval BirthWeight_g
## 2327 1 NA 7.67
## 2328 2 NA 0.70
## 2329 2 NA 11.50
## 2330 NA NA 3.95
## 2331 NA NA 3.60
## 2332 1 NA 24.70
## WeaningWeight_g AdultWeight_g GrowthRate_1overDays MaxLongevity_yrs Source
## 2327 50 180.0 0.0301 19.0 671
## 2328 NA 6.7 NA NA <NA>
## 2329 NA 180.0 NA 2.7 671
## 2330 NA 37.8 0.0266 5.6 434
## 2331 NA 50.0 0.0298 5.8 434
## 2332 NA 225.0 0.0419 14.1 671
## SpecimenOrigin SampleSize DataQuality IMR_perYr MRDT_yrs MetabolicRate_W
## 2327 captivity medium acceptable NA NA 0.750
## 2328 unknown small low NA NA 0.043
## 2329 captivity small questionable NA NA 0.380
## 2330 captivity small acceptable NA NA 0.315
## 2331 captivity small acceptable NA NA 0.243
## 2332 captivity medium acceptable NA NA 0.573
## BodyMass_g Temperature_K
## 2327 116.4 307.85
## 2328 6.9 303.95
## 2329 116.4 308.15
## 2330 44.6 304.05
## 2331 44.0 303.95
## 2332 427.6 305.35
unique(AnAge_mammalia$Class)
## [1] "Mammalia"
# Just double checking that only Mammalia comes up now that I subset it.
colnames(AnAge_mammalia)
## [1] "HAGRID" "Kingdom"
## [3] "Phylum" "Class"
## [5] "Order" "Family"
## [7] "Genus" "Species"
## [9] "FemaleMaturity_days" "MaleMaturity_days"
## [11] "Gestation.Incubation_days" "Weaning_days"
## [13] "Litter.Clutch_size" "Litters.ClutchesPerYear"
## [15] "Interlitter.Interbirth_interval" "BirthWeight_g"
## [17] "WeaningWeight_g" "AdultWeight_g"
## [19] "GrowthRate_1overDays" "MaxLongevity_yrs"
## [21] "Source" "SpecimenOrigin"
## [23] "SampleSize" "DataQuality"
## [25] "IMR_perYr" "MRDT_yrs"
## [27] "MetabolicRate_W" "BodyMass_g"
## [29] "Temperature_K"
sapply(AnAge_mammalia, function(x) length(unique(x)))
## HAGRID Kingdom
## 1327 1
## Phylum Class
## 1 1
## Order Family
## 28 130
## Genus Species
## 643 1111
## FemaleMaturity_days MaleMaturity_days
## 382 249
## Gestation.Incubation_days Weaning_days
## 279 254
## Litter.Clutch_size Litters.ClutchesPerYear
## 120 72
## Interlitter.Interbirth_interval BirthWeight_g
## 262 625
## WeaningWeight_g AdultWeight_g
## 322 861
## GrowthRate_1overDays MaxLongevity_yrs
## 200 355
## Source SpecimenOrigin
## 48 3
## SampleSize DataQuality
## 5 4
## IMR_perYr MRDT_yrs
## 12 13
## MetabolicRate_W BodyMass_g
## 396 392
## Temperature_K
## 86
# I guess I am struggling to understand this question. My instinct is to say species, because there are far more rows for that column? I am just not understanding, because there are other rows that represent other variables.
nrow(AnAge_mammalia)
## [1] 1327
ncol(AnAge_mammalia)
## [1] 29
hist(AnAge_mammalia$Interlitter.Interbirth_interval[!is.na(AnAge_mammalia$Interlitter.Interbirth_interval)],
main = "Frequency Distribution of Interlitter and Interbirth Interval",
xlab = "Interlitter and Interbirth Interval (Days)",
ylab = "Frequency",
breaks = 10,
col = "orange",
border = "orange4")
# I can see that the data is skewed right (this seems to be the case so far in all of our datasets), so I am going to log transform it). Also, I know you said that I don't have to keep using !is.na once I have already done it once, but my brain refuses to trust the process, so I am going to probably keep putting it in every time.
log_inter <- log(AnAge_mammalia$Interlitter.Interbirth_interval[!is.na(AnAge_mammalia$Interlitter.Interbirth_interval)])
hist_info <- hist(log_inter,
main = "Log-Transformed Freq Distribution of Interlitter and Interbirth Interval",
xlab = "Log(Interlitter and Interbirth Interval (Days)",
col = "orange",
breaks = 10)
exp(mean(log_inter, na.rm = TRUE))
## [1] 249.6819
exp(sd(log_inter, na.rm = TRUE))
## [1] 2.6388
AnAge_subset <- AnAge_mammalia[, c("Interlitter.Interbirth_interval",
"Gestation.Incubation_days",
"Litters.ClutchesPerYear",
"MaxLongevity_yrs")]
AnAge_clean <- AnAge_subset[complete.cases(AnAge_subset), ]
# I have absolutely no idea what I am doing wrong, but whenever I use par("usr") I am getting a warning message. And whenever I try making it the more basic way without par(usr), my spacing is completely off, and the numbers are in the lower left corner and being cut off. I tried adjusting the spacing, but was never able to get it to work how I wanted it to. So I am just going to use the par and ignore the warnings.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
r <- cor(x, y, use = "complete.obs")
txt <- format(c(r, 0.123456789), digits = digits)[1]
if (missing(cex.cor)) cex.cor <- 2.0
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
text(0.5, 0.5, txt, cex = cex.cor, col = "darkgreen", adj = c(0.5, 0.5))
}
pairs(AnAge_clean,
main = "Scatter Plot Matrix w/ Pearson",
pch = 16,
col = "darkgreen",
upper.panel = panel.cor)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
multi_regression_model <- lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days +
Litters.ClutchesPerYear + MaxLongevity_yrs,
data = AnAge_clean)
summary(multi_regression_model)
##
## Call:
## lm(formula = Interlitter.Interbirth_interval ~ Gestation.Incubation_days +
## Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -547.16 -107.76 -21.99 68.70 1096.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 235.8290 22.7145 10.382 < 2e-16 ***
## Gestation.Incubation_days 0.7340 0.1021 7.191 2.82e-12 ***
## Litters.ClutchesPerYear -66.1178 7.5775 -8.726 < 2e-16 ***
## MaxLongevity_yrs 6.0942 0.6202 9.826 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.3 on 437 degrees of freedom
## Multiple R-squared: 0.6575, Adjusted R-squared: 0.6552
## F-statistic: 279.7 on 3 and 437 DF, p-value: < 2.2e-16
# The R2 value is 0.6575 (65.75%). Yes, the coefficient estimates are significant because their p-values are smaller than 5%
plot(multi_regression_model)
# Residuals vs. Fitted plot- First assumption is violated (linear relationship)
# Q-Q plot- 3rd assumption is violated (errors have mean of 0 & are normally distributed)
# Scale-location plot- I think the 4th assumption might be violated (error terms). I can't visualize a cone/fan shape that would indicate clear heteroscedasticity, but it also doesn't really doesn't seem to me to have constant variance.
AnAge_clean$Log_Litters.ClutchesPerYear <- log(AnAge_clean$Litters.ClutchesPerYear)
AnAge_clean_log <- lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days +
Log_Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
summary(AnAge_clean_log)
##
## Call:
## lm(formula = Interlitter.Interbirth_interval ~ Gestation.Incubation_days +
## Log_Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -475.74 -85.26 -16.06 48.68 853.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 290.45875 15.69995 18.501 < 2e-16 ***
## Gestation.Incubation_days 0.33517 0.08566 3.913 0.000106 ***
## Log_Litters.ClutchesPerYear -257.17397 13.50163 -19.048 < 2e-16 ***
## MaxLongevity_yrs 3.43514 0.52510 6.542 1.7e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 140.4 on 437 degrees of freedom
## Multiple R-squared: 0.7803, Adjusted R-squared: 0.7788
## F-statistic: 517.3 on 3 and 437 DF, p-value: < 2.2e-16
# I am not going to drop any of my independent variables, because they are all below 0.05 and therefore are all statistically significant.
AnAge_clean_log_interaction_term <- lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days * Log_Litters.ClutchesPerYear + MaxLongevity_yrs,
data = AnAge_clean)
summary(AnAge_clean_log_interaction_term)
##
## Call:
## lm(formula = Interlitter.Interbirth_interval ~ Gestation.Incubation_days *
## Log_Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -481.43 -59.15 -12.57 34.46 795.98
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 312.38870 14.98132
## Gestation.Incubation_days 0.08427 0.08645
## Log_Litters.ClutchesPerYear -210.73120 13.97737
## MaxLongevity_yrs 2.84969 0.49788
## Gestation.Incubation_days:Log_Litters.ClutchesPerYear -0.53883 0.06881
## t value Pr(>|t|)
## (Intercept) 20.852 < 2e-16 ***
## Gestation.Incubation_days 0.975 0.33
## Log_Litters.ClutchesPerYear -15.077 < 2e-16 ***
## MaxLongevity_yrs 5.724 1.94e-08 ***
## Gestation.Incubation_days:Log_Litters.ClutchesPerYear -7.830 3.74e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 131.7 on 436 degrees of freedom
## Multiple R-squared: 0.8074, Adjusted R-squared: 0.8056
## F-statistic: 456.9 on 4 and 436 DF, p-value: < 2.2e-16
# Gestation incubation days and max longevity are not significant, but interaction term and clutches per year are.
plot (AnAge_clean_log_interaction_term)
# Residuals vs. Fitted plot- There is super minimal improvement and there is still a violation.
# Q-Q plot- Same thing, improvement but still violation.
# Scale-location plot- Same as the previous two.
# Residual vs. Leverage- There is a lot of difference from before, but I honestly have no idea what violations I should be looking for with this specific plot (I wasn't in class the day we discussed violations).
# The interaction term shows that both the gestation incubation days and the clutches per year affect the interlitter/interbirth interval. When the number of clutches per year is higher, the effect of gestation time on the birth interval becomes smaller. So, when these mammals have more clutches per year, the length of gestation won’t matter as much for the time between births. We can tell this because the interaction term has a negative coefficient, meaning that when clutches per year increases, the effect of gestation incubation days on birth intervals decreases. Since the interaction term is also statistically significant, we can assume that this is not due to chance.
library(car)
## Loading required package: carData
vif(AnAge_clean_log)
## Gestation.Incubation_days Log_Litters.ClutchesPerYear
## 2.219035 2.103391
## MaxLongevity_yrs
## 2.269946
# Since all of the independent variables are well below 10, we can say that the independent variables are not collinear.
full_r2 <- summary(lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days + Log_Litters.ClutchesPerYear, data = AnAge_clean))$r.squared
gestation_r2 <- summary(lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days, data = AnAge_clean))$r.squared
litters_r2 <- summary(lm(Interlitter.Interbirth_interval ~ Log_Litters.ClutchesPerYear, data = AnAge_clean))$r.squared
gestation_variance <- full_r2 - litters_r2
litters_variance <- full_r2 - gestation_r2
joint_variance <- gestation_r2 - gestation_variance
gestation_variance
## [1] 0.02929968
litters_variance
## [1] 0.2782203
joint_variance
## [1] 0.4512559