AnAge <- read.csv("C:/Users/kelse/OneDrive/Documents/Research Design Analysis/Files to Import/anage_longevity.csv", header=TRUE)

Question 1

a.

AnAge_mammalia <- subset(AnAge, Class == "Mammalia")
head(AnAge_mammalia)
##      HAGRID  Kingdom   Phylum    Class        Order     Family        Genus
## 2327   2340 Animalia Chordata Mammalia Afrosoricida Tenrecidae     Echinops
## 2328   2341 Animalia Chordata Mammalia Afrosoricida Tenrecidae      Geogale
## 2329   2342 Animalia Chordata Mammalia Afrosoricida Tenrecidae Hemicentetes
## 2330   2343 Animalia Chordata Mammalia Afrosoricida Tenrecidae    Microgale
## 2331   2344 Animalia Chordata Mammalia Afrosoricida Tenrecidae    Microgale
## 2332   2345 Animalia Chordata Mammalia Afrosoricida Tenrecidae      Setifer
##           Species FemaleMaturity_days MaleMaturity_days
## 2327     telfairi                 365               365
## 2328       aurita                  NA                NA
## 2329 semispinosus                  35                NA
## 2330      dobsoni                 669               669
## 2331     talazaci                 639               639
## 2332      setosus                 198               258
##      Gestation.Incubation_days Weaning_days Litter.Clutch_size
## 2327                        55           29               6.00
## 2328                        63           37               3.90
## 2329                        59           21               4.00
## 2330                        61           29               2.67
## 2331                        61           29               2.00
## 2332                        54           24               3.00
##      Litters.ClutchesPerYear Interlitter.Interbirth_interval BirthWeight_g
## 2327                       1                              NA          7.67
## 2328                       2                              NA          0.70
## 2329                       2                              NA         11.50
## 2330                      NA                              NA          3.95
## 2331                      NA                              NA          3.60
## 2332                       1                              NA         24.70
##      WeaningWeight_g AdultWeight_g GrowthRate_1overDays MaxLongevity_yrs Source
## 2327              50         180.0               0.0301             19.0    671
## 2328              NA           6.7                   NA               NA   <NA>
## 2329              NA         180.0                   NA              2.7    671
## 2330              NA          37.8               0.0266              5.6    434
## 2331              NA          50.0               0.0298              5.8    434
## 2332              NA         225.0               0.0419             14.1    671
##      SpecimenOrigin SampleSize  DataQuality IMR_perYr MRDT_yrs MetabolicRate_W
## 2327      captivity     medium   acceptable        NA       NA           0.750
## 2328        unknown      small          low        NA       NA           0.043
## 2329      captivity      small questionable        NA       NA           0.380
## 2330      captivity      small   acceptable        NA       NA           0.315
## 2331      captivity      small   acceptable        NA       NA           0.243
## 2332      captivity     medium   acceptable        NA       NA           0.573
##      BodyMass_g Temperature_K
## 2327      116.4        307.85
## 2328        6.9        303.95
## 2329      116.4        308.15
## 2330       44.6        304.05
## 2331       44.0        303.95
## 2332      427.6        305.35
unique(AnAge_mammalia$Class)
## [1] "Mammalia"
# Just double checking that only Mammalia comes up now that I subset it. 

b.

colnames(AnAge_mammalia)
##  [1] "HAGRID"                          "Kingdom"                        
##  [3] "Phylum"                          "Class"                          
##  [5] "Order"                           "Family"                         
##  [7] "Genus"                           "Species"                        
##  [9] "FemaleMaturity_days"             "MaleMaturity_days"              
## [11] "Gestation.Incubation_days"       "Weaning_days"                   
## [13] "Litter.Clutch_size"              "Litters.ClutchesPerYear"        
## [15] "Interlitter.Interbirth_interval" "BirthWeight_g"                  
## [17] "WeaningWeight_g"                 "AdultWeight_g"                  
## [19] "GrowthRate_1overDays"            "MaxLongevity_yrs"               
## [21] "Source"                          "SpecimenOrigin"                 
## [23] "SampleSize"                      "DataQuality"                    
## [25] "IMR_perYr"                       "MRDT_yrs"                       
## [27] "MetabolicRate_W"                 "BodyMass_g"                     
## [29] "Temperature_K"
sapply(AnAge_mammalia, function(x) length(unique(x)))
##                          HAGRID                         Kingdom 
##                            1327                               1 
##                          Phylum                           Class 
##                               1                               1 
##                           Order                          Family 
##                              28                             130 
##                           Genus                         Species 
##                             643                            1111 
##             FemaleMaturity_days               MaleMaturity_days 
##                             382                             249 
##       Gestation.Incubation_days                    Weaning_days 
##                             279                             254 
##              Litter.Clutch_size         Litters.ClutchesPerYear 
##                             120                              72 
## Interlitter.Interbirth_interval                   BirthWeight_g 
##                             262                             625 
##                 WeaningWeight_g                   AdultWeight_g 
##                             322                             861 
##            GrowthRate_1overDays                MaxLongevity_yrs 
##                             200                             355 
##                          Source                  SpecimenOrigin 
##                              48                               3 
##                      SampleSize                     DataQuality 
##                               5                               4 
##                       IMR_perYr                        MRDT_yrs 
##                              12                              13 
##                 MetabolicRate_W                      BodyMass_g 
##                             396                             392 
##                   Temperature_K 
##                              86
# I guess I am struggling to understand this question. My instinct is to say species, because there are far more rows for that column? I am just not understanding, because there are other rows that represent other variables. 

nrow(AnAge_mammalia)
## [1] 1327
ncol(AnAge_mammalia)
## [1] 29

c. 

hist(AnAge_mammalia$Interlitter.Interbirth_interval[!is.na(AnAge_mammalia$Interlitter.Interbirth_interval)], 
     main = "Frequency Distribution of Interlitter and Interbirth Interval", 
     xlab = "Interlitter and Interbirth Interval (Days)", 
     ylab = "Frequency", 
     breaks = 10, 
     col = "orange", 
     border = "orange4")

# I can see that the data is skewed right (this seems to be the case so far in all of our datasets), so I am going to log transform it). Also, I know you said that I don't have to keep using !is.na once I have already done it once, but my brain refuses to trust the process, so I am going to probably keep putting it in every time.

log_inter <- log(AnAge_mammalia$Interlitter.Interbirth_interval[!is.na(AnAge_mammalia$Interlitter.Interbirth_interval)])

hist_info <- hist(log_inter, 
     main = "Log-Transformed Freq Distribution of Interlitter and Interbirth Interval", 
     xlab = "Log(Interlitter and Interbirth Interval (Days)", 
     col = "orange", 
     breaks = 10)

exp(mean(log_inter, na.rm = TRUE))
## [1] 249.6819
exp(sd(log_inter, na.rm = TRUE))
## [1] 2.6388

Question 2

a.

AnAge_subset <- AnAge_mammalia[, c("Interlitter.Interbirth_interval", 
                                  "Gestation.Incubation_days", 
                                  "Litters.ClutchesPerYear", 
                                  "MaxLongevity_yrs")]

AnAge_clean <- AnAge_subset[complete.cases(AnAge_subset), ]

# I have absolutely no idea what I am doing wrong, but whenever I use par("usr") I am getting a warning message. And whenever I try making it the more basic way without par(usr), my spacing is completely off, and the numbers are in the lower left corner and being cut off. I tried adjusting the spacing, but was never able to get it to work how I wanted it to. So I am just going to use the par and ignore the warnings. 

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
  r <- cor(x, y, use = "complete.obs")
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  if (missing(cex.cor)) cex.cor <- 2.0
  usr <- par("usr")          
  on.exit(par(usr))         
  par(usr = c(0, 1, 0, 1))   
  text(0.5, 0.5, txt, cex = cex.cor, col = "darkgreen", adj = c(0.5, 0.5))

}

pairs(AnAge_clean, 
      main = "Scatter Plot Matrix w/ Pearson",
      pch = 16, 
      col = "darkgreen",  
      upper.panel = panel.cor)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

b.

multi_regression_model <- lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days + 
            Litters.ClutchesPerYear + MaxLongevity_yrs, 
            data = AnAge_clean)
summary(multi_regression_model)
## 
## Call:
## lm(formula = Interlitter.Interbirth_interval ~ Gestation.Incubation_days + 
##     Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -547.16 -107.76  -21.99   68.70 1096.92 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               235.8290    22.7145  10.382  < 2e-16 ***
## Gestation.Incubation_days   0.7340     0.1021   7.191 2.82e-12 ***
## Litters.ClutchesPerYear   -66.1178     7.5775  -8.726  < 2e-16 ***
## MaxLongevity_yrs            6.0942     0.6202   9.826  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 175.3 on 437 degrees of freedom
## Multiple R-squared:  0.6575, Adjusted R-squared:  0.6552 
## F-statistic: 279.7 on 3 and 437 DF,  p-value: < 2.2e-16
# The R2 value is 0.6575 (65.75%). Yes, the coefficient estimates are significant because their p-values are smaller than 5%

c. 

plot(multi_regression_model)

# Residuals vs. Fitted plot- First assumption is violated (linear relationship) 
# Q-Q plot- 3rd assumption is violated (errors have mean of 0 & are normally distributed)
# Scale-location plot- I think the 4th assumption might be violated (error terms). I can't visualize a cone/fan shape that would indicate clear heteroscedasticity, but it also doesn't really doesn't seem to me to have constant variance. 

Question 3

a.

AnAge_clean$Log_Litters.ClutchesPerYear <- log(AnAge_clean$Litters.ClutchesPerYear)
AnAge_clean_log <- lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days + 
                Log_Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
summary(AnAge_clean_log)
## 
## Call:
## lm(formula = Interlitter.Interbirth_interval ~ Gestation.Incubation_days + 
##     Log_Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -475.74  -85.26  -16.06   48.68  853.94 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  290.45875   15.69995  18.501  < 2e-16 ***
## Gestation.Incubation_days      0.33517    0.08566   3.913 0.000106 ***
## Log_Litters.ClutchesPerYear -257.17397   13.50163 -19.048  < 2e-16 ***
## MaxLongevity_yrs               3.43514    0.52510   6.542  1.7e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 140.4 on 437 degrees of freedom
## Multiple R-squared:  0.7803, Adjusted R-squared:  0.7788 
## F-statistic: 517.3 on 3 and 437 DF,  p-value: < 2.2e-16

b.

# I am not going to drop any of my independent variables, because they are all below 0.05 and therefore are all statistically significant. 

AnAge_clean_log_interaction_term <- lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days * Log_Litters.ClutchesPerYear + MaxLongevity_yrs, 
                                  data = AnAge_clean)
summary(AnAge_clean_log_interaction_term)
## 
## Call:
## lm(formula = Interlitter.Interbirth_interval ~ Gestation.Incubation_days * 
##     Log_Litters.ClutchesPerYear + MaxLongevity_yrs, data = AnAge_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -481.43  -59.15  -12.57   34.46  795.98 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                            312.38870   14.98132
## Gestation.Incubation_days                                0.08427    0.08645
## Log_Litters.ClutchesPerYear                           -210.73120   13.97737
## MaxLongevity_yrs                                         2.84969    0.49788
## Gestation.Incubation_days:Log_Litters.ClutchesPerYear   -0.53883    0.06881
##                                                       t value Pr(>|t|)    
## (Intercept)                                            20.852  < 2e-16 ***
## Gestation.Incubation_days                               0.975     0.33    
## Log_Litters.ClutchesPerYear                           -15.077  < 2e-16 ***
## MaxLongevity_yrs                                        5.724 1.94e-08 ***
## Gestation.Incubation_days:Log_Litters.ClutchesPerYear  -7.830 3.74e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 131.7 on 436 degrees of freedom
## Multiple R-squared:  0.8074, Adjusted R-squared:  0.8056 
## F-statistic: 456.9 on 4 and 436 DF,  p-value: < 2.2e-16
# Gestation incubation days and max longevity are not significant, but interaction term and clutches per year are. 

c. 

plot (AnAge_clean_log_interaction_term) 

# Residuals vs. Fitted plot- There is super minimal improvement and there is still a violation. 
# Q-Q plot- Same thing, improvement but still violation. 
# Scale-location plot- Same as the previous two. 
# Residual vs. Leverage- There is a lot of difference from before, but I honestly have no idea what violations I should be looking for with this specific plot (I wasn't in class the day we discussed violations). 

d. 

# The interaction term shows that both the gestation incubation days and the clutches per year affect the interlitter/interbirth interval. When the number of clutches per year is higher, the effect of gestation time on the birth interval becomes smaller. So, when these mammals have more clutches per year, the length of gestation won’t matter as much for the time between births. We can tell this because the interaction term has a negative coefficient, meaning that when clutches per year increases, the effect of gestation incubation days on birth intervals decreases. Since the interaction term is also statistically significant, we can assume that this is not due to chance. 

Question 4

a.

library(car)
## Loading required package: carData
vif(AnAge_clean_log)
##   Gestation.Incubation_days Log_Litters.ClutchesPerYear 
##                    2.219035                    2.103391 
##            MaxLongevity_yrs 
##                    2.269946
# Since all of the independent variables are well below 10, we can say that the independent variables are not collinear. 

b.

full_r2 <- summary(lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days + Log_Litters.ClutchesPerYear, data = AnAge_clean))$r.squared
gestation_r2 <- summary(lm(Interlitter.Interbirth_interval ~ Gestation.Incubation_days, data = AnAge_clean))$r.squared
litters_r2 <- summary(lm(Interlitter.Interbirth_interval ~ Log_Litters.ClutchesPerYear, data = AnAge_clean))$r.squared

gestation_variance <- full_r2 - litters_r2
litters_variance <- full_r2 - gestation_r2
joint_variance <- gestation_r2 - gestation_variance


gestation_variance
## [1] 0.02929968
litters_variance
## [1] 0.2782203
joint_variance
## [1] 0.4512559