IP_selection

# Selection on Phenotypic traits in Impatiens capensis 
# Evolution Course 2020

# First written: 2018 10 09
# Author: Vince Formica

#This line will erase EVERYTHING in your working environment.  Ignore for now : )
rm(list=ls())

# GOALS ####
# 1) Conduct a selection analysis on Impatiens data for 2108 and 2014
#   1.A) Conduct linear and non-linear selection 
# 2) Make graphs and interpret stats for selection analyses


# Data and prep ####

#Read in the data for both 2018 and 2014 
data_2018<-read.csv("/home/biol034f20/2018.csv")
data_2014<-read.csv("/home/biol034f20/2014.csv")


library(lme4)  # statistics we might need

## Loading required package: Matrix

library(car)   # let's us conduct a type III sums of squares biologists use

## Loading required package: carData

library(lattice) # plotting 
library(latticeExtra) #extra plotting package

## Loading required package: RColorBrewer

library(ggplot2) #the best plotting package

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:latticeExtra':
## 
##     layer

library(visreg) # makes easy figures from a regression model-----> what's cool about this package is that it makes figures with the "marginal means" in other words it is showing you the values adjusted for the other variables in your model.



#Let's look at the data in the spreadsheet form
#View(data_2018)

#Take a look at the structure of the data
str(data_2018)

## 'data.frame':    66 obs. of  23 variables:
##  $ ID                                           : Factor w/ 7 levels "CC","CESJ","ESE",..: 5 5 7 2 3 3 5 6 7 7 ...
##  $ Lab.section                                  : Factor w/ 2 levels "Tuesday","Wednesday": 2 2 2 1 1 1 2 1 2 2 ...
##  $ Plant..                                      : int  25 32 8 22 10 12 23 11 18 2 ...
##  $ Height                                       : num  177 206 200 125 114 ...
##  $ Hypo.length                                  : num  6.63 7.66 5.97 5.44 10.8 5.6 6.48 4 5.24 3.77 ...
##  $ Internode                                    : num  7.98 4.52 12.1 14.29 7.4 ...
##  $ Calax1                                       : num  23.1 22.4 NA 22.7 10.8 20.7 15.5 18.6 NA 13.2 ...
##  $ Petal.h1                                     : num  16.7 16.9 NA 15.9 14.1 9.5 16.7 15.9 NA 19 ...
##  $ Calax2                                       : num  17.1 NA NA 20.1 17.8 NA 16.2 20.3 NA 13.6 ...
##  $ Petal.h2                                     : num  9.4 NA NA 15.2 13.1 NA 16.9 13.5 NA 15.1 ...
##  $ Calax3                                       : num  NA NA NA 22.4 18.2 NA NA 17.7 NA 11.1 ...
##  $ Petal.h3                                     : num  NA NA NA 21 11.7 NA NA 19.1 NA 15.9 ...
##  $ Seed.no..1                                   : int  3 1 2 1 3 2 5 3 1 1 ...
##  $ Seed.no..2                                   : int  3 4 1 2 1 3 2 2 4 1 ...
##  $ Seed.no..3                                   : int  3 1 3 3 2 3 4 3 1 4 ...
##  $ No.flowers                                   : int  4 1 1 69 3 1 3 16 0 3 ...
##  $ FRUIT...pedicels..empty.fruit.stalks....fruit: int  626 723 591 523 427 266 193 258 339 322 ...
##  $ lifetime.fruit...flowers                     : int  630 724 592 592 430 267 196 274 339 325 ...
##  $ Notes                                        : Factor w/ 7 levels "","shade","Shade",..: 2 5 2 5 5 5 2 6 2 5 ...
##  $ Ave.calax                                    : num  20.1 22.4 NA 21.7 15.6 ...
##  $ Ave.petal                                    : num  16.7 16.9 NA 15.9 14.1 ...
##  $ Mean.seeds.per.fruit                         : num  3 2 2 2 2 ...
##  $ Total.est.seeds                              : num  1878 1446 1182 1046 854 ...

#str(data_2014)

#Make some quick plots of seeds and traits 
## You should always 
qplot(x=Ave.calax, y=Total.est.seeds, data=data_2018) +theme_classic()

## Warning: Removed 24 rows containing missing values (geom_point).

qplot(x=Height, y=Total.est.seeds, data=data_2018) +theme_classic()

## Warning: Removed 2 rows containing missing values (geom_point).

qplot(x=Ave.petal, y=Total.est.seeds, data=data_2018) +theme_classic()

## Warning: Removed 24 rows containing missing values (geom_point).

qplot(x=Hypo.length, y=Total.est.seeds, data=data_2018) +theme_classic()

## Warning: Removed 2 rows containing missing values (geom_point).

#Histogram of raw data
histogram(~Total.est.seeds, data=data_2018, breaks=20)

#Histogram of log transformed data
histogram(~log10(Total.est.seeds), data=data_2018,breaks=20)

#So we log transform the data
data_2018$log.total.seeds<-log10(data_2018$Total.est.seeds)

#Remember that non-linear selection asks if traits closer or farther from the mean, so we need to center all of the traits from their mean.  


#Make a histogram of Ave.calax in 2018

histogram(~Ave.calax, data=data_2018, breaks=20)

#The mean function does NOT work if you have NAs (missing values) in your column ANYWHERE.  UNLESS you feed it the na.rm=TRUE argument.  Try it:

mean(data_2018$Ave.calax)  # Do you get an error?

## [1] NA

# Now try:
mean(data_2018$Ave.calax, na.rm=TRUE)  #--> This tells R to ignore the NAs (blanks) in the data set.

## [1] 17.00426

#To center our traits we simply subtract the mean from each value.  You could this the long way  (for each variable):

data_2018$center.calax<-data_2018$Ave.calax-mean(data_2018$Ave.calax, na.rm=TRUE)

#Now Make a histogram of the centered trait:

histogram(~center.calax, data=data_2018, breaks=20)

# The MUCH easier way is to use the within function, that lest us do all of the calculations in one line of code.

#center variables (subtract mean from each value for all traits)

data_2018<-within(data_2018,{
  center.calax=Ave.calax-mean(Ave.calax, na.rm=TRUE)
  center.petal=Ave.petal-mean(Ave.petal, na.rm=TRUE)
  center.height=Height-mean(Height, na.rm=TRUE)
  center.hypo=Hypo.length-mean(Hypo.length, na.rm=TRUE)
})



# Selection analysis ####


#Differentials (one trait)
diff.height.2018<-lm(log.total.seeds~center.height, data=data_2018)

# Get P Values with type III SS (sums of squares)
Anova(diff.height.2018, type=3)

## Anova Table (Type III tests)
## 
## Response: log.total.seeds
##               Sum Sq Df  F value    Pr(>F)    
## (Intercept)   343.31  1 2061.012 < 2.2e-16 ***
## center.height   2.34  1   14.027 0.0003971 ***
## Residuals      10.33 62                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Get slope or ("estimate" of trait)  IGNORE THIS P VALUE!! It will be WRONG some times!!!  Use the Anova function from the car package
summary(diff.height.2018)

## 
## Call:
## lm(formula = log.total.seeds ~ center.height, data = data_2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0534 -0.1182  0.0303  0.2053  0.8294 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.316230   0.051020  45.398  < 2e-16 ***
## center.height 0.004070   0.001087   3.745 0.000397 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4081 on 62 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1845, Adjusted R-squared:  0.1714 
## F-statistic: 14.03 on 1 and 62 DF,  p-value: 0.0003971

# Make a graph of the model.
visreg(diff.height.2018)

# Just to be VERY specific.  You get the beta values from summary function (estimates) and the P values (and df, F value) from the Anova fucntion (with a CAPITAL A for Anova)

# What does the slope of the center.height value MEAN in words?  Especially given that total seeds is on a log scale?  Remember the slope is the DIFFERENTIAL or S.


10^0

## [1] 1

10^1

## [1] 10

10^2

## [1] 100

10^3

## [1] 1000

10^0.004070

## [1] 1.009416

# Calculate the differential for Calax
diff.calax.2018<-lm(log.total.seeds~center.calax, data=data_2018)

Anova(diff.calax.2018, type=3)

## Anova Table (Type III tests)
## 
## Response: log.total.seeds
##               Sum Sq Df   F value  Pr(>F)    
## (Intercept)  234.447  1 1388.5104 < 2e-16 ***
## center.calax   0.685  1    4.0598 0.05067 .  
## Residuals      6.754 40                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(diff.calax.2018)

## 
## Call:
## lm(formula = log.total.seeds ~ center.calax, data = data_2018)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24502 -0.22149 -0.04188  0.26790  0.80076 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.36285    0.06341  37.263   <2e-16 ***
## center.calax  0.03556    0.01765   2.015   0.0507 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4109 on 40 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.09214,    Adjusted R-squared:  0.06945 
## F-statistic:  4.06 on 1 and 40 DF,  p-value: 0.05067

visreg(diff.calax.2018)

# Now conduct a multiple regression for those two together.
mod.grad.2018<-lm(log.total.seeds~center.height+center.calax, data=data_2018)

Anova(mod.grad.2018, type=3)

## Anova Table (Type III tests)
## 
## Response: log.total.seeds
##                Sum Sq Df   F value  Pr(>F)    
## (Intercept)   224.279  1 1493.2744 < 2e-16 ***
## center.height   0.896  1    5.9684 0.01920 *  
## center.calax    0.972  1    6.4732 0.01502 *  
## Residuals       5.858 39                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Which trait will change more in the next generation if they were both equally heritable?
summary(mod.grad.2018)

## 
## Call:
## lm(formula = log.total.seeds ~ center.height + center.calax, 
##     data = data_2018)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15191 -0.19993  0.00459  0.22820  0.78207 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.404912   0.062234  38.643   <2e-16 ***
## center.height 0.003954   0.001619   2.443   0.0192 *  
## center.calax  0.043064   0.016926   2.544   0.0150 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3875 on 39 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.2126, Adjusted R-squared:  0.1723 
## F-statistic: 5.266 on 2 and 39 DF,  p-value: 0.009449

par(mfrow=c(1,2)) # this just makes the two plots next to each other.  TO turn this off use dev.off()
visreg(mod.grad.2018, "center.calax")

xyplot(center.calax~center.height, data=data_2018)

# Remember Log = 10^estimate = # of seeds per unit of trait (mm or cm)


# non-linear for 2018  - you need to put the squared terms in an I and ()s  to tell R you want them considered separately
mod.non.linear.2018<-lm(log.total.seeds~center.height+center.calax+I(center.height^2)+I(center.calax^2), data=data_2018)

Anova(mod.non.linear.2018, type=3)

## Anova Table (Type III tests)
## 
## Response: log.total.seeds
##                    Sum Sq Df  F value  Pr(>F)    
## (Intercept)        73.565  1 529.9734 < 2e-16 ***
## center.height       0.813  1   5.8569 0.02054 *  
## center.calax        0.885  1   6.3786 0.01596 *  
## I(center.height^2)  0.453  1   3.2652 0.07890 .  
## I(center.calax^2)   0.303  1   2.1848 0.14784    
## Residuals           5.136 37                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(mod.non.linear.2018)

## 
## Call:
## lm(formula = log.total.seeds ~ center.height + center.calax + 
##     I(center.height^2) + I(center.calax^2), data = data_2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2147 -0.1441 -0.0015  0.2183  0.6468 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.398e+00  1.042e-01  23.021   <2e-16 ***
## center.height       4.114e-03  1.700e-03   2.420   0.0205 *  
## center.calax        4.164e-02  1.649e-02   2.526   0.0160 *  
## I(center.height^2)  6.335e-05  3.506e-05   1.807   0.0789 .  
## I(center.calax^2)  -6.823e-03  4.616e-03  -1.478   0.1478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3726 on 37 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.3096, Adjusted R-squared:  0.235 
## F-statistic: 4.149 on 4 and 37 DF,  p-value: 0.007094

par(mfrow=c(1,2)) # this just makes the two plots next to each other.  TO turn this off use dev.off()

visreg(mod.non.linear.2018, main="2018")

#dev.off()


### Visualize your data to make it look like how we were talking about quadratic (non-linear selection in class...in other words, no curves but actually squared terms)


#instead of using the I() terms we are going to actually square the terms, long hand.  This will let use visualize the data in the 
#Step one, make new columns where you square the terms:

#WHOA BIG MISTAKE BELOW!! I've commented it out so you could see it!

#data_2018$center.calax2<-data_2018$center.height^2
data_2018$center.height2<-data_2018$center.height^2
data_2018$center.calax2<-data_2018$center.calax^2


#Step 2, rerun the analysis and give the model a new name (I'll attach on a B).  THis is the same model above.  I've just gotten rid of the I() terms and replaced them with our new squared columns
mod.non.linear.2018B<-lm(log.total.seeds~center.height+center.calax+center.height2+center.calax2, data=data_2018)

#The results won't change....But how Visreg displays the data will.  Instead of mapping those curved lines, it will show you what the math is ACTUALLY doing: fitting straight lines to the converted (quadratic data):

par(mfrow=c(2,2)) # this just makes the two plots next to each other and 2 below.  The first two is how many rows of figures you want and the second row is how many columns you want.  TO turn this off use dev.off()
visreg(mod.non.linear.2018B, main="2018")

# I'm not sure why this isn't display on the RPUBS page but if you try it, it should work for you.


#dev.off()


# # Correlational selection for 2014  (optional and fun)  This assumes that you have data for 2014 loaded and ready to go.  If you don't then go ahead and write the code for 2014 (just like I did for 2018).  The reason I'm doing 2014 data here is that 2018 just isn't interesting in 3D.
# #I've had to #it out since I don't have hte 2014 data loaded, but try the code to make fun 3D graphs.
# 
# mod.cor.linear.2018<-lm(log.total.seeds~center.height+center.calax+I(center.height^2)+I(center.calax^2)+center.height:center.calax, data=data_2018)
# 
# Anova(mod.cor.linear.2018, type=3)
# dev.off()
# visreg2d(mod.non.linear.2014, x="center.calax", y= "center.height", plot.type="persp", main="2014", theta = 37, phi =20) #--->change the theta to rotate the graph
# 
# 
# par(mfrow=c(2,2)) # this just makes the two plots next to each other.  TO turn this off use dev.off()
# visreg(mod.non.linear.2014, main="2014")
# visreg(mod.non.linear.2018, main="2018")
# dev.off()

IP_selection_2020.R

vformic1

2020-10-15