library(ISwR)
PRACTICE PROBLEMS – R language
#call ISwR package built-in dataset "thuesen"
write.table(thuesen, file = "results", quote = F, col.names = T, row.names = T, sep = "\t")
#open in text editor and manually change NA value to "." resave as "resultsfinal.txt"
resultsfinal<-read.table("resultsfinal.txt", header = T)
print(resultsfinal)
## blood.glucose short.velocity
## 1 15.3 1.76
## 2 10.8 1.34
## 3 8.1 1.27
## 4 19.5 1.47
## 5 7.2 1.27
## 6 5.3 1.49
## 7 9.3 1.31
## 8 11.1 1.09
## 9 7.5 1.18
## 10 12.2 1.22
## 11 6.7 1.25
## 12 5.2 1.19
## 13 19.0 1.95
## 14 15.1 1.28
## 15 6.7 1.52
## 16 8.6 .
## 17 4.2 1.12
## 18 10.3 1.37
## 19 12.5 1.19
## 20 16.1 1.05
## 21 13.3 1.32
## 22 4.9 1.03
## 23 8.8 1.12
## 24 9.5 1.7
t<-1:20
r1<-(0.5)
r2<-(0.8)
r3<-(-0.1)
line0.5 <-(10*exp(r1*t))
line0.8 <-(10*exp(r2*t))
lineneg0.1 <-(10*exp(r3*t))
# plot the first curve by calling plot() function
# First curve is plotted
plot(t, line0.5, type="o", col="blue", pch="o", lty=1, ylim=c(0,1000), xlab = "Population Total", ylab = "Time (Days)" )
# Add second curve to the same plot by calling points() and lines()
# Use symbol '*' for points.
points(t, line0.8, col="red", pch="*")
lines(t, line0.8, col="red",lty=2)
# Add Third curve to the same plot by calling points() and lines()
# Use symbol '+' for points.
points(t, lineneg0.1, col="green",pch="+")
lines(t, lineneg0.1, col="green", lty=3)
t<-1:20
r1<-(0.5)
r2<-(0.8)
r3<-(0.4)
line0.5 <-(1000*10)/(10+(1000-10)*exp(-r1*t))
line0.8 <-(1000*10)/(10+(1000-10)*exp(-r2*t))
line0.4 <-(1000*10)/(10+(1000-10)*exp(-r3*t))
# plot the first curve by calling plot() function
# First curve is plotted
plot(t, line0.5, type="o", col="blue", pch="o", lty=1, ylim=c(0,1100), xlab = "Population Total", ylab = "Time (days)" )
# Add second curve to the same plot by calling points() and lines()
# Use symbol '*' for points.
points(t, line0.8, col="red", pch="*")
lines(t, line0.8, col="red",lty=2)
# Add Third curve to the same plot by calling points() and lines()
# Use symbol '+' for points.
points(t, line0.4, col="green",pch="+")
lines(t, line0.4, col="green", lty=3)
n=5000
sum_n <- sum(1:n)
print(sum_n)
## [1] 12502500
x = 5
sqrt_round <- round(sqrt(x), 0)
print(sqrt_round)
## [1] 2
PRACTICE PROBLEMS – Fundamentals of statistics
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-6
library(car)
## Loading required package: carData
library(plyr)
#create vector of measurements
UndRates <- c(0.9, 1.4, 1.2, 1.2, 1.3, 2.0, 1.4, 1.6)
#graph histogram of Measurments
hist(UndRates, xlab = "Undulation Rate (Hertz)", col = "green", main = "Histogram of Undulation Rates" )
#Mean of rates
mean(UndRates)
## [1] 1.375
#Minimum rate
min(UndRates)
## [1] 0.9
#Maximum rate
max(UndRates)
## [1] 2
#Range of rates
range(UndRates)
## [1] 0.9 2.0
#Standard deviation of rates
sd(UndRates)
## [1] 0.324037
#function - calculate coeff of variation
cveq <- function(x){sd(x)/mean(x)}
#run function on measurements
cveq(UndRates)
## [1] 0.2356633
#Create a vector of Blood pressure measurements
BlPr <- c(112, 128, 108, 129, 125, 153, 155, 132, 137)
#Number of Measurements/individuals
length(BlPr)
## [1] 9
There are 9 individuals in the sample
#Calculate mean of sample
mean(BlPr)
## [1] 131
#Calculate variance of sample
var(BlPr)
## [1] 254.5
#Calculate standard deviation of sample
sd(BlPr)
## [1] 15.95306
#Use function (cveq <- function(x){sd(x)/mean(x)}) to calculate coefficient of variation
cveq(BlPr)
## [1] 0.1217791
#call dataset "DesertBirdAbundance.csv"
DesertBird <- read.table("DesertBirdAbundance.csv", header = T, sep = ",")
#Create historgram of DesertBird dataset
hist(DesertBird$abundance, xlab = "Abundance", main = "Histogram of Desert Bird Abundance", col = "lavender")
#mean of Desert Bird Abundance
mean(DesertBird$abundance)
## [1] 74.76744
#median of Desert Bird Abundance
median(DesertBird$abundance)
## [1] 18
The median would be a better representation of the middle as the data is skewed to the right (there seems to be an outlier).
#Calculate Range of Desert Bird Abundance data
range(DesertBird$abundance)
## [1] 1 625
#Calculate Standard Deviation of Desert Bird Abundance data
sd(DesertBird$abundance)
## [1] 121.9473
#Calculate Variance of Desert Bird Abundance data
var(DesertBird$abundance)
## [1] 14871.14
#Use function (cveq <- function(x){sd(x)/mean(x)}) to calculate coefficient of variation of Desert Bird Abundance
cveq(DesertBird$abundance)
## [1] 1.631021
1-pnorm (3)
## [1] 0.001349898
1-pnorm (42, mean=35, sd=6, lower.tail=T)
## [1] 0.1216725
dbinom (10, size=10, prob = 0.8)
## [1] 0.1073742
pchisq (6.5, df=2, lower.tail=F)
## [1] 0.03877421
#Central Limit Theorem, Probability of 0.9, 5 Samples, 10 trials, 1 iteration
clt1 <-numeric(10)
for (i in 1:10) {
clt1[i]<-mean(rbinom(5, size=10, prob=0.9))
}
hist(clt1, main = "Central Limit Theorem, Probability of 0.9, 5 Samples, 10 trials, 1 iteration", col="blue", xlab = "samples")
#Central Limit Theorem, Probability of 0.9, 5 Samples, 10 trials, 100 interations
clt100 <-numeric(100)
for (i in 1:100) {
clt100[i]<-mean(rbinom(5, size=10, prob=0.9))
}
hist(clt100, main = "Central Limit Theorem, Probability of 0.9, 5 Samples, 10 trials, 100 iterations", col="red", xlab = "samples")
#Central Limit Theorem, Probability of 0.9, 5 Samples, 10 trials, 1000 interations
clt1000 <-numeric(1000)
for (i in 1:1000) {
clt1000[i]<-mean(rbinom(5, size=10, prob=0.9))
}
hist(clt1000, main = "Central Limit Theorem, Probability of 0.9, 5 Samples, 10 trials, 1000 iterations", col="red", xlab = "samples")
PRACTICE PROBLEMS – Statistical tests
#Create vector of tempurature measurements
Temp <- c(98.4, 98.6, 97.8, 98.8, 97.9, 99.0, 98.2, 98.8, 98.8, 99.0, 98.0, 99.2, 99.5, 99.4, 98.4, 99.1, 98.4, 97.6, 97.4, 97.5, 97.5, 98.8, 98.6, 100.0, 98.4)
# Histogram of Tempuratures
temphist<-hist(Temp, col="orange", xlab = "Body Temperature",
main="Histogram of Body Temps", freq = FALSE)
#add normal curve to histogram
curve(dnorm(x, mean = mean(Temp), sd = sd(Temp)), add = T)
qqnorm(Temp)
qqline(Temp)
shapiro.test(Temp)
##
## Shapiro-Wilk normality test
##
## data: Temp
## W = 0.97216, p-value = 0.7001
Are the data normally distributed? > Yes, the data is normally distributed: Shapiro-Wilk test was not significant and the histogram visually aproximates normal
Are these measurements consistent with a population mean of 98.6 F?
#run t-test to compare with provided and accepted population mean temp of 98.6
t.test(Temp, mu = 98.6)
##
## One Sample t-test
##
## data: Temp
## t = -0.56065, df = 24, p-value = 0.5802
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
## 98.24422 98.80378
## sample estimates:
## mean of x
## 98.524
The Pvalue is not significant, therefore this sample does not differe from the population mean tempurature of 98.6
#run binomial test to evaluate if there is a preference comparing groups
binom.test(31, 41, p = 0.5)
##
## Exact binomial test
##
## data: 31 and 41
## number of successes = 31, number of trials = 41, p-value = 0.00145
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.5969538 0.8763675
## sample estimates:
## probability of success
## 0.7560976
Yes, in this sample the brown recluse spiders significantly prefer dead crickets more than live crickets
Patient Placebo Drug 1 37 5 2 52 23 3 68 40 4 4 3 5 29 38 6 32 19 7 19 9 8 52 24 9 19 17 10 12 14
# create dataframe of patients and the number of seizures observed while on placebo and on drug
sz <- data.frame("placebo"= c(37, 52, 68, 4, 29, 32, 19, 52, 19, 12), "drug" = c(5, 23, 40, 3, 38, 19, 9, 24, 17, 14))
# make box blot of data frame
boxplot(sz, ylab = "Frequency of Seizures", col = "green")
#perform t-test to compare groups
t.test(sz$placebo, sz$drug, paired = T)
##
## Paired t-test
##
## data: sz$placebo and sz$drug
## t = 2.7661, df = 9, p-value = 0.02189
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.404666 23.995334
## sample estimates:
## mean of the differences
## 13.2
#Make a data frame with tree species and bee colony frequency
tree <- data.frame("Habitat"= c("Oak", "Hickory", "Maple", "Red Cedar", "Poplar"), "Bee Colonies"= c(33, 30, 29, 4, 4))
#Creat bar plot of data frame
barplot(tree$Bee.Colonies, names.arg = tree$Habitat, ylab = "Number of Bee Colonies", xlab = "Tree Species", main = "Bee colonies by Tree Species")
#preformchi square test to compare groups
chisq.test(tree$Bee.Colonies)
##
## Chi-squared test for given probabilities
##
## data: tree$Bee.Colonies
## X-squared = 43.1, df = 4, p-value = 9.865e-09
#Perform 10 one-sample t-tests for mu=0 on simulated standard normally distributed data of 25 observations each and get the P-value.
replicate (10, t.test(rnorm(25), mu=0)$p.value)
## [1] 0.3726329 0.6570938 0.9974094 0.2001194 0.5574921 0.1309028 0.8195544
## [8] 0.8165301 0.4001241 0.3747476
#Perform 10 one-sample t-tests for mu=0 on simulated data of 25 observations in a t-distribution with 2 degrees of freedom each and get the P-value.
replicate (10, t.test(rt(25, df=2), mu=0)$p.value)
## [1] 0.1027295 0.4741522 0.8977442 0.3046655 0.5437520 0.5487903 0.6155922
## [8] 0.7997388 0.7640467 0.7961501
#Perform 10 one-sample t-tests for mu=0 on simulated standard normally distributed data of 25 observations each and get the P-value for a total of 1000 times
sig <- replicate (1000, t.test(rnorm(25), mu=0)$p.value)
#Show significant values
sig[sig<0.05]
## [1] 0.013660418 0.016343708 0.038256308 0.043179446 0.045698514 0.038456695
## [7] 0.005935105 0.034511392 0.034828781 0.019271671 0.013396259 0.008987165
## [13] 0.021548387 0.017490234 0.039381411 0.019962753 0.008498529 0.046630800
## [19] 0.015732225 0.007834513 0.047386549 0.021892836 0.025580903 0.010473671
## [25] 0.046105644 0.016111132 0.047991389 0.007658295 0.035491113 0.046562902
## [31] 0.045448154 0.039770350 0.001074726 0.014192020 0.020237756 0.031399027
## [37] 0.026727954 0.043569962 0.022093199 0.029405842 0.022435887 0.032791205
## [43] 0.035284457 0.023866484
50 tests are significant
#Apply a false discovery rate correction to the P-values
sig.correct <- p.adjust(sig, method = "fdr")
#Show significant values
sig.correct[sig.correct<0.05]
## numeric(0)
After adjusting, there are no significant tests
PRACTICE PROBLEMS – ANOVA
library(readxl)
library(tidyr)
#Call agriculture dataset "data_ANOVA.xlsx"
agri <- read_excel("data_ANOVA.xlsx")
#Convert to long format
agri.long <- gather(agri, Additive, Values, Glyphosate:Control, factor_key = T)
#Attach dataset "agri.long"
attach(agri.long)
#Create boxplot of agri.long dataset
boxplot(Values~Additive, xlab="Additive", ylab="Kudzu Value", main="Kudzu Experiment", col=c("green", "purple", "yellow"))
anova(lm(Values~Additive))
## Analysis of Variance Table
##
## Response: Values
## Df Sum Sq Mean Sq F value Pr(>F)
## Additive 2 763 381.5 54.5 1.318e-07 ***
## Residuals 15 105 7.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(Values~Additive))
##
## Call:
## lm(formula = Values ~ Additive)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.500 -1.875 0.000 1.875 4.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.500 1.080 18.053 1.38e-11 ***
## AdditiveTriclopyr -4.500 1.528 -2.946 0.01 *
## AdditiveControl 11.000 1.528 7.201 3.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.646 on 15 degrees of freedom
## Multiple R-squared: 0.879, Adjusted R-squared: 0.8629
## F-statistic: 54.5 on 2 and 15 DF, p-value: 1.318e-07
R^2 = 0.879
#Perfonm Tukey post hoc test
TukeyHSD(aov(lm(Values~Additive)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = lm(Values ~ Additive))
##
## $Additive
## diff lwr upr p adj
## Triclopyr-Glyphosate -4.5 -8.467701 -0.5322987 0.0255594
## Control-Glyphosate 11.0 7.032299 14.9677013 0.0000087
## Control-Triclopyr 15.5 11.532299 19.4677013 0.0000001
#detach dataset
detach(agri.long)
#Call dataset growth.txt
farmdiet <- read.table("growth.txt", header = T)
head(farmdiet)
## supplement diet gain
## 1 supergain wheat 17.37125
## 2 supergain wheat 16.81489
## 3 supergain wheat 18.08184
## 4 supergain wheat 15.78175
## 5 control wheat 17.70656
## 6 control wheat 18.22717
boxplot(gain~diet, data = farmdiet, col=c("red", "orange", "yellow"))
boxplot(gain~supplement, data = farmdiet, col=c("light blue", "blue", "purple", "mediumpurple1"))
boxplot(gain~diet:supplement, data = farmdiet)
anova(lm(gain~diet*supplement, data = farmdiet))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 83.5201 2.999e-14 ***
## supplement 3 91.881 30.627 17.8150 2.952e-07 ***
## diet:supplement 6 3.406 0.568 0.3302 0.9166
## Residuals 36 61.890 1.719
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#create interaction plots for farmdiet dataset
interaction.plot(farmdiet$supplement, farmdiet$diet, farmdiet$gain, col=c("red","blue","green"), main = "Interaction between Diet and Supplement", xlab = "Supplement", ylab = "Gain", trace.label = "Diet")
interaction.plot(farmdiet$diet, farmdiet$supplement, farmdiet$gain, col=c("red","blue","green", "purple"), main = "Interaction between Supplement and Diet", xlab = "Diet", ylab = "Gain", trace.label = "Supplement")
> I do not perceive an interaction visually by interaction plots
#As they are not interacting, consider the variables together
anova(lm(gain~diet+supplement, data=farmdiet))
## Analysis of Variance Table
##
## Response: gain
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 2 287.171 143.586 92.358 4.199e-16 ***
## supplement 3 91.881 30.627 19.700 3.980e-08 ***
## Residuals 42 65.296 1.555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(gain~diet+supplement, data=farmdiet))
##
## Call:
## lm(formula = gain ~ diet + supplement, data = farmdiet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.30792 -0.85929 -0.07713 0.92052 2.90615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.1230 0.4408 59.258 < 2e-16 ***
## dietoats -3.0928 0.4408 -7.016 1.38e-08 ***
## dietwheat -5.9903 0.4408 -13.589 < 2e-16 ***
## supplementcontrol -2.6967 0.5090 -5.298 4.03e-06 ***
## supplementsupergain -3.3815 0.5090 -6.643 4.72e-08 ***
## supplementsupersupp -0.7274 0.5090 -1.429 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.247 on 42 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8356
## F-statistic: 48.76 on 5 and 42 DF, p-value: < 2.2e-16
R^2 = 0.8531
TukeyHSD(aov(gain~diet+supplement, data=farmdiet))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gain ~ diet + supplement, data = farmdiet)
##
## $diet
## diff lwr upr p adj
## oats-barley -3.092817 -4.163817 -2.021817 0e+00
## wheat-barley -5.990298 -7.061298 -4.919297 0e+00
## wheat-oats -2.897481 -3.968481 -1.826481 2e-07
##
## $supplement
## diff lwr upr p adj
## control-agrimore -2.6967005 -4.0583332 -1.3350677 0.0000234
## supergain-agrimore -3.3814586 -4.7430914 -2.0198259 0.0000003
## supersupp-agrimore -0.7273521 -2.0889849 0.6342806 0.4888738
## supergain-control -0.6847581 -2.0463909 0.6768746 0.5400389
## supersupp-control 1.9693484 0.6077156 3.3309811 0.0020484
## supersupp-supergain 2.6541065 1.2924737 4.0157392 0.0000307
PRACTICE PROBLEMS – Correlation
#Create data frame of hyena laughter sample
hyena <- data.frame("age"=c(2,2,2,6,9,10,13,10,14,14,12,7,11,11,14,20), "Frequency"=c(840,670,580,470,540,660,510,520,500,480,400,650,460,500,580,500))
#plot data frame of hyena laughter
plot(hyena$age, hyena$Frequency, main = "Hyena Laughter", xlab = "Age (years)", ylab = "Frequency (Hz)")
#preform correlation test between hyena age and laughter fruequency
cor.test(hyena$age, hyena$Frequency, method="pearson")
##
## Pearson's product-moment correlation
##
## data: hyena$age and hyena$Frequency
## t = -2.8194, df = 14, p-value = 0.01365
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8453293 -0.1511968
## sample estimates:
## cor
## -0.601798
cor(hyena, method = "pearson")
## age Frequency
## age 1.000000 -0.601798
## Frequency -0.601798 1.000000
r = -0.60
#Perform non-parametric test Spearman
cor(hyena, method = "spearman")
## age Frequency
## age 1.0000000 -0.5397807
## Frequency -0.5397807 1.0000000
ρ = -0.54
#Perform non-parametric test Kendall
cor(hyena, method = "kendall")
## age Frequency
## age 1.0000000 -0.4211174
## Frequency -0.4211174 1.0000000
τ = -0.42
PRACTICE PROBLEMS – Regression
#call dataset "face.txt"
testost <- read.table("face.txt", sep="\t", header=T)
#Calculate the estimate of the intercept
fit <- lm(testost$Penalty ~ testost$Face)
summary(fit)
##
## Call:
## lm(formula = testost$Penalty ~ testost$Face)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9338 -0.3883 -0.1260 0.3381 1.0711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.505 2.089 -2.157 0.0440 *
## testost$Face 3.189 1.150 2.774 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6129 on 19 degrees of freedom
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2508
## F-statistic: 7.694 on 1 and 19 DF, p-value: 0.0121
For every unit increase in face ratio there is a 0.09 minute penalty increase. The y intercept is -4.505 and the slope is 3.189, the equation is y=3.189x - 4.505
How many degrees of freedom does this analysis have? > 19 degress of freedom
What is the t-statistic? > 2.774
What is your final conclusion about the slope? > The slope is positive - as your face dimensions increase, aggression increases.
What is the variation explained, R2? > R^2 = 0.2882
Make a scatterplot of the data and include a linear regression line
plot(testost$Face, testost$Penalty, xlab="Face ratio", ylab = "Number of penalties", main = "Face Ratio and Number of Penalties - Hockey Players")
abline(fit, col="purple", lwd=3)
#Call dataset "respiratoryrate_bodymass.txt"
resp<-read.table("respiratoryrate_bodymass.txt", header = T, sep = "\t")
#Make scatterplot of dataset
plot(resp, main="Body Mass x Respirations")
#use a log10 transfomation to linearize the data
plot(log10(resp$BodyMass),log10(resp$Respiration), xlab="Body mass", ylab="Respiration rate", main = "Body Mass x Respirations - Transformed")
> log10 transformation
#perform linear regression on transformed data
reg <- lm(log10(resp$BodyMass) ~ log10(resp$Respiration))
summary(reg)
##
## Call:
## lm(formula = log10(resp$BodyMass) ~ log10(resp$Respiration))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24453 -0.15388 0.01637 0.11046 0.36318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1627 0.5779 0.282 0.785401
## log10(resp$Respiration) 0.9196 0.1684 5.461 0.000601 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2001 on 8 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.762
## F-statistic: 29.82 on 1 and 8 DF, p-value: 0.000601
beta is 0.1627
Carry out a formal test of the null hypothesis of beta=0 > The Pvalue for slope at the tails is 0.000601/2=0.0003005 (below 0.05) so you cannot reject the null hypothesis with this sample.
What is the variation explained, R2? > R^2 = 0.7885
PRACTICE PROBLEMS – Advanced regression
#Call dataset "ozone"
ozone<-read.table("ozone.txt",header=T, sep = "\t")
1a. Make a multiple panel bivariate scatterplot
#make individual plots mapping variables "radiation", "temperature", "Wind" and "ozone"
pairs( ~ ozone + rad + temp + wind, data = ozone, row1attop=TRUE)
1b. Perform a multiple linear regression
#make linear model with ozone by radiation, wind and temperature
ozonereg<- lm(ozone ~ rad+temp+wind,ozone)
summary(ozonereg)
##
## Call:
## lm(formula = ozone ~ rad + temp + wind, data = ozone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.210 -3.556 10.124 95.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.23208 23.04204 -2.788 0.00628 **
## rad 0.05980 0.02318 2.580 0.01124 *
## temp 1.65121 0.25341 6.516 2.43e-09 ***
## wind -3.33760 0.65384 -5.105 1.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.17 on 107 degrees of freedom
## Multiple R-squared: 0.6062, Adjusted R-squared: 0.5952
## F-statistic: 54.91 on 3 and 107 DF, p-value: < 2.2e-16
1c. What is the variation explained, R2? > R^2 = 0.6062
1d. Assess the collinearity of the explanatory variables using the variance inflation factor
library(car)
vif(ozonereg )
## rad temp wind
## 1.095241 1.431201 1.328979
Values higher than 5 are worrisome, higher than 10 need to be addressed.
#Call dataset diminish
diminish<-read.table("diminish.txt", sep="\t", header=T)
2a. Perform a simple linear regression
#make linear regression model with variable yv and xv
dimreg <- lm(diminish$yv ~ diminish$xv)
summary(dimreg)
##
## Call:
## lm(formula = diminish$yv ~ diminish$xv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3584 -2.1206 -0.3218 1.8763 4.1782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.61438 1.17315 23.54 7.66e-14 ***
## diminish$xv 0.22683 0.02168 10.46 1.45e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 16 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.8646
## F-statistic: 109.5 on 1 and 16 DF, p-value: 1.455e-08
2b. Perform a polynomial (second-degree) regression
#Perform a polynomial (second-degree) regression
dimpoly <- lm(diminish$yv ~ diminish$xv + I(diminish$xv^2))
summary(dimpoly)
##
## Call:
## lm(formula = diminish$yv ~ diminish$xv + I(diminish$xv^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0490 -1.2890 0.6018 1.1829 2.9610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.6323529 1.6916139 14.561 2.95e-10 ***
## diminish$xv 0.4057534 0.0819860 4.949 0.000175 ***
## I(diminish$xv^2) -0.0018834 0.0008386 -2.246 0.040203 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.131 on 15 degrees of freedom
## Multiple R-squared: 0.9046, Adjusted R-squared: 0.8919
## F-statistic: 71.12 on 2 and 15 DF, p-value: 2.222e-08
2c. Compare both models with Akaike’s Information Criterion (AIC). Which model is better?
#compare using Akaike’s Information Criterion (AIC)
AIC(dimreg, dimpoly)
## df AIC
## dimreg 3 86.26193
## dimpoly 4 83.04403
The polynomial model (second) is better as it has a lower AIC
2d. Make a scatterplot of the data and include both regression lines
#Make plot
plot(diminish$xv, diminish$yv, xlab = "xv", ylab = "yv")
#add linear regression line in blue
abline(dimreg, col="blue", lwd=3)
#add polynomial regression line in red
lines(diminish$xv, predict(dimpoly), col="red", lwd=3)
library(vegan)
#call in stork dataset
stork<-read.table("stork.txt", sep="\t", header=T)
3a. Make a scatterplot of the data
#plot data "stork"
plot(stork$Corticosterone,stork$Survival, xlab = "Corticosterone", ylab = "Survival")
3b. Which type of regression model is suitable for these data? > The data seems to be binary so use logistic regression
3c. Perform an appropriate regression to predict survival from corticosterone
#attach dataset for analysis
attach (stork)
#perform logistic regression
storkglm<-glm(Survival~Corticosterone, family=binomial)
summary(storkglm)
##
## Call:
## glm(formula = Survival ~ Corticosterone, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4316 -0.8724 -0.6703 1.2125 1.8211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.70304 1.74725 1.547 0.1219
## Corticosterone -0.07980 0.04368 -1.827 0.0677 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45.234 on 33 degrees of freedom
## Residual deviance: 41.396 on 32 degrees of freedom
## AIC: 45.396
##
## Number of Fisher Scoring iterations: 4
3d. What is the pseudo-R2 of the model?
#Pseudo R^2
1-(storkglm$deviance/storkglm$null.deviance)
## [1] 0.084852
3e. Include the predicted curve in the scatterplot
#plot data "stork"
plot(Corticosterone,Survival, xlab="Corticosterone", ylab="Probability of survival")
#add regression line in red
lines(25:61,predict(storkglm, list(Corticosterone=25:61), type="response"), col="red", lwd=3)
detach(stork)
#Call in "cluster" dataset
cluster<-read.table("clusters.txt", sep="\t", header=T)
4a. Make a scatterplot of the data
#plot clusters dataset
plot(cluster$Cancers,cluster$Distance)
4b. Which regression is the more appropriate for these data? > The data is categorical so use a poisson regression
4c. Given your choice, is the trend significant?
#Perform poisson regression
clusterreg<-glm(cluster$Cancers ~ cluster$Distance, family=poisson)
summary(clusterreg)
##
## Call:
## glm(formula = cluster$Cancers ~ cluster$Distance, family = poisson)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5504 -1.3491 -1.1553 0.3877 3.1304
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.186865 0.188728 0.990 0.3221
## cluster$Distance -0.006138 0.003667 -1.674 0.0941 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 149.48 on 93 degrees of freedom
## Residual deviance: 146.64 on 92 degrees of freedom
## AIC: 262.41
##
## Number of Fisher Scoring iterations: 5
No it is not significant (Pvalue = 0.0941)
4d. Include the predicted relationship from the model in the scatterplot
#define R^2 from cluster poisson regression
cluster_r2<-1-(clusterreg$deviance/clusterreg$null.deviance)
plot(cluster$Cancers,cluster$Distance, ylab = "Distance", xlab = "Cancers")
abline(clusterreg, col="red", lwd=3)
#call in dataset "jaws"
jaws<-read.table("jaws.txt", sep="\t", header=T)
#attach dataset for graph
attach(jaws)
6a. Make a scatterplot of the data (age explanatory, bone response)
#plot "jaws" dataset
plot(age, bone, xlab='Age Explanatory', ylab='Bone Response')
6b. Perform a non-linear regression assuming an asymptotic exponential relationship: > non-linear regression formula y=a(1-e^(-cx))
#select starting variable according to knowlege of data
a<-100
c<-0.64
#perform non-linear regression on "jaws"
jawsnls<-nls(bone~(a*(1-exp(-1*c*age))), start = list(a=100, c=0.1),jaws)
summary(jawsnls)
##
## Formula: bone ~ (a * (1 - exp(-1 * c * age)))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 115.58059 2.84365 40.645 < 2e-16 ***
## c 0.11882 0.01233 9.635 3.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 52 degrees of freedom
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 4.025e-06
6c. Perform a non-linear regression assuming a Michaelis-Menten model: > non-linear regression assuming a Michaelis-Menten model y=ax/(1+bx)
#Perform a non-linear regression assuming a Michaelis-Menten model
jawsmmm<-nls(bone~(a*age/(1+b*age)), start = list(a=100, b=0.1),jaws)
summary(jawsmmm)
##
## Formula: bone ~ (a * age/(1 + b * age))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 18.72524 2.52585 7.413 1.09e-09 ***
## b 0.13596 0.02339 5.814 3.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 52 degrees of freedom
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 7.785e-06
6d. Compare both models with Akaike’s Information Criterion (AIC). Which model is better?
AIC(jawsnls, jawsmmm)
## df AIC
## jawsnls 3 435.0823
## jawsmmm 3 440.4066
The standard non-linear regression is better as AIC is lower
6e. Make a scatterplot of the data and include both regression lines
#plot "jaws"
plot(jaws)
#plot NLR line
lines(seq(0, 50, 0.1), predict(jawsnls, list(age=seq(0, 50, 0.1)), type="response"), col="red", lwd=3)
#plot NLR with MMM model
lines(seq(0, 50, 0.1), predict(jawsmmm, list(age=seq(0, 50, 0.1)), type="response"), col="blue", lwd=3)
detach(jaws)
#load dragons dataset
load("dragons.RData")
#attach dragons for analysis
attach(dragons)
7a. Perform a simple linear regression with testScore as response and bodyLength as explanatory
dragonlm <- lm(testScore ~ bodyLength)
summary(dragonlm)
##
## Call:
## lm(formula = testScore ~ bodyLength)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.962 -16.411 -0.783 15.193 55.200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.31783 12.06694 -5.081 5.38e-07 ***
## bodyLength 0.55487 0.05975 9.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.2 on 478 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1511
## F-statistic: 86.25 on 1 and 478 DF, p-value: < 2.2e-16
7b. Plot the data with ggplot2 and add a linear regression line with confidence intervals. Hint: geom_smooth()
library(ggplot2)
#Plot the dragon with ggplot2 and add a linear regression line with confidence intervals
ggplot(dragonlm, aes(x=bodyLength, y=testScore)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
## `geom_smooth()` using formula 'y ~ x'
# (by default includes 95% confidence region)
7c. We collected multiple samples from 8 mountain ranges. Generate a boxplot using (or not) ggplot2 to explore this new explanatory variable
# make a grouped boxplot
ggplot(dragons, aes(x=mountainRange, y=bodyLength)) +
geom_boxplot()
7d. Now repeat the scatterplot in b. but coloring by mountain range and without linear regression line
#plot "dragons" using ggplot and color by mountain range
ggplot(dragons, aes(x=testScore, y=bodyLength)) +
geom_point(aes(color=mountainRange))
7e. Instead of coloring, use facet_wrap() to separate by mountain range
#plot "dragons" using ggplot and perarate plots by mountain range
ggplot(data=dragons, aes(x=bodyLength, y=testScore)) +
geom_point() +
facet_wrap(~mountainRange)
7f. Perform a new linear model adding mountain range and assuming that it is fixed effects
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
#perform linear mixed effects model with mountain range as a fixed effect
dragonlmer<-lmer(dragons$bodyLength~dragons$testScore+(1|dragons$mountainRange))
summary(dragonlmer)
## Linear mixed model fit by REML ['lmerMod']
## Formula: dragons$bodyLength ~ dragons$testScore + (1 | dragons$mountainRange)
##
## REML criterion at convergence: 3472.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.44503 -0.75468 -0.06233 0.72124 2.56768
##
## Random effects:
## Groups Name Variance Std.Dev.
## dragons$mountainRange (Intercept) 212.33 14.571
## Residual 74.73 8.645
## Number of obs: 480, groups: dragons$mountainRange, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 2.009e+02 5.337e+00 37.646
## dragons$testScore 8.006e-03 2.652e-02 0.302
##
## Correlation of Fixed Effects:
## (Intr)
## drgns$tstSc -0.250
#perform ANOVA on model
anova(dragonlmer)
## Analysis of Variance Table
## npar Sum Sq Mean Sq F value
## dragons$testScore 1 6.8123 6.8123 0.0912
7g. Perform the same linear model as before but now assuming mountain range is random effects
#perform linear model with mountain range as a random effect
dragonlm2 <- lm(dragons$testScore ~ dragons$bodyLength+dragons$mountainRange)
summary(dragonlm2)
##
## Call:
## lm(formula = dragons$testScore ~ dragons$bodyLength + dragons$mountainRange)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.263 -9.926 0.361 9.994 44.488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.83051 14.47218 1.439 0.15072
## dragons$bodyLength 0.01267 0.07974 0.159 0.87379
## dragons$mountainRangeCentral 36.58277 3.59929 10.164 < 2e-16 ***
## dragons$mountainRangeEmmental 16.20923 3.69665 4.385 1.43e-05 ***
## dragons$mountainRangeJulian 45.11469 4.19012 10.767 < 2e-16 ***
## dragons$mountainRangeLigurian 17.74779 3.67363 4.831 1.84e-06 ***
## dragons$mountainRangeMaritime 49.88133 3.13924 15.890 < 2e-16 ***
## dragons$mountainRangeSarntal 41.97841 3.19717 13.130 < 2e-16 ***
## dragons$mountainRangeSouthern 8.51961 2.73128 3.119 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.96 on 471 degrees of freedom
## Multiple R-squared: 0.5843, Adjusted R-squared: 0.5773
## F-statistic: 82.76 on 8 and 471 DF, p-value: < 2.2e-16
7h. How much of the variation in test scores is explained by the random effect (mountain range)
# to calculate the variation explained by the mountain range, take the difference of the total model variation explained (r^2) and the t-statistic
0.5843-.1529
## [1] 0.4314
43.14%
library(lme4) #call in data (wish I could) site<-site:date #Perfomr linear effect model with “dailygrowth” as an outcome, compared to “turbidity”, “temperature”, “tide” and “waveaction” with “site” as a fixed effect. reeflme<-lmer(dailygrowth~turbidity+temperature+tide+waveaction+(1|site)) summary(reeflme) #Perform ANOVA on model anova(reeflme)