library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Exercises

  1. Make up a vector of 50 random Legolas actors, with mean height of 195cm, and a standard deviation of 15cm. Run a t-test to compare this sample of actors to the set of Aragorns and then the set of Gimlis. Do you find evidence for significant differences?
# generate random data for actor heights
aragorn = rnorm(50, mean = 180, sd=10)
gimli = rnorm(50, mean = 132, sd = 15)
legolas = rnorm(50, mean = 195, sd = 15)

# compare means for Aragorns and Legolases
tAL<- t.test(aragorn, legolas, alternatve = "two.sided")
tAL
## 
##  Welch Two Sample t-test
## 
## data:  aragorn and legolas
## t = -5.08, df = 87.9, p-value = 2.1e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.7696  -8.2135
## sample estimates:
## mean of x mean of y 
##    181.65    195.14

The p-value for the t test between Aragorn and Legolas actors is 2.09939^{-6}. This value is very low, suggesting that there is a statistically significant difference between the mean heights of the two groups.

tAL2 <- t.test(aragorn, legolas, alternative = "less")
tAL2
## 
##  Welch Two Sample t-test
## 
## data:  aragorn and legolas
## t = -5.08, df = 87.9, p-value = 1e-06
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##     -Inf -9.0765
## sample estimates:
## mean of x mean of y 
##    181.65    195.14

Performing a one-sided t test, with the hypothesis that the Legolas actors are taller than the Aragorn actors, gives a p-value of 1.0497^{-6}. This value is also very low. There is significant reason to believe that the Legolas actors are taller than the Aragorn actors.

tGL <- t.test(gimli, legolas, alternative = "two.sided")
tGL
## 
##  Welch Two Sample t-test
## 
## data:  gimli and legolas
## t = -21.6, df = 96, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -67.689 -56.288
## sample estimates:
## mean of x mean of y 
##    133.15    195.14

Conducting the same analysis between the Gimli and Legolas actors. There is an even smaller p-value than before, 1.3074^{-38}. We can conclude that there is a significant difference between the mean height of the two groups.

tGL2 <- t.test(gimli, legolas, alternative = "less")
tGL2
## 
##  Welch Two Sample t-test
## 
## data:  gimli and legolas
## t = -21.6, df = 96, p-value <2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##     -Inf -57.219
## sample estimates:
## mean of x mean of y 
##    133.15    195.14

Using a one sided test with the hypothesis that the Gimli actors are shorter than the Legolas actors returns a p-value of 6.53702^{-39}. This value is almost zero, indicating, once again, that there is a significant difference between the two groups of actors AND the Gimli actors are shorter than the Legolas actors.


  1. Re-run the variance test (F-test) to compare the group of Gimli and Legolas actors. Do these groups have different variance?
vLG <- var.test(legolas, gimli)
vLG
## 
##  F test to compare two variances
## 
## data:  legolas and gimli
## F = 1.34, num df = 49, denom df = 49, p-value = 0.31
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.75939 2.35813
## sample estimates:
## ratio of variances 
##             1.3382

The test returns a p-value of 0.31124 between the height variances of the Gimli actors and Legolas actors. This value is very high, suggesting that there is no significant difference between the variances for the two groups.


  1. Redo the correlation for the Sepal Length and Sepal Width for the Iris dataset, but for the three individual species. Are these correlated?
iSpecies = levels(iris$Species)
iSCor = rep(0, length(iSpecies))
for(x in 1:length(iSpecies)) {
  species = iris %>% filter(Species == iSpecies[x])
  ctest = cor.test(species$Sepal.Length, species$Sepal.Width)
  iSCor[x] = ctest$p.value
}

irisDatabySpecies <- data.frame(iSpecies, iSCor)
colnames(irisDatabySpecies) = c("Species", "Cor pVal")
irisDatabySpecies
##      Species   Cor pVal
## 1     setosa 6.7098e-10
## 2 versicolor 8.7719e-05
## 3  virginica 8.4346e-04

Previously, the p-value for all the iris was high, and we had to conclude that there was not a relationship between the sepal length and width.

However, after separating the iris data into the different species, the p-values for the correlations between sepal length and width are all very low. This implies that there is a strong correlation between the sepal length and width for each species.


  1. Using the deer dataset and the chisq.test() function, test:

4a. If there are significant differences in the number of deer caught per month

deer <- read.csv("Deer.csv")
deerMonth <- chisq.test(table(deer$Month))
deerMonth
## 
##  Chi-squared test for given probabilities
## 
## data:  table(deer$Month)
## X-squared = 997, df = 11, p-value <2e-16

The hypothesis is that the number of deer caught is independent of the month (there no significant relationship between the month and the number of deer captured). The p-value for the Chi Squared test is 8.19963^{-207}. This low value means that we reject the original hypothesis, and conclude that there is a significant difference between the number of deer caught each month.

4b. If the cases of tuberculosis are uniformly distributed across all farms

goodTbData <- filter(deer, !is.na(deer$Tb))
table(goodTbData$Tb, goodTbData$Farm)
##    
##      AL  AU  BA  BE  CB CRC  HB LCV  LN MAN  MB  MO  NC  NV  PA  PN  QM  RF  RN
##   0  10  23  67   7  88   4  22   0  28  27  16 186  24  18  11  39  67  23  21
##   1   3   0   5   0   3   0   1   1   6  24   5  31   4   1   0   0   7   1   0
##    
##      RO SAL SAU  SE  TI  TN VISO  VY
##   0  31   0   3  16   9  16   13  15
##   1   0   1   0  10   0   2    1   4
deerTBFarm <- chisq.test(table(goodTbData$Tb, goodTbData$Farm))
## Warning in chisq.test(table(goodTbData$Tb, goodTbData$Farm)): Chi-squared
## approximation may be incorrect
deerTBFarm
## 
##  Pearson's Chi-squared test
## 
## data:  table(goodTbData$Tb, goodTbData$Farm)
## X-squared = 129, df = 26, p-value = 1.2e-15

The hypothesis is that there is no relationship between the number of TB cases and the location of the farms. The p-value from the Chi Squared test between the number of deer with TB and the farms is 1.24253^{-15}. This value is very low, therefore we must reject our original hypothesis and conclude that there is a relationship between the farm location and the number of TB cases.