A. Use the default data set InsectSprays: The counts of insects in agricultural experimental units treated with different insecticides. Column 1: Insect count Column 2: The type of spray
InsectSprays = InsectSprays
count = InsectSprays$count
spray = InsectSprays$spray
\(H_0\) : Mean of insect counts are same corresponding to every spray.
There are 6 different spray categories in the data set: A,B,C,D,E, and F. Each spray type has a total of 12 rows out of the entire data set. We can check this with the following code.
sum((InsectSprays$spray == "A")) #Spray A
## [1] 12
sum((InsectSprays$spray == "B")) #Spray B
## [1] 12
sum((InsectSprays$spray == "C")) #Spray C
## [1] 12
sum((InsectSprays$spray == "D")) #Spray D
## [1] 12
sum((InsectSprays$spray == "E")) #Spray E
## [1] 12
sum((InsectSprays$spray == "F")) #Spray F
## [1] 12
Category A has a mean count of 14.5 insects.
mean(count[spray == "A"])
## [1] 14.5
mean = tapply(count, spray, mean, na.rm=TRUE)
mean
## A B C D E F
## 14.500000 15.333333 2.083333 4.916667 3.500000 16.666667
SD = tapply(count, spray, var, na.rm=TRUE)
SD
## A B C D E F
## 22.272727 18.242424 3.901515 6.265152 3.000000 38.606061
CountA = count[spray == "A"]
CountB = count[spray == "B"]
CountC = count[spray == "C"]
CountD = count[spray == "D"]
CountE = count[spray == "E"]
CountF = count[spray == "F"]
par(mfrow = c(2,3))
qqnorm(CountA, main= "Spray vs Insect Counts", xlab = "Spray A", ylab = "Count")
qqline(CountA, datax = FALSE, distribution = qnorm)
qqnorm(CountB, main= "Spray vs Insect Counts", xlab = "Spray B", ylab = "Count")
qqline(CountB, datax = FALSE, distribution = qnorm)
qqnorm(CountC, main= "Spray vs Insect Counts", xlab = "Spray C", ylab = "Count")
qqline(CountC, datax = FALSE, distribution = qnorm)
qqnorm(CountD, main= "Spray vs Insect Counts", xlab = "Spray D", ylab = "Count")
qqline(CountD, datax = FALSE, distribution = qnorm)
qqnorm(CountE, main= "Spray vs Insect Counts", xlab = "Spray E", ylab = "Count")
qqline(CountE, datax = FALSE, distribution = qnorm)
qqnorm(CountF, main= "Spray vs Insect Counts", xlab = "Spray F", ylab = "Count")
qqline(CountF, datax = FALSE, distribution = qnorm)
bartlett.test(count ~ spray, data = InsectSprays)
##
## Bartlett test of homogeneity of variances
##
## data: count by spray
## Bartlett's K-squared = 25.96, df = 5, p-value = 9.085e-05
Basing our Bartlett on our p-value, we can conclude that the there is no homogeneity of the variance. If comparing it to a p value of .05 > 9.085e-05, which is why we conclude no homogeneity of variance.
model1 = aov(count~ spray, data=InsectSprays)
model1
## Call:
## aov(formula = count ~ spray, data = InsectSprays)
##
## Terms:
## spray Residuals
## Sum of Squares 2668.833 1015.167
## Deg. of Freedom 5 66
##
## Residual standard error: 3.921902
## Estimated effects may be unbalanced
summary(model1)
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 2669 533.8 34.7 <2e-16 ***
## Residuals 66 1015 15.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
\[df_{between} = k - 1 = 5 \] \[df_{within} = N - k = 66 \]
\[SS_{between} = \sum n (\bar{x} - \bar{X}_{GM})^2 = 2669\]
\[SS_{within}$ = $\sum df*s^2 = 1015\]
\[MS_{within}= \frac{SS_{within}}{df_{within}} = \frac{1015}{66} = 15\]
\[MS_{between}= \frac{SS_{Between}}{df_{between}} = \frac{2669}{5} = 534\]
F Value = 34.7
Using the Anova, the p-value we examine is <2e-16 which is smaller than .01. With this being known, we can say with 99% confidence that we can reject the null that states, mean of insects counts are same corresponding to every spray.
Tukey = TukeyHSD(model1)
Tukey
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = count ~ spray, data = InsectSprays)
##
## $spray
## diff lwr upr p adj
## B-A 0.8333333 -3.866075 5.532742 0.9951810
## C-A -12.4166667 -17.116075 -7.717258 0.0000000
## D-A -9.5833333 -14.282742 -4.883925 0.0000014
## E-A -11.0000000 -15.699409 -6.300591 0.0000000
## F-A 2.1666667 -2.532742 6.866075 0.7542147
## C-B -13.2500000 -17.949409 -8.550591 0.0000000
## D-B -10.4166667 -15.116075 -5.717258 0.0000002
## E-B -11.8333333 -16.532742 -7.133925 0.0000000
## F-B 1.3333333 -3.366075 6.032742 0.9603075
## D-C 2.8333333 -1.866075 7.532742 0.4920707
## E-C 1.4166667 -3.282742 6.116075 0.9488669
## F-C 14.5833333 9.883925 19.282742 0.0000000
## E-D -1.4166667 -6.116075 3.282742 0.9488669
## F-D 11.7500000 7.050591 16.449409 0.0000000
## F-E 13.1666667 8.467258 17.866075 0.0000000
Visualize the results with boxplot().
boxplot(InsectSprays$count ~ InsectSprays$spray, InsectSprays)
boxplot(InsectSprays$count ~ InsectSprays$spray, InsectSprays, col = c("blue", "purple", "green", "yellow", "red", "orange"), ylab = "Count", xlab = "Spray")
There are a few outliers in the data with spray bottle C and D.
library("ggplot2")
ggplot(InsectSprays, aes(x=spray, y = count)) + geom_boxplot(fill =c("blue", "purple", "green", "yellow", "red", "orange"), colour = "black") + scale_x_discrete() + xlab("Type of Spray")+ ylab("Count")
B. Consider the default data set mtcars for the next two problems.
round(cor(mtcars, method = c("pearson")), 4)
## mpg cyl disp hp drat wt qsec vs
## mpg 1.0000 -0.8522 -0.8476 -0.7762 0.6812 -0.8677 0.4187 0.6640
## cyl -0.8522 1.0000 0.9020 0.8324 -0.6999 0.7825 -0.5912 -0.8108
## disp -0.8476 0.9020 1.0000 0.7909 -0.7102 0.8880 -0.4337 -0.7104
## hp -0.7762 0.8324 0.7909 1.0000 -0.4488 0.6587 -0.7082 -0.7231
## drat 0.6812 -0.6999 -0.7102 -0.4488 1.0000 -0.7124 0.0912 0.4403
## wt -0.8677 0.7825 0.8880 0.6587 -0.7124 1.0000 -0.1747 -0.5549
## qsec 0.4187 -0.5912 -0.4337 -0.7082 0.0912 -0.1747 1.0000 0.7445
## vs 0.6640 -0.8108 -0.7104 -0.7231 0.4403 -0.5549 0.7445 1.0000
## am 0.5998 -0.5226 -0.5912 -0.2432 0.7127 -0.6925 -0.2299 0.1683
## gear 0.4803 -0.4927 -0.5556 -0.1257 0.6996 -0.5833 -0.2127 0.2060
## carb -0.5509 0.5270 0.3950 0.7498 -0.0908 0.4276 -0.6562 -0.5696
## am gear carb
## mpg 0.5998 0.4803 -0.5509
## cyl -0.5226 -0.4927 0.5270
## disp -0.5912 -0.5556 0.3950
## hp -0.2432 -0.1257 0.7498
## drat 0.7127 0.6996 -0.0908
## wt -0.6925 -0.5833 0.4276
## qsec -0.2299 -0.2127 -0.6562
## vs 0.1683 0.2060 -0.5696
## am 1.0000 0.7941 0.0575
## gear 0.7941 1.0000 0.2741
## carb 0.0575 0.2741 1.0000
options(digits=2)
Kendall = cor(mtcars, use = "complete.obs", method = c("kendall"))
round(Kendall, 2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## mpg 1.00 -0.80 -0.77 -0.74 0.46 -0.73 0.32 0.59 0.47 0.43 -0.50
## cyl -0.80 1.00 0.81 0.79 -0.55 0.73 -0.45 -0.77 -0.49 -0.51 0.47
## disp -0.77 0.81 1.00 0.67 -0.50 0.74 -0.30 -0.60 -0.52 -0.48 0.41
## hp -0.74 0.79 0.67 1.00 -0.38 0.61 -0.47 -0.63 -0.30 -0.28 0.60
## drat 0.46 -0.55 -0.50 -0.38 1.00 -0.55 0.03 0.38 0.58 0.58 -0.10
## wt -0.73 0.73 0.74 0.61 -0.55 1.00 -0.14 -0.49 -0.61 -0.54 0.37
## qsec 0.32 -0.45 -0.30 -0.47 0.03 -0.14 1.00 0.66 -0.17 -0.09 -0.51
## vs 0.59 -0.77 -0.60 -0.63 0.38 -0.49 0.66 1.00 0.17 0.27 -0.58
## am 0.47 -0.49 -0.52 -0.30 0.58 -0.61 -0.17 0.17 1.00 0.77 -0.06
## gear 0.43 -0.51 -0.48 -0.28 0.58 -0.54 -0.09 0.27 0.77 1.00 0.10
## carb -0.50 0.47 0.41 0.60 -0.10 0.37 -0.51 -0.58 -0.06 0.10 1.00
library(psych)
pairs.panels(Kendall)