Consider the radiation data in Example 4.10.
Construct a Q-Q plot for the natural logarithms of these data. Do the natural logarithms appear to normally distributed?
# create a data matrix
radiation <- matrix(c(0.15,0.9,0.18,0.1,0.05,0.12,0.08,0.05,0.08,0.1,0.07,0.02,0.01,0.1,0.1,0.1,0.02,0.1,0.01,0.4,0.1,0.05,0.03,0.05,0.15,0.1,0.15,0.09,0.08,0.18,0.1,0.2,0.11,0.3,0.02,0.2,0.2,0.3,0.3,0.4,0.3,0.05))
colnames(radiation) <-c("Radiation")
# construct a q-q plot
qqnorm(log(radiation))
qqline(log(radiation))
# compute the marginal normality
shapiro.test(log(radiation))
##
## Shapiro-Wilk normality test
##
## data: log(radiation)
## W = 0.9615, p-value = 0.1665
The Q-Q plot appears to be straight, hence the data are normally distributed. In addition, the test of normality produces a p-value of 0.1665 showing that the data is normally distributed.
For lambda = 1/4, the coefficient of normality is greater than for lambda = 0.
Given the air pollution data.
# read the data into R
pollution_data <- read.table("table15.txt")
# examine columns five and six
data2 <- (pollution_data[,5:6])
colnames(data2) <- c("NO2", "O3")
# calcualate the sample mean vector
cm <- colMeans(data2)
cm
## NO2 O3
## 10.047619 9.404762
# calculate the variance-covariance Matrix
S <- cov(data2)
S
## NO2 O3
## NO2 11.363531 3.126597
## O3 3.126597 30.978513
# calaculate the statistical square distances
d <- apply(data2, MARGIN = 1, function(data2) +
t(data2-cm) %*% solve(S) %*% (data2-cm))
d
## [1] 0.4606524 0.6592206 2.3770610 1.6282902 0.4135364 0.4760726
## [7] 1.1848895 10.6391792 0.1388339 0.8162468 1.3566301 0.6228096
## [13] 5.6494392 0.3159498 0.4135364 0.1224973 0.8987982 4.7646873
## [19] 3.0089122 0.6592206 2.7741416 1.0360061 0.7874152 3.4437748
## [25] 6.1488606 1.0360061 0.1388339 0.8856041 0.1379719 2.2488867
## [31] 0.1901188 0.4606524 1.1471939 7.0857237 1.4584229 0.1224973
## [37] 1.8984708 2.7782596 8.4730649 0.6370218 0.7032485 1.8013611
# construct a chi-square plot of the ordered distances
par(mfrow = c(1,1))
plot(qchisq((1:nrow(data2)-1/2)/nrow(data2),df=2), sort(d),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-square Plot for (NO2, O3)"); abline(a=0, b=1)
The number of units = 26 The proportion = 62%
Examine the margninal normality of the observations for the data in Table 4.3.
# read the data into R
data3 <- read.table("table43.txt")
colnames(data3)<-c("X1", "X2", "X3", "X4","X5")
library(MVN)
## Warning: package 'MVN' was built under R version 3.1.3
uniNorm(data3, type = "SW", desc = TRUE)
## $`Descriptive Statistics`
## n Mean Std.Dev Median Min Max 25th 75th Skew
## X1 30 1906.100 324.987 1863.00 1325.00 2983.00 1715.250 2057.250 1.038
## X2 30 1749.533 318.607 1680.00 1170.00 2794.00 1595.500 1888.750 1.144
## X3 30 1509.133 303.178 1466.00 1002.00 2412.00 1295.750 1623.750 0.980
## X4 30 1724.967 322.844 1674.50 1176.00 2581.00 1520.250 1880.750 0.598
## X5 30 3.867 3.734 2.64 0.13 16.85 1.415 5.043 1.786
## Kurtosis
## X1 2.036
## X2 1.950
## X3 0.997
## X4 -0.046
## X5 3.140
##
## $`Shapiro-Wilk's Normality Test`
## Variable Statistic p-value Normality
## 1 X1 0.9307 0.0512 YES
## 2 X2 0.9127 0.0175 NO
## 3 X3 0.9326 0.0575 YES
## 4 X4 0.9613 0.3337 YES
## 5 X5 0.7999 0.0001 NO
Examine the bivariate normality of the observations for the data.
# examine columns one upto four
data3 <- (data3[,-5])
# construct the bivariate chi-square plot for X1 and X2
data3_X1x2 <- (data3[,c("X1","X2")])
cm3_X1X2 <- colMeans(data3_X1x2)
S3_X1X2 <- cov(data3_X1x2)
d3_X1X2 <- apply(data3_X1x2, MARGIN = 1, function(data3_X1x2) +
t(data3_X1x2-cm3_X1X2) %*% solve(S3_X1X2) %*% (data3_X1x2-cm3_X1X2))
par(mfrow = c(3,2))
plot(qchisq((1:nrow(data3_X1x2)-1/2)/nrow(data3_X1x2),df=2), sort(d3_X1X2),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-Square Plot for (X1,X2)"); abline(a=0, b=1)
# construct the bivariate chi-square plot for X1 and X3
data3_X1x3 <- (data3[,c("X1","X3")])
cm3_X1X3 <- colMeans(data3_X1x3)
S3_X1X3 <- cov(data3_X1x3)
d3_X1X3 <- apply(data3_X1x3, MARGIN = 1, function(data3_X1x3) +
t(data3_X1x3-cm3_X1X3) %*% solve(S3_X1X3) %*% (data3_X1x3-cm3_X1X3))
plot(qchisq((1:nrow(data3_X1x3)-1/2)/nrow(data3_X1x3),df=2), sort(d3_X1X3),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-Square Plot for (X1,X3)"); abline(a=0, b=1)
# construct the bivariate chi-square plot for X1 and X4
data3_X1x4 <- (data3[,c("X1","X4")])
cm3_X1X4 <- colMeans(data3_X1x4)
S3_X1X4 <- cov(data3_X1x4)
d3_X1X4 <- apply(data3_X1x4, MARGIN = 1, function(data3_X1x4) +
t(data3_X1x4-cm3_X1X4) %*% solve(S3_X1X4) %*% (data3_X1x4-cm3_X1X4))
plot(qchisq((1:nrow(data3_X1x4)-1/2)/nrow(data3_X1x4),df=2), sort(d3_X1X4),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-Square Plot for (X1,X4)"); abline(a=0, b=1)
# construct the bivariate chi-square plot for X2 and X3
data3_X2x3 <- (data3[,c("X2","X3")])
cm3_X2x3 <- colMeans(data3_X2x3)
S3_X2x3 <- cov(data3_X2x3)
d3_X2x3 <- apply(data3_X2x3, MARGIN = 1, function(data3_X2x3) +
t(data3_X2x3-cm3_X2x3) %*% solve(S3_X2x3) %*% (data3_X2x3-cm3_X2x3))
plot(qchisq((1:nrow(data3_X2x3)-1/2)/nrow(data3_X2x3),df=2), sort(d3_X2x3),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-Square Plot for (X2,X3)"); abline(a=0, b=1)
# construct the bivariate chi-square plot for X2 and X4
data3_X2x4 <- (data3[,c("X2","X4")])
cm3_X2x4 <- colMeans(data3_X2x4)
S3_X2x4 <- cov(data3_X2x4)
d3_X2x4 <- apply(data3_X2x4, MARGIN = 1, function(data3_X2x4) +
t(data3_X2x4-cm3_X2x4) %*% solve(S3_X2x4) %*% (data3_X2x4-cm3_X2x4))
plot(qchisq((1:nrow(data3_X2x4)-1/2)/nrow(data3_X2x4),df=2), sort(d3_X2x4),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-Square Plot for (X2,X4)"); abline(a=0, b=1)
# construct the bivariate chi-square plot for X3 and X4
data3_X3x4 <- (data3[,c("X3","X4")])
cm3_X3x4 <- colMeans(data3_X3x4)
S3_X3x4 <- cov(data3_X3x4)
d3_X3x4 <- apply(data3_X3x4, MARGIN = 1, function(data3_X3x4) +
t(data3_X3x4-cm3_X3x4) %*% solve(S3_X3x4) %*% (data3_X3x4-cm3_X3x4))
plot(qchisq((1:nrow(data3_X3x4)-1/2)/nrow(data3_X3x4),df=2), sort(d3_X3x4),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "Chi-Square Plot for (X3,X4)"); abline(a=0, b=1)
The chi-square plots for (x1,x2), (x1,x3), and (x3,x4) are straight lines while (x1,x4), (x2,x3) and (x2,x4) do not appear to be straight.
Using the data in Table 4.6, examine each of the variables independence, support, benevolence, conformity, and leadership for marginal normality.
# read the data into R
data4 <- read.table("table46.txt")
# examine columns one upto five
data4 <- (data4[,1:5])
colnames(data4) <- c("independence", "support", "benevolence", "conformity", "leadership")
# test the marginal normality for each variable
uniNorm(data4, type = "SW", desc = TRUE)
## $`Descriptive Statistics`
## n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
## independence 130 15.669 5.895 15 3 31 11.25 19 0.427 -0.371
## support 130 17.077 4.185 18 6 27 14.00 20 -0.310 -0.429
## benevolence 130 18.785 5.463 19 2 29 15.00 22 -0.299 -0.215
## conformity 130 15.500 5.748 16 1 27 11.25 19 -0.161 -0.628
## leadership 130 11.731 5.192 11 2 29 8.00 15 0.669 0.124
##
## $`Shapiro-Wilk's Normality Test`
## Variable Statistic p-value Normality
## 1 independence 0.9749 0.0162 NO
## 2 support 0.9779 0.0320 NO
## 3 benevolence 0.9839 0.1269 YES
## 4 conformity 0.9846 0.1500 YES
## 5 leadership 0.9626 0.0012 NO
Only two variables are normaly distributed, that is, benevolence and conformity.
Using all five variables, check for multivariate normality
# calculate the sample mean vector
cm4 <- colMeans(data4)
# calculate the variance-covariance Matrix
S4 <- cov(data4)
# calaculate the statistical square distances
d4 <- apply(data4, MARGIN = 1, function(data4) +
t(data4-cm4) %*% solve(S4) %*% (data4-cm4))
# construct the chi-square plots
plot(qchisq((1:nrow(data4)-1/2)/nrow(data4),df=5), sort(d4),
xlab = expression(paste(chi[5]^2, "Quantile")),
ylab = "Ordered Distances",
main = "The Chi-Squared Plot for all Variables in Psychological Data"); abline(a=0, b=1)
The chi-square plot is not straight thereby showing non-normality for all five variables
For those variables that are not normal, determine the transformation that makes them nearly normal
# take the square root of the variables independence and leadership
data4$independence <- sqrt(data4$independence)
data4$leadership <- sqrt(data4$leadership)
# take the natural log of variable support
data4$support <- log(data4$support)
# test for normality
uniNorm(data4, type = "SW", desc = TRUE)
## $`Descriptive Statistics`
## n Mean Std.Dev Median Min Max 25th 75th Skew
## independence 130 3.886 0.755 3.873 1.732 5.568 3.353 4.359 -0.041
## support 130 2.803 0.276 2.890 1.792 3.296 2.639 2.996 -1.018
## benevolence 130 18.785 5.463 19.000 2.000 29.000 15.000 22.000 -0.299
## conformity 130 15.500 5.748 16.000 1.000 27.000 11.250 19.000 -0.161
## leadership 130 3.341 0.758 3.317 1.414 5.385 2.828 3.873 0.126
## Kurtosis
## independence -0.316
## support 1.087
## benevolence -0.215
## conformity -0.628
## leadership -0.344
##
## $`Shapiro-Wilk's Normality Test`
## Variable Statistic p-value Normality
## 1 independence 0.9891 0.3944 YES
## 2 support 0.9264 0.0000 NO
## 3 benevolence 0.9839 0.1269 YES
## 4 conformity 0.9846 0.1500 YES
## 5 leadership 0.9914 0.6053 YES
Now all the variables are normaly distributed except for support.
Consider the data on snow removal.
Comment on any outliers in a scatter plot of the original variables
# read the data into R
data5 <- read.table("table32.txt")
colnames(data5) <- c("Duration", "Machine_Time")
# scatter plot of the data
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
ggplot(data5, aes(Machine_Time, Duration)) + geom_point(color = "blue", size = 4) + ggtitle ("Scatter Plot of Original Variables in the Snow Data")
There appears to be one outlier at position (42.3, 17.5)
Determine the power transformation that makes “Duration” approximately normal. Construct a QQ plot of the transformed observations
# first check for normality before transformation
shapiro.test(data5$Duration)
##
## Shapiro-Wilk normality test
##
## data: data5$Duration
## W = 0.917, p-value = 0.04382
# transform by taking the natural log of Duration i.e (lambda = 0)
shapiro.test(log(data5$Duration))
##
## Shapiro-Wilk normality test
##
## data: log(data5$Duration)
## W = 0.9779, p-value = 0.8416
# construct a QQ plot of the transformed Duration observations
qqnorm(log(data5$Duration))
qqline(log(data5$Duration))
Determine the power transformation that makes “Machine_Time” approximately normal. Construct a QQ plot of the transformed observations
# first check for normality before transformation
shapiro.test(data5$Machine_Time)
##
## Shapiro-Wilk normality test
##
## data: data5$Machine_Time
## W = 0.8819, p-value = 0.00755
# transform by taking the natural log of Machine_Time data i.e (lambda = 0)
shapiro.test(log(data5$Machine_Time))
##
## Shapiro-Wilk normality test
##
## data: log(data5$Machine_Time)
## W = 0.9589, p-value = 0.3921
# construct a QQ plot of the transformed Machine_Time data
qqnorm(log(data5$Machine_Time))
qqline(log(data5$Machine_Time))
Determine the power transformation for approximate bivariate normality using (4-40)
# calculate the sample mean vector
data5 <- log(data5)
cm5 <- colMeans(data5)
# calculate the variance-covariance Matrix
S5 <- cov(data5)
# calaculate the statistical square distances
d5 <- apply(data5, MARGIN = 1, function(data5) +
t(data5-cm5) %*% solve(S5) %*% (data5-cm5))
# construct the chi-square plots
plot(qchisq((1:nrow(data5)-1/2)/nrow(data5),df=2), sort(d5),
xlab = expression(paste(chi[2]^2, "Quantile")),
ylab = "Ordered Distances",
main = "The Chi-Squared Plot for Transformed Snow Data"); abline(a=0, b=1)