Problem 4.27:

Consider the radiation data in Example 4.10.

Construct a Q-Q plot for the natural logarithms of these data. Do the natural logarithms appear to normally distributed?

# create a data matrix
radiation <- matrix(c(0.15,0.9,0.18,0.1,0.05,0.12,0.08,0.05,0.08,0.1,0.07,0.02,0.01,0.1,0.1,0.1,0.02,0.1,0.01,0.4,0.1,0.05,0.03,0.05,0.15,0.1,0.15,0.09,0.08,0.18,0.1,0.2,0.11,0.3,0.02,0.2,0.2,0.3,0.3,0.4,0.3,0.05))

colnames(radiation) <-c("Radiation")

# construct a q-q plot
qqnorm(log(radiation))
qqline(log(radiation))

# compute the marginal normality
shapiro.test(log(radiation))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(radiation)
## W = 0.9615, p-value = 0.1665

The Q-Q plot appears to be straight, hence the data are normally distributed. In addition, the test of normality produces a p-value of 0.1665 showing that the data is normally distributed.

For lambda = 1/4, the coefficient of normality is greater than for lambda = 0.

Problem 4.29

Given the air pollution data.

# read the data into R
pollution_data <- read.table("table15.txt")

# examine columns five and six
data2 <- (pollution_data[,5:6])
colnames(data2) <- c("NO2", "O3")

# calcualate the sample mean vector
cm <- colMeans(data2)
cm
##       NO2        O3 
## 10.047619  9.404762
# calculate the variance-covariance Matrix
S <- cov(data2)
S
##           NO2        O3
## NO2 11.363531  3.126597
## O3   3.126597 30.978513
# calaculate the statistical square distances
d <- apply(data2, MARGIN = 1, function(data2) +
                   t(data2-cm) %*% solve(S) %*% (data2-cm))
d
##  [1]  0.4606524  0.6592206  2.3770610  1.6282902  0.4135364  0.4760726
##  [7]  1.1848895 10.6391792  0.1388339  0.8162468  1.3566301  0.6228096
## [13]  5.6494392  0.3159498  0.4135364  0.1224973  0.8987982  4.7646873
## [19]  3.0089122  0.6592206  2.7741416  1.0360061  0.7874152  3.4437748
## [25]  6.1488606  1.0360061  0.1388339  0.8856041  0.1379719  2.2488867
## [31]  0.1901188  0.4606524  1.1471939  7.0857237  1.4584229  0.1224973
## [37]  1.8984708  2.7782596  8.4730649  0.6370218  0.7032485  1.8013611
# construct a chi-square plot of the ordered distances
par(mfrow = c(1,1))
plot(qchisq((1:nrow(data2)-1/2)/nrow(data2),df=2), sort(d),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-square Plot for (NO2, O3)"); abline(a=0, b=1)

The number of units = 26 The proportion = 62%

Problem 4.33

(a)

Examine the margninal normality of the observations for the data in Table 4.3.

# read the data into R
data3 <- read.table("table43.txt")
colnames(data3)<-c("X1", "X2", "X3", "X4","X5")
library(MVN)
## Warning: package 'MVN' was built under R version 3.1.3
uniNorm(data3, type = "SW", desc = TRUE)
## $`Descriptive Statistics`
##     n     Mean Std.Dev  Median     Min     Max     25th     75th  Skew
## X1 30 1906.100 324.987 1863.00 1325.00 2983.00 1715.250 2057.250 1.038
## X2 30 1749.533 318.607 1680.00 1170.00 2794.00 1595.500 1888.750 1.144
## X3 30 1509.133 303.178 1466.00 1002.00 2412.00 1295.750 1623.750 0.980
## X4 30 1724.967 322.844 1674.50 1176.00 2581.00 1520.250 1880.750 0.598
## X5 30    3.867   3.734    2.64    0.13   16.85    1.415    5.043 1.786
##    Kurtosis
## X1    2.036
## X2    1.950
## X3    0.997
## X4   -0.046
## X5    3.140
## 
## $`Shapiro-Wilk's Normality Test`
##    Variable Statistic   p-value Normality
## 1    X1        0.9307    0.0512    YES   
## 2    X2        0.9127    0.0175    NO    
## 3    X3        0.9326    0.0575    YES   
## 4    X4        0.9613    0.3337    YES   
## 5    X5        0.7999    0.0001    NO

(b)

Examine the bivariate normality of the observations for the data.

# examine columns one upto four
data3 <- (data3[,-5])

# construct the bivariate chi-square plot for X1 and X2
data3_X1x2 <- (data3[,c("X1","X2")])
cm3_X1X2 <- colMeans(data3_X1x2)
S3_X1X2 <- cov(data3_X1x2)
d3_X1X2 <- apply(data3_X1x2, MARGIN = 1, function(data3_X1x2) +
                   t(data3_X1x2-cm3_X1X2) %*% solve(S3_X1X2) %*% (data3_X1x2-cm3_X1X2))

par(mfrow = c(3,2))
plot(qchisq((1:nrow(data3_X1x2)-1/2)/nrow(data3_X1x2),df=2), sort(d3_X1X2),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-Square Plot for (X1,X2)"); abline(a=0, b=1)

# construct the bivariate chi-square plot for X1 and X3
data3_X1x3 <- (data3[,c("X1","X3")])
cm3_X1X3 <- colMeans(data3_X1x3)
S3_X1X3 <- cov(data3_X1x3)
d3_X1X3 <- apply(data3_X1x3, MARGIN = 1, function(data3_X1x3) +
                   t(data3_X1x3-cm3_X1X3) %*% solve(S3_X1X3) %*% (data3_X1x3-cm3_X1X3))

plot(qchisq((1:nrow(data3_X1x3)-1/2)/nrow(data3_X1x3),df=2), sort(d3_X1X3),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-Square Plot for (X1,X3)"); abline(a=0, b=1)

# construct the bivariate chi-square plot for X1 and X4
data3_X1x4 <- (data3[,c("X1","X4")])
cm3_X1X4 <- colMeans(data3_X1x4)
S3_X1X4 <- cov(data3_X1x4)
d3_X1X4 <- apply(data3_X1x4, MARGIN = 1, function(data3_X1x4) +
                   t(data3_X1x4-cm3_X1X4) %*% solve(S3_X1X4) %*% (data3_X1x4-cm3_X1X4))

plot(qchisq((1:nrow(data3_X1x4)-1/2)/nrow(data3_X1x4),df=2), sort(d3_X1X4),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-Square Plot for (X1,X4)"); abline(a=0, b=1)

# construct the bivariate chi-square plot for X2 and X3
data3_X2x3 <- (data3[,c("X2","X3")])
cm3_X2x3 <- colMeans(data3_X2x3)
S3_X2x3 <- cov(data3_X2x3)
d3_X2x3 <- apply(data3_X2x3, MARGIN = 1, function(data3_X2x3) +
                   t(data3_X2x3-cm3_X2x3) %*% solve(S3_X2x3) %*% (data3_X2x3-cm3_X2x3))


plot(qchisq((1:nrow(data3_X2x3)-1/2)/nrow(data3_X2x3),df=2), sort(d3_X2x3),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-Square Plot for (X2,X3)"); abline(a=0, b=1)

# construct the bivariate chi-square plot for X2 and X4
data3_X2x4 <- (data3[,c("X2","X4")])
cm3_X2x4 <- colMeans(data3_X2x4)
S3_X2x4 <- cov(data3_X2x4)
d3_X2x4 <- apply(data3_X2x4, MARGIN = 1, function(data3_X2x4) +
                   t(data3_X2x4-cm3_X2x4) %*% solve(S3_X2x4) %*% (data3_X2x4-cm3_X2x4))


plot(qchisq((1:nrow(data3_X2x4)-1/2)/nrow(data3_X2x4),df=2), sort(d3_X2x4),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-Square Plot for (X2,X4)"); abline(a=0, b=1)

# construct the bivariate chi-square plot for X3 and X4
data3_X3x4 <- (data3[,c("X3","X4")])
cm3_X3x4 <- colMeans(data3_X3x4)
S3_X3x4 <- cov(data3_X3x4)
d3_X3x4 <- apply(data3_X3x4, MARGIN = 1, function(data3_X3x4) +
                   t(data3_X3x4-cm3_X3x4) %*% solve(S3_X3x4) %*% (data3_X3x4-cm3_X3x4))


plot(qchisq((1:nrow(data3_X3x4)-1/2)/nrow(data3_X3x4),df=2), sort(d3_X3x4),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "Chi-Square Plot for (X3,X4)"); abline(a=0, b=1)

The chi-square plots for (x1,x2), (x1,x3), and (x3,x4) are straight lines while (x1,x4), (x2,x3) and (x2,x4) do not appear to be straight.

Problem 4.39

(a)

Using the data in Table 4.6, examine each of the variables independence, support, benevolence, conformity, and leadership for marginal normality.

# read the data into R
data4 <- read.table("table46.txt")

# examine columns one upto five
data4 <- (data4[,1:5])
colnames(data4) <- c("independence", "support", "benevolence", "conformity", "leadership")

# test the marginal normality for each variable
uniNorm(data4, type = "SW", desc = TRUE)
## $`Descriptive Statistics`
##                n   Mean Std.Dev Median Min Max  25th 75th   Skew Kurtosis
## independence 130 15.669   5.895     15   3  31 11.25   19  0.427   -0.371
## support      130 17.077   4.185     18   6  27 14.00   20 -0.310   -0.429
## benevolence  130 18.785   5.463     19   2  29 15.00   22 -0.299   -0.215
## conformity   130 15.500   5.748     16   1  27 11.25   19 -0.161   -0.628
## leadership   130 11.731   5.192     11   2  29  8.00   15  0.669    0.124
## 
## $`Shapiro-Wilk's Normality Test`
##       Variable Statistic   p-value Normality
## 1 independence    0.9749    0.0162    NO    
## 2   support       0.9779    0.0320    NO    
## 3 benevolence     0.9839    0.1269    YES   
## 4  conformity     0.9846    0.1500    YES   
## 5  leadership     0.9626    0.0012    NO

Only two variables are normaly distributed, that is, benevolence and conformity.

(b)

Using all five variables, check for multivariate normality

# calculate the sample mean vector
cm4 <- colMeans(data4)

# calculate the variance-covariance Matrix
S4 <- cov(data4)

# calaculate the statistical square distances
d4 <- apply(data4, MARGIN = 1, function(data4) +
                   t(data4-cm4) %*% solve(S4) %*% (data4-cm4))

# construct the chi-square plots
plot(qchisq((1:nrow(data4)-1/2)/nrow(data4),df=5), sort(d4),
     xlab = expression(paste(chi[5]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "The Chi-Squared Plot for all Variables in Psychological Data"); abline(a=0, b=1)

The chi-square plot is not straight thereby showing non-normality for all five variables

(c)

For those variables that are not normal, determine the transformation that makes them nearly normal

# take the square root of the variables independence and leadership
data4$independence <- sqrt(data4$independence)
data4$leadership <- sqrt(data4$leadership)

# take the natural log of variable support
data4$support <- log(data4$support)

# test for normality
uniNorm(data4, type = "SW", desc = TRUE)
## $`Descriptive Statistics`
##                n   Mean Std.Dev Median   Min    Max   25th   75th   Skew
## independence 130  3.886   0.755  3.873 1.732  5.568  3.353  4.359 -0.041
## support      130  2.803   0.276  2.890 1.792  3.296  2.639  2.996 -1.018
## benevolence  130 18.785   5.463 19.000 2.000 29.000 15.000 22.000 -0.299
## conformity   130 15.500   5.748 16.000 1.000 27.000 11.250 19.000 -0.161
## leadership   130  3.341   0.758  3.317 1.414  5.385  2.828  3.873  0.126
##              Kurtosis
## independence   -0.316
## support         1.087
## benevolence    -0.215
## conformity     -0.628
## leadership     -0.344
## 
## $`Shapiro-Wilk's Normality Test`
##       Variable Statistic   p-value Normality
## 1 independence    0.9891    0.3944    YES   
## 2   support       0.9264    0.0000    NO    
## 3 benevolence     0.9839    0.1269    YES   
## 4  conformity     0.9846    0.1500    YES   
## 5  leadership     0.9914    0.6053    YES

Now all the variables are normaly distributed except for support.

Problem 4.41

Consider the data on snow removal.

(a)

Comment on any outliers in a scatter plot of the original variables

# read the data into R
data5 <- read.table("table32.txt")
colnames(data5) <- c("Duration", "Machine_Time")

# scatter plot of the data
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
ggplot(data5, aes(Machine_Time, Duration)) + geom_point(color = "blue", size = 4) + ggtitle ("Scatter Plot of Original Variables in the Snow Data")

There appears to be one outlier at position (42.3, 17.5)

(b)

Determine the power transformation that makes “Duration” approximately normal. Construct a QQ plot of the transformed observations

# first check for normality before transformation
shapiro.test(data5$Duration)
## 
##  Shapiro-Wilk normality test
## 
## data:  data5$Duration
## W = 0.917, p-value = 0.04382
# transform by taking the natural log of Duration i.e (lambda = 0)
shapiro.test(log(data5$Duration))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(data5$Duration)
## W = 0.9779, p-value = 0.8416
# construct a QQ plot of the transformed Duration observations 
qqnorm(log(data5$Duration))
qqline(log(data5$Duration))

(c)

Determine the power transformation that makes “Machine_Time” approximately normal. Construct a QQ plot of the transformed observations

# first check for normality before transformation
shapiro.test(data5$Machine_Time)
## 
##  Shapiro-Wilk normality test
## 
## data:  data5$Machine_Time
## W = 0.8819, p-value = 0.00755
# transform by taking the natural log of Machine_Time data i.e (lambda = 0)
shapiro.test(log(data5$Machine_Time))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(data5$Machine_Time)
## W = 0.9589, p-value = 0.3921
# construct a QQ plot of the transformed Machine_Time data
qqnorm(log(data5$Machine_Time))
qqline(log(data5$Machine_Time))

(d)

Determine the power transformation for approximate bivariate normality using (4-40)

# calculate the sample mean vector
data5 <- log(data5)
cm5 <- colMeans(data5)

# calculate the variance-covariance Matrix
S5 <- cov(data5)

# calaculate the statistical square distances
d5 <- apply(data5, MARGIN = 1, function(data5) +
                   t(data5-cm5) %*% solve(S5) %*% (data5-cm5))

# construct the chi-square plots
plot(qchisq((1:nrow(data5)-1/2)/nrow(data5),df=2), sort(d5),
     xlab = expression(paste(chi[2]^2, "Quantile")),
     ylab = "Ordered Distances",
     main = "The Chi-Squared Plot for Transformed Snow Data"); abline(a=0, b=1)