Source: Jeremy Miles: Applying Regression and Correlation: A Guide for Students and Researchers
Conservative interpretation
A more liberal approach
see Book pp61f; allows rating scales as interval data
x <- rnorm(1000)
hist(x)
boxplot(x)
qqnorm(x); qqline(x, col = "red")
describe(x)
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 1000 -0.02 1.01 -0.02 -0.02 1.03 -2.93 3.44 6.37 0 -0.13
## se
## 1 0.03
require(pastecs)
stat.desc(x, basic = FALSE, norm = TRUE)
## median mean SE.mean CI.mean.0.95 var
## -1.636978e-02 -2.213358e-02 3.195339e-02 6.270346e-02 1.021019e+00
## std.dev coef.var skewness skew.2SE kurtosis
## 1.010455e+00 -4.565257e+01 -4.871872e-04 -3.149490e-03 -1.267754e-01
## kurt.2SE normtest.W normtest.p
## -4.101853e-01 9.991895e-01 9.517207e-01
If the (absolute) value of skew or kurtosis is greater than twice the standard error, then the distribution significantly differs form a normal distribution.
salary <- c(10.4, 11.3, 12.9, 13.2, 14.6, 15.8, 15.5, 16.1, 17, 22.7, 23.5, 24.6, 28.9)
salaryOutlier <- c(10.4, 11.3, 12.9, 13.2, 14.6, 15.8, 15.5, 250.6, 17, 22.7, 23.5, 24.6, 28.9)
cbind(salary, scale(salary))
## salary
## [1,] 10.4 -1.23296804
## [2,] 11.3 -1.07496447
## [3,] 12.9 -0.79406923
## [4,] 13.2 -0.74140137
## [5,] 14.6 -0.49561804
## [6,] 15.8 -0.28494661
## [7,] 15.5 -0.33761447
## [8,] 16.1 -0.23227875
## [9,] 17.0 -0.07427518
## [10,] 22.7 0.92641410
## [11,] 23.5 1.06686172
## [12,] 24.6 1.25997720
## [13,] 28.9 2.01488315
cbind(salaryOutlier, scale(salaryOutlier))
## salaryOutlier
## [1,] 10.4 -0.3862138
## [2,] 11.3 -0.3723442
## [3,] 12.9 -0.3476872
## [4,] 13.2 -0.3430641
## [5,] 14.6 -0.3214892
## [6,] 15.8 -0.3029965
## [7,] 15.5 -0.3076196
## [8,] 250.6 3.3154166
## [9,] 17.0 -0.2845037
## [10,] 22.7 -0.1966632
## [11,] 23.5 -0.1843347
## [12,] 24.6 -0.1673830
## [13,] 28.9 -0.1011174
# function to calculate deleted z-scores; x = variable
delZ <- function(x){
del.z.score <- numeric()
for (i in 1:length(x)){
del.z.score[i] <- (x[i]-mean(x[-i]))/sd(x[-i])
}
return(del.z.score)
}
cbind(salary, scale(salary), delZ(salary))
## salary
## [1,] 10.4 -1.23296804 -1.37681296
## [2,] 11.3 -1.07496447 -1.17810930
## [3,] 12.9 -0.79406923 -0.84811143
## [4,] 13.2 -0.74140137 -0.78881176
## [5,] 14.6 -0.49561804 -0.51985772
## [6,] 15.8 -0.28494661 -0.29663944
## [7,] 15.5 -0.33761447 -0.35199374
## [8,] 16.1 -0.23227875 -0.24151137
## [9,] 17.0 -0.07427518 -0.07705835
## [10,] 22.7 0.92641410 1.00042642
## [11,] 23.5 1.06686172 1.16820780
## [12,] 24.6 1.25997720 1.41195596
## [13,] 28.9 2.01488315 2.62570720
cbind(salaryOutlier, scale(salaryOutlier), delZ(salaryOutlier))
## salaryOutlier
## [1,] 10.4 -0.3862138 -0.4033105
## [2,] 11.3 -0.3723442 -0.3886400
## [3,] 12.9 -0.3476872 -0.3626097
## [4,] 13.2 -0.3430641 -0.3577360
## [5,] 14.6 -0.3214892 -0.3350193
## [6,] 15.8 -0.3029965 -0.3155823
## [7,] 15.5 -0.3076196 -0.3204387
## [8,] 250.6 3.3154166 39.2708718
## [9,] 17.0 -0.2845037 -0.2961750
## [10,] 22.7 -0.1966632 -0.2043386
## [11,] 23.5 -0.1843347 -0.1914882
## [12,] 24.6 -0.1673830 -0.1738318
## [13,] 28.9 -0.1011174 -0.1049287
When there are outliers, the mean is not a good value to calculate the z-scores. A better method is to use the deleted z-score. To calculate the deleted z-score, the first case is deleted, the mean and standard deviation for the rest of the data are calculated, and this mean and standard deviation are then used to calculate the z-score for the deleted case. The process is then repeated, removing each case case, one at a time.
How much does each individual data point influence the model parameters.
infl <- function(x){
influence <- numeric()
# calculate the parameter estimate for all of the variables
totMean <- mean(x)
for (i in 1:length(x)){
# exclude one datapoint and recalculate the parameter
excMean <- mean(x[-i])
# calculate the difference between totMean and excMean
influence[i] <- totMean - excMean
}
return(influence)
}
cbind(salary, "mean" = mean(salary), "influence" = infl(salary))
## salary mean influence
## [1,] 10.4 17.42308 -0.58525641
## [2,] 11.3 17.42308 -0.51025641
## [3,] 12.9 17.42308 -0.37692308
## [4,] 13.2 17.42308 -0.35192308
## [5,] 14.6 17.42308 -0.23525641
## [6,] 15.8 17.42308 -0.13525641
## [7,] 15.5 17.42308 -0.16025641
## [8,] 16.1 17.42308 -0.11025641
## [9,] 17.0 17.42308 -0.03525641
## [10,] 22.7 17.42308 0.43974359
## [11,] 23.5 17.42308 0.50641026
## [12,] 24.6 17.42308 0.59807692
## [13,] 28.9 17.42308 0.95641026
cbind(salaryOutlier, "mean" = mean(salaryOutlier), "influence" = infl(salaryOutlier))
## salaryOutlier mean influence
## [1,] 10.4 35.46154 -2.0884615
## [2,] 11.3 35.46154 -2.0134615
## [3,] 12.9 35.46154 -1.8801282
## [4,] 13.2 35.46154 -1.8551282
## [5,] 14.6 35.46154 -1.7384615
## [6,] 15.8 35.46154 -1.6384615
## [7,] 15.5 35.46154 -1.6634615
## [8,] 250.6 35.46154 17.9282051
## [9,] 17.0 35.46154 -1.5384615
## [10,] 22.7 35.46154 -1.0634615
## [11,] 23.5 35.46154 -0.9967949
## [12,] 24.6 35.46154 -0.9051282
## [13,] 28.9 35.46154 -0.5467949
f.infl <- function(x){
influence2 <- data.frame("x" = NA, "totMean" = NA, "meanCaseExcl" = NA, "diffMeans" = NA)
totMean <- mean(x)
for (i in 1:length(x)){
excMean <- mean(x[-i])
influence2[i, 1] <- x[i]
influence2[i, 2] <- totMean
influence2[i, 3] <- excMean
influence2[i, 4] <- totMean - excMean
}
return(influence2)
}
f.infl(salary)
## x totMean meanCaseExcl diffMeans
## 1 10.4 17.42308 18.00833 -0.58525641
## 2 11.3 17.42308 17.93333 -0.51025641
## 3 12.9 17.42308 17.80000 -0.37692308
## 4 13.2 17.42308 17.77500 -0.35192308
## 5 14.6 17.42308 17.65833 -0.23525641
## 6 15.8 17.42308 17.55833 -0.13525641
## 7 15.5 17.42308 17.58333 -0.16025641
## 8 16.1 17.42308 17.53333 -0.11025641
## 9 17.0 17.42308 17.45833 -0.03525641
## 10 22.7 17.42308 16.98333 0.43974359
## 11 23.5 17.42308 16.91667 0.50641026
## 12 24.6 17.42308 16.82500 0.59807692
## 13 28.9 17.42308 16.46667 0.95641026
f.infl(salaryOutlier)
## x totMean meanCaseExcl diffMeans
## 1 10.4 35.46154 37.55000 -2.0884615
## 2 11.3 35.46154 37.47500 -2.0134615
## 3 12.9 35.46154 37.34167 -1.8801282
## 4 13.2 35.46154 37.31667 -1.8551282
## 5 14.6 35.46154 37.20000 -1.7384615
## 6 15.8 35.46154 37.10000 -1.6384615
## 7 15.5 35.46154 37.12500 -1.6634615
## 8 250.6 35.46154 17.53333 17.9282051
## 9 17.0 35.46154 37.00000 -1.5384615
## 10 22.7 35.46154 36.52500 -1.0634615
## 11 23.5 35.46154 36.45833 -0.9967949
## 12 24.6 35.46154 36.36667 -0.9051282
## 13 28.9 35.46154 36.00833 -0.5467949