# clear environment
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525594 28.1 1167051 62.4 NA 669291 35.8
## Vcells 967507 7.4 8388608 64.0 32768 1840395 14.1
cat("\f") # Clear the console
graphics.off() # Clear plots. Can use par(mfrow=c(1,1))
# load packages
library(psych)
library(e1071)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(gridExtra)
library(patchwork)
The default value for skewness is often set to 0 because skewness is a measure of the asymmetry in the distribution of data. A skewness value of 0 indicates that the data is perfectly symmetrical, with a balanced distribution of values on both sides of the mean (or median), and there is no pronounced skew in either direction.
In a symmetric distribution:
The mean, median, and mode are all equal.
The left and right tails of the distribution have similar lengths.
The data is evenly spread around the central point, forming a bell-shaped curve in the case of a normal distribution.
By setting the default skewness value to 0, it provides a convenient reference point for understanding the degree of asymmetry in a dataset:
If the skewness value is greater than 0, it indicates a right-skewed (positively skewed) distribution, where the tail on the right side (positive side) of the central point is longer or more pronounced.
If the skewness value is less than 0, it indicates a left-skewed (negatively skewed) distribution, where the tail on the left side (negative side) of the central point is longer or more pronounced.
If the skewness value is exactly 0, it indicates a symmetric distribution, as there is no skew in either direction.
Setting the default skewness value to 0 makes it easier to interpret and compare skewness values across different datasets. It provides a clear baseline for understanding whether the data is skewed or not. Researchers and analysts can then focus on deviations from this baseline to assess the degree and direction of skewness in the data.
# Generate right-skewed data
right_skewed_data <- c(5, 10, 15, 20, 30, 50, 80, 150, 300, 500)
describe(right_skewed_data)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10 116 162.78 40 81.88 48.18 5 500 495 1.33 0.34 51.48
# Plot the data
plot(x = density(right_skewed_data),
main = "Right-Skewed Data",
xlab = "Value",
ylab = "Frequency"
)
# Calculate skewness
skewness_right <- skewness(right_skewed_data)
cat("Skewness (Right-Skewed):", skewness_right, "\n")
## Skewness (Right-Skewed): 1.331783
Strong positive skewness from the visualization/histogram, and skewness statistic is greater than 1.
# Generate left-skewed data
left_skewed_data <- c(500, 490, 450, 390, 380, 370, 310, 280, 210, 130, 5)
describe(left_skewed_data)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 11 319.55 154.38 370 334.44 133.43 5 500 495 -0.63 -0.84
## se
## X1 46.55
# Plot the data
plot(x = density(left_skewed_data),
main = "Left-Skewed Data",
xlab = "Value",
ylab = "Frequency"
)
# Calculate skewness
skewness_left <- skewness(left_skewed_data)
cat("Skewness (Left-Skewed):", skewness_left, "\n")
## Skewness (Left-Skewed): -0.631565
Mild negative skewness from the visualization/histogram, and skewness statistic is just under -1 (but more than -.50).
# Generate a symmetric dataset
symmetric_data <- c(10, 15, 20, 25, 30, 20, 20, 30, 25, 20, 15, 10)
describe(symmetric_data)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 12 20 6.74 20 20 7.41 10 30 20 0 -1.29 1.95
# Plot the data
plot(x = density(symmetric_data),
main = "Symmetric Data",
xlab = "Value",
ylab = "Frequency",
xlim = c(0, 40)
)
# Calculate skewness
skewness_symmetric <- skewness(symmetric_data)
cat("Skewness (Symmetric Data):", skewness_symmetric, "\n")
## Skewness (Symmetric Data): 0
Skewness is a measure of the asymmetry in the distribution of data. The interpretation of skewness as “low,” “average,” or “high” depends on the magnitude of the skewness value:
Low Skewness (Near Zero):
Skewness values that are close to zero, typically within the range of -0.5 to 0.5, are considered to indicate low or negligible skewness.
In a dataset with low skewness, the distribution is relatively symmetric, with a roughly balanced distribution of values on both sides of the mean.
Moderate Skewness (Between -0.5 and -1 or 0.5 and 1):
Skewness values between -0.5 and -1 (negative skewness) or between 0.5 and 1 (positive skewness) are considered to indicate moderate skewness.
In this case, the distribution is somewhat skewed, but the skewness is not extreme. There is a noticeable asymmetry in the distribution, but it may still be reasonably interpretable.
High Skewness (Beyond -1 or 1):
Skewness values that are significantly below -1 (strong negative skewness) or significantly above 1 (strong positive skewness) indicate a high degree of skewness.
In datasets with high skewness, the distribution is highly asymmetric, with a long tail on one side. Extreme skewness can make the data challenging to analyze, and it may require special consideration in statistical analyses.
It’s important to note that the interpretation of skewness can vary depending on the context and the specific field of study. What is considered “low,” “average,” or “high” skewness can be somewhat subjective and may depend on the goals of the analysis and the data distribution’s characteristics. Additionally, in some cases, the interpretation of skewness may be relative to the skewness of other datasets being compared.
Kurtosis is a statistical measure that describes the distribution of data in terms of its tails or outliers compared to a normal distribution. Kurtosis values can be categorized as follows:
High Kurtosis (Leptokurtic):
A high kurtosis value indicates a distribution with a peak that is more pronounced and sharper than a normal distribution.
It implies that the data is more concentrated around the mean.
This can result in a distribution that appears more peaked and less spread out.
Low Kurtosis (Platykurtic):
A low kurtosis value indicates a distribution with a flatter peak compared to a normal distribution.
It suggests that the data is more spread out.
This can result in a distribution that looks flatter and less peaked than a normal distribution.
Mesokurtic (Zero Kurtosis):
A kurtosis value of 0 (or close to 0) suggests a distribution that is very close to a normal distribution (bell-shaped).
The data has tails and a peak similar to those of a normal distribution.
This is sometimes referred to as “mesokurtic” and indicates that the data does not exhibit excessive outliers or extreme behavior in terms of its tails.
A kurtosis of greater than 2 indicates a distribution that is too peaked.
?rep
# Generate data with high kurtosis
high_kurtosis_data <- c( 30, 40, rep( x = 51, times = 150), 60, 70)
describe(high_kurtosis_data)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 154 50.97 2.56 51 51 0 30 70 40 -1.13 49.14 0.21
# Plot the data
plot(x = density(high_kurtosis_data),
main = "High Kurtosis Data (Leptokurtic)",
xlab = "Value",
ylab = "Frequency"
)
# Plot the data using ggplot2
lepto <-
ggplot(data = data.frame(x = density(high_kurtosis_data)$x, y = density(high_kurtosis_data)$y), aes(x = x, y = y)) +
geom_line() +
labs(title = "High Kurtosis Data (Leptokurtic)",
x = "Value",
y = "Frequency") + ylim(0, .5) + xlim(-50, 150)
lepto
# Calculate kurtosis
kurtosis_high <- kurtosis(high_kurtosis_data)
cat("Kurtosis (High Kurtosis Data):", kurtosis_high, "\n")
## Kurtosis (High Kurtosis Data): 49.14095
A kurtosis of less than −2 indicates a distribution that is too flat.
# Generate data with low kurtosis
low_kurtosis_data <- c(2, 2, 2, 2, 2, 2, 2, 30, 40, 51, 51, 51, 51, 51, 60, 70, 100, 100, 100, 100, 100, 100, 100)
describe(low_kurtosis_data)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 23 50.83 39.67 51 50.79 72.65 2 100 98 0.01 -1.58 8.27
# Plot the data
plot(x = density(low_kurtosis_data),
main = "Low Kurtosis Data (Platykurtic)",
xlab = "Value",
ylab = "Frequency"
)
platy <-
ggplot(data = data.frame(x = density(low_kurtosis_data)$x, y = density(low_kurtosis_data)$y), aes(x = x, y = y)) +
geom_line() +
labs(title = "Low Kurtosis Data (Platykurtic)",
x = "Value",
y = "Frequency") + ylim(0, .5) + xlim(-50, 150)
# Calculate kurtosis
kurtosis_low <- kurtosis(low_kurtosis_data)
cat("Kurtosis (Low Kurtosis Data):", kurtosis_low, "\n")
## Kurtosis (Low Kurtosis Data): -1.57661
(Excess) kurtosis is almost 0. 3 is already subtracted when we
specify type=1
# Generate data with zero kurtosis (approximately normal)
normal_data <- rnorm(10000)+51
describe(normal_data)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 51 1 51 51 0.99 46.91 54.69 7.77 -0.03 0 0.01
# Plot the data
plot(x = density(normal_data),
main = "Mesokurtic (Normal) Data",
xlab = "Value",
ylab = "Frequency"
)
# Create a density plot using ggplot2
meso <-
ggplot(data = data.frame(x = density(normal_data)$x, y = density(normal_data)$y), aes(x = x, y = y)) +
geom_line() +
labs(title = "Mesokurtic (Normal) Data",
x = "Value",
y = "Frequency") + ylim(0, .5) + xlim(-50, 150)
meso
# Calculate kurtosis
kurtosis_normal <- e1071::kurtosis(x = normal_data,
type = 1 # excess kurtosis - 3 is already substracted
)
cat("Kurtosis (Mesokurtic Data):", kurtosis_normal, "\n")
## Kurtosis (Mesokurtic Data): 0.001769622
Lets call the three charts generated from ggplot
package
.
lepto
platy
## Warning: Removed 31 rows containing missing values (`geom_line()`).
meso
# Combine the plots using grid.arrange from the 'gridExtra' package
grid.arrange(lepto, platy, meso,
ncol = 3
)
## Warning: Removed 31 rows containing missing values (`geom_line()`).
# Combine the plots using the patchwork package
combined_plot <- lepto + platy + meso
combined_plot
## Warning: Removed 31 rows containing missing values (`geom_line()`).