library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(moments)
trees #loading the dataset
head(trees) #returns the first 6 rows from the trees dataset
tail(trees) #returns the last 6 rows from the trees dataset
str(trees) #display the structure of trees dataset
## 'data.frame': 31 obs. of 3 variables:
## $ Girth : num 8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
## $ Height: num 70 65 63 72 81 83 66 75 80 75 ...
## $ Volume: num 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
summary(trees) #prints the summary measures for each variable in the dataset
## Girth Height Volume
## Min. : 8.30 Min. :63 Min. :10.20
## 1st Qu.:11.05 1st Qu.:72 1st Qu.:19.40
## Median :12.90 Median :76 Median :24.20
## Mean :13.25 Mean :76 Mean :30.17
## 3rd Qu.:15.25 3rd Qu.:80 3rd Qu.:37.30
## Max. :20.60 Max. :87 Max. :77.00
Alternatively, in order to get the count of variables and observations and their appropriate summaries
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(trees)
## trees
##
## 3 Variables 31 Observations
## --------------------------------------------------------------------------------
## Girth
## n missing distinct Info Mean Gmd .05 .10
## 31 0 27 0.999 13.25 3.561 8.70 10.50
## .25 .50 .75 .90 .95
## 11.05 12.90 15.25 17.90 18.00
##
## lowest : 8.3 8.6 8.8 10.5 10.7, highest: 17.3 17.5 17.9 18.0 20.6
## --------------------------------------------------------------------------------
## Height
## n missing distinct Info Mean Gmd .05 .10
## 31 0 21 0.994 76 7.312 64.5 66.0
## .25 .50 .75 .90 .95
## 72.0 76.0 80.0 83.0 85.5
##
## lowest : 63 64 65 66 69, highest: 82 83 85 86 87
## --------------------------------------------------------------------------------
## Volume
## n missing distinct Info Mean Gmd .05 .10
## 31 0 30 1 30.17 17.89 10.3 15.6
## .25 .50 .75 .90 .95
## 19.4 24.2 37.3 55.4 57.0
##
## lowest : 10.2 10.3 15.6 16.4 18.2, highest: 51.5 55.4 55.7 58.3 77.0
## --------------------------------------------------------------------------------
Alternatively, displaying detail summary of each variable
trees_girth <- summary(trees$Girth) #display detail summary of the variable Girth
trees_girth
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.30 11.05 12.90 13.25 15.25 20.60
sprintf("Variance and standard deviation of trees$Girth is %#.2f and %#.2f", var(trees$Girth), sd(trees$Girth)) #display variance and SD
## [1] "Variance and standard deviation of trees$Girth is 9.85 and 3.14"
sprintf("Skewness and Kurtosis of trees$Girth is %#.2f and %#.2f", skewness(trees$Girth), kurtosis(trees$Girth)) #display Skewness and Kurtosis
## [1] "Skewness and Kurtosis of trees$Girth is 0.53 and 2.44"
trees_height <- summary(trees$Height) #display detail summary of the variable Height
trees_height
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 63 72 76 76 80 87
sprintf("Variance and standard deviation of trees$Height is %#.2f and %#.2f", var(trees$Height), sd(trees$Height)) #display variance and SD
## [1] "Variance and standard deviation of trees$Height is 40.60 and 6.37"
sprintf("Skewness and Kurtosis of trees$Height is %#.2f and %#.2f", skewness(trees$Height), kurtosis(trees$Height)) #display Skewness and Kurtosis
## [1] "Skewness and Kurtosis of trees$Height is -0.37 and 2.43"
trees_volume <- summary(trees$Volume) #display detail summary of the variable Volume
trees_volume
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.20 19.40 24.20 30.17 37.30 77.00
sprintf("Variance and standard deviation of trees$Height is %#.2f and %#.2f", var(trees$Volume), sd(trees$Volume)) #display variance and SD
## [1] "Variance and standard deviation of trees$Height is 270.20 and 16.44"
sprintf("Skewness and Kurtosis of trees$Height is %#.2f and %#.2f", skewness(trees$Volume), kurtosis(trees$Volume)) #display Skewness and Kurtosis
## [1] "Skewness and Kurtosis of trees$Height is 1.06 and 3.47"
hist(trees$Girth)
#Alternatively, we can plot the histogram more effectively using ggplot package
library(ggplot2)
ggplot(trees, aes(x=Girth)) +
geom_histogram(binwidth = 1.5, colour="darkblue", fill="steelblue")
#Histogram with mean line
ggplot(trees, aes(x=Girth)) +
geom_histogram(binwidth = 1.5, colour="black", fill="steelblue") +
geom_vline(aes(xintercept=mean(Girth)),
color="red", linetype="dashed", size=1)
#Histogram with density plot
ggplot(trees, aes(x=Girth)) +
geom_histogram(aes(y=..density..), binwidth = 1.5, colour="black", fill="steelblue") +
geom_density(alpha=0.2, fill="blue")
hist(trees$Height)
#Alternatively, we can plot the histogram more effectively using ggplot package
library(ggplot2)
ggplot(trees, aes(x=Height)) +
geom_histogram(binwidth = 1.5, colour="darkblue", fill="steelblue")
#Histogram with mean line
ggplot(trees, aes(x=Height)) +
geom_histogram(binwidth = 1.5, colour="black", fill="steelblue") +
geom_vline(aes(xintercept=mean(Height)),
color="red", linetype="dashed", size=1)
#Histogram with density plot
ggplot(trees, aes(x=Height)) +
geom_histogram(aes(y=..density..), binwidth = 1.5, colour="black", fill="steelblue") +
geom_density(alpha=0.2, fill="blue")
hist(trees$Volume)
#Alternatively, we can plot the histogram more effectively using ggplot package
library(ggplot2)
ggplot(trees, aes(x=Volume)) +
geom_histogram(binwidth = 1.5, colour="darkblue", fill="steelblue")
#Histogram with mean line
ggplot(trees, aes(x=Volume)) +
geom_histogram(binwidth = 1.5, colour="black", fill="steelblue") +
geom_vline(aes(xintercept=mean(Volume)),
color="red", linetype="dashed", size=1)
#Histogram with density plot
ggplot(trees, aes(x=Volume)) +
geom_histogram(aes(y=..density..), binwidth = 1.5, colour="black", fill="steelblue") +
geom_density(alpha=0.2, fill="blue")
Density Plots are used to view the distribution of each variable
#Density plot of Girth
plot(density(trees$Girth)) #kernel density plot
d <- density(trees$Girth)
plot(d, main="Kernel Density of Girth") #filled density plot
polygon(d, col="blue", border="black")
#Density plot of Height
plot(density(trees$Height)) #kernel density plot
d <- density(trees$Height)
plot(d, main="Kernel Density of Height") #filled density plot
polygon(d, col="blue", border="black")
#Density plot of Volume
plot(density(trees$Volume)) #kernel density plot
d <- density(trees$Volume)
plot(d, main="Kernel Density of Volume") #filled density plot
polygon(d, col="blue", border="black")
Boxplot is used to show the shape of the distribution, its central value, and its variability
Rug is used to see the data points across the box plot; it adds (1-d plot) of the data to the plot.
#Boxplot for the variable Girth
boxplot(trees$Girth, horizontal = TRUE, col = c("yellow"),main = "Boxplot of Girth")
rug(trees$Girth, side = 1) #rug is used to see the data points across the box plot
#Boxplot for the variable Height
boxplot(trees$Height, horizontal = TRUE, col = c("green"), main = "Boxplot of Height")
rug(trees$Height, side = 1)
#Boxplot for the variable Volume
boxplot(trees$Volume, horizontal = TRUE, col = c("red"), main = "Boxplot of Volume")
rug(trees$Volume, side = 1)
Combined Boxplot of all the 3 variables
The boxplots of Height shows a normal distribution, Girth is slightly positively-skewed and Volume is positively skewed.
#Combined Boxplot of all the 3 variables
boxplot(trees$Girth, trees$Height, trees$Volume, horizontal = TRUE,
col = c("yellow","green","red"), names= c("Girth","Height","Volume"), main = "Combined Boxplot of all the 3 variables")
Normal Probability Plots are used to assess whether a dataset is normally distributed or not specifically, to identify outliers or any unusual values.
qqnorm(trees$Girth, xlab = "", ylab = "Girth", main = "Girth of trees"); qqline(trees$Girth, col = 2)
qqnorm(trees$Volume, xlab = "", ylab = "Volume", main = "Volume of trees"); qqline(trees$Volume, col = 2)
From the visualizations above, we can see that the normal probability distribution plots of Girth and Volume are skewed right i.e., the plot shows a strongly non-linear pattern. Specifically, it shows a quadratic pattern in which all the points are below a reference line drawn between the first and last points.
qqnorm(trees$Height, xlab = "", ylab = "Height", main = "Height of trees");
qqline(trees$Height, col = 2)
Normal Probability plot of the Height is short-tailed i.e., it shows a non-linear pattern.