(a) Print the trees dataset

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(moments)
trees #loading the dataset
head(trees) #returns the first 6 rows from the trees dataset
tail(trees) #returns the last 6 rows from the trees dataset
str(trees) #display the structure of trees dataset
## 'data.frame':    31 obs. of  3 variables:
##  $ Girth : num  8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
##  $ Height: num  70 65 63 72 81 83 66 75 80 75 ...
##  $ Volume: num  10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...

(b) Print the summary measures for each variable in the dataset

summary(trees) #prints the summary measures for each variable in the dataset
##      Girth           Height       Volume     
##  Min.   : 8.30   Min.   :63   Min.   :10.20  
##  1st Qu.:11.05   1st Qu.:72   1st Qu.:19.40  
##  Median :12.90   Median :76   Median :24.20  
##  Mean   :13.25   Mean   :76   Mean   :30.17  
##  3rd Qu.:15.25   3rd Qu.:80   3rd Qu.:37.30  
##  Max.   :20.60   Max.   :87   Max.   :77.00

Alternatively, in order to get the count of variables and observations and their appropriate summaries

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(trees)
## trees 
## 
##  3  Variables      31  Observations
## --------------------------------------------------------------------------------
## Girth 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       31        0       27    0.999    13.25    3.561     8.70    10.50 
##      .25      .50      .75      .90      .95 
##    11.05    12.90    15.25    17.90    18.00 
## 
## lowest :  8.3  8.6  8.8 10.5 10.7, highest: 17.3 17.5 17.9 18.0 20.6
## --------------------------------------------------------------------------------
## Height 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       31        0       21    0.994       76    7.312     64.5     66.0 
##      .25      .50      .75      .90      .95 
##     72.0     76.0     80.0     83.0     85.5 
## 
## lowest : 63 64 65 66 69, highest: 82 83 85 86 87
## --------------------------------------------------------------------------------
## Volume 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       31        0       30        1    30.17    17.89     10.3     15.6 
##      .25      .50      .75      .90      .95 
##     19.4     24.2     37.3     55.4     57.0 
## 
## lowest : 10.2 10.3 15.6 16.4 18.2, highest: 51.5 55.4 55.7 58.3 77.0
## --------------------------------------------------------------------------------

Alternatively, displaying detail summary of each variable

trees_girth <- summary(trees$Girth) #display detail summary of the variable Girth 
trees_girth
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.30   11.05   12.90   13.25   15.25   20.60
sprintf("Variance and standard deviation of trees$Girth is %#.2f and %#.2f", var(trees$Girth), sd(trees$Girth)) #display variance and SD
## [1] "Variance and standard deviation of trees$Girth is 9.85 and 3.14"
sprintf("Skewness and Kurtosis of trees$Girth is %#.2f and %#.2f", skewness(trees$Girth), kurtosis(trees$Girth)) #display Skewness and Kurtosis
## [1] "Skewness and Kurtosis of trees$Girth is 0.53 and 2.44"
trees_height  <- summary(trees$Height) #display detail summary of the variable Height 
trees_height
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      63      72      76      76      80      87
sprintf("Variance and standard deviation of trees$Height is %#.2f and %#.2f", var(trees$Height), sd(trees$Height)) #display variance and SD
## [1] "Variance and standard deviation of trees$Height is 40.60 and 6.37"
sprintf("Skewness and Kurtosis of trees$Height is %#.2f and %#.2f", skewness(trees$Height), kurtosis(trees$Height)) #display Skewness and Kurtosis
## [1] "Skewness and Kurtosis of trees$Height is -0.37 and 2.43"
trees_volume  <- summary(trees$Volume) #display detail summary of the variable Volume
trees_volume
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.20   19.40   24.20   30.17   37.30   77.00
sprintf("Variance and standard deviation of trees$Height is %#.2f and %#.2f", var(trees$Volume), sd(trees$Volume)) #display variance and SD
## [1] "Variance and standard deviation of trees$Height is 270.20 and 16.44"
sprintf("Skewness and Kurtosis of trees$Height is %#.2f and %#.2f", skewness(trees$Volume), kurtosis(trees$Volume)) #display Skewness and Kurtosis
## [1] "Skewness and Kurtosis of trees$Height is 1.06 and 3.47"

(c) Create Histograms and Density Plots

hist(trees$Girth) 

#Alternatively, we can plot the histogram more effectively using ggplot package
library(ggplot2)

ggplot(trees, aes(x=Girth)) +
  geom_histogram(binwidth = 1.5, colour="darkblue", fill="steelblue")

#Histogram with mean line

ggplot(trees, aes(x=Girth)) +
 geom_histogram(binwidth = 1.5, colour="black", fill="steelblue") +
  geom_vline(aes(xintercept=mean(Girth)),
            color="red", linetype="dashed", size=1)

#Histogram with density plot

ggplot(trees, aes(x=Girth)) +
  geom_histogram(aes(y=..density..), binwidth = 1.5, colour="black", fill="steelblue") +
    geom_density(alpha=0.2, fill="blue")

hist(trees$Height) 

#Alternatively, we can plot the histogram more effectively using ggplot package

library(ggplot2)

ggplot(trees, aes(x=Height)) +
  geom_histogram(binwidth = 1.5, colour="darkblue", fill="steelblue")

#Histogram with mean line

ggplot(trees, aes(x=Height)) +
 geom_histogram(binwidth = 1.5, colour="black", fill="steelblue") +
  geom_vline(aes(xintercept=mean(Height)),
            color="red", linetype="dashed", size=1)

#Histogram with density plot

ggplot(trees, aes(x=Height)) +
  geom_histogram(aes(y=..density..), binwidth = 1.5, colour="black", fill="steelblue") +
    geom_density(alpha=0.2, fill="blue")

hist(trees$Volume) 

#Alternatively, we can plot the histogram more effectively using ggplot package

library(ggplot2)

ggplot(trees, aes(x=Volume)) +
  geom_histogram(binwidth = 1.5, colour="darkblue", fill="steelblue")

#Histogram with mean line

ggplot(trees, aes(x=Volume)) +
 geom_histogram(binwidth = 1.5, colour="black", fill="steelblue") +
  geom_vline(aes(xintercept=mean(Volume)),
            color="red", linetype="dashed", size=1)

#Histogram with density plot

ggplot(trees, aes(x=Volume)) +
  geom_histogram(aes(y=..density..), binwidth = 1.5, colour="black", fill="steelblue") +
    geom_density(alpha=0.2, fill="blue")

Density Plots

Density Plots are used to view the distribution of each variable

#Density plot of Girth

plot(density(trees$Girth)) #kernel density plot

d <- density(trees$Girth)
plot(d, main="Kernel Density of Girth") #filled density plot
polygon(d, col="blue", border="black")

#Density plot of Height

plot(density(trees$Height)) #kernel density plot

d <- density(trees$Height)
plot(d, main="Kernel Density of Height") #filled density plot
polygon(d, col="blue", border="black")

#Density plot of Volume

plot(density(trees$Volume)) #kernel density plot

d <- density(trees$Volume)
plot(d, main="Kernel Density of Volume") #filled density plot
polygon(d, col="blue", border="black")

(d) Create Boxplots

Boxplot is used to show the shape of the distribution, its central value, and its variability

Rug is used to see the data points across the box plot; it adds (1-d plot) of the data to the plot.

#Boxplot for the variable Girth
boxplot(trees$Girth, horizontal = TRUE, col = c("yellow"),main = "Boxplot of Girth")
rug(trees$Girth, side = 1) #rug is used to see the data points across the box plot

#Boxplot for the variable Height
boxplot(trees$Height, horizontal = TRUE, col = c("green"), main = "Boxplot of Height")
rug(trees$Height, side = 1)

#Boxplot for the variable Volume
boxplot(trees$Volume, horizontal = TRUE, col = c("red"), main = "Boxplot of Volume")
rug(trees$Volume, side = 1)

Combined Boxplot of all the 3 variables

The boxplots of Height shows a normal distribution, Girth is slightly positively-skewed and Volume is positively skewed.

#Combined Boxplot of all the 3 variables

boxplot(trees$Girth, trees$Height, trees$Volume, horizontal = TRUE, 
        col = c("yellow","green","red"), names= c("Girth","Height","Volume"), main = "Combined Boxplot of all the 3 variables")

(e) Normal Probability Plots

Normal Probability Plots are used to assess whether a dataset is normally distributed or not specifically, to identify outliers or any unusual values.

qqnorm(trees$Girth, xlab = "", ylab = "Girth", main = "Girth of trees"); qqline(trees$Girth, col = 2)

qqnorm(trees$Volume, xlab = "", ylab = "Volume", main = "Volume of trees"); qqline(trees$Volume, col = 2)

From the visualizations above, we can see that the normal probability distribution plots of Girth and Volume are skewed right i.e., the plot shows a strongly non-linear pattern. Specifically, it shows a quadratic pattern in which all the points are below a reference line drawn between the first and last points.

qqnorm(trees$Height, xlab = "", ylab = "Height", main = "Height of trees");
qqline(trees$Height, col = 2)

Normal Probability plot of the Height is short-tailed i.e., it shows a non-linear pattern.