rm(list=ls()) # remove all variables
cat("\014") # clear Console
if (dev.cur()!=1) {dev.off()} # clear R plots if exists
## null device
## 1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggfortify)
library(cluster)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(ggpubr)
# Exercise 4. In this task, you are required to analyze the Animals dataset from the MASS package.
# This dataset contains brain weight (in grams) and body weight (in kilograms) for 28 different
# animal species.The three largest animals are dinosaurs, whose measurements are obviously the
# result of scientific modeling rather than precise measurements.
# A scatter plot given below fails to describe any obvious relationship between brain weight and
# body weight variables. You are required to apply appropriate power transformations to the
# variables to obtain more interpretable plot and describe the obtained relationship.
# To this end, undertake the following tasks.
# Task-1. Check whether each of the variables has normal distribution. Your response should be based
# on an appropriate statistical test as well as smoothed histogram plots.
animals <- Animals
hist(animals$body)
hist(animals$brain)
# histogram of both Animal's body and brain are both highly skewed to one side.
shapiro.test(animals$body)
##
## Shapiro-Wilk normality test
##
## data: animals$body
## W = 0.27831, p-value = 1.115e-10
qqnorm(animals$body,main="QQ plot of Animal Body data",pch=19)
qqline(animals$body)
shapiro.test(animals$brain)
##
## Shapiro-Wilk normality test
##
## data: animals$brain
## W = 0.45173, p-value = 3.763e-09
qqnorm(animals$brain,main="QQ plot of Animal Body data",pch=19)
qqline(animals$brain)
# From the output, the p-value < 0.05 implying that the distribution of the data are significantly
# different from normal distribution. In other words, we cannot assume the normality.
# Its QQ and Histogram plots confirm it too.
# Task-2. A power transformation of a variable X consists of raising X to the power lambda.
# Using an appropriate statistical test and/or plot, find best lambda values needed for
# transforming each of the variables requiring power transformation.
hist(log10(animals$body))
hist(log2(animals$brain))
# Task-3. Apply power transformation and verify whether transformed variables have a normal
# distribution through statistical test as well as smoothed histogram plots.
shapiro.test(log10(animals$body))
##
## Shapiro-Wilk normality test
##
## data: log10(animals$body)
## W = 0.98465, p-value = 0.9433
shapiro.test(log2(animals$brain))
##
## Shapiro-Wilk normality test
##
## data: log2(animals$brain)
## W = 0.95787, p-value = 0.31
# Task-4. Create a scatter plot of the transformed data. Based on the visual inspection of the plot,
# provide your interpretation of the relationship between brain weight and body weight variables.
# You may like to add an appropriate smoothed line curve to your plot to help in interpretation.
plot(log10(animals$body),log2(animals$brain))
animals_transformed <- animals %>% dplyr::mutate(Body=log10(body),Brain=log2(brain))
cor(animals_transformed$Body,animals_transformed$Brain)
## [1] 0.7794935
ggscatter(animals_transformed, x = "Body", y = "Brain",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Body (transformed)", ylab = "Brain (transformed)",
title = "Plot of Brain vs Body (transformed)")
## `geom_smooth()` using formula 'y ~ x'
# log2(brain) has a somewhat strong linear relationship with log10(body) from the animals dataset
# Their correlation coefficient is 0.78