Multivariate

##########################################################################
######################                                                   #           
# R example code to calculate summary statistics for multivariate data:  #
######################                                                   #
##########################################################################


# load 'HSAUR3' package:
library(HSAUR3)

## Warning: package 'HSAUR3' was built under R version 4.4.1

## Loading required package: tools

# load USairpollution data set:
data(USairpollution)
# See first 5 data rows:
head(USairpollution)

##             SO2 temp manu popul wind precip predays
## Albany       46 47.6   44   116  8.8  33.36     135
## Albuquerque  11 56.8   46   244  8.9   7.77      58
## Atlanta      24 61.5  368   497  9.1  48.34     115
## Baltimore    47 55.0  625   905  9.6  41.31     111
## Buffalo      11 47.1  391   463 12.4  36.11     166
## Charleston   31 55.2   35    71  6.5  40.75     148

# The observations are 41 cities (data were collected in 1981).
# These are the 7 variables:
# SO2: SO2 (sulphur dioxide) content of air in micrograms per cubic metre;
# temp: average annual temperature in degrees Fahrenheit;
# manu: number of manufacturing enterprises employing 20 or more workers;
# popul: population size (1970 census) in thousands;
# wind: average annual wind speed in miles per hour;
# precip: average annual precipitation in inches;
# predays: average number of days with precipitation per year.

#attach(USairpollution) # attaching the data frame


############################################
############################################

# Getting the sample mean vector:

colMeans(USairpollution)

##        SO2       temp       manu      popul       wind     precip    predays 
##  30.048780  55.763415 463.097561 608.609756   9.443902  36.769024 113.902439

# Getting the sample covariance matrix:

var(USairpollution)

##                 SO2        temp        manu       popul        wind
## SO2      550.947561  -73.560671   8527.7201   6711.9945   3.1753049
## temp     -73.560671   52.239878   -773.9713   -262.3496  -3.6113537
## manu    8527.720122 -773.971341 317502.8902 311718.8140 191.5481098
## popul   6711.994512 -262.349634 311718.8140 335371.8939 175.9300610
## wind       3.175305   -3.611354    191.5481    175.9301   2.0410244
## precip    15.001799   32.862988   -215.0199   -178.0529  -0.2185311
## predays  229.929878  -82.426159   1968.9598    645.9860   6.2143902
##               precip    predays
## SO2       15.0017988  229.92988
## temp      32.8629884  -82.42616
## manu    -215.0199024 1968.95976
## popul   -178.0528902  645.98598
## wind      -0.2185311    6.21439
## precip   138.5693840  154.79290
## predays  154.7929024  702.59024

# Rounding off:  round(var(USairpollution),digits=2)

# Note the diagonal elements of this are the sample variances for the 7 variables.

############################################
############################################

# Getting the sample correlation matrix:

cor(USairpollution)

##                 SO2        temp        manu       popul        wind      precip
## SO2      1.00000000 -0.43360020  0.64476873  0.49377958  0.09469045  0.05429434
## temp    -0.43360020  1.00000000 -0.19004216 -0.06267813 -0.34973963  0.38625342
## manu     0.64476873 -0.19004216  1.00000000  0.95526935  0.23794683 -0.03241688
## popul    0.49377958 -0.06267813  0.95526935  1.00000000  0.21264375 -0.02611873
## wind     0.09469045 -0.34973963  0.23794683  0.21264375  1.00000000 -0.01299438
## precip   0.05429434  0.38625342 -0.03241688 -0.02611873 -0.01299438  1.00000000
## predays  0.36956363 -0.43024212  0.13182930  0.04208319  0.16410559  0.49609671
##             predays
## SO2      0.36956363
## temp    -0.43024212
## manu     0.13182930
## popul    0.04208319
## wind     0.16410559
## precip   0.49609671
## predays  1.00000000

# Rounding off:  round(cor(USairpollution),digits=2)

############################################
############################################

# Calculations to show the relationship between the covariance and correlation matrices:

# This is the long way...

my.S <- var(USairpollution)

D.minus.12 <- diag( 1/sqrt(diag(my.S) )  )

my.R <- D.minus.12 %*% my.S %*% D.minus.12

my.R

##             [,1]        [,2]        [,3]        [,4]        [,5]        [,6]
## [1,]  1.00000000 -0.43360020  0.64476873  0.49377958  0.09469045  0.05429434
## [2,] -0.43360020  1.00000000 -0.19004216 -0.06267813 -0.34973963  0.38625342
## [3,]  0.64476873 -0.19004216  1.00000000  0.95526935  0.23794683 -0.03241688
## [4,]  0.49377958 -0.06267813  0.95526935  1.00000000  0.21264375 -0.02611873
## [5,]  0.09469045 -0.34973963  0.23794683  0.21264375  1.00000000 -0.01299438
## [6,]  0.05429434  0.38625342 -0.03241688 -0.02611873 -0.01299438  1.00000000
## [7,]  0.36956363 -0.43024212  0.13182930  0.04208319  0.16410559  0.49609671
##             [,7]
## [1,]  0.36956363
## [2,] -0.43024212
## [3,]  0.13182930
## [4,]  0.04208319
## [5,]  0.16410559
## [6,]  0.49609671
## [7,]  1.00000000

# Getting the correlation matrix (if you are given the covariance matrix) the quick way:

cov2cor(my.S)

##                 SO2        temp        manu       popul        wind      precip
## SO2      1.00000000 -0.43360020  0.64476873  0.49377958  0.09469045  0.05429434
## temp    -0.43360020  1.00000000 -0.19004216 -0.06267813 -0.34973963  0.38625342
## manu     0.64476873 -0.19004216  1.00000000  0.95526935  0.23794683 -0.03241688
## popul    0.49377958 -0.06267813  0.95526935  1.00000000  0.21264375 -0.02611873
## wind     0.09469045 -0.34973963  0.23794683  0.21264375  1.00000000 -0.01299438
## precip   0.05429434  0.38625342 -0.03241688 -0.02611873 -0.01299438  1.00000000
## predays  0.36956363 -0.43024212  0.13182930  0.04208319  0.16410559  0.49609671
##             predays
## SO2      0.36956363
## temp    -0.43024212
## manu     0.13182930
## popul    0.04208319
## wind     0.16410559
## precip   0.49609671
## predays  1.00000000

## Be careful about the difference between the 'cor' and 'cov2cor' functions!

# NOTE:  If you have the DATA matrix and you want to get the sample correlation matrix, then use
# the 'cor' function, e.g.:
# cor(data_matrix)

# If you have the COVARIANCE matrix and you want to calculate the correlation matrix from this, then use
# the 'cov2cor' function, e.g.:
# cov2cor(covar_matrix)

Distances

############################################
############################################

# Getting distance matrix (Euclidean distances) between raw observations

dis <- dist(USairpollution)

dist2full<-function(ds){
  n<-attr(ds,"Size")
  full<-matrix(0,n,n)
  full[lower.tri(full)]<-ds
  dist.mat<-full+t(full)
  rownames(dist.mat) <- colnames(dist.mat) <- attributes(ds)$Labels
  return(dist.mat)
}
dis.matrix<-dist2full(dis)
D = round(dis.matrix,digits=2)

# Getting distance matrix (Euclidean distances) between SCALED observations

std <- sapply(USairpollution, sd)  # finding standard deviations of variables

USairpollution.std <- sweep(USairpollution,2,std,FUN="/")  # dividing each variable by its standard deviation

dis <- dist(USairpollution.std)

dis.matrix<-dist2full(dis)
DD = round(dis.matrix,digits=2)

# Finding the smallest and largest distances within a distance matrix or distance object,
# and also printing which pairs of objects these correspond to:
# (Make sure dist2full function above has been copied into workspace first)

# Copy and paste the following pick.smallest.largest function into R:

############
##
#
pick.smallest.largest <- function(ds, howmany=1){
if (is.matrix(ds)==TRUE) {
if (is.vector(rownames(ds))==TRUE) {
  ds.Labels <- rownames(ds)
  } else {
  ds.Labels <- 1:nrow(ds)
  }
#print(ds.Labels)
disLT <- ds[lower.tri(ds)]
ds2 <- ds
} else {
if (is.vector(attributes(ds)$Labels)==TRUE) {
  ds.Labels <- attributes(ds)$Labels
  } else {
  ds.Labels <- 1:attributes(ds)$Size
  }
ds2 <- dist2full(ds)
disLT <- ds2[lower.tri(ds2)]
}
smallest.few <- sort(disLT)[1:howmany]
largest.few <- sort(disLT,decreasing=T)[1:howmany]
print(paste("smallest", howmany, "distances are: ")); print(smallest.few)
object.pairs <- object.pairs2 <- matrix(0,nrow=howmany,ncol=2)
for (i in 1:howmany){
object.pairs[i,] <- which(ds2==smallest.few[i],arr.ind=T)[2,]
}
print("these smallest distances correspond to the pairs of objects given in the rows below:"); print(object.pairs)
object.pairs.labels <- matrix(ds.Labels[object.pairs],ncol=2)
print("the labels for these pairs of objects given in the rows below:"); print(object.pairs.labels)
print(paste("largest", howmany, "distances are: ")); print(largest.few)
for (i in 1:howmany){
object.pairs2[i,] <- which(ds2==largest.few[i],arr.ind=T)[2,]
}
print("these largest distances correspond to the pairs of objects given in the rows below:"); print(object.pairs2)
object.pairs2.labels <- matrix(ds.Labels[object.pairs2],ncol=2)
print("the labels for these pairs of objects are given in the rows below:"); print(object.pairs2.labels)
}
#
##
############

# Using the pick.smallest.largest function:
pick.smallest.largest(dis,3)

## [1] "smallest 3 distances are: "
## [1] 0.5231313 0.5487501 0.6334920
## [1] "these smallest distances correspond to the pairs of objects given in the rows below:"
##      [,1] [,2]
## [1,]   18   27
## [2,]   13   29
## [3,]   26   34
## [1] "the labels for these pairs of objects given in the rows below:"
##      [,1]           [,2]         
## [1,] "Jacksonville" "New Orleans"
## [2,] "Des Moines"   "Omaha"      
## [3,] "Nashville"    "Richmond"   
## [1] "largest 3 distances are: "
## [1] 10.249000  9.701221  9.693640
## [1] "these largest distances correspond to the pairs of objects given in the rows below:"
##      [,1] [,2]
## [1,]    7   31
## [2,]    7   23
## [3,]    2    7
## [1] "the labels for these pairs of objects are given in the rows below:"
##      [,1]          [,2]     
## [1,] "Chicago"     "Phoenix"
## [2,] "Chicago"     "Miami"  
## [3,] "Albuquerque" "Chicago"

pick.smallest.largest(dis.matrix,3)

## [1] "smallest 3 distances are: "
## [1] 0.5231313 0.5487501 0.6334920
## [1] "these smallest distances correspond to the pairs of objects given in the rows below:"
##      [,1] [,2]
## [1,]   18   27
## [2,]   13   29
## [3,]   26   34
## [1] "the labels for these pairs of objects given in the rows below:"
##      [,1]           [,2]         
## [1,] "Jacksonville" "New Orleans"
## [2,] "Des Moines"   "Omaha"      
## [3,] "Nashville"    "Richmond"   
## [1] "largest 3 distances are: "
## [1] 10.249000  9.701221  9.693640
## [1] "these largest distances correspond to the pairs of objects given in the rows below:"
##      [,1] [,2]
## [1,]    7   31
## [2,]    7   23
## [3,]    2    7
## [1] "the labels for these pairs of objects are given in the rows below:"
##      [,1]          [,2]     
## [1,] "Chicago"     "Phoenix"
## [2,] "Chicago"     "Miami"  
## [3,] "Albuquerque" "Chicago"

# A measure of distance between the variables:

1 - abs(cor(USairpollution))

##               SO2      temp       manu      popul      wind    precip   predays
## SO2     0.0000000 0.5663998 0.35523127 0.50622042 0.9053095 0.9457057 0.6304364
## temp    0.5663998 0.0000000 0.80995784 0.93732187 0.6502604 0.6137466 0.5697579
## manu    0.3552313 0.8099578 0.00000000 0.04473065 0.7620532 0.9675831 0.8681707
## popul   0.5062204 0.9373219 0.04473065 0.00000000 0.7873562 0.9738813 0.9579168
## wind    0.9053095 0.6502604 0.76205317 0.78735625 0.0000000 0.9870056 0.8358944
## precip  0.9457057 0.6137466 0.96758312 0.97388127 0.9870056 0.0000000 0.5039033
## predays 0.6304364 0.5697579 0.86817070 0.95791681 0.8358944 0.5039033 0.0000000

############################################
############################################

# Normal Q-Q plots for the first four variables in the data set, considered separately:
# This checks normality of the marginal distributions

par(mfrow=c(2,2))  # Setting up for a 2 by 2 array of plots

qqnorm(USairpollution[,1], ylab="Ordered Observations", main = "Normal Q-Q Plot, Sulphur Dioxide")
qqline(USairpollution[,1])
qqnorm(USairpollution[,2], ylab="Ordered Observations", main = "Normal Q-Q Plot, Temperature")
qqline(USairpollution[,2])
qqnorm(USairpollution[,3], ylab="Ordered Observations", main = "Normal Q-Q Plot, Manufacturing")
qqline(USairpollution[,3])
qqnorm(USairpollution[,4], ylab="Ordered Observations", main = "Normal Q-Q Plot, Population")
qqline(USairpollution[,4])

par(mfrow=c(1,1))


# Using Everitt's chisplot function to check multivariate normality of entire data set:

# Copy the chisplot function into R:
###############
###
chisplot <- function(x) {
    if (!is.matrix(x)) stop("x is not a matrix")

    ### determine dimensions
    n <- nrow(x)
    p <- ncol(x)
    #
    xbar <- apply(x, 2, mean)
    S <- var(x)
    S <- solve(S,tol=1e-40)
    index <- (1:n)/(n+1)
    #
    xcent <- t(t(x) - xbar)
    di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S)
    #
    quant <- qchisq(index,p)
    plot(quant, sort(di), ylab = "Ordered distances",
         xlab = "Chi-square quantile", lwd=2,pch=1)
   
}
###
###############

# Use the chisplot function:

chisplot(as.matrix(USairpollution))

# Try it on a bigger data set (Fisher's iris data set, which is a built-in data set in R):

fisher.iris <- iris[,1:4]
chisplot(as.matrix(fisher.iris))

# For the three types of irises, separately:

fisher.iris.setosa <- iris[1:50,1:4]
chisplot(as.matrix(fisher.iris.setosa))

fisher.iris.versicolor <- iris[51:100,1:4]
chisplot(as.matrix(fisher.iris.versicolor))

fisher.iris.virginica <- iris[101:150,1:4]
chisplot(as.matrix(fisher.iris.virginica))

# To get basically the same data set as described on pages 10-13 of textbook:

library(MASS)

set.seed(1203)

mymat1 <- mvrnorm(200, mu=c(0,0), Sigma=matrix(c(1,0.5,0.5,1),ncol=2))

mymat2 <- log(abs(mymat1)) + 12

# mymat2 can be thought of as the observed data matrix which is not multivariate normal.

chisplot(mymat2)

boxcox(lm(mymat2[,1]~1),lambda=seq(-10,10,1/5))

boxcox(lm(mymat2[,2]~1),lambda=seq(-10,10,1/5))

# To see the actual lambda values:
box.x <- boxcox(lm(mymat2[,1]~1),lambda=seq(-10,10,1/5),plotit=F)$x
box.y <- boxcox(lm(mymat2[,1]~1),lambda=seq(-10,10,1/5),plotit=F)$y
XY = cbind(box.x, box.y)

box.x <- boxcox(lm(mymat2[,2]~1),lambda=seq(-10,10,1/5),plotit=F)$x
box.y <- boxcox(lm(mymat2[,2]~1),lambda=seq(-10,10,1/5),plotit=F)$y
XYXY = cbind(box.x, box.y)


# Good choices for the lambdas:

lambda=c(5.5, 6)

mymat3 <- cbind( ((mymat2[,1])^5.5 - 1)/5.5,  ((mymat2[,2])^6 - 1)/6 )

# mymat3 is the transformed data matrix

#### A more efficient way to do the transformations of each column of mymat2.
#### This assumes all the lambda_j's are in a vector called "lambda", as above.

if(F){
newmat <- NULL
for (j in 1:length(lambda)){
if (lambda[j]!=0){
newcol<- (airpol.data.sub[,j]^lambda[j] - 1)/lambda[j]
} else {
newcol<-log(airpol.data.sub[,j])
}
newmat <- cbind(newmat,newcol)
}
mymat3 <- newmat
colnames(mymat3) <-colnames(mymat2)

# mymat3 is the transformed data matrix
####
####

chisplot(mymat3)
}

Plots

#############################################################
######################                                      #           
# R example code to display / visualize multivariate data:  #
######################                                      #
#############################################################

# The built-in R data set called state contains a multivariate data matrix.
# For information, type:

help(state)

## starting httpd help server ... done

# To see the data matrix (containing variables for the 50 states, as of 1977):

state.x77[1:5,]

##            Population Income Illiteracy Life Exp Murder HS Grad Frost   Area
## Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708
## Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417
## Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945
## California      21198   5114        1.1    71.71   10.3    62.6    20 156361

# Creating a data frame out of a matrix:

state.x77.df <- data.frame(state.x77)

names(state.x77.df) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area")

attach(state.x77.df)

############################################
############################################

# A simple scatterplot of Life Expectancy against Income:

plot(Income, LifeExp)

# A simple scatterplot of HS Graduation Rate against Income:

plot(Income, HSGrad)

# With regression line added:

plot(Income, HSGrad)
abline(lm(HSGrad~Income), lwd=2)

# With lowess regression curve added:

plot(Income, HSGrad)
lines(lowess(Income,HSGrad), lwd=2)

# With "rug plots" added to display both marginal distributions:

plot(Income, HSGrad)
rug(jitter(Income), side=1)
rug(jitter(HSGrad), side=2)

# With histograms added to display both marginal distributions:

#install.packages("psych")
library(psych)

## Warning: package 'psych' was built under R version 4.4.1

scatterHist(x=Income,y=HSGrad)

# Changing some default options, for a little cleaner look:
scatterHist(x=Income,y=HSGrad,xlab="Income",ylab="High School Graduation Rate",ellipse=F,smooth=F,density=F)

# Labeling individual observations (the states):

st.name <- abbreviate(row.names(state.x77.df) )
plot(Income, HSGrad, type='n')
text(Income, HSGrad, labels=st.name, lwd=2)

# Even better here:  R has a built-in state.abb vector, which we can use:

plot(Income, HSGrad, type='n')
text(Income, HSGrad, labels=state.abb, lwd=2)

# Symbolic scatterplots, with separate symbols for each level of some grouping variable:

# Creating a data frame that adds the 'state.region' variable to the others:
more.state.data <- data.frame(state.x77.df,state.region)

library(lattice)
xyplot(HSGrad ~ Income, group=factor(state.region), data=more.state.data)

# install.packages("ggplot2") # run this if 'ggplot2' package is not already installed
library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

qplot(Income, HSGrad, col=factor(state.region), shape=factor(state.region), data=more.state.data)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# A scatterplot with multiple y axes: (functions courtesy of Horton and Kleinman (2015) book):

# First function plots one y-variable against x, (by default) adds lowess curve through data, 
# then calls 'addsecondy' function:
plottwoy <- function(x,y1,y2,xname="X",y1name="Y1",y2name="Y2",lowess.curve="yes"){
  plot(x,y1,ylab=y1name,xlab=xname)
  if (lowess.curve=="yes") {
    lines(lowess(x,y1),lwd=3)
    addsecondy(x,y2,y1,yname=y2name,lowess.curve="yes")
  } else {
    addsecondy(x,y2,y1,yname=y2name,lowess.curve="no")
  }
}

# Second function rescales range of second y variable to that of the first, adds the right axis
addsecondy <- function(x,y,origy,yname="Y2",lowess.curve="yes"){
  prevlimits=range(origy)
  axislimits=range(y)
  axis(side=4,at=prevlimits[1]+diff(prevlimits)*c(0:5)/5, labels=round(axislimits[1]+diff(axislimits)*c(0:5)/5,1))
  mtext(yname,side=4)
  newy=(y-axislimits[1])/(diff(axislimits)/diff(prevlimits))+prevlimits[1]
  points(x,newy,pch=2)
  if (lowess.curve=="yes") lines(lowess(x,newy),lty=2,lwd=3)
}

with(state.x77.df, plottwoy(x=HSGrad,y1=Illit,y2=Murder,xname="High School Graduation Rate",y1name="Illiteracy Rate",y2name="Murder Rate"))
legend(x='topright',legend=c("Illiteracy","Murder"),pch=c(1,2),bty='n',lty=c(1,2))

# Without the lowess curves:

with(state.x77.df, plottwoy(x=HSGrad,y1=Illit,y2=Murder,xname="High School Graduation Rate",y1name="Illiteracy Rate",y2name="Murder Rate",lowess.curve='no'))
legend(x='topright',legend=c("Illiteracy","Murder"),pch=c(1,2),bty='n') # no 'lty' argument now

############################################
############################################


# Finding and plotting the "convex hull" of the data:

conv.hull <- chull(Income, HSGrad)
plot(Income, HSGrad)
polygon(Income[conv.hull], HSGrad[conv.hull], density=15, angle=30)

# The ordinary sample correlation coefficient between HS Graduation Rate and Income:
cor(Income, HSGrad)

## [1] 0.6199323

# A "robust" sample correlation coefficient between HS Graduation Rate and Income:
cor(Income[-conv.hull], HSGrad[-conv.hull])

## [1] 0.6010696

## An example where removing the convex hull makes more of a difference:

conv.hull <- chull(Area, Popul)
plot(Area, Popul)
polygon(Area[conv.hull], Popul[conv.hull], density=15, angle=30)

# The ordinary sample correlation coefficient between Area and Population:
cor(Area, Popul)

## [1] 0.02254384

# A "robust" sample correlation coefficient between Area and Population:
cor(Area[-conv.hull], Popul[-conv.hull])

## [1] 0.115327

############################################
############################################


# Copy Everitt's chiplot function:
######################
#####
chiplot<-function(x,y,vlabs=c("X","Y"),matrix="NO") {
  n<-length(x); ind<-numeric(length=n)
  for(i in 1:n) {
    for(j in (1:n)[-i]) if(x[i]>x[j]&y[i]>y[j]) ind[i]<-ind[i]+1
  }
  ind<-ind/(n-1); ind1<-numeric(length=n)
  for(i in 1:n) {
    for(j in (1:n)[-i]) if(x[i]>x[j]) ind1[i]<-ind1[i]+1
  }
  ind1<-ind1/(n-1); ind2<-numeric(length=n)
  for(i in 1:n) {
    for(j in (1:n)[-i]) if(y[i]>y[j]) ind2[i]<-ind2[i]+1
  }
  ind2<-ind2/(n-1)
  s<-sign((ind1-0.5)*(ind2-0.5))
  chi<-(ind-ind1*ind2)/sqrt(ind1*(1-ind1)*ind2*(1-ind2))
  lambda<-4*s*pmax((ind1-0.5)^2,(ind2-0.5)^2)
  thresh<-4*(1/(n-1)-0.5)^2
  if(matrix=="NO") {
    par(mfrow=c(1,2))
    plot(x,y,xlab=vlabs[1],ylab=vlabs[2])
    plot(lambda[abs(lambda)<thresh],chi[abs(lambda)<thresh],ylim=c(-1,1),xlab="lambda",
         ylab="Chi"); abline(h=1.78/sqrt(n)); abline(h=-1.78/sqrt(n))}
  if(matrix=="YES") {
    plot(lambda[abs(lambda)<thresh],chi[abs(lambda)<thresh],ylim=c(-1,1))
    abline(h=1.78/sqrt(n)); abline(h=-1.78/sqrt(n))}
}
#####
######################

# Use it to create a chi-plot to check for independence of HS Graduation Rate and Income:

chiplot(Income, HSGrad, vlabs=c("Income", "HSGrad"))

# chi-plot to check for independence of Area and Population:
chiplot(Area, Popul, vlabs=c("Area", "Popul"))

############################################
############################################


# Copy Everitt's biweight function:
# (It will be needed to construct the robust version of the bivariate boxplot)
######################
#####
biweight<-function(a,const1=9,const2=36,err=0.0001) {
  #
  #a is data matrix with two cols.
  #const1=common tuning constant
  #const2=bivariate tuning constant
  #err=convergence criterion.
  x<-a[,1]; y<-a[,2]; n<-length(x); mx<-median(x); my<-median(y); madx<-median(abs(x-mx)); mady<-median(abs(y-my))
  if(madx != 0) { ux<-(x-mx)/(const1*madx)
  ux1<-ux[abs(ux)<1]; tx<-mx+(sum((x[abs(ux)<1]-mx)*(1-ux1*ux1)^2)/sum((1-ux1^2)^2))
  sx<- sqrt(n)*sqrt(sum((x[abs(ux)<1]-mx)^2*(1-ux1*ux1)^4))/abs(sum((1-ux1*ux1)*(1-5*ux1*ux1)))
  }
  else { tx<-mx
  sx<-sum(abs(x-mx))/n
  }
  if(mady != 0) { uy<-(y-my)/(const1*mady)
  uy1<-uy[abs(uy)<1]; ty<-my+(sum((y[abs(uy)<1]-my)*(1-uy1*uy1)^2)/sum((1-uy1^2)^2))
  sy<- sqrt(n)*sqrt(sum((y[abs(uy)<1]-my)^2*(1-uy1*uy1)^4))/abs(sum((1-uy1*uy1)*(1-5*uy1*uy1)))
  }
  else { ty<-my
  sy<-sum(abs(y-my))/n
  }
  z1<-(y-ty)/sy+(x-tx)/sx; z2<-(y-ty)/sy-(x-tx)/sx; mz1<-median(z1); mz2<-median(z2); 
  madz1<-median(abs(z1-mz1)); madz2<-median(abs(z2-mz2))
  if(madz1 != 0) { uz1<-(z1-mz1)/(const1*madz1)
  uz11<-uz1[abs(uz1)<1]; tz1<-mz1+(sum((z1[abs(uz1)<1]-mz1)*(1-uz11*uz11)^2)/sum((1-uz11^2)^2))
  sz1<- sqrt(n)*sqrt(sum((z1[abs(uz1)<1]-mz1)^2* (1-uz11*uz11)^4))/abs(sum((1-uz11*uz11)*(1-5*uz11*uz11)))
  }
  else { tz1<-mz1
  sz1<-sum(abs(z1-mz1))/n
  }
  if(mady != 0) { uz2<-(z2-mz2)/(const1*madz2)
  uz21<-uz2[abs(uz2)<1]; tz2<-mz2+(sum((z2[abs(uz2)<1]-mz2)*(1-uz21*uz21)^2)/sum((1-uz21^2)^2))
  sz2<- sqrt(n)*sqrt(sum((z2[abs(uz2)<1]-mz2)^2*(1-uz21*uz21)^4))/abs(sum((1-uz21*uz21)*(1-5*uz21*uz21)))
  }
  else { tz2<-mz2
  sz2<-sum(abs(z2-mz2))/n
  }
  esq<-((z1-tz1)/sz1)^2+((z2-tz2)/sz2)^2; w<-numeric(length=n); c2<-const2
  for(i in 1:10) {
    w[esq<const2]<-(1-esq[esq<const2]/const2)^2; w[esq>=const2]<-0; l<-length(w[w==0])
    if(l<0.5*n) break
    else const2<-2*const2
  }
  tx<-sum(w*x)/sum(w); sx<-sqrt(sum(w*(x-tx)^2)/sum(w)); ty<-sum(w*y)/sum(w); 
  sy<-sqrt(sum(w*(y-ty)^2)/sum(w)); r<-sum(w*(x-tx)*(y-ty))/(sx*sy*sum(w)); const2<-c2; wold<-w
  for(i in 1:100) {
    z1<-((y-ty)/sy+(x-tx)/sx)/sqrt(2*(1+r)); z2<-((y-ty)/sy-(x-tx)/sx)/sqrt(2*(1+r)); esq<-z1*z1+z2*z2
    for(j in 1:10) {
      w[esq<const2]<-(1-esq[esq<const2]/const2)^2; w[esq>=const2]<-0; l<-length(w[w==0])
      if(l<0.5*n) break
      else const2<-2*const2
    }
    tx<-sum(w*x)/sum(w); sx<-sqrt(sum(w*(x-tx)^2)/sum(w)); ty<-sum(w*y)/sum(w); sy<-sqrt(sum(w*(y-ty)^2)/sum(w))
    r<-sum(w*(x-tx)*(y-ty))/(sx*sy*sum(w)); term<-sum((w-wold)^2)/(sum(w)/n)^2
    if(term<-err) break
    else {wold<-w
    const2<-c2
    }
  }
  param<-c(tx,ty,sx,sy,r)
  param
}
#####
######################


# Copy Everitt's bivbox function:
######################
#####
bivbox<-function(a, d = 7, mtitle = "Bivariate Boxplot",
                 method = "robust",xlab="X",ylab="Y")
{
  #a is data matrix
  #d is constant(usually 7)
  p <- length(a[1,  ])
  if(method == "robust") {
    param <- biweight(a[, 1:2]); m1 <- param[1]; m2 <- param[2]
    s1 <- param[3]; s2 <- param[4]; r <- param[5]
  }
  else {
    m1 <- mean(a[, 1]); m2 <- mean(a[, 2]); 
    s1 <- sqrt(var(a[, 1])); s2 <- sqrt(var(a[, 2])); r <- cor(a[, 1:2])[1, 2]
  }
  x <- (a[, 1] - m1)/s1; y <- (a[, 2] - m2)/s2
  e <- sqrt((x * x + y * y - 2 * r * x * y)/(1 - r * r))
  e2 <- e * e; em <- median(e); emax <- max(e[e2 < d * em * em])
  r1 <- em * sqrt((1 + r)/2); r2 <- em * sqrt((1 - r)/2); theta <- ((2 * pi)/360) * seq(0, 360, 3)
  xp <- m1 + (r1 * cos(theta) + r2 * sin(theta)) * s1; yp <- m2 + (r1 * cos(theta) - r2 * sin(theta)) * s2
  r1 <- emax * sqrt((1 + r)/2); r2 <- emax * sqrt((1 - r)/2); theta <- ((2 * pi)/360) * seq(0, 360, 3)
  xpp <- m1 + (r1 * cos(theta) + r2 * sin(theta)) * s1; ypp <- m2 + (r1 * cos(theta) - r2 * sin(theta)) * s2
  maxxl <- max(xpp); minxl <- min(xpp); maxyl <- max(ypp); minyl <- min(ypp)
  b1 <- (r * s2)/s1; a1 <- m2 - b1 * m1; y1 <- a1 + b1 * minxl; y2 <- a1 + b1 * maxxl
  b2 <- (r * s1)/s2; a2 <- m1 - b2 * m2; x1 <- a2 + b2 * minyl; x2 <- a2 + b2 * maxyl
  maxx <- max(c(a[, 1], xp, xpp, x1, x2)); minx <- min(c(a[, 1], xp, xpp, x1, x2))
  maxy <- max(c(a[, 2], yp, ypp, y1, y2)); miny <- min(c(a[, 2], yp, ypp, y1, y2))
  plot(a[, 1], a[, 2], xlim = c(minx, maxx), ylim = c(miny, maxy), xlab =xlab, ylab =ylab,
       lwd = 2, pch = 1)
  lines(xp, yp, lwd = 2); lines(xpp, ypp, lty = 2, lwd = 2)
  segments(minxl, y1, maxxl, y2, lty = 3, lwd = 2); segments(x1, minyl, x2, maxyl, lty = 4, lwd = 2)
}
#####
######################

# Use it to create a bivariate boxplot of HS Graduation Rate and Income:
# The regular method:

bivbox(cbind(Income, HSGrad), xlab = "Income", ylab = "HSGrad", method ="O")

# The robust method (which is the default):

bivbox(cbind(Income, HSGrad), xlab = "Income", ylab = "HSGrad", method ="robust")

############################################
############################################

# Copy Everitt's bivden function:
######################
#####
bivden<-function(x, y, ngridx = 30, ngridy = 30, constant.x = 1, constant.y = 1) {
  #x and y are vectors containing the bivariate data
  #ngridx and ngridy are the number of points in the grid
  mx <- mean(x); sdx <- sqrt(var(x)); my <- mean(y); sdy <- sqrt(var(y))
  #scale x and y before estimation
  x <- scale(x); y <- scale(y); den <- matrix(0, ngridx, ngridy)
  #find possible value for bandwidth
  n <- length(x); hx <- constant.x * n^(-0.2); hy <- constant.y * n^(-0.2)
  h <- hx * hy; hsqrt <- sqrt(h)
  seqx <- seq(range(x)[1], range(x)[2], length = ngridx); seqy <- seq(range(y)[1], range(y)[2], length = ngridy)
  for(i in 1:n) {
    X <- x[i]; Y <- y[i]; xx <- (seqx - X)/hsqrt; yy <- (seqy - Y)/hsqrt
    den <- den + outer(xx, yy, function(x, y)
      exp(-0.5 * (x^2 + y^2)))
  }
  den <- den/(n * 2 * pi * h); seqx <- sdx * seqx + mx; seqy <- sdy * seqy + my
  result <- list(seqx = seqx, seqy = seqy, den = den)
  result
}
#####
######################

# Use it to create a bivariate density of HS Graduation Rate and Income,
# and then plot it with the persp function:

den1 <- bivden(Income, HSGrad)
persp(den1$seqx, den1$seqy, den1$den, xlab="Income", ylab="HSGrad", zlab="Density", lwd=2, ticktype="detailed", theta=35)

# Try different values of theta (between 0 and 360) to get the best perspective.


# A contour plot of the bivariate density plotted on top of the ordinary scatterplot:

plot(Income, HSGrad)
contour(den1$seqx, den1$seqy, den1$den, lwd=2, nlevels=15, add=T)

# You can show more or fewer contours by varying the value of nlevels in this code.

############################################
############################################


# Producing a bubble plot of Income, HS Graduation Rate, and Murder Rate
# (Here Murder Rate is represented by the size of the bubbles)

plot(Income, HSGrad, pch='.', lwd=2, ylim=c(35,70), xlim=c(3000,6400),cex=2)
# The x limits and y limits should be adjusted so that all the bubbles fit within the plotting window
symbols(Income, HSGrad, circles=Murder, inches=0.4, add=T, lwd=2)
# labeling the state names:
text(Income, HSGrad, labels=state.abb, lwd=2, cex=0.6)

# Producing a bubble plot of Income, HS Graduation Rate, and Illiteracy Rate
# (Here Illiteracy Rate is represented by the size of the bubbles)
# type='n' will remove the dot in the middle of the bubbles

plot(Income, HSGrad, type='n', lwd=2, ylim=c(35,70), xlim=c(3000,6400),cex=2)
# The x limits and y limits should be adjusted so that all the bubbles fit within the plotting window
symbols(Income, HSGrad, circles=Illit, inches=0.4, add=T, lwd=2)
# labeling the state names:
text(Income, HSGrad, labels=state.abb, lwd=2, cex=0.6)

# What if we deleted the Income-HSGrad "outliers"?

plot(Income[-conv.hull], HSGrad[-conv.hull], pch='.', lwd=2, ylim=c(35,70), xlim=c(3000,6400),cex=2)
# The x limits and y limits should be adjusted so that all the bubbles fit within the plotting window
symbols(Income[-conv.hull], HSGrad[-conv.hull], circles=Illit[-conv.hull], inches=0.4, add=T, lwd=2)

## By adding colors, you can include a 4th variable, which is categorical, to the bubble plot:

library(ggplot2)
library(carData)

## Warning: package 'carData' was built under R version 4.4.1

data(Salaries, package="carData")
# plot salary against experience for a sample of faculty
# (color represents rank and size represents service)
ggplot(Salaries, 
       aes(x = yrs.since.phd, 
           y = salary, 
           color = rank, 
           size = yrs.service)) +
  geom_point(alpha = .6) +
  labs(title = "Academic salary by rank, years of service, and years since degree")

# This example is from 
# https://rkabacoff.github.io/datavis/Multivariate.html
# See lots of other cool graphs there...

## Making a ggplot "interactive" with the 'plotly' package:

#install.packages("plotly") # Need to install the package first if you have not already done so...
library(plotly)

## Warning: package 'plotly' was built under R version 4.4.1

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

## Use the usual code, but save the plot as an object in R:

my_salary_plot <- ggplot(Salaries, 
                         aes(x = yrs.since.phd, 
                             y = salary, 
                             color = rank, 
                             size = yrs.service)) +
  geom_point(alpha = .6) +
  labs(title = "Academic salary by rank, years of service, and years since degree")

ggplotly(my_salary_plot)

# This will open up an html file with the plot on it.
# Now use the mouse to hover over each point!


## Another example:  This is the 'spotify' data set from the 'bayesrules' package:
## It contains data on 350 songs.  It's a subset of a larger data set collected by Kaylin Pavlik.

#install.packages("bayesrules")
#library(bayesrules)
#data(spotify)

spotify <- read.csv(file="http://people.stat.sc.edu/hitchcock/spotify_bayesrules.csv",header=T)

# If you want to see the whole data set:
# print(as.data.frame(spotify))

# Creating a character variable that combines the artist and title:
spotify$artist_title <- paste(spotify$artist,':',spotify$title)

# A 4-variable bubble plot that also has a "labels" element containg the artist/title name:
my_song_plot <- ggplot(spotify, 
                       aes(x = energy, 
                           y = danceability, 
                           color = genre, 
                           size = duration_ms, labels=artist_title)) +
  geom_point(alpha = .6) +
  labs(title = "Song data")

# my_song_plot

#ggplotly(my_song_plot)

## If you only want the interactive tool to show one or two variable values, use the 'tooltip' argument:

#ggplotly(my_song_plot, tooltip=c("labels","size"))


#
############################################
############################################

# Producing a scatterplot matrix of all variables in this data set:

pairs(state.x77.df)

# If there are a lot of observations (not really here), can make the plotting characters smaller:
pairs(state.x77.df, pch='.', cex=1.5)

# Which pair of variables has the strongest positive association?
# Which pair of variables has the strongest negative association?

# Why are the scatterplots involving "Area" somewhat useless-looking here?

# With regression lines overlain on each plot:

pairs(state.x77.df, panel=function(x,y) {abline(lm(y~x),lwd=2)
  points(x,y)})

# Maximize the R graphics window to get a better presentation of these plots.

# The 'ggpairs' function in the 'GGally' package works with continuous and categorical variables.
# It also includes histograms for each continuous variable's marginal distribution,
# bar plots for each categorical variable's marginal distribution,
# and side-by-side box plots to show the association between continuous and categorical variables.

#install.packages("GGally")
library(GGally)

## Warning: package 'GGally' was built under R version 4.4.1

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# Creating a data frame that adds the 'state.region' variable to the others:
more.state.data <- data.frame(state.x77.df,state.region)

# Making a ggpairs plot with just some of the columns in the data frame:
ggpairs(more.state.data[,c(3,4,5,6,9)], axisLabels="show", diag=list(continuous="bar",discrete="bar"),
        upper=list(continuous="points",combo="box"), lower=list(continuous="cor",combo="facethist"))

## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'bar' to 'barDiag'

## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$discrete from 'bar' to 'barDiag'

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# You could also save this plot as an R object and use 'ggplotly' to make it interactive, 
# as in the previous example.


############################################
############################################


# Producing a 3-D scatterplot of Income, HS Graduation Rate, and Murder Rate

library(lattice)  # loading the lattice package

cloud(Murder ~ Income * HSGrad, xlim=range(Income), ylim=range(HSGrad), zlim=range(Murder), 
      scales = list(distance = rep(1, 3), arrows = FALSE))

# 3-D scatterplot of Income, HS Graduation Rate, and Murder Rate with "drop lines":

cloud(Murder ~ Income * HSGrad, xlim=range(Income), ylim=range(HSGrad), zlim=range(Murder), type="h",
      scales = list(distance = rep(1, 3), arrows = FALSE))

#install.packages("scatterplot3d")
library(scatterplot3d)  # loading the scatterplot3d package
with(more.state.data, scatterplot3d(Income,HSGrad,Murder, pch = (1:4)[state.region], type = "h", angle = 55))

# The 'rgl' package produces 3-D scatterplots that are rotatable with the mouse:
#install.packages('rgl')
library(rgl)

## Warning: package 'rgl' was built under R version 4.4.1

open3d()

## wgl 
##   1

plot3d(Income,HSGrad,Murder, col = rainbow(length(Income)))

# While the plot is open, you can interactively identify points:
# identify3d(Income,HSGrad,Murder)
# Right-click on plot to show ID numbers

# Doing it all at once with state abbreviations as identifiers:
plot3d(Income,HSGrad,Murder,type="s",radius=0.025)
text3d(Income,HSGrad,Murder,text=state.abb, col = rainbow(length(Income)))

############################################
############################################

# A Conditioning Plot of HS Graduation Rate against Income, Separately for each Region:
# Note that state.region is a categorical vector in this built-in R data set

coplot(HSGrad ~ Income | state.region)

# I like the xyplot function (in the lattice package) better than the coplot function:

dev.new() # Opens another graphics window while still keeping the old graph up

library(lattice)
xyplot(HSGrad ~ Income | state.region)

# You can also condition on a continuous variable with the 'cut' argument:

xyplot(LifeExp ~ Murder| cut(Income, 2), data = state.x77.df, layout = c(2,1), xlab = "Murder Rate", ylab = "Life Expectancy")

## Here's an approach to improve the labeling of the above plot:

IncCat <- cut(Income, 2)
levels(IncCat) <- c("Low Income", "High Income")
xyplot(LifeExp ~ Murder| IncCat, data = state.x77.df, layout = c(2,1), xlab = "Murder Rate", ylab = "Life Expectancy")

############################################
############################################


# Creating a star plot to graphically present all variables of the state data set:

stars(state.x77.df)

# The first variable is denoted by the edge at the 3 o'clock (rightmost) position,
# and the other variables go counterclockwise around the star.

# Note how the states of Washington and Oregon resemble each other...

# Kind of a cool plotting option in 'stars':

stars(state.x77.df,locations=cbind(state.center$x,state.center$y))

# Plotting by location only makes sense for some data sets, not all...

############################################
############################################

# Creating Chernoff Faces to graphically present all variables of the state data set:

library(TeachingDemos)

## Warning: package 'TeachingDemos' was built under R version 4.4.1

## 
## Attaching package: 'TeachingDemos'
## 
## The following object is masked from 'package:plotly':
## 
##     subplot

# May need to install the TeachingDemos package first?
# If so, type at the command line:  install.packages("TeachingDemos", dependencies=T)
# while plugged in to the internet.

faces( as.matrix(state.x77.df), fill=T)

# For this function, the variables correspond to facial features in this order:

# 1-height of face, 2-width of face, 3-shape of face, 4-height of mouth, 5-width of mouth, 
# 6-curve of smile, 7-height of eyes, 8-width of eyes, 9-height of hair, 10-width of hair, 
# 11-styling of hair, 12-height of nose, 13-width of nose, 14-width of ears, 15-height of ears. 

# Here's a way to generate a "key":

faces.features <- c("Face height", "Face width", "Face shape", "Mouth height", "Mouth width", 
                    "Smile curve", "Eyes height", "Eyes width", "Hair height", "Hair width", 
                    "Hair styling", "Nose height", "Nose width", "Ears width", "Ears height")

cbind( names(state.x77.df), faces.features[1:length( names(state.x77.df) )])

##      [,1]      [,2]          
## [1,] "Popul"   "Face height" 
## [2,] "Income"  "Face width"  
## [3,] "Illit"   "Face shape"  
## [4,] "LifeExp" "Mouth height"
## [5,] "Murder"  "Mouth width" 
## [6,] "HSGrad"  "Smile curve" 
## [7,] "Frost"   "Eyes height" 
## [8,] "Area"    "Eyes width"

# To switch which columns correspond to which features, just reorder the columns in the data matrix:

faces( as.matrix(state.x77.df[c(8,6,5,2,4,1,3,7)]), fill=T)
cbind( names(state.x77.df)[c(8,6,5,2,4,1,3,7)], faces.features[1:length( names(state.x77.df) )])

##      [,1]      [,2]          
## [1,] "Area"    "Face height" 
## [2,] "HSGrad"  "Face width"  
## [3,] "Murder"  "Face shape"  
## [4,] "Income"  "Mouth height"
## [5,] "LifeExp" "Mouth width" 
## [6,] "Popul"   "Smile curve" 
## [7,] "Illit"   "Eyes height" 
## [8,] "Frost"   "Eyes width"

# Now the 8th column (Area) corresponds to "Face height", etc.

# Note Alaska has the tallest face and the "frowniest" smile here:  Why?

# Or a similar function:

faces2(as.matrix(state.x77.df), scale="center")

# For this function, the variables correspond to facial features in this order:

# 1 Width of center 2 Top vs. Bottom width (height of split) 3 Height of Face 4 Width of top half of face 
# 5 Width of bottom half of face 6 Length of Nose 7 Height of Mouth 8 Curvature of Mouth (abs < 9) 
# 9 Width of Mouth 10 Height of Eyes 11 Distance between Eyes (.5-.9) 12 Angle of Eyes/Eyebrows 
# 13 Circle/Ellipse of Eyes 14 Size of Eyes 15 Position Left/Right of Eyeballs/Eyebrows 
# 16 Height of Eyebrows 17 Angle of Eyebrows 18 Width of Eyebrows 

# Radar Charts:

# Important to scale the data first:

scaled.state.x77.df <- scale(state.x77.df)
selected.obs.indices <- c(11,38,40,44)

some.states <- scaled.state.x77.df[selected.obs.indices, c("LifeExp","HSGrad","Popul","Illit","Murder","Frost","Income")]

# Note I am arranging the variables so the "good" indicators will be on the top of the radar plot
# and the "bad" indicators will be on the bottom of the radar plot.

colnames(some.states) <- c("LifeExp","HSGrad","Popul","Illit","Murder","Frost","Income")
rownames(some.states) <- rownames(state.x77.df)[selected.obs.indices]
some.states <- as.data.frame(some.states)

#install.packages("fmsb")
library(fmsb)

## Warning: package 'fmsb' was built under R version 4.4.1

# Put max and min values to be plotted for each variable.
# 3 and -3 usually work well if the data are pre-standardized as they are here.
#some.states <- as.data.frame(rbind(rep(3,ncol(some.states)) , rep(-3,ncol(some.states)) , some.states))

radarchart(some.states,maxmin=F,plwd=4,plty=1)
# Add a legend
legend(x=0.8, y=1.3, legend = rownames(state.x77.df)[selected.obs.indices], bty = "n", pch=20, col=1:length(rownames(state.x77.df)[selected.obs.indices]), cex=0.8, pt.cex=3)

# With more than a couple observations, it may be better to show the radar plots separately:
par(mfrow=c(2,2))  # set up a 2-by-2 array of plots...
for (i in 1:length(rownames(state.x77.df)[selected.obs.indices])){
  # Below, the 3 and -3 are the max and min values shown on each radar plot,
  # works well for pre-standardized data such as these.
  radarchart(as.data.frame(rbind(3,-3,some.states[i,])),plwd=4,plty=1)
  title(rownames(state.x77.df)[selected.obs.indices][i])
}
par(mfrow=c(1,1))

# Pirate Plots:
# A more advanced extention of side-by-side boxplots to see the distribution of a continuous variable at  
# levels of one or more categorical variables:
# It shows the density estimate, mean, 95% interval estimate for the mean, and the raw data values all in one plot.

#install.packages("yarrr")
library(yarrr)

## Warning: package 'yarrr' was built under R version 4.4.1

## Loading required package: jpeg
## Loading required package: BayesFactor

## Warning: package 'BayesFactor' was built under R version 4.4.1

## Loading required package: coda

## Warning: package 'coda' was built under R version 4.4.1

## Loading required package: Matrix
## ************
## Welcome to BayesFactor 0.9.12-4.7. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
## Loading required package: circlize

## Warning: package 'circlize' was built under R version 4.4.1

## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
## 
## yarrr v0.1.5. Citation info at citation('yarrr'). Package guide at yarrr.guide()
## Email me at Nathaniel.D.Phillips.is@gmail.com
## 
## Attaching package: 'yarrr'
## 
## The following object is masked from 'package:ggplot2':
## 
##     diamonds

# Creating a data frame that adds the 'state.region' variable to the others:
more.state.data <- data.frame(state.x77.df,state.region)

# dv (continuous) is Income, iv (categorical) is state.region
yarrr::pirateplot(formula = Income ~ state.region, 
                  data = more.state.data,
                  main = "Pirateplot of Incomes",
                  xlab = "state.region",
                  ylab = "Income", theme=1)

# Try changing theme to 2, 3, or 4 ...

# Grid of Plots:  

# For base R plots:

par(mfrow=c(3,2))
boxplot(Popul~state.region)
boxplot(Area~state.region)
boxplot(Illit~state.region)
boxplot(LifeExp~state.region)
boxplot(Murder~state.region)
boxplot(Income~state.region)
par(mfrow=c(1,1)) # resetting

# The'patchwork' package does this incredibly well for 'ggplot2'-type plots 
# For examples, see:  https://patchwork.data-imaginist.com/articles/guides/assembly.html


#################################################

## Animation Plots:

#install.packages('gganimate')
library(gganimate)

## Warning: package 'gganimate' was built under R version 4.4.1

## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session

theme_set(theme_bw())

# These examples are from this website:
# https://www.datanovia.com/en/blog/gganimate-how-to-create-plots-with-beautiful-animation-in-r/

# The 'gapminder' data set has some cool demographic data collected over time:

#install.packages('gapminder')
library(gapminder)

## Warning: package 'gapminder' was built under R version 4.4.1

head(gapminder)

## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

# Note the variable names...

# Start with a static plot (we've see a 4-variable bubble plot kind of like this before):

my_plot <- ggplot(
  gapminder, 
  aes(x = gdpPercap, y=lifeExp, size = pop, colour = country)
) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(x = "GDP per capita", y = "Life expectancy")
my_plot

# This plots each country-year combination as a separate observation (that's why there are a LOT of bubbles)

# The transition_time variable specifies which variable you want to dynamic plot to change with 
# (typically this would be a variable that measures time)
# The 'labs' function with 'frame_time' allows the title to reflect 
# the changing values of the transition_time variable.

my_plot + transition_time(year) +
  labs(title = "Year: {frame_time}")

## Warning: No renderer available. Please install the gifski, av, or magick package to
## create animated output

## NULL

# The dynamic plot appears as a gif in a separate window.

# Doing separate panels by Continent with facet_wrap:

my_plot + facet_wrap(~continent) +
  transition_time(year) +
  labs(title = "Year: {frame_time}")

## Warning: No renderer available. Please install the gifski, av, or magick package to
## create animated output

## NULL

# Look at the bouncing ball in the Africa plot.  What's going on there?

# Can use 'ggplotly' to identify the country on the static plot:

ggplotly(my_plot)

# To see all the data:
# print(gapminder,n=Inf)

# To just see the data for one country:

gapminder %>% filter(country == "Rwanda")

## # A tibble: 12 × 6
##    country continent  year lifeExp     pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
##  1 Rwanda  Africa     1952    40   2534927      493.
##  2 Rwanda  Africa     1957    41.5 2822082      540.
##  3 Rwanda  Africa     1962    43   3051242      597.
##  4 Rwanda  Africa     1967    44.1 3451079      511.
##  5 Rwanda  Africa     1972    44.6 3992121      591.
##  6 Rwanda  Africa     1977    45   4657072      670.
##  7 Rwanda  Africa     1982    46.2 5507565      882.
##  8 Rwanda  Africa     1987    44.0 6349365      848.
##  9 Rwanda  Africa     1992    23.6 7290203      737.
## 10 Rwanda  Africa     1997    36.1 7212583      590.
## 11 Rwanda  Africa     2002    43.4 7852401      786.
## 12 Rwanda  Africa     2007    46.2 8860588      863.

# Also notice the country on the right side of the Asia plot.

# You can let one or more of the axes change over time as well:

my_plot + transition_time(year) +
  labs(title = "Year: {frame_time}") +
  view_follow(fixed_y = TRUE)

## Warning: No renderer available. Please install the gifski, av, or magick package to
## create animated output

## NULL

# See many more cool options at: 
# https://www.datanovia.com/en/blog/gganimate-how-to-create-plots-with-beautiful-animation-in-r/

# Some other ways of displaying several variables at a time:

# A function to standardize a data vector:

zstd <- function(vec){
  out <- (vec-mean(vec))/sd(vec)
  return(out)
}

# Let's read in the husband-wife data set:

my.datafile <- tempfile()
cat(file=my.datafile, "  
49 1809 43 1590 25
25 1841 28 1560 19
40 1659 30 1620 38
52 1779 57 1540 26
58 1616 52 1420 30
32 1695 27 1660 23
43 1730 52 1610 33
47 1740 43 1580 26
31 1685 23 1610 26
26 1735 25 1590 23
", sep=" ")

options(scipen=999) # suppressing scientific notation

huswif <- read.table(my.datafile, header=FALSE, col.names=c("Hage", "Hheight", "Wage", "Wheight", "Hagefm"))

attach(huswif)  # attaching the data frame

# The observations are 10 married couples
# The first variable is the husband's age (in years).
# The second variable is the husband's height (in mm).
# The third variable is the wife's age (in years).
# The fourth variable is the wife's's height (in mm).
# The fifth variable is the husband's age (in years) at first marriage.


###############################################################################

## Profile curves of the 10 observations based on the 5 variables:

# First order the variables from smallest (in mean) to largest:

ord.vec <- order(apply(huswif,2,mean))

huswif.order <- huswif[, ord.vec]


plot(ord.vec, seq(min(huswif.order),max(huswif.order),length=length(ord.vec)), type='n', 
     xlab="Hagefm                   Wage                   Hage                   Wheight                   Hheight"  , ylab="Value")

for (ii in 1:nrow(huswif.order)){
  lines((1:length(ord.vec)), huswif.order[ii,], col=(1:nrow(huswif.order))[ii], lty=(1:nrow(huswif.order))[ii] )
}

# Better to put y-axis on a log scale here:

plot(ord.vec, seq(min(huswif.order),max(huswif.order),length=length(ord.vec)), type='n', log="y",
     xlab="Hagefm                   Wage                   Hage                   Wheight                   Hheight"  , ylab="Value")

for (ii in 1:nrow(huswif.order)){
  lines((1:length(ord.vec)), huswif.order[ii,], col=(1:nrow(huswif.order))[ii], lty=(1:nrow(huswif.order))[ii] , ylog=T)
}

legend("topleft", legend=1:nrow(huswif.order), fill=1:nrow(huswif.order), lty = (1:nrow(huswif.order)) )

## Maybe better to convert heights to inches (from mm) here?
##
## huswif.order[,4:5] <- huswif.order[,4:5]/25.4
##

###############################################################################

## Profile bars of the 10 observations based on the 5 variables:


barplot(t(as.matrix(huswif.order)), beside=T, log="y", names.arg=1:nrow(huswif.order), col=1:5)

legend("topleft", legend=names(huswif.order), fill=1:5 , cex=0.5)

###############################################################################

# Andrews Plots

# It is recommended to standardize the observations before doing Andrews Plots:

huswif.std <- apply(huswif,2,zstd)  # Using our zstd function from above

my.grid.length<- 500
t.vals <- seq(-pi,pi,length=my.grid.length)
basis.fcn.1 <- rep( (1/sqrt(2)), times = my.grid.length)
basis.fcn.2 <- sin(t.vals)
basis.fcn.3 <- cos(t.vals)
basis.fcn.4 <- sin(2*t.vals)
basis.fcn.5 <- cos(2*t.vals)

# If we had more than 5 variables, we would continue with
# basis.fcn.6 <- sin(3*t.vals)
# basis.fcn.7 <- cos(3*t.vals)
# and so on.

fourier.matrix <- rbind(basis.fcn.1, basis.fcn.2, basis.fcn.3, basis.fcn.4, basis.fcn.5)

my.Andrews.curves <- huswif.std %*% fourier.matrix

plot(c(-pi,pi), c(min(my.Andrews.curves),max(my.Andrews.curves)) , type='n', xlab='t', ylab='Fourier series')
for (jj in 1:nrow(huswif.std)){
  lines(t.vals, my.Andrews.curves[jj,], col=(1:nrow(huswif.std))[jj], lty=(1:nrow(huswif.std))[jj] )
}
legend("topleft", legend=1:nrow(huswif.std), fill=1:nrow(huswif.std), lty=(1:nrow(huswif.std)), cex=0.7)

# This graphical presentation of the data values preserves the relative pairwise distances:

dist(huswif.std)

##           1        2        3        4        5        6        7        8
## 2  2.725315                                                               
## 3  3.507198 4.662010                                                      
## 4  1.443297 3.641078 3.865915                                             
## 5  4.097448 5.600133 4.188873 3.171566                                    
## 6  2.800491 2.800462 2.955585 3.713186 5.074846                           
## 7  2.081379 3.970204 2.222621 2.024355 3.666068 2.988463                  
## 8  1.048628 2.997644 2.828334 1.445258 3.370065 2.355273 1.578504         
## 9  2.883217 2.799552 2.430380 3.668488 4.571880 1.013093 2.878059 2.291979
## 10 2.706807 1.789020 3.260408 3.566634 4.885015 1.347130 3.177160 2.384186
##           9
## 2          
## 3          
## 4          
## 5          
## 6          
## 7          
## 8          
## 9          
## 10 1.069791

Multivariate

2024-07-30