library(tidyverse)
library(kableExtra)
library(ggplot2)
library(mdatools)
library(ggbiplot)
library(plotly)
library(shiny)
df = read.csv('Alcohol_consumption_cleaned.csv', sep = ';', row.names = 'Country')
df = df[1:6]
df %>% kable() %>% kable_styling(full_width = FALSE)
| Continent | Liquor | Wine | Beer | LifeEx | HeartD | |
|---|---|---|---|---|---|---|
| France | Europe | 2.5 | 63.5 | 40.1 | 78 | 61.1 |
| Italy | Europe | 0.9 | 58.0 | 25.1 | 78 | 94.1 |
| Switz | Europe | 1.7 | 46.0 | 65.0 | 78 | 106.4 |
| Austria | Europe | 1.2 | 15.7 | 102.1 | 78 | 173.0 |
| U.K. | Other | 1.5 | 12.2 | 100.0 | 77 | 199.7 |
| U.S.A. | America | 2.0 | 8.9 | 87.8 | 76 | 176.0 |
| Russia | Asia | 3.8 | 2.7 | 17.1 | 69 | 373.6 |
| CzechRep | Europe | 1.0 | 1.7 | 140.0 | 73 | 283.7 |
| Japan | Asia | 2.1 | 1.0 | 55.0 | 79 | 34.7 |
| Mexico | America | 0.8 | 0.2 | 50.4 | 73 | 36.4 |
Performing PCA
dfScaled = scale(df[2:6], scale = T) %>% data.frame()
Pc = pca(dfScaled, alpha = 0.05, lim.type = 'jm', method= 'svd', ncomp = 2)
pc = prcomp(dfScaled)
summary(Pc)
##
## Summary for PCA model (class pca)
## Type of limits: jm
## Alpha: 0.05
## Gamma: 0.01
##
## Eigenvals Expvar Cumexpvar Nq Nh
## Comp 1 2.301 46.03 46.03 10 10
## Comp 2 1.606 32.11 78.14 10 10
pca1 = summary(pc)
pca1
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.5170 1.2672 0.7644 0.64978 0.29399
## Proportion of Variance 0.4603 0.3211 0.1168 0.08444 0.01729
## Cumulative Proportion 0.4603 0.7814 0.8983 0.98271 1.00000
var = paste(round(pca1$importance[2,]*100, digits = 2), '%')
ggscreeplot(pc)+
geom_col(fill = 'cornflowerblue') +
geom_line()+
theme_bw()+
geom_text(label = var, nudge_y = 0.03, nudge_x = 0.2 )
scoredf = Pc$calres$scores
eigenvalues = Pc$eigenvals*diag(Pc$ncomp)
tnorm = scoredf %*% solve(eigenvalues)
tsquared = c()
for (i in 1:nrow(tnorm)) {
for (j in 1:ncol(tnorm)) {
tsquared = c(tsquared, tnorm[i,1:j] %*% scoredf[i,1:j])
}
}
tsquared = tsquared %>% matrix(nrow = 10, byrow = T)
rownames(tsquared) = rownames(tnorm)
colnames(tsquared) = colnames(scoredf)
tsquared
## Comp 1 Comp 2
## France 0.846010400 2.4786891
## Italy 1.345417138 1.7523651
## Switz 0.527539249 0.6135703
## Austria 0.047799072 0.8294527
## U.K. 0.011387880 0.5512178
## U.S.A. 0.086134519 0.1884672
## Russia 5.048273778 7.6798452
## CzechRep 0.855615710 3.5396848
## Japan 0.226754801 0.2366353
## Mexico 0.005067455 0.1300725
hotelling = data.frame('T2_Hotelling' = Pc$calres$T2[,2],'Sample' = c(1:nrow(df)), 'Country' = rownames(df), 'Continent' = df$Continent)
hotplot = ggplot(hotelling, aes(x= Sample, y= T2_Hotelling, text = Country, color = Continent))+
geom_point()+
geom_line()+
theme_bw()+
geom_hline(yintercept = Pc$T2lim[1,2], alpha = 0.3, color = 'orange')+
geom_hline(yintercept = Pc$T2lim[2,2], alpha = 0.3, color = 'darkred')
div(
ggplotly(hotplot, width = 700, height = 500), align = 'center')
russia = scoredf[7,1:2] %*% t(Pc$loadings)
barplot(russia, col = 'blue', main = 'Russia')
qdf = data.frame('Q_residuals' = Pc$calres$Q[,2],'Sample' = c(1:nrow(df)), 'Country' = hotelling$Country, 'Continent' = df$Continent)
qplot = ggplot(qdf, aes(x= Sample, y= Q_residuals, text = Country, color = Continent))+
geom_point()+
geom_line()+
theme_bw()+
geom_hline(yintercept = Pc$Qlim[1,2], alpha = 0.3, color = 'orange')+
geom_hline(yintercept = Pc$Qlim[2,2], alpha = 0.3, color = 'darkred')
div(
ggplotly(qplot, width = 700, height = 500), align = 'center')
barplot(Pc$calres$residuals[10,],col = 'blue', main = 'Mexico')
Pc = selectCompNum(Pc, ncomp = 2)
selectedcomp = Pc$calres$ncomp.selected
plotResiduals(Pc, show.limits = T,
ncomp = selectedcomp,
labels=rownames(df),
show.labels= T,
norm = FALSE ,
lab.col='black',
xlab = 'T^2 Hotelling',
ylab = 'Q residuals',
cgroup = factor(df$Continent)
)
scoreplot = ggbiplot(pc, groups = df$Continent, ellipse = F, var.axes = F)+
theme_bw()+
geom_text(label= rownames(df), size= 3, nudge_y = 0.1, nudge_x = 0.2)+
labs(color='Continent')+
xlim(-3,2)+
geom_hline(yintercept = 0, alpha = 0.3, linetype = 'longdash')+
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'longdash')
div(
ggplotly(scoreplot, width = 700, height = 500), align = 'center')
loadingplot = ggbiplot(pc)+
geom_point(color='white')+
theme_bw()+
xlim(-2.5,2.5)+
geom_hline(yintercept = 0, alpha = 0.3, linetype = 'longdash')+
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'longdash')
loadingplot
biplo = ggbiplot(pc, groups = df$Continent, ellipse = T)+
theme_bw()+
geom_text(label= rownames(df), size= 3, nudge_y = 0.1, nudge_x = 0.2)+
labs(color='Continent')+
geom_hline(yintercept = 0, alpha = 0.3, linetype = 'longdash')+
geom_vline(xintercept = 0, alpha = 0.3, linetype = 'longdash')
div(
ggplotly(biplo, width = 700, height = 500), align = 'center')