PCA_NMDS

cep %>%
  str()

## Classes 'tbl_df', 'tbl' and 'data.frame':    290 obs. of  19 variables:
##  $ municipality    : chr  "Ale" "Alingsås" "Alvesta" "Aneby" ...
##  $ region          : chr  "Götaland" "Götaland" "Götaland" "Götaland" ...
##  $ county          : chr  "Västra Götalands län" "Västra Götalands län" "Kronobergs län" "Jönköpings län" ...
##  $ income          : chr  "Middle" "Middle" "Low" "Low" ...
##  $ pop.size        : num  30223 40390 20026 6776 13934 ...
##  $ area            : num  317 472 974 518 325 ...
##  $ mean.age        : num  39.5 42.1 41.8 42.4 44.2 47 45.3 44.6 46.2 43.9 ...
##  $ mortality       : num  0.84 1.005 0.939 0.945 1.184 ...
##  $ natality        : num  1.18 1.05 1.31 1.45 1.08 ...
##  $ pop.change      : num  2.23 0.854 0.879 2.553 0.222 ...
##  $ immigration     : num  7.41 4.88 6.82 8.09 5.96 ...
##  $ emigration      : num  5.52 4.12 6.31 6.02 5.76 ...
##  $ tax.capacity    : num  197627 199056 174595 181317 177804 ...
##  $ tax.equal       : num  -462 -1025 2357 -837 -885 ...
##  $ unemployment    : num  3.9 5.1 8.5 5.5 9.4 4.8 7.6 6.6 5.2 11.8 ...
##  $ foreign.origin  : num  21.8 14.6 24.3 14.8 18.2 ...
##  $ higher.edu      : num  21 24.2 17.2 17.7 18.9 ...
##  $ greenhouse.gases: num  103773 105709 119691 50356 62066 ...
##  $ dioxin.mg       : num  29.5 69.2 69.7 18.5 35 ...

cep <- na.omit(cep) #remove missing values

PCA.model<-princomp(~area+mean.age+mortality+natality+pop.change+
                      immigration+emigration+tax.capacity+unemployment+foreign.origin+
                      higher.edu+greenhouse.gases+dioxin.mg, cor=TRUE, data=cep, scores=TRUE)

#the number of PCs to show can be set with ncps. 
pca.scatter <- screeplot(PCA.model, npcs = 13, main = "Importance of components")

cep_pc <<- within(cep, {
  PC3 <- PCA.model$scores[,3]
  PC2 <- PCA.model$scores[,2]
  PC1 <- PCA.model$scores[,1] })

autoplot(PCA.model, data = cep, colour = 'income',  addEllipses=TRUE, ellipse.level=0.95) + 
  theme_light() + scale_colour_discrete("income")

autoplot(PCA.model, data = cep, colour = 'region') + 
  theme_light() + scale_colour_discrete("region")

autoplot(PCA.model, data = cep, colour = 'county') + 
  theme_light() + scale_colour_discrete("county")

p3<-autoplot(PCA.model, data = cep, colour = 'income', loadings = TRUE, 
             loadings.colour = 'blue', loadings.label = TRUE, loadings.label.size = 3) 

p3 + theme_light() + 
  scale_colour_discrete("income") + 
  geom_text(label=cep$municipality, vjust = -1, size=2)

p4<-ggplot(cep_pc) +
  aes(x = mean.age, y = PC1, colour = income) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('Mean Age')+
  theme_light() +
  theme(legend.position ="none")
p5<-ggplot(cep_pc) +
  aes(x = emigration, y = PC2, colour = income) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('Emigration')+
  theme_light() +
  theme(legend.position ="none")
grid.arrange(p4,p5, ncol=2)

df<-subset(cep, select = -c(county,income,region,municipality,pop.size,tax.equal))
PCA.model2<-PCA(df, graph=FALSE)
get_eig(PCA.model2)

##          eigenvalue variance.percent cumulative.variance.percent
## Dim.1  4.5061448557     34.662652736                    34.66265
## Dim.2  2.5582835242     19.679104032                    54.34176
## Dim.3  1.8160902012     13.969924624                    68.31168
## Dim.4  1.1538428253      8.875714041                    77.18740
## Dim.5  0.8077968685      6.213822065                    83.40122
## Dim.6  0.6016893918      4.628379937                    88.02960
## Dim.7  0.4079067353      3.137744118                    91.16734
## Dim.8  0.3603945414      2.772265703                    93.93961
## Dim.9  0.2905023827      2.234633713                    96.17424
## Dim.10 0.2409524099      1.853480076                    98.02772
## Dim.11 0.1597052843      1.228502187                    99.25622
## Dim.12 0.0962585005      0.740450003                    99.99667
## Dim.13 0.0004324794      0.003326765                   100.00000

fviz_screeplot(PCA.model2, addlabels = TRUE, main = 'Importance of principal components', xlab = 'Component')

fviz_contrib(PCA.model2, choice = "var", axes = 1)

fviz_contrib(PCA.model2, choice = "var", axes = 2)

fviz_pca_var(PCA.model2, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE 
)

fviz_pca_biplot(PCA.model, label="var", habillage=cep$income, addEllipses=TRUE, ellipse.level=0.95)

otu_table(enter)[1:10,1:9]

## OTU Table:          [10 taxa and 9 samples]
##                      taxa are rows
##                    AM.AD.1    AM.AD.2  DA.AD.1    DA.AD.2    DA.AD.3    DA.AD.4
## -1               0.4265035 0.33226194 0.533604 0.63470800 0.77980981 0.76409898
## Bacteria         0.0000000 0.00000000 0.000000 0.00001328 0.00000000 0.00000000
## Prosthecochloris 0.0000000 0.00000000 0.000000 0.00000000 0.00000000 0.00000000
## Chloroflexus     0.0000000 0.00000000 0.000000 0.00000000 0.00000000 0.00000000
## Dehalococcoides  0.0000000 0.00026835 0.000000 0.00002030 0.00000000 0.00008728
## Thermus          0.0000000 0.00000000 0.000000 0.00000000 0.00000000 0.00000000
## Listeria         0.0000000 0.00000000 0.000000 0.00000000 0.00000000 0.00000000
## Fusobacterium    0.0000000 0.00001508 0.000000 0.00000852 0.00000000 0.00000000
## Streptobacillus  0.0000000 0.00000000 0.000000 0.00000000 0.00005138 0.00000000
## Brevibacillus    0.0000000 0.00000000 0.000000 0.00000237 0.00000000 0.00000000
##                     ES.AD.1   ES.AD.2   ES.AD.3
## -1               0.25166833 0.4271269 0.3912698
## Bacteria         0.00000000 0.0000000 0.0000000
## Prosthecochloris 0.00000000 0.0000000 0.0000000
## Chloroflexus     0.00000000 0.0000000 0.0000000
## Dehalococcoides  0.00004637 0.0000000 0.0000000
## Thermus          0.00000000 0.0000000 0.0000000
## Listeria         0.00000000 0.0000000 0.0000000
## Fusobacterium    0.00000000 0.0000000 0.0000000
## Streptobacillus  0.00000000 0.0000000 0.0000000
## Brevibacillus    0.00000000 0.0000000 0.0000000

apply(otu_table(enter),2,sum)

##   AM.AD.1   AM.AD.2   DA.AD.1   DA.AD.2   DA.AD.3   DA.AD.4   ES.AD.1   ES.AD.2 
## 0.9999999 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   ES.AD.3   ES.AD.4   FR.AD.1   FR.AD.2   FR.AD.3   FR.AD.4   FR.AD.5   FR.AD.6 
## 1.0000000 1.0000000 1.0000000 1.0000000 0.9999999 1.0000000 1.0000000 1.0000000 
##   FR.AD.7   FR.AD.8   IT.AD.1   IT.AD.2   IT.AD.3   IT.AD.4   IT.AD.5   IT.AD.6 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000001 0.9999999 
##   JP.AD.1   JP.AD.2   JP.AD.3   JP.AD.4   JP.AD.5   JP.AD.6   JP.AD.7   JP.AD.8 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   JP.AD.9 
## 1.0000000

newsampledata=sample_data(sventer)
newsampledata$Enterotype=addNA(newsampledata$Enterotype)
sample_data(enter)=newsampledata
table(sample_data(enter)$Enterotype)

## 
##    1    2    3 <NA> 
##    8    6   17    2

levels(sample_data(enter)$Enterotype) =c("1","2","3","4")
table(sample_data(enter)$Enterotype)

## 
##  1  2  3  4 
##  8  6 17  2

data = as(otu_table(enter), "matrix")
data = data[-1, ] ####Take out the data missing an otu classification.

myTaxa <- names(sort(taxa_sums(enter), decreasing = TRUE)[1:1000])
GP3 <- prune_taxa(myTaxa, enter)
GP3

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 553 taxa and 33 samples ]
## sample_data() Sample Data:       [ 33 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 553 taxa by 1 taxonomic ranks ]

require(ggplot2)
#Bray-Curtis
nmds.bray <- ordinate(GP3, method="NMDS", distance="bray")

## Run 0 stress 0.1508618 
## Run 1 stress 0.1735976 
## Run 2 stress 0.1530483 
## Run 3 stress 0.155411 
## Run 4 stress 0.1530476 
## Run 5 stress 0.1925618 
## Run 6 stress 0.1508605 
## ... New best solution
## ... Procrustes: rmse 0.0006311593  max resid 0.0024577 
## ... Similar to previous best
## Run 7 stress 0.1504053 
## ... New best solution
## ... Procrustes: rmse 0.03488855  max resid 0.1655918 
## Run 8 stress 0.1584142 
## Run 9 stress 0.1634679 
## Run 10 stress 0.1859829 
## Run 11 stress 0.1715057 
## Run 12 stress 0.1514534 
## Run 13 stress 0.1937191 
## Run 14 stress 0.1810915 
## Run 15 stress 0.1541248 
## Run 16 stress 0.1579322 
## Run 17 stress 0.1508605 
## ... Procrustes: rmse 0.03492331  max resid 0.1663783 
## Run 18 stress 0.1551209 
## Run 19 stress 0.1530478 
## Run 20 stress 0.1512936 
## *** No convergence -- monoMDS stopping criteria:
##     20: stress ratio > sratmax

print(nmds.bray)

## 
## Call:
## metaMDS(comm = veganifyOTU(physeq), distance = distance) 
## 
## global Multidimensional Scaling using monoMDS
## 
## Data:     veganifyOTU(physeq) 
## Distance: bray 
## 
## Dimensions: 2 
## Stress:     0.1504053 
## Stress type 1, weak ties
## No convergent solutions - best solution after 20 tries
## Scaling: centring, PC rotation, halfchange scaling 
## Species: expanded scores based on 'veganifyOTU(physeq)'

# Create plot, store as temp variable, p
p <- plot_ordination(GP3, nmds.bray, color = "Enterotype", shape = "Nationality")
# Add title to each plot
p <- p + ggtitle("Bray Curtis Distance, NonMetric MDS")
p + geom_point(size = 4)+ theme_bw () + scale_colour_brewer(type="qual", palette="Set1")

veganotu = function(physeq) {
  require("vegan")
  OTU = otu_table(physeq)
  if (taxa_are_rows(OTU)) {
    OTU = t(OTU)
  }
  return(as(OTU, "matrix"))
}

#export data from phyloseq to vegan-compatible object
GP3_vegan<-veganotu(GP3)

# make a data frame that can be used in vegan from the sample_data in the phyloseq object
sampledf <- data.frame(sample_data(GP3))
#create a dissimilarity matrix similar to the one used in the analysis in phyloseq specify the same transformation and standardisation as used earlier.
GP3_BC<-vegdist((GP3_vegan), method = "bray")

# testing for homogeneity of group dispersions
betadisp_GP3<-betadisper(GP3_BC, sampledf$Nationality)
betadisp_GP3

## 
##  Homogeneity of multivariate dispersions
## 
## Call: betadisper(d = GP3_BC, group = sampledf$Nationality)
## 
## No. of Positive Eigenvalues: 21
## No. of Negative Eigenvalues: 11
## 
## Average distance to median:
## american   danish   french  italian japanese  spanish 
##   0.1770   0.1596   0.1631   0.2161   0.2269   0.2249 
## 
## Eigenvalues for PCoA axes:
## (Showing 8 of 32 eigenvalues)
##   PCoA1   PCoA2   PCoA3   PCoA4   PCoA5   PCoA6   PCoA7   PCoA8 
## 0.94156 0.40317 0.33686 0.20205 0.14594 0.13778 0.08504 0.06803

boxplot(betadisp_GP3, xlab="", las=2, cex.axis=0.8)

anova(betadisp_GP3)

## Analysis of Variance Table
## 
## Response: Distances
##           Df   Sum Sq   Mean Sq F value Pr(>F)
## Groups     5 0.028893 0.0057786  0.7874 0.5679
## Residuals 27 0.198156 0.0073391

adonis(GP3_BC ~ Nationality, data = sampledf)

## 
## Call:
## adonis(formula = GP3_BC ~ Nationality, data = sampledf) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##             Df SumsOfSqs  MeanSqs F.Model     R2 Pr(>F)    
## Nationality  5   0.91239 0.182479  3.3139 0.3803  0.001 ***
## Residuals   27   1.48675 0.055065         0.6197           
## Total       32   2.39914                  1.0000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

GP3_mds <- metaMDS(venter, trace = FALSE)
GP3_mds

## 
## Call:
## metaMDS(comm = venter, trace = FALSE) 
## 
## global Multidimensional Scaling using monoMDS
## 
## Data:     venter 
## Distance: bray 
## 
## Dimensions: 2 
## Stress:     0.1490281 
## Stress type 1, weak ties
## Two convergent solutions found after 20 tries
## Scaling: centring, PC rotation, halfchange scaling 
## Species: expanded scores based on 'venter'

# Build a color palette, 9 (number of sample types) colours coming from the Set1 color palette (which was used in the original NMDS above)
colpal <- brewer.pal(9, "Set2")

## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

plot(GP3_mds,type="n", dis="sp") #ordiplot can also be used. 
with(sampledf, points(GP3_mds, display = "species", col = colpal[Nationality],
                      pch = 16, cex=2))
text(sampledf, cex = 0.65, col = "black",pos=1) 
with(sampledf, legend(2.4, 1.5, legend = levels(Nationality), bty = "n",
                      col = colpal, pch = 16, cex = 1))
text(2.4, -1.5, "Stress = 0.149", cex=1)

PCA_NMDS

Theodorou Sofia

3/31/2020