contenido del segundo tab
#Paquetes
#install.packages("corrr")
#install.packages("corrplot")
#install.packages("FactoMineR")
#install.packages("factoextra")
#install.packages("rworldmap")
#install.packages("rnaturalearth")
#install.packages("rnaturalearthdata")
#install.packages("sf")
#Matriz de correlación
library(corrr)
library(corrplot)
## corrplot 0.94 loaded
#PCA
library(FactoMineR)
library(factoextra)
## Cargando paquete requerido: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#graficos
library(ggplot2)
library(plotly)
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#manejo
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Mapa
library(rworldmap)
## Cargando paquete requerido: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(rnaturalearth)
library(rnaturalearthdata)
##
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
protein_data<-read.csv("protein.csv")
str(protein_data)
## 'data.frame': 25 obs. of 11 variables:
## $ Country : chr "Albania" "Austria" "Belgium" "Bulgaria" ...
## $ Red_Meat : int 10 9 14 8 10 11 8 10 18 10 ...
## $ White_Meat : int 1 14 9 6 11 11 12 5 10 3 ...
## $ Eggs : int 1 4 4 2 3 4 4 3 3 3 ...
## $ Milk : int 9 20 18 8 13 25 11 34 20 18 ...
## $ Fish : int 0 2 5 1 2 10 5 6 6 6 ...
## $ Cereals : int 42 28 27 57 34 22 25 26 28 42 ...
## $ Starchy_Foods : int 1 4 6 1 5 5 7 5 5 2 ...
## $ Pulses_nuts_oilseeds: int 6 1 2 4 1 1 1 1 2 8 ...
## $ Fruits_Vegetables : int 2 4 4 4 4 2 4 1 7 7 ...
## $ Total : int 72 86 89 91 83 91 77 91 99 99 ...
#presencia de datos nulos
colSums(is.na(protein_data))
## Country Red_Meat White_Meat
## 0 0 0
## Eggs Milk Fish
## 0 0 0
## Cereals Starchy_Foods Pulses_nuts_oilseeds
## 0 0 0
## Fruits_Vegetables Total
## 0 0
#solamente vamos a seleccionar los datos numericos
numerical_data<-protein_data[,2:10]
#Calcular la media y la varianza
apply(X= numerical_data, MARGIN=2,FUN = mean)
## Red_Meat White_Meat Eggs
## 9.80 7.92 3.08
## Milk Fish Cereals
## 17.28 4.28 32.32
## Starchy_Foods Pulses_nuts_oilseeds Fruits_Vegetables
## 4.36 3.08 4.20
#cALCULAR LA VARIANZA
#margin es 1 opera sobre las filas
#margin es 2 opera sobre las columnas
apply(X= numerical_data, MARGIN=2,FUN = var)
## Red_Meat White_Meat Eggs
## 11.583333 13.993333 1.243333
## Milk Fish Cereals
## 50.376667 12.043333 121.226667
## Starchy_Foods Pulses_nuts_oilseeds Fruits_Vegetables
## 2.740000 4.076667 3.666667
apply(X= numerical_data, MARGIN=2,FUN = sd)
## Red_Meat White_Meat Eggs
## 3.403430 3.740766 1.115049
## Milk Fish Cereals
## 7.097652 3.470351 11.010298
## Starchy_Foods Pulses_nuts_oilseeds Fruits_Vegetables
## 1.655295 2.019076 1.914854
#Normalizar los datos
#se resta la media y se divide por la sd
data_normalized<- scale(numerical_data)
#Mirar si los datos estan normalizados.
apply(X= data_normalized, MARGIN=2,FUN = mean)
## Red_Meat White_Meat Eggs
## -2.192951e-16 1.097646e-17 -6.438426e-17
## Milk Fish Cereals
## -1.701417e-16 -6.438426e-17 -1.774622e-17
## Starchy_Foods Pulses_nuts_oilseeds Fruits_Vegetables
## -1.976479e-16 -1.833169e-17 -1.021492e-16
apply(X= data_normalized, MARGIN=2,FUN = sd)
## Red_Meat White_Meat Eggs
## 1 1 1
## Milk Fish Cereals
## 1 1 1
## Starchy_Foods Pulses_nuts_oilseeds Fruits_Vegetables
## 1 1 1
#Aplicar ACP
data.cpa<-princomp(data_normalized)
data.cpa$center
## Red_Meat White_Meat Eggs
## -2.272488e-16 2.233456e-17 -7.806256e-17
## Milk Fish Cereals
## -1.806281e-16 -5.290907e-17 -1.734723e-17
## Starchy_Foods Pulses_nuts_oilseeds Fruits_Vegetables
## -1.873501e-16 -1.821460e-17 -9.714451e-17
data.cpa$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Red_Meat 0.311 0.355 0.597 0.397 0.377 0.228
## White_Meat 0.316 0.215 -0.628 -0.311 0.146
## Eggs 0.421 0.255 -0.665 0.467
## Milk 0.379 0.169 0.404 -0.318 -0.718 -0.102
## Fish 0.134 -0.652 0.300 -0.235 -0.304 0.237 0.441
## Cereals -0.430 0.254 0.185 0.194 -0.343 0.721
## Starchy_Foods 0.296 -0.389 -0.281 -0.305 0.673 -0.326
## Pulses_nuts_oilseeds -0.422 -0.129 0.140 0.251 -0.587 -0.218
## Fruits_Vegetables -0.122 -0.504 -0.340 0.604 -0.228 0.158 -0.359
## Comp.9
## Red_Meat 0.251
## White_Meat 0.577
## Eggs -0.275
## Milk 0.190
## Fish 0.260
## Cereals 0.192
## Starchy_Foods 0.150
## Pulses_nuts_oilseeds 0.567
## Fruits_Vegetables -0.211
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111
## Cumulative Var 0.111 0.222 0.333 0.444 0.556 0.667 0.778 0.889 1.000
summary(data.cpa)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.9828553 1.2489623 1.0207403 0.9321032 0.6400533
## Proportion of Variance 0.4550596 0.1805448 0.1205915 0.1005574 0.0474153
## Cumulative Proportion 0.4550596 0.6356044 0.7561959 0.8567534 0.9041687
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.57711577 0.50866787 0.35936288 0.32716279
## Proportion of Variance 0.03854891 0.02994711 0.01494695 0.01238837
## Cumulative Proportion 0.94271757 0.97266468 0.98761163 1.00000000
#los cuatros componentes principales pueden considerarse como los mas significativos, ya que contienen casi el 86% de la informacion contenida.
corr_matrix<-cor(data_normalized)
corr_matrix
## Red_Meat White_Meat Eggs Milk Fish
## Red_Meat 1.00000000 0.18850977 0.57532001 0.5440251 0.06491072
## White_Meat 0.18850977 1.00000000 0.60095535 0.2974816 -0.19719960
## Eggs 0.57532001 0.60095535 1.00000000 0.6130310 0.04780844
## Milk 0.54402512 0.29748163 0.61303102 1.0000000 0.16246239
## Fish 0.06491072 -0.19719960 0.04780844 0.1624624 1.00000000
## Cereals -0.50970337 -0.43941908 -0.70131040 -0.5924925 -0.51714759
## Starchy_Foods 0.15383673 0.33456770 0.41266333 0.2144917 0.43868411
## Pulses_nuts_oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357 -0.12226043
## Fruits_Vegetables -0.06393465 -0.07329308 -0.16392249 -0.3997753 0.22948842
## Cereals Starchy_Foods Pulses_nuts_oilseeds
## Red_Meat -0.50970337 0.1538367 -0.4098888
## White_Meat -0.43941908 0.3345677 -0.6721488
## Eggs -0.70131040 0.4126633 -0.5951938
## Milk -0.59249246 0.2144917 -0.6238357
## Fish -0.51714759 0.4386841 -0.1222604
## Cereals 1.00000000 -0.5781345 0.6360595
## Starchy_Foods -0.57813449 1.0000000 -0.4951880
## Pulses_nuts_oilseeds 0.63605948 -0.4951880 1.0000000
## Fruits_Vegetables 0.04229293 0.0683567 0.3513323
## Fruits_Vegetables
## Red_Meat -0.06393465
## White_Meat -0.07329308
## Eggs -0.16392249
## Milk -0.39977527
## Fish 0.22948842
## Cereals 0.04229293
## Starchy_Foods 0.06835670
## Pulses_nuts_oilseeds 0.35133227
## Fruits_Vegetables 1.00000000
ggcorrplot::ggcorrplot(corr_matrix)
#scree plot
fviz_eig(data.cpa,addlabels = TRUE)
fviz_pca_var(data.cpa)
fviz_cos2(data.cpa, choice = "var",axes = 1:4)
## clusterizacion k-means Suponiendo que ya tienes el objeto data.pca
con los componentes principales
# Extraer los scores de los primeros cuatro componentes
cpa_scores <- data.cpa$scores[, 1:4]
# Realizar K-means clustering (25 observaciones correspondientes a los países)
set.seed(123) # Para reproducibilidad
kmeans_result <- kmeans(cpa_scores, centers = 3, nstart = 25)
# Añadir el cluster asignado a cada país al dataframe original
protein_data$Cluster <- as.factor(kmeans_result$cluster)
# Visualización del clustering en el espacio de los dos primeros componentes principales
plot1<-ggplot(protein_data, aes(x = cpa_scores[,1], y = cpa_scores[,2], color = Cluster, label = rownames(protein_data))) +
geom_point(size = 5) + # Dibuja los puntos de dispersión con un tamaño de 5
geom_text(vjust = 1.5) + # Añade los nombres de los países, con un desplazamiento vertical para que no se superpongan con los puntos
labs(title = "Clustering de Países basado en el Consumo de Proteínas",
x = "Componente Principal 1", # Etiqueta del eje X
y = "Componente Principal 2") + # Etiqueta del eje Y
theme_minimal() # Utiliza un tema minimalista para la gráfica
ggplotly(plot1)