Tabs

Análisis de componentes principales

Dataset

Clusterización k-means

contenido del segundo tab

Análisis de regresión

#Paquetes
#install.packages("corrr")
#install.packages("corrplot")
#install.packages("FactoMineR")
#install.packages("factoextra")
#install.packages("rworldmap")
#install.packages("rnaturalearth")
#install.packages("rnaturalearthdata")
#install.packages("sf")
#Matriz de correlación
library(corrr)
library(corrplot)

## corrplot 0.94 loaded

#PCA
library(FactoMineR)
library(factoextra)

## Cargando paquete requerido: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#graficos
library(ggplot2)
library(plotly)

## 
## Adjuntando el paquete: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

#manejo
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Mapa
library(rworldmap)

## Cargando paquete requerido: sp

## ### Welcome to rworldmap ###

## For a short introduction type :   vignette('rworldmap')

library(rnaturalearth)
library(rnaturalearthdata)

## 
## Adjuntando el paquete: 'rnaturalearthdata'

## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(sf)

## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE

protein_data<-read.csv("protein.csv")
str(protein_data)

## 'data.frame':    25 obs. of  11 variables:
##  $ Country             : chr  "Albania" "Austria" "Belgium" "Bulgaria" ...
##  $ Red_Meat            : int  10 9 14 8 10 11 8 10 18 10 ...
##  $ White_Meat          : int  1 14 9 6 11 11 12 5 10 3 ...
##  $ Eggs                : int  1 4 4 2 3 4 4 3 3 3 ...
##  $ Milk                : int  9 20 18 8 13 25 11 34 20 18 ...
##  $ Fish                : int  0 2 5 1 2 10 5 6 6 6 ...
##  $ Cereals             : int  42 28 27 57 34 22 25 26 28 42 ...
##  $ Starchy_Foods       : int  1 4 6 1 5 5 7 5 5 2 ...
##  $ Pulses_nuts_oilseeds: int  6 1 2 4 1 1 1 1 2 8 ...
##  $ Fruits_Vegetables   : int  2 4 4 4 4 2 4 1 7 7 ...
##  $ Total               : int  72 86 89 91 83 91 77 91 99 99 ...

#presencia de datos nulos 
colSums(is.na(protein_data))

##              Country             Red_Meat           White_Meat 
##                    0                    0                    0 
##                 Eggs                 Milk                 Fish 
##                    0                    0                    0 
##              Cereals        Starchy_Foods Pulses_nuts_oilseeds 
##                    0                    0                    0 
##    Fruits_Vegetables                Total 
##                    0                    0

#solamente vamos a seleccionar los datos numericos

numerical_data<-protein_data[,2:10]

#Calcular la media y la varianza

apply(X= numerical_data, MARGIN=2,FUN = mean)

##             Red_Meat           White_Meat                 Eggs 
##                 9.80                 7.92                 3.08 
##                 Milk                 Fish              Cereals 
##                17.28                 4.28                32.32 
##        Starchy_Foods Pulses_nuts_oilseeds    Fruits_Vegetables 
##                 4.36                 3.08                 4.20

#cALCULAR LA VARIANZA

#margin es 1 opera sobre las filas 
#margin es 2 opera sobre las columnas 
apply(X= numerical_data, MARGIN=2,FUN = var)

##             Red_Meat           White_Meat                 Eggs 
##            11.583333            13.993333             1.243333 
##                 Milk                 Fish              Cereals 
##            50.376667            12.043333           121.226667 
##        Starchy_Foods Pulses_nuts_oilseeds    Fruits_Vegetables 
##             2.740000             4.076667             3.666667

apply(X= numerical_data, MARGIN=2,FUN = sd)

##             Red_Meat           White_Meat                 Eggs 
##             3.403430             3.740766             1.115049 
##                 Milk                 Fish              Cereals 
##             7.097652             3.470351            11.010298 
##        Starchy_Foods Pulses_nuts_oilseeds    Fruits_Vegetables 
##             1.655295             2.019076             1.914854

#Normalizar los datos

#se resta la media y se divide por la sd
data_normalized<- scale(numerical_data)

#Mirar si los datos estan normalizados.

apply(X= data_normalized, MARGIN=2,FUN = mean)

##             Red_Meat           White_Meat                 Eggs 
##        -2.192951e-16         1.097646e-17        -6.438426e-17 
##                 Milk                 Fish              Cereals 
##        -1.701417e-16        -6.438426e-17        -1.774622e-17 
##        Starchy_Foods Pulses_nuts_oilseeds    Fruits_Vegetables 
##        -1.976479e-16        -1.833169e-17        -1.021492e-16

apply(X= data_normalized, MARGIN=2,FUN = sd)

##             Red_Meat           White_Meat                 Eggs 
##                    1                    1                    1 
##                 Milk                 Fish              Cereals 
##                    1                    1                    1 
##        Starchy_Foods Pulses_nuts_oilseeds    Fruits_Vegetables 
##                    1                    1                    1

#Aplicar ACP

data.cpa<-princomp(data_normalized)
data.cpa$center

##             Red_Meat           White_Meat                 Eggs 
##        -2.272488e-16         2.233456e-17        -7.806256e-17 
##                 Milk                 Fish              Cereals 
##        -1.806281e-16        -5.290907e-17        -1.734723e-17 
##        Starchy_Foods Pulses_nuts_oilseeds    Fruits_Vegetables 
##        -1.873501e-16        -1.821460e-17        -9.714451e-17

data.cpa$loadings

## 
## Loadings:
##                      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Red_Meat              0.311         0.355  0.597  0.397  0.377  0.228       
## White_Meat            0.316  0.215 -0.628        -0.311         0.146       
## Eggs                  0.421                0.255        -0.665         0.467
## Milk                  0.379  0.169  0.404        -0.318        -0.718 -0.102
## Fish                  0.134 -0.652  0.300 -0.235 -0.304         0.237  0.441
## Cereals              -0.430  0.254                0.185  0.194 -0.343  0.721
## Starchy_Foods         0.296 -0.389 -0.281 -0.305  0.673        -0.326       
## Pulses_nuts_oilseeds -0.422 -0.129  0.140  0.251        -0.587        -0.218
## Fruits_Vegetables    -0.122 -0.504 -0.340  0.604 -0.228  0.158 -0.359       
##                      Comp.9
## Red_Meat              0.251
## White_Meat            0.577
## Eggs                 -0.275
## Milk                  0.190
## Fish                  0.260
## Cereals               0.192
## Starchy_Foods         0.150
## Pulses_nuts_oilseeds  0.567
## Fruits_Vegetables    -0.211
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.111  0.111  0.111  0.111  0.111  0.111  0.111  0.111  0.111
## Cumulative Var  0.111  0.222  0.333  0.444  0.556  0.667  0.778  0.889  1.000

summary(data.cpa)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.9828553 1.2489623 1.0207403 0.9321032 0.6400533
## Proportion of Variance 0.4550596 0.1805448 0.1205915 0.1005574 0.0474153
## Cumulative Proportion  0.4550596 0.6356044 0.7561959 0.8567534 0.9041687
##                            Comp.6     Comp.7     Comp.8     Comp.9
## Standard deviation     0.57711577 0.50866787 0.35936288 0.32716279
## Proportion of Variance 0.03854891 0.02994711 0.01494695 0.01238837
## Cumulative Proportion  0.94271757 0.97266468 0.98761163 1.00000000

#los cuatros componentes principales pueden considerarse como los mas significativos, ya que contienen casi el 86% de la informacion contenida. 

corr_matrix<-cor(data_normalized)
corr_matrix

##                         Red_Meat  White_Meat        Eggs       Milk        Fish
## Red_Meat              1.00000000  0.18850977  0.57532001  0.5440251  0.06491072
## White_Meat            0.18850977  1.00000000  0.60095535  0.2974816 -0.19719960
## Eggs                  0.57532001  0.60095535  1.00000000  0.6130310  0.04780844
## Milk                  0.54402512  0.29748163  0.61303102  1.0000000  0.16246239
## Fish                  0.06491072 -0.19719960  0.04780844  0.1624624  1.00000000
## Cereals              -0.50970337 -0.43941908 -0.70131040 -0.5924925 -0.51714759
## Starchy_Foods         0.15383673  0.33456770  0.41266333  0.2144917  0.43868411
## Pulses_nuts_oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357 -0.12226043
## Fruits_Vegetables    -0.06393465 -0.07329308 -0.16392249 -0.3997753  0.22948842
##                          Cereals Starchy_Foods Pulses_nuts_oilseeds
## Red_Meat             -0.50970337     0.1538367           -0.4098888
## White_Meat           -0.43941908     0.3345677           -0.6721488
## Eggs                 -0.70131040     0.4126633           -0.5951938
## Milk                 -0.59249246     0.2144917           -0.6238357
## Fish                 -0.51714759     0.4386841           -0.1222604
## Cereals               1.00000000    -0.5781345            0.6360595
## Starchy_Foods        -0.57813449     1.0000000           -0.4951880
## Pulses_nuts_oilseeds  0.63605948    -0.4951880            1.0000000
## Fruits_Vegetables     0.04229293     0.0683567            0.3513323
##                      Fruits_Vegetables
## Red_Meat                   -0.06393465
## White_Meat                 -0.07329308
## Eggs                       -0.16392249
## Milk                       -0.39977527
## Fish                        0.22948842
## Cereals                     0.04229293
## Starchy_Foods               0.06835670
## Pulses_nuts_oilseeds        0.35133227
## Fruits_Vegetables           1.00000000

ggcorrplot::ggcorrplot(corr_matrix)

#scree plot

fviz_eig(data.cpa,addlabels = TRUE)

fviz_pca_var(data.cpa)

fviz_cos2(data.cpa, choice = "var",axes = 1:4)

## clusterizacion k-means Suponiendo que ya tienes el objeto data.pca con los componentes principales

# Extraer los scores de los primeros cuatro componentes
cpa_scores <- data.cpa$scores[, 1:4]

# Realizar K-means clustering (25 observaciones correspondientes a los países)
set.seed(123) # Para reproducibilidad
kmeans_result <- kmeans(cpa_scores, centers = 3, nstart = 25)

# Añadir el cluster asignado a cada país al dataframe original
protein_data$Cluster <- as.factor(kmeans_result$cluster)

# Visualización del clustering en el espacio de los dos primeros componentes principales

plot1<-ggplot(protein_data, aes(x = cpa_scores[,1], y = cpa_scores[,2], color = Cluster, label = rownames(protein_data))) +
  geom_point(size = 5) +              # Dibuja los puntos de dispersión con un tamaño de 5
  geom_text(vjust = 1.5) +            # Añade los nombres de los países, con un desplazamiento vertical para que no se superpongan con los puntos
  labs(title = "Clustering de Países basado en el Consumo de Proteínas",
       x = "Componente Principal 1",  # Etiqueta del eje X
       y = "Componente Principal 2") + # Etiqueta del eje Y
  theme_minimal()                     # Utiliza un tema minimalista para la gráfica


ggplotly(plot1)

Clase 13

Juan Pablo Bolaños

2024-08-27

Tabs

Análisis de componentes principales

Dataset

Clusterización k-means

Análisis de regresión