Chcolates_PROYECTO

##Cargamos las librerias Nota: las librerias siguientes se pueden instalar con “install.packages:

library(ggplot2) #paquete de gráficas

## Warning: package 'ggplot2' was built under R version 4.2.2

library(tidyverse) #Paquete que nos ayuda a ocnectar con más paquetes

## Warning: package 'tidyverse' was built under R version 4.2.2

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5

## Warning: package 'tibble' was built under R version 4.2.2

## Warning: package 'tidyr' was built under R version 4.2.2

## Warning: package 'readr' was built under R version 4.2.2

## Warning: package 'purrr' was built under R version 4.2.2

## Warning: package 'dplyr' was built under R version 4.2.2

## Warning: package 'stringr' was built under R version 4.2.2

## Warning: package 'forcats' was built under R version 4.2.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(rmarkdown) #paquete que nos ayuda a cargar un  informrte en HTTML, word, etc

## Warning: package 'rmarkdown' was built under R version 4.2.2

library(skimr) #para variables estadisticas

## Warning: package 'skimr' was built under R version 4.2.2

library(dplyr) #para editar los datos
library(janitor) #funciones para la limpieza de datos

## Warning: package 'janitor' was built under R version 4.2.2

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library("here")  #Este paquete facilita la consulta de los archivos

## Warning: package 'here' was built under R version 4.2.2

## here() starts at C:/Users/moren/OneDrive/Documents/Google_certifid

library(readr) #para leer datos

##Datos a analizar

para poder cargar un documentos cvs usamos la siguiente función de R

flavors_of_cacao <- read_csv("C:/Users/moren/OneDrive/Escritorio/Proyectos/Proyecto_Chocolate/flavors_of_cacao.csv")

## Rows: 1795 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Creador_empresa_si_exite, Origen_FRIJOL_BARRA, Empresa_localidad, F...
## dbl (5): Id_d, REF, Revisar, Porcentaje_Cocoa, Popularidad
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(flavors_of_cacao)

##Datros con clasificación de popularidad 

flavors_of_cacao_V3 <- read_csv("C:/Users/moren/OneDrive/Escritorio/Proyectos/Proyecto_Chocolate/flavors_of_cacao_V3.csv")

## Rows: 1795 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Creador_empresa_si_exite, Origen_FRIJOL_BARRA, Empresa_localidad, F...
## dbl (5): Id_d, REF, Revisar, Porcentaje_Cocoa, Popularidad
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(flavors_of_cacao_V3)

##Reporte de datos

Usamos las siguientes funciones para que nos de un resumen de los datos que estamos usando.

skim_without_charts(flavors_of_cacao_V3) #resumen detallado de los datos

Data summary
Name	flavors_of_cacao_V3
Number of rows	1795
Number of columns	11
_______________________
Column type frequency:
character	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
Creador_empresa_si_exite	0	1.00	2	39	416
Origen_FRIJOL_BARRA	0	1.00	3	45	1039
Empresa_localidad	0	1.00	4	17	60
Frijo_tipo	888	0.51	3	23	39
Haba_origen	74	0.96	4	29	99
Popularidad_Class	0	1.00	5	14	5

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
Id_d	1	898.00	518.32	1.00	449.50	898.00	1346.50	1795
REF	1	1035.90	552.89	5.00	576.00	1069.00	1502.00	1952
Revisar	1	2012.33	2.93	2006.00	2010.00	2013.00	2015.00	2017
Porcentaje_Cocoa	1	0.72	0.06	0.42	0.70	0.70	0.75	1
Popularidad	1	3.19	0.48	1.00	2.88	3.25	3.50	5

glimpse(flavors_of_cacao_V3) #resumen de las columnas

## Rows: 1,795
## Columns: 11
## $ Id_d                     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ Creador_empresa_si_exite <chr> "A. Morin", "A. Morin", "A. Morin", "A. Morin…
## $ Origen_FRIJOL_BARRA      <chr> "Agua Grande", "Kpime", "Atsane", "Akata", "Q…
## $ REF                      <dbl> 1876, 1676, 1676, 1680, 1704, 1315, 1315, 131…
## $ Revisar                  <dbl> 2016, 2015, 2015, 2015, 2015, 2014, 2014, 201…
## $ Porcentaje_Cocoa         <dbl> 0.63, 0.70, 0.70, 0.70, 0.70, 0.70, 0.70, 0.7…
## $ Empresa_localidad        <chr> "France", "France", "France", "France", "Fran…
## $ Popularidad              <dbl> 3.75, 2.75, 3.00, 3.50, 3.50, 2.75, 3.50, 3.5…
## $ Frijo_tipo               <chr> NA, NA, NA, NA, NA, "Criollo", NA, "Criollo",…
## $ Haba_origen              <chr> "Sao Tome", "Togo", "Togo", "Togo", "Peru", "…
## $ Popularidad_Class        <chr> "Sastisfactorio", "Decepcionante", "Sastisfac…

head(flavors_of_cacao_V3)

## # A tibble: 6 × 11
##    Id_d Creador_…¹ Orige…²   REF Revisar Porce…³ Empre…⁴ Popul…⁵ Frijo…⁶ Haba_…⁷
##   <dbl> <chr>      <chr>   <dbl>   <dbl>   <dbl> <chr>     <dbl> <chr>   <chr>  
## 1     1 A. Morin   Agua G…  1876    2016    0.63 France     3.75 <NA>    Sao To…
## 2     2 A. Morin   Kpime    1676    2015    0.7  France     2.75 <NA>    Togo   
## 3     3 A. Morin   Atsane   1676    2015    0.7  France     3    <NA>    Togo   
## 4     4 A. Morin   Akata    1680    2015    0.7  France     3.5  <NA>    Togo   
## 5     5 A. Morin   Quilla   1704    2015    0.7  France     3.5  <NA>    Peru   
## 6     6 A. Morin   Carene…  1315    2014    0.7  France     2.75 Criollo Venezu…
## # … with 1 more variable: Popularidad_Class <chr>, and abbreviated variable
## #   names ¹Creador_empresa_si_exite, ²Origen_FRIJOL_BARRA, ³Porcentaje_Cocoa,
## #   ⁴Empresa_localidad, ⁵Popularidad, ⁶Frijo_tipo, ⁷Haba_origen

##Gráficas Vemos que en el diagrama de dispersión tenemos la popularidad de Desagradable a Elite y sus niveles y como esque se comportan.

ggplot(data = flavors_of_cacao_V3) + geom_point((mapping = 
                                                  aes(x = Porcentaje_Cocoa,
                                                      y = Popularidad, color = 
                                                        Popularidad_Class
                                                    )))+
  labs(title="Porcentaje de cocoa y popularidad por clase",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

Ahora tenemos Tenemos que el porcentaje de Cocoa en mayor numero de conteo es en nivel satisfactorio

ggplot(data = flavors_of_cacao_V3) + geom_bar((mapping = 
                                                aes(x = Porcentaje_Cocoa
                                                    , fill= Popularidad_Class
                                                )))+
  labs(title="Porcentaje de cocoa y conteo color por popularidad Clase",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

Grafico de porcentaje de cocoa vs popularidad el porcentaje de cocoa en 0.7 la popularidad es la más alta.

ggplot(data = flavors_of_cacao) + geom_smooth((mapping = 
                                                 aes(x= Porcentaje_Cocoa, 
                                                     y= Popularidad)))+
  labs(title="Cocoa vs popularidad",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Popularidad y sus reseñas en cuestión del tiempo por gráficos

ggplot(data = flavors_of_cacao)+
  geom_bar(mapping=aes(x= Popularidad, fill=Revisar))+
  facet_wrap(~Revisar)+
  labs(title="Popularidad y reseñas",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

Valor revisado y su aumento REF

ggplot(data = flavors_of_cacao) +geom_smooth(mapping = 
                                       aes(x = Revisar,
                                           y = REF)) +
  labs(title="Revisars vs REF",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Popularidad y porcentaje de Cocoa por localidad

ggplot(data = flavors_of_cacao) +geom_jitter(mapping = 
                                      aes(x = Popularidad,
                                          y = Porcentaje_Cocoa, color = Empresa_localidad))+
  geom_smooth(mapping = aes(x = Popularidad,
                                                y = Porcentaje_Cocoa, color = Empresa_localidad))+ 
  labs(title="Popularidad y porcentaje de cocoa por Localidad",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.2475

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.2525

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.063756

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 3.2475

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.2525

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.063756

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2.745

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.755

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.4316e-16

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.25502

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2.745

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.755

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 1.4316e-16

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.25502

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.25

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.25

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 3.25

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 0.25

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 2.745

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2.745

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.755

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.5e-05

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning: Computation failed in `stat_smooth()`
## Caused by error in `predLoess()`:
## ! NA/NaN/Inf en llamada a una función externa (arg 5)

Popularidad clase y conteo, numero de datos que más hay por clase

ggplot(data = flavors_of_cacao_V3)+
  geom_bar(mapping=aes(x=Popularidad_Class, fill = Popularidad_Class))+ 
  labs(title="Clase y conteo",
       caption= "@RACHAEL TATMAN conjunto de datos Kaggle")

##Estadísticas

Teneiendo los datos de los chocolates, en cuestión de estadisticas, las columnas de Porcentaje cococa y popularidad no tienen relación alguna, podemos verlos en las siguientes estadísticas y gráficas.

flavors_of_cacao_V3 %>% 
  group_by(Popularidad_Class) %>% 
  summarise(mean(Popularidad), sd(Porcentaje_Cocoa), mean(Porcentaje_Cocoa), sd(Popularidad), 
            cor(Popularidad, Porcentaje_Cocoa))

## Warning in cor(Popularidad, Porcentaje_Cocoa): the standard deviation is zero

## Warning in cor(Popularidad, Porcentaje_Cocoa): the standard deviation is zero

## # A tibble: 5 × 6
##   Popularidad_Class `mean(Popularidad)` sd(Porcentaje_…¹ mean(…² sd(Po…³ cor(P…⁴
##   <chr>                           <dbl>            <dbl>   <dbl>   <dbl>   <dbl>
## 1 Decepcionante                    2.60           0.0777   0.725   0.217 -0.124 
## 2 Desagradable                     1.43           0.149    0.843   0.262  0.271 
## 3 Elite                            5              0        0.7     0     NA     
## 4 Premiun                          4              0.0388   0.708   0     NA     
## 5 Sastisfactorio                   3.34           0.0549   0.713   0.265 -0.0585
## # … with abbreviated variable names ¹`sd(Porcentaje_Cocoa)`,
## #   ²`mean(Porcentaje_Cocoa)`, ³`sd(Popularidad)`,
## #   ⁴`cor(Popularidad, Porcentaje_Cocoa)`

Gráfica de la estadísticas anterior.

ggplot(flavors_of_cacao_V3, aes(Porcentaje_Cocoa, Popularidad)) +
  geom_point() + geom_smooth(method = lm, se=FALSE)

## `geom_smooth()` using formula = 'y ~ x'

##Conclusión Los datos de chocolates, tienen en más porcentaje y popularidad los paises U.S.A, Venezuela, U.K, Spain, y en azul con poca Nicaragua. Con el tiempo han aumentado las reseñas de los chocolates y la actualización del conjunto de datos. Tambien tenemos la popularidad y la relación con las reseñas por año, que igual manera han ido aumentando. Los que tienen un porcentaje de Cocoa mayor a %70 y <80% tienen una popularidad >3 esto quiere decir que el porcentaje de cocoa es bueno pero falta más produción o reseñas. Podemos decir que el conjunto de datos esta bien pero faltan datos más cuantitativos, para un análisis más completo.

Chcolates_PROYECTO

Juan Mario

2023-01-28