Evidencia

#Bases

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readxl)
library(stringr)
library(openxlsx)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(treemapify)

## Warning: package 'treemapify' was built under R version 4.3.3

#Data Cleaning

VentasUS = read.xlsx("SP_Endeavor_Base1 Papas_VF_CLEAN.xlsx")
VentasMex = read.xlsx("SP_Endeavor_Base2 Papas_VF_CLEAN.xlsx")
WageMex = read.xlsx("WagesMex.xlsx")
WageUS = read.xlsx("WagesUS.xlsx")
OrdMonth = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
OrdMonthEs = c("Enero", "Febrero", "Marzo", "Abril", "Mayo", "Junio", "Julio", "Agosto", "Septiembre", "Octubre", "Noviembre", "Diciembre")

apply(is.na(VentasMex), 2, sum)>=1

##     Month    Region  CrujiCos  PicanCos SaladoCos    BBQCos  SalsaCos 
##     FALSE     FALSE      TRUE     FALSE      TRUE      TRUE     FALSE

Media_CC= mean(VentasMex$CrujiCos,na.rm=T)
VentasMex$CrujiCos[is.na(VentasMex$CrujiCos)] = Media_CC

Media_SC= mean(VentasMex$SaladoCos,na.rm=T)
VentasMex$SaladoCos[is.na(VentasMex$SaladoCos)] = Media_SC

Media_BC= mean(VentasMex$BBQCos,na.rm=T)
VentasMex$BBQCos[is.na(VentasMex$BBQCos)] = Media_BC

#The product that sells the most

VentasMex$RM = paste(VentasMex$Month, VentasMex$Region)
meses_en_df = VentasMex$meses
numeros_meses = match(meses_en_df, VentasMex$Month)
borrar= c("Region", "Month")
#Mex
vm_transpose= VentasMex[ , !(names(VentasMex) %in% borrar)]
dM_transpose = data.frame(t(vm_transpose[-6]))
colnames(dM_transpose) = vm_transpose[, 6]
dM_transpose$Sumative= rowSums((dM_transpose[, 1:36]))
dM_transpose$Sabores =rownames(dM_transpose)

#US
VentasUS$RM =paste(VentasUS$Month, VentasUS$Region)
vu_transpose= VentasUS[ , !(names(VentasUS) %in% borrar)]
dU_transpose = data.frame(t(vu_transpose[-6]))
colnames(dU_transpose) = vu_transpose[, 6]
dU_transpose$Sumative= rowSums((dU_transpose[, 1:36]))
dU_transpose$Sabores =rownames(dU_transpose)

##US

ggplot2::ggplot(data = dU_transpose, mapping = aes(x= Sabores,y = Sumative))+
geom_col(color ="blue", fill ="red") +  theme_classic() + 
  labs(title = "US Sales by Product",
       x = "Products",
       y = "Sales")

SpicyCo is the best-selling flavor of chips in the US ##MEX

ggplot2::ggplot(data = dM_transpose, mapping = aes(x= Sabores,y = Sumative))+
geom_col(color ="red", fill ="green") +  theme_classic() +
  labs(title = "Mex Sales by Product",
       x = "Products",
       y = "Sales")

PicanCos is the best-selling flavor of chips in the US

#the month with more sells

VentasMexTotales= VentasMex

VentasMexTotales$MesesNum = match(VentasMex$Month,OrdMonthEs)

VentasMexTotales$VentasTotal = VentasMex$CrujiCos + VentasMex$PicanCos + VentasMex$SaladoCos + VentasMex$BBQCos + VentasMex$SalsaCos

ventas_por_mesMex = VentasMexTotales %>%
  group_by(MesesNum) %>%
  summarise(Ventas = sum(VentasTotal))

VentasUSTotales= VentasUS

VentasUSTotales$MesesNum = match(VentasUS$Month,OrdMonth)

  
VentasUSTotales$VentasTotal = VentasUS$CrunchyCo + VentasUS$SpicyCo + VentasUS$SaltCo + VentasUS$BBQCo + VentasUS$SalsaCo

ventas_por_mesUS = VentasUSTotales %>%
  group_by(MesesNum) %>%
  summarise(Ventas = sum(VentasTotal))

ventas_por_mesUS$Variable = "US"
ventas_por_mesMex$Variable = "MEX"

datos_combinados = rbind(ventas_por_mesUS, ventas_por_mesMex)

##US

ggplot2::ggplot(data = ventas_por_mesUS, mapping = aes(x= MesesNum,y = Ventas))+
geom_line(color ="blue") + geom_point(color ="red") +  theme_classic()+ scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1)) + labs(title = "US Sales by Month",
       x = "Months",
       y = "Sales")

July is when most chips are sale in the US

##MEX

ggplot2::ggplot(data = ventas_por_mesMex, mapping = aes(x= MesesNum,y = Ventas))+
geom_line(color ="green") + geom_point(color ="red") + theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1)) + labs(title = "Mexico Sales by Month",
       x = "Months",
       y = "Sales")

August is when most chips are sale in Mexico

#The months where each products sells the most

ventas_producto_mesUS = VentasUSTotales %>%
  group_by(MesesNum) %>%
  reframe(Venta = c(sum(CrunchyCo), sum(SaltCo), sum(BBQCo), sum(SalsaCo), sum(SpicyCo)))
num_filas = (12*5)
Marca = c("CrunchyCo", "SaltCo", "BBQCo","SalsaCo", "SpicyCo")
ventas_producto_mesUS$Sabores =  rep(Marca, length.out = num_filas)

ventas_producto_mesMEX = VentasMexTotales %>%
  group_by(MesesNum) %>%
  reframe(Venta =c(sum(CrujiCos),sum(SaladoCos), sum(BBQCos), sum(SalsaCos), sum(PicanCos)))
Marca = c("CrujiCos", "SaladoCos", "BBQCos","SalsaCos", "PicanCos")
ventas_producto_mesMEX$Sabores =  rep(Marca, length.out = num_filas)

0.1 US

ggplot(ventas_producto_mesUS, aes(x = factor(MesesNum), y = Venta, fill = Sabores)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Sabores) +
  labs(
    title = "Mensual Sales by Flavor",
    x = "Month",
    y = "Sales"
  )

In the previous graph you can see the monthly sales of each flavor of potatoes. Highlighting January as the month that SpicyCo sells the most and December as CrunchyCo ##MEX

ggplot(ventas_producto_mesMEX, aes(x = factor(MesesNum), y = Venta, fill = Sabores)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Sabores) +
  labs(
    title = "Mensual Sales by Flavor",
    x = "Month",
    y = "Sales")

In the previous graph you can see the monthly sales of each flavor of potatoes. Highlighting August as the month that PicanCos sells the most. #Patterns in wages

WageUS_R = WageUS %>%
  group_by(Wage) %>%
  slice(rep(1:n(), Workers))

WUpromedio = mean(WageUS_R$Wage)
WUsd = sd(WageUS_R$Wage)
WUA1sd = WUpromedio + WUsd
WUS1sd = WUpromedio - WUsd
WUA2sd = WUpromedio + (2*WUsd)
WUS2sd = WUpromedio - (2*WUsd)
WUA3sd = WUpromedio + (3*WUsd)
WUS3sd = WUpromedio - (3*WUsd)

WageMEX_R = WageMex %>%
  group_by(Wage) %>%
  slice(rep(1:n(), Workers))

WMpromedio = mean(WageMEX_R$Wage)
WMsd = sd(WageMEX_R$Wage)
WMA1sd = WMpromedio + WMsd
WMS1sd = WMpromedio - WMsd
WMA2sd = WMpromedio + (2*WMsd)
WMS2sd = WMpromedio - (2*WMsd)
WMA3sd = WMpromedio + (3*WMsd)
WMS3sd = WMpromedio - (3*WMsd)

##US

ggplot2::ggplot(WageUS_R, aes(x = Wage)) +
  geom_histogram(binwidth = 500, fill = "blue", color = "yellow") +
  geom_vline(xintercept = WUpromedio, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = WUA1sd, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = WUS1sd, linetype = "dashed", color = "red", size = 1)+   geom_vline(xintercept = WUA2sd, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = WUS2sd, linetype = "dashed", color = "red", size = 1)+
  theme_classic()+
  
  labs(title = "Histogram US WAGE",
       x = "Wage",
       y = "Frequency")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

There is a normal distribution among the Wages of the United States.

##MEX

ggplot2::ggplot(WageMEX_R, aes(x = Wage)) +
  geom_histogram(binwidth = 500, fill = "green", color = "yellow") +
  geom_vline(xintercept = WMpromedio, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = WMA1sd, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = WMS1sd, linetype = "dashed", color = "red", size = 1)+   geom_vline(xintercept = WMA2sd, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = WMS2sd, linetype = "dashed", color = "red", size = 1)+ theme_classic()+
  
  labs(title = "Histogram MEX WAGE",
       x = "Wage",
       y = "Frequency")

Mexican taxes look like a normal distribution. Even so, there are inconsistencies that call into question the veracity of the data.

Note: In the “Wages Mex” document, Mexican salaries were highly higher than those in the United States, we assume that this is an error. Since there is no sense in earning more in an emerging market that has not been present for more than a year. The error can be due to different causes such as not specifying the time period of the salaries (weekly, biweekly or monthly) or an error in the currency exchange rate. Verification and correction of the database is necessary to have the most accurate information. The graphs above were made from the database without changes.

#Patterns in products

ventas_producto_mesUS = VentasUSTotales %>%
  group_by(MesesNum) %>%
  reframe(Venta = c(sum(CrunchyCo), sum(SaltCo), sum(BBQCo), sum(SalsaCo), sum(SpicyCo)))
num_filas = (12*5)
Marca = c("CrunchyCo", "SaltCo", "BBQCo","SalsaCo", "SpicyCo")
ventas_producto_mesUS$Sabores =  rep(Marca, length.out = num_filas)

ventas_producto_mesMEX = VentasMexTotales %>%
  group_by(MesesNum) %>%
  reframe(Venta =c(sum(CrujiCos),sum(SaladoCos), sum(BBQCos), sum(SalsaCos), sum(PicanCos)))
Marca = c("CrujiCos", "SaladoCos", "BBQCos","SalsaCos", "PicanCos")
ventas_producto_mesMEX$Sabores =  rep(Marca, length.out = num_filas)

0.2 US

ggplot2::ggplot(data = ventas_producto_mesUS, mapping = aes(x= MesesNum,y = Venta, color = Sabores))+
geom_line() +  theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1))

Most flavors in the United States reach a peak in sales in the middle of the year, after which they usually decrease, to increase sales in December.

##MEX

ggplot2::ggplot(data = ventas_producto_mesMEX, mapping = aes(x= MesesNum,y = Venta, color = Sabores))+
geom_line() +  theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1))

Sales in Mexico start the year with their maximum in sales, after that they usually drop considerably.

#Information about the regions

ventas_regionUS = VentasUSTotales %>%
  group_by(Region) %>%
  summarise(Venta = sum(VentasTotal))
ventas_regionMEX = VentasMexTotales %>%
  group_by(Region) %>%
  summarise(Venta = sum(VentasTotal))

##US

ggplot2::ggplot(data = ventas_regionUS, mapping = aes(area=Venta, fill= Region))+
geom_treemap() +  theme_classic() + 
  labs(title = "US Sales by Region",
       x = "Region",
       y = "Sales")

The 3 regions usually have similar sales numbers. ##MEX

ggplot2::ggplot(data = ventas_regionMEX, mapping = aes(area=Venta, fill= Region))+
geom_treemap() +  theme_classic() + 
  labs(title = "MEX Sales by Region",
       x = "Region",
       y = "Sales")

The 3 regions usually have similar sales numbers.

#Explain in a small paragraph the steps you followed to clean the database To clean the data, it was necessary to delete unnecessary information, fill in blank spaces according to your needs, correct spelling and separate the information into different Excels. From that, the sales presented with some blank spaces that were filled with the average sales according to their flavor.

#Explain in 3 paragraphs the information about sales and wages that you find

Despite the significant difference in sales volume between the American and Mexican markets, they behave similarly on a different scale. One of the similarities found is the popularity of flavors, with spicy products standing out as favorites among the public.

Month by month, sales between the two markets differ; in the United States, the middle of the year stands out as the period with the highest sales, while in Mexico, the beginning of the year sees the highest sales.

Regarding wages, the United States shows a normal distribution of salary and number of workers, indicating greater budget control in salaries. Meanwhile, Mexico’s distribution appears to resemble a normal distribution but has certain inconsistencies that cast doubt on the accuracy of the data, as the Mexican average considerably exceeds the American average, despite being an emerging market. Therefore, verification of the information is necessary, and if necessary, correction as well.

#Compare the mexican sales with the american ones

##MEX AND US

ggplot2::ggplot(data = datos_combinados, mapping = aes(x= MesesNum,y = Ventas, color = Variable))+
geom_line() +  theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1))

The US market, being older, surpasses the Mexican market by a large amount. Even so, sales trends are similar in that the Spicy product is the most popular among consumers.

Evidencia

Osvaldo López Treviño A01285699

2024-04-19

0.1 US

0.2 US