#Bases
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'treemapify' was built under R version 4.3.3
#Data Cleaning
VentasUS = read.xlsx("SP_Endeavor_Base1 Papas_VF_CLEAN.xlsx")
VentasMex = read.xlsx("SP_Endeavor_Base2 Papas_VF_CLEAN.xlsx")
WageMex = read.xlsx("WagesMex.xlsx")
WageUS = read.xlsx("WagesUS.xlsx")
OrdMonth = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
OrdMonthEs = c("Enero", "Febrero", "Marzo", "Abril", "Mayo", "Junio", "Julio", "Agosto", "Septiembre", "Octubre", "Noviembre", "Diciembre")## Month Region CrujiCos PicanCos SaladoCos BBQCos SalsaCos
## FALSE FALSE TRUE FALSE TRUE TRUE FALSE
Media_CC= mean(VentasMex$CrujiCos,na.rm=T)
VentasMex$CrujiCos[is.na(VentasMex$CrujiCos)] = Media_CC
Media_SC= mean(VentasMex$SaladoCos,na.rm=T)
VentasMex$SaladoCos[is.na(VentasMex$SaladoCos)] = Media_SC
Media_BC= mean(VentasMex$BBQCos,na.rm=T)
VentasMex$BBQCos[is.na(VentasMex$BBQCos)] = Media_BC#The product that sells the most
VentasMex$RM = paste(VentasMex$Month, VentasMex$Region)
meses_en_df = VentasMex$meses
numeros_meses = match(meses_en_df, VentasMex$Month)
borrar= c("Region", "Month")
#Mex
vm_transpose= VentasMex[ , !(names(VentasMex) %in% borrar)]
dM_transpose = data.frame(t(vm_transpose[-6]))
colnames(dM_transpose) = vm_transpose[, 6]
dM_transpose$Sumative= rowSums((dM_transpose[, 1:36]))
dM_transpose$Sabores =rownames(dM_transpose)
#US
VentasUS$RM =paste(VentasUS$Month, VentasUS$Region)
vu_transpose= VentasUS[ , !(names(VentasUS) %in% borrar)]
dU_transpose = data.frame(t(vu_transpose[-6]))
colnames(dU_transpose) = vu_transpose[, 6]
dU_transpose$Sumative= rowSums((dU_transpose[, 1:36]))
dU_transpose$Sabores =rownames(dU_transpose)##US
ggplot2::ggplot(data = dU_transpose, mapping = aes(x= Sabores,y = Sumative))+
geom_col(color ="blue", fill ="red") + theme_classic() +
labs(title = "US Sales by Product",
x = "Products",
y = "Sales")
SpicyCo is the best-selling flavor of chips in the US ##MEX
ggplot2::ggplot(data = dM_transpose, mapping = aes(x= Sabores,y = Sumative))+
geom_col(color ="red", fill ="green") + theme_classic() +
labs(title = "Mex Sales by Product",
x = "Products",
y = "Sales")
PicanCos is the best-selling flavor of chips in the US
#the month with more sells
VentasMexTotales= VentasMex
VentasMexTotales$MesesNum = match(VentasMex$Month,OrdMonthEs)
VentasMexTotales$VentasTotal = VentasMex$CrujiCos + VentasMex$PicanCos + VentasMex$SaladoCos + VentasMex$BBQCos + VentasMex$SalsaCos
ventas_por_mesMex = VentasMexTotales %>%
group_by(MesesNum) %>%
summarise(Ventas = sum(VentasTotal))
VentasUSTotales= VentasUS
VentasUSTotales$MesesNum = match(VentasUS$Month,OrdMonth)
VentasUSTotales$VentasTotal = VentasUS$CrunchyCo + VentasUS$SpicyCo + VentasUS$SaltCo + VentasUS$BBQCo + VentasUS$SalsaCo
ventas_por_mesUS = VentasUSTotales %>%
group_by(MesesNum) %>%
summarise(Ventas = sum(VentasTotal))
ventas_por_mesUS$Variable = "US"
ventas_por_mesMex$Variable = "MEX"
datos_combinados = rbind(ventas_por_mesUS, ventas_por_mesMex)##US
ggplot2::ggplot(data = ventas_por_mesUS, mapping = aes(x= MesesNum,y = Ventas))+
geom_line(color ="blue") + geom_point(color ="red") + theme_classic()+ scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1)) + labs(title = "US Sales by Month",
x = "Months",
y = "Sales")
July is when most chips are sale in the US
##MEX
ggplot2::ggplot(data = ventas_por_mesMex, mapping = aes(x= MesesNum,y = Ventas))+
geom_line(color ="green") + geom_point(color ="red") + theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1)) + labs(title = "Mexico Sales by Month",
x = "Months",
y = "Sales")
August is when most chips are sale in Mexico
#The months where each products sells the most
ventas_producto_mesUS = VentasUSTotales %>%
group_by(MesesNum) %>%
reframe(Venta = c(sum(CrunchyCo), sum(SaltCo), sum(BBQCo), sum(SalsaCo), sum(SpicyCo)))
num_filas = (12*5)
Marca = c("CrunchyCo", "SaltCo", "BBQCo","SalsaCo", "SpicyCo")
ventas_producto_mesUS$Sabores = rep(Marca, length.out = num_filas)
ventas_producto_mesMEX = VentasMexTotales %>%
group_by(MesesNum) %>%
reframe(Venta =c(sum(CrujiCos),sum(SaladoCos), sum(BBQCos), sum(SalsaCos), sum(PicanCos)))
Marca = c("CrujiCos", "SaladoCos", "BBQCos","SalsaCos", "PicanCos")
ventas_producto_mesMEX$Sabores = rep(Marca, length.out = num_filas)ggplot(ventas_producto_mesUS, aes(x = factor(MesesNum), y = Venta, fill = Sabores)) +
geom_bar(stat = "identity") +
facet_wrap(~Sabores) +
labs(
title = "Mensual Sales by Flavor",
x = "Month",
y = "Sales"
)
In the previous graph you can see the monthly sales of each flavor of
potatoes. Highlighting January as the month that SpicyCo sells the most
and December as CrunchyCo ##MEX
ggplot(ventas_producto_mesMEX, aes(x = factor(MesesNum), y = Venta, fill = Sabores)) +
geom_bar(stat = "identity") +
facet_wrap(~Sabores) +
labs(
title = "Mensual Sales by Flavor",
x = "Month",
y = "Sales")
In the previous graph you can see the monthly sales of each flavor of
potatoes. Highlighting August as the month that PicanCos sells the most.
#Patterns in wages
WageUS_R = WageUS %>%
group_by(Wage) %>%
slice(rep(1:n(), Workers))
WUpromedio = mean(WageUS_R$Wage)
WUsd = sd(WageUS_R$Wage)
WUA1sd = WUpromedio + WUsd
WUS1sd = WUpromedio - WUsd
WUA2sd = WUpromedio + (2*WUsd)
WUS2sd = WUpromedio - (2*WUsd)
WUA3sd = WUpromedio + (3*WUsd)
WUS3sd = WUpromedio - (3*WUsd)
WageMEX_R = WageMex %>%
group_by(Wage) %>%
slice(rep(1:n(), Workers))
WMpromedio = mean(WageMEX_R$Wage)
WMsd = sd(WageMEX_R$Wage)
WMA1sd = WMpromedio + WMsd
WMS1sd = WMpromedio - WMsd
WMA2sd = WMpromedio + (2*WMsd)
WMS2sd = WMpromedio - (2*WMsd)
WMA3sd = WMpromedio + (3*WMsd)
WMS3sd = WMpromedio - (3*WMsd)##US
ggplot2::ggplot(WageUS_R, aes(x = Wage)) +
geom_histogram(binwidth = 500, fill = "blue", color = "yellow") +
geom_vline(xintercept = WUpromedio, linetype = "dashed", color = "red", size = 1) +
geom_vline(xintercept = WUA1sd, linetype = "dashed", color = "red", size = 1) +
geom_vline(xintercept = WUS1sd, linetype = "dashed", color = "red", size = 1)+ geom_vline(xintercept = WUA2sd, linetype = "dashed", color = "red", size = 1) +
geom_vline(xintercept = WUS2sd, linetype = "dashed", color = "red", size = 1)+
theme_classic()+
labs(title = "Histogram US WAGE",
x = "Wage",
y = "Frequency")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
There is a normal distribution among the Wages of the United States.
##MEX
ggplot2::ggplot(WageMEX_R, aes(x = Wage)) +
geom_histogram(binwidth = 500, fill = "green", color = "yellow") +
geom_vline(xintercept = WMpromedio, linetype = "dashed", color = "red", size = 1) +
geom_vline(xintercept = WMA1sd, linetype = "dashed", color = "red", size = 1) +
geom_vline(xintercept = WMS1sd, linetype = "dashed", color = "red", size = 1)+ geom_vline(xintercept = WMA2sd, linetype = "dashed", color = "red", size = 1) +
geom_vline(xintercept = WMS2sd, linetype = "dashed", color = "red", size = 1)+ theme_classic()+
labs(title = "Histogram MEX WAGE",
x = "Wage",
y = "Frequency")Mexican taxes look like a normal distribution. Even so, there are inconsistencies that call into question the veracity of the data.
Note: In the “Wages Mex” document, Mexican salaries were highly higher than those in the United States, we assume that this is an error. Since there is no sense in earning more in an emerging market that has not been present for more than a year. The error can be due to different causes such as not specifying the time period of the salaries (weekly, biweekly or monthly) or an error in the currency exchange rate. Verification and correction of the database is necessary to have the most accurate information. The graphs above were made from the database without changes.
#Patterns in products
ventas_producto_mesUS = VentasUSTotales %>%
group_by(MesesNum) %>%
reframe(Venta = c(sum(CrunchyCo), sum(SaltCo), sum(BBQCo), sum(SalsaCo), sum(SpicyCo)))
num_filas = (12*5)
Marca = c("CrunchyCo", "SaltCo", "BBQCo","SalsaCo", "SpicyCo")
ventas_producto_mesUS$Sabores = rep(Marca, length.out = num_filas)
ventas_producto_mesMEX = VentasMexTotales %>%
group_by(MesesNum) %>%
reframe(Venta =c(sum(CrujiCos),sum(SaladoCos), sum(BBQCos), sum(SalsaCos), sum(PicanCos)))
Marca = c("CrujiCos", "SaladoCos", "BBQCos","SalsaCos", "PicanCos")
ventas_producto_mesMEX$Sabores = rep(Marca, length.out = num_filas)ggplot2::ggplot(data = ventas_producto_mesUS, mapping = aes(x= MesesNum,y = Venta, color = Sabores))+
geom_line() + theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1))
Most flavors in the United States reach a peak in sales in the middle of
the year, after which they usually decrease, to increase sales in
December.
##MEX
ggplot2::ggplot(data = ventas_producto_mesMEX, mapping = aes(x= MesesNum,y = Venta, color = Sabores))+
geom_line() + theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1))
Sales in Mexico start the year with their maximum in sales, after that
they usually drop considerably.
#Information about the regions
ventas_regionUS = VentasUSTotales %>%
group_by(Region) %>%
summarise(Venta = sum(VentasTotal))
ventas_regionMEX = VentasMexTotales %>%
group_by(Region) %>%
summarise(Venta = sum(VentasTotal))##US
ggplot2::ggplot(data = ventas_regionUS, mapping = aes(area=Venta, fill= Region))+
geom_treemap() + theme_classic() +
labs(title = "US Sales by Region",
x = "Region",
y = "Sales")
The 3 regions usually have similar sales numbers. ##MEX
ggplot2::ggplot(data = ventas_regionMEX, mapping = aes(area=Venta, fill= Region))+
geom_treemap() + theme_classic() +
labs(title = "MEX Sales by Region",
x = "Region",
y = "Sales")
The 3 regions usually have similar sales numbers.
#Explain in a small paragraph the steps you followed to clean the database To clean the data, it was necessary to delete unnecessary information, fill in blank spaces according to your needs, correct spelling and separate the information into different Excels. From that, the sales presented with some blank spaces that were filled with the average sales according to their flavor.
#Explain in 3 paragraphs the information about sales and wages that you find
Despite the significant difference in sales volume between the American and Mexican markets, they behave similarly on a different scale. One of the similarities found is the popularity of flavors, with spicy products standing out as favorites among the public.
Month by month, sales between the two markets differ; in the United States, the middle of the year stands out as the period with the highest sales, while in Mexico, the beginning of the year sees the highest sales.
Regarding wages, the United States shows a normal distribution of salary and number of workers, indicating greater budget control in salaries. Meanwhile, Mexico’s distribution appears to resemble a normal distribution but has certain inconsistencies that cast doubt on the accuracy of the data, as the Mexican average considerably exceeds the American average, despite being an emerging market. Therefore, verification of the information is necessary, and if necessary, correction as well.
#Compare the mexican sales with the american ones
##MEX AND US
ggplot2::ggplot(data = datos_combinados, mapping = aes(x= MesesNum,y = Ventas, color = Variable))+
geom_line() + theme_classic() + scale_x_continuous(limits = c(1, 12), breaks = seq(1, 12, by = 1))The US market, being older, surpasses the Mexican market by a large amount. Even so, sales trends are similar in that the Spicy product is the most popular among consumers.