setwd("~/R Training")
library(readr)
Chocolate_Sales <- read_csv("Chocolate Sales.csv")
## Rows: 1094 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sales Person, Country, Product, Date, Amount
## dbl (1): Boxes Shipped
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Chocolate_Sales)
chocolate=read.csv("chocolate sales.csv",stringsAsFactors = FALSE)
library(readr)
library(ggplot2)
Chocolate_Sales=read_csv("chocolate sales.csv")
## Rows: 1094 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sales Person, Country, Product, Date, Amount
## dbl (1): Boxes Shipped
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Chocolate_Sales)
## # A tibble: 6 × 6
## `Sales Person` Country Product Date Amount `Boxes Shipped`
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Jehu Rudeforth UK Mint Chip Choco 04-Jan-22 $5,320 180
## 2 Van Tuxwell India 85% Dark Bars 01-Aug-22 $7,896 94
## 3 Gigi Bohling India Peanut Butter Cubes 07-Jul-22 $4,501 91
## 4 Jan Morforth Australia Peanut Butter Cubes 27-Apr-22 $12,726 342
## 5 Jehu Rudeforth UK Peanut Butter Cubes 24-Feb-22 $13,685 184
## 6 Van Tuxwell India Smooth Sliky Salty 06-Jun-22 $5,376 38
colnames(chocolate)
## [1] "Sales.Person" "Country" "Product" "Date"
## [5] "Amount" "Boxes.Shipped"
chocolate$Amount <- as.character(chocolate$Amount)
chocolate$Amount <- gsub("\\$", "", chocolate$Amount) # REMOVE "$"
chocolate$Amount <- gsub(",", "", chocolate$Amount) # REMOVE ","
chocolate$Amount <- as.numeric(chocolate$Amount)
head(chocolate$Amount)
## [1] 5320 7896 4501 12726 13685 5376
summary(chocolate$Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7 2390 4868 5652 8027 22050
mean(chocolate$Amount, na.rm = TRUE)
## [1] 5652.308
max(chocolate$Amount, na.rm = TRUE)
## [1] 22050
min(chocolate$Amount, na.rm = TRUE)
## [1] 7
#5 Data visualization # Histogram
hist(chocolate$Amount,
main = "Distribution of Sales Amounts",
xlab = "Amount ($)",
col = "skyblue",
border = "white")
# Boxplot
boxplot(chocolate$Amount,
main = "Boxplot of Sales Amounts",
ylab = "Amount ($)",
col = "lightgreen")
# statistical test # compare sales in USA vs UK
t.test(Amount ~ Country, data = chocolate, subset = Country %in% c("USA", "UK"))
##
## Welch Two Sample t-test
##
## data: Amount by Country
## t = 0.27294, df = 353.68, p-value = 0.7851
## alternative hypothesis: true difference in means between group UK and group USA is not equal to 0
## 95 percent confidence interval:
## -774.9133 1024.6557
## sample estimates:
## mean in group UK mean in group USA
## 5908.944 5784.073
# ANOVA # sales more than two countries
anova_model <- aov(Amount ~ Country, data = chocolate)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Country 5 2.553e+07 5105165 0.302 0.912
## Residuals 1088 1.837e+10 16883913
# chi-squre # products sold by region
table_country_product <- table(chocolate$Country, chocolate$Product)
chisq.test(table_country_product)
##
## Pearson's Chi-squared test
##
## data: table_country_product
## X-squared = 71.967, df = 105, p-value = 0.9943
# simple linear rigression
model1 <- lm(Amount ~ Boxes.Shipped, data = chocolate)
summary(model1)
##
## Call:
## lm(formula = Amount ~ Boxes.Shipped, data = chocolate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5694.7 -3246.1 -769.4 2345.9 16427.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5755.1237 206.6254 27.853 <2e-16 ***
## Boxes.Shipped -0.6355 1.0212 -0.622 0.534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4104 on 1092 degrees of freedom
## Multiple R-squared: 0.0003545, Adjusted R-squared: -0.000561
## F-statistic: 0.3872 on 1 and 1092 DF, p-value: 0.5339
# Scatter plot + regression line
library(ggplot2)
ggplot(chocolate, aes(x = Boxes.Shipped, y = Amount)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Sales Amount vs. Boxes Shipped",
x = "Boxes Shipped", y = "Sales Amount ($)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# multiple linear linear regression
model2 <- lm(Amount ~ Boxes.Shipped + Country + Product, data = chocolate)
summary(model2)
##
## Call:
## lm(formula = Amount ~ Boxes.Shipped + Country + Product, data = chocolate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6577.5 -3304.1 -776.1 2413.8 15415.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5704.8784 606.2296 9.410 <2e-16 ***
## Boxes.Shipped -0.6913 1.0370 -0.667 0.505
## CountryCanada -54.9176 427.4668 -0.128 0.898
## CountryIndia 114.0555 421.8329 0.270 0.787
## CountryNew Zealand -123.6476 429.3784 -0.288 0.773
## CountryUK 314.6186 424.1602 0.742 0.458
## CountryUSA 245.0671 423.3937 0.579 0.563
## Product70% Dark Bites -605.4607 830.9975 -0.729 0.466
## Product85% Dark Bars 321.3178 791.6908 0.406 0.685
## Product99% Dark & Pure 418.1462 794.1699 0.527 0.599
## ProductAfter Nines -425.1172 792.3547 -0.537 0.592
## ProductAlmond Choco 86.6070 799.5354 0.108 0.914
## ProductBaker's Choco Chips 419.2033 836.2477 0.501 0.616
## ProductCaramel Stuffed Bars -277.9705 825.9262 -0.337 0.737
## ProductChoco Coated Almonds 507.0318 850.7821 0.596 0.551
## ProductDrinking Coco -1115.8715 766.6311 -1.456 0.146
## ProductEclairs -477.0039 755.0259 -0.632 0.528
## ProductFruit & Nut Bars -488.1493 791.5440 -0.617 0.538
## ProductManuka Honey Choco 454.4505 814.3194 0.558 0.577
## ProductMilk Bars -191.2801 794.2014 -0.241 0.810
## ProductMint Chip Choco 685.8939 817.6288 0.839 0.402
## ProductOrange Choco -228.0227 804.5002 -0.283 0.777
## ProductOrganic Choco Syrup 17.6606 783.0957 0.023 0.982
## ProductPeanut Butter Cubes 959.3548 796.1073 1.205 0.228
## ProductRaspberry Choco -174.2150 799.2329 -0.218 0.827
## ProductSmooth Sliky Salty 242.2114 758.2268 0.319 0.749
## ProductSpicy Special Slims -224.8624 776.7148 -0.290 0.772
## ProductWhite Choc -25.1203 760.2394 -0.033 0.974
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4123 on 1066 degrees of freedom
## Multiple R-squared: 0.01481, Adjusted R-squared: -0.01015
## F-statistic: 0.5934 on 27 and 1066 DF, p-value: 0.9512
plot(model2)
hist(residuals(model2))
# scatter plot for multiple linear rigressioin model
ggplot(chocolate, aes(x = Boxes.Shipped, y = Amount)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Sales Amount vs. Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount ($)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(chocolate, aes(x = Boxes.Shipped, y = Amount)) +
geom_point(aes(color = Country)) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ Country) +
labs(title = "Sales vs. Boxes Shipped by Country",
x = "Boxes Shipped", y = "Sales Amount") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# histogram of sales
ggplot(chocolate, aes(x = Amount)) +
geom_histogram(binwidth = 1000, fill = "green", color = "white") +
labs(
title = "Distribution of Sales Amount",
x = "Sales Amount ($)",
y = "Frequency"
) +
theme_classic()
# box plot
ggplot(chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount)) +
geom_boxplot(fill = "blue") +
labs(title = "Sales Amount by Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount") +
theme_light()
# Bar plot of ‘sales’
ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount)) +
geom_bar(stat = "identity", fill = "red") +
labs(
title = "Sales Amount by Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount") +
theme_classic()
# Labels
ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount,colour = Product)) +
geom_point() +
labs(title = "Sales Amount by Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount",
caption = "kemboi,2025") +
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))
# Facets ** make use of the~operator
ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount,colour = Product)) +
geom_point() +
facet_wrap(~ Country)+
labs(title = "Sales Amount by Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount",
caption = "kemboi,2025") +
theme_classic()
# combine the plots
library(patchwork)
p1= ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount,colour = Product)) +
geom_point() +
labs(title = "Sales Amount by Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount",
caption = "kemboi,2025") +
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(chocolate, aes(x = Amount)) +
geom_histogram(binwidth = 1000, fill = "green", color = "white") +
labs(
title = "Distribution of Sales Amount",
x = "Sales Amount ($)",
y = "Frequency") +
theme_classic()
p3= ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount)) +
geom_bar(stat = "identity", fill = "red") +
labs(
title = "Sales Amount by Boxes Shipped",
x = "Boxes Shipped",
y = "Sales Amount") +
theme_classic()
combine = (p1| p2| p3)+
plot_annotation(title="combine plots Example")
print(combine)