R Markdown

set a working directory

  setwd("~/R Training")
library(readr)
Chocolate_Sales <- read_csv("Chocolate Sales.csv")
## Rows: 1094 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sales Person, Country, Product, Date, Amount
## dbl (1): Boxes Shipped
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Chocolate_Sales)
chocolate=read.csv("chocolate sales.csv",stringsAsFactors = FALSE)

load data set

library(readr)
library(ggplot2)
Chocolate_Sales=read_csv("chocolate sales.csv")
## Rows: 1094 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sales Person, Country, Product, Date, Amount
## dbl (1): Boxes Shipped
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Chocolate_Sales)
## # A tibble: 6 × 6
##   `Sales Person` Country   Product             Date      Amount  `Boxes Shipped`
##   <chr>          <chr>     <chr>               <chr>     <chr>             <dbl>
## 1 Jehu Rudeforth UK        Mint Chip Choco     04-Jan-22 $5,320              180
## 2 Van Tuxwell    India     85% Dark Bars       01-Aug-22 $7,896               94
## 3 Gigi Bohling   India     Peanut Butter Cubes 07-Jul-22 $4,501               91
## 4 Jan Morforth   Australia Peanut Butter Cubes 27-Apr-22 $12,726             342
## 5 Jehu Rudeforth UK        Peanut Butter Cubes 24-Feb-22 $13,685             184
## 6 Van Tuxwell    India     Smooth Sliky Salty  06-Jun-22 $5,376               38

clean data

remove dolar sign and commas

colnames(chocolate)
## [1] "Sales.Person"  "Country"       "Product"       "Date"         
## [5] "Amount"        "Boxes.Shipped"

2. Convert the Amount column to character

chocolate$Amount <- as.character(chocolate$Amount)

3. Remove the dollar sign and commas

chocolate$Amount <- gsub("\\$", "", chocolate$Amount)  # REMOVE "$"
chocolate$Amount <- gsub(",", "", chocolate$Amount)    # REMOVE ","

4. Convert to numeric

chocolate$Amount <- as.numeric(chocolate$Amount)

 head(chocolate$Amount)
## [1]  5320  7896  4501 12726 13685  5376
 summary(chocolate$Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       7    2390    4868    5652    8027   22050
 mean(chocolate$Amount, na.rm = TRUE)
## [1] 5652.308
 max(chocolate$Amount, na.rm = TRUE)
## [1] 22050
 min(chocolate$Amount, na.rm = TRUE)
## [1] 7

#5 Data visualization # Histogram

 hist(chocolate$Amount, 
      main = "Distribution of Sales Amounts", 
      xlab = "Amount ($)", 
      col = "skyblue", 
      border = "white")

# Boxplot

 boxplot(chocolate$Amount, 
         main = "Boxplot of Sales Amounts", 
         ylab = "Amount ($)", 
         col = "lightgreen")

# statistical test # compare sales in USA vs UK

t.test(Amount ~ Country, data = chocolate, subset = Country %in% c("USA", "UK"))
## 
##  Welch Two Sample t-test
## 
## data:  Amount by Country
## t = 0.27294, df = 353.68, p-value = 0.7851
## alternative hypothesis: true difference in means between group UK and group USA is not equal to 0
## 95 percent confidence interval:
##  -774.9133 1024.6557
## sample estimates:
##  mean in group UK mean in group USA 
##          5908.944          5784.073

# ANOVA # sales more than two countries

 anova_model <- aov(Amount ~ Country, data = chocolate)
 summary(anova_model)
##               Df    Sum Sq  Mean Sq F value Pr(>F)
## Country        5 2.553e+07  5105165   0.302  0.912
## Residuals   1088 1.837e+10 16883913

# chi-squre # products sold by region

table_country_product <- table(chocolate$Country, chocolate$Product)
 chisq.test(table_country_product)
## 
##  Pearson's Chi-squared test
## 
## data:  table_country_product
## X-squared = 71.967, df = 105, p-value = 0.9943

# simple linear rigression

model1 <- lm(Amount ~ Boxes.Shipped, data = chocolate)
 summary(model1)
## 
## Call:
## lm(formula = Amount ~ Boxes.Shipped, data = chocolate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5694.7 -3246.1  -769.4  2345.9 16427.1 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5755.1237   206.6254  27.853   <2e-16 ***
## Boxes.Shipped   -0.6355     1.0212  -0.622    0.534    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4104 on 1092 degrees of freedom
## Multiple R-squared:  0.0003545,  Adjusted R-squared:  -0.000561 
## F-statistic: 0.3872 on 1 and 1092 DF,  p-value: 0.5339

# Scatter plot + regression line

library(ggplot2)
 ggplot(chocolate, aes(x = Boxes.Shipped, y = Amount)) +
   geom_point(color = "steelblue") +
   geom_smooth(method = "lm", se = TRUE, color = "red") +
   labs(title = "Sales Amount vs. Boxes Shipped",
        x = "Boxes Shipped", y = "Sales Amount ($)") +
   theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# multiple linear linear regression

 model2 <- lm(Amount ~ Boxes.Shipped + Country + Product, data = chocolate)
 summary(model2)
## 
## Call:
## lm(formula = Amount ~ Boxes.Shipped + Country + Product, data = chocolate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6577.5 -3304.1  -776.1  2413.8 15415.5 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5704.8784   606.2296   9.410   <2e-16 ***
## Boxes.Shipped                  -0.6913     1.0370  -0.667    0.505    
## CountryCanada                 -54.9176   427.4668  -0.128    0.898    
## CountryIndia                  114.0555   421.8329   0.270    0.787    
## CountryNew Zealand           -123.6476   429.3784  -0.288    0.773    
## CountryUK                     314.6186   424.1602   0.742    0.458    
## CountryUSA                    245.0671   423.3937   0.579    0.563    
## Product70% Dark Bites        -605.4607   830.9975  -0.729    0.466    
## Product85% Dark Bars          321.3178   791.6908   0.406    0.685    
## Product99% Dark & Pure        418.1462   794.1699   0.527    0.599    
## ProductAfter Nines           -425.1172   792.3547  -0.537    0.592    
## ProductAlmond Choco            86.6070   799.5354   0.108    0.914    
## ProductBaker's Choco Chips    419.2033   836.2477   0.501    0.616    
## ProductCaramel Stuffed Bars  -277.9705   825.9262  -0.337    0.737    
## ProductChoco Coated Almonds   507.0318   850.7821   0.596    0.551    
## ProductDrinking Coco        -1115.8715   766.6311  -1.456    0.146    
## ProductEclairs               -477.0039   755.0259  -0.632    0.528    
## ProductFruit & Nut Bars      -488.1493   791.5440  -0.617    0.538    
## ProductManuka Honey Choco     454.4505   814.3194   0.558    0.577    
## ProductMilk Bars             -191.2801   794.2014  -0.241    0.810    
## ProductMint Chip Choco        685.8939   817.6288   0.839    0.402    
## ProductOrange Choco          -228.0227   804.5002  -0.283    0.777    
## ProductOrganic Choco Syrup     17.6606   783.0957   0.023    0.982    
## ProductPeanut Butter Cubes    959.3548   796.1073   1.205    0.228    
## ProductRaspberry Choco       -174.2150   799.2329  -0.218    0.827    
## ProductSmooth Sliky Salty     242.2114   758.2268   0.319    0.749    
## ProductSpicy Special Slims   -224.8624   776.7148  -0.290    0.772    
## ProductWhite Choc             -25.1203   760.2394  -0.033    0.974    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4123 on 1066 degrees of freedom
## Multiple R-squared:  0.01481,    Adjusted R-squared:  -0.01015 
## F-statistic: 0.5934 on 27 and 1066 DF,  p-value: 0.9512
 plot(model2)

 hist(residuals(model2))

 # scatter plot for multiple linear rigressioin model
 ggplot(chocolate, aes(x = Boxes.Shipped, y = Amount)) +
   geom_point(color = "steelblue") +
   geom_smooth(method = "lm", se = TRUE, color = "red") +
   labs(
     title = "Sales Amount vs. Boxes Shipped",
     x = "Boxes Shipped",
     y = "Sales Amount ($)") +
   theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

multiple linear rigression: Amount~ Boxes.shipped + country

 ggplot(chocolate, aes(x = Boxes.Shipped, y = Amount)) +
   geom_point(aes(color = Country)) +
   geom_smooth(method = "lm", se = FALSE) +
   facet_wrap(~ Country) +
   labs(title = "Sales vs. Boxes Shipped by Country",
        x = "Boxes Shipped", y = "Sales Amount") +
   theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

 # histogram of sales
  ggplot(chocolate, aes(x = Amount)) +
   geom_histogram(binwidth = 1000, fill = "green", color = "white") +
   labs(
     title = "Distribution of Sales Amount",
     x = "Sales Amount ($)",
     y = "Frequency"
   ) +
   theme_classic()

# box plot

ggplot(chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount)) +
   geom_boxplot(fill =  "blue") +
   labs(title = "Sales Amount by Boxes Shipped",
     x = "Boxes Shipped",
     y = "Sales Amount") +
   theme_light()

# Bar plot of ‘sales’

 ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount)) +
   geom_bar(stat = "identity", fill = "red") +
   labs(
     title = "Sales Amount by Boxes Shipped",
     x = "Boxes Shipped",
     y = "Sales Amount") +
   theme_classic()

# Labels

 ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount,colour = Product)) +
   geom_point() +
   labs(title = "Sales Amount by Boxes Shipped",
        x = "Boxes Shipped",
        y = "Sales Amount",
        caption = "kemboi,2025") +
   theme_classic()+
   theme(plot.title = element_text(hjust = 0.5))

# Facets ** make use of the~operator

ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount,colour = Product)) +
   geom_point() +
   facet_wrap(~ Country)+
   labs(title = "Sales Amount by Boxes Shipped",
        x = "Boxes Shipped",
        y = "Sales Amount",
        caption = "kemboi,2025") +
   theme_classic()

# combine the plots

 library(patchwork)
 
p1= ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount,colour = Product)) +
   geom_point() +
   labs(title = "Sales Amount by Boxes Shipped",
        x = "Boxes Shipped",
        y = "Sales Amount",
        caption = "kemboi,2025") +
   theme_classic()+
   theme(plot.title = element_text(hjust = 0.5))
 
p2 <- ggplot(chocolate, aes(x = Amount)) +
  geom_histogram(binwidth = 1000, fill = "green", color = "white") +
  labs(
    title = "Distribution of Sales Amount",
    x = "Sales Amount ($)",
    y = "Frequency") +
  theme_classic()
 
p3= ggplot(data = chocolate, aes(x = as.factor(Boxes.Shipped), y = Amount)) +
   geom_bar(stat = "identity", fill = "red") +
   labs(
     title = "Sales Amount by Boxes Shipped",
     x = "Boxes Shipped",
     y = "Sales Amount") +
   theme_classic()

combine = (p1| p2| p3)+
  plot_annotation(title="combine plots Example")
print(combine)