knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)

Import údajov z .csv alebo .xls

udaje <- read.csv2("data.csv",sep = ",", header = TRUE)
colnames(udaje)
 [1] "Customer.ID"           "Purchase.Date"         "Product.Category"     
 [4] "Product.Price"         "Quantity"              "Total.Purchase.Amount"
 [7] "Payment.Method"        "Customer.Age"          "Returns"              
[10] "Customer.Name"         "Age"                   "Gender"               
[13] "Churn"                

Grafy

ggplot2 - knižnica pre grafy

Výber a následné triedenie

library(dplyr)

udaje.2020 <- udaje %>%
  filter(Purchase.Date == 2020) %>%
  select(Product.Category,Product.Price,Quantity,Total.Purchase.Amount,Customer.Age,Gender,Purchase.Date)

Scatter plot

library(ggplot2)
ggplot(udaje.2020, aes(x = Customer.Age, y = Total.Purchase.Amount, color = Gender)) +
  geom_point(alpha = 0.6, size = 2) +  
  labs(
    title = "Vzťah veku zákazníka a celkovej sumy nákupu v roku 2020",
    x = "Vek zákazníka",
    y = "Celková suma nákupu",
    color = "Pohlavie"
  ) +
  theme_minimal()      

Zákazníci pokrývajú široké vekové rozpätie- približne od 18 do 70 rokov. Počet nákupov je pomerne rovnomerne rozložený vo všetkých vekových kategóriách — žiadna skupina výrazne nedominuje. Väčšina nákupov sa pohybuje v dolnej polovici grafu (do približne 4000), čo naznačuje, že menšie nákupy sú oveľa častejšie. Červené (Female) a tyrkysové (Male) body sú rozmiestnené veľmi podobne. Muži aj ženy nakupujú v podobných objemoch naprieč vekovými kategóriami.

Boxplot

library(ggplot2)

library(ggplot2)

ggplot(udaje.2020, aes(x = `Product.Category`, y = `Total.Purchase.Amount`, fill = `Product.Category`)) +
  geom_boxplot() +
  labs(
    title = "Distribúcia celkovej sumy nákupu podľa kategórie produktu",
    x = "Kategória produktu",
    y = "Celková suma nákupu"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribúcia celkovej sumy nákupu je veľmi podobná naprieč všetkými kategóriami produktov (Books, Clothing, Electronics, Home). Neexistujú veľké rozdiely v mediáne ani v rozsahu nákupov, čo znamená, že žiadna kategória výrazne nevyniká v tom, koľko ľudia utrácajú.

Základné štatistiky.

knitr - tabuľka

library(dplyr)
library(knitr)


purchase.stats <- udaje %>%
  filter(Purchase.Date %in% 2020:2023) %>%   
  group_by(Purchase.Date) %>%
  summarise(
    n       = n(),
    mean    = mean(Total.Purchase.Amount, na.rm = TRUE),
    sd      = sd(Total.Purchase.Amount, na.rm = TRUE),
    min     = min(Total.Purchase.Amount, na.rm = TRUE),
    q25     = quantile(Total.Purchase.Amount, 0.25, na.rm = TRUE),
    median  = median(Total.Purchase.Amount, na.rm = TRUE),
    q75     = quantile(Total.Purchase.Amount, 0.75, na.rm = TRUE),
    max     = max(Total.Purchase.Amount, na.rm = TRUE),
    .groups = "drop"
  )

kable(purchase.stats, digits = 2, caption = "Základné štatistiky Total Purchase Amount podľa roku")
Základné štatistiky Total Purchase Amount podľa roku
Purchase.Date n mean sd min q25 median q75 max
2020 66473 2714.75 1445.93 108 1467 2707 3967.00 5345
2021 66295 2732.49 1440.76 100 1482 2737 3976.00 5350
2022 69732 2730.42 1442.80 101 1481 2731 3978.25 5350
2023 47500 2722.90 1441.92 100 1475 2717 3972.00 5350

alebo krajšie tabuľky s pomocou .kableExtra.:

library(dplyr)
library(knitr)
library(kableExtra)

 purchase.stats <- udaje %>%
  filter(Purchase.Date %in% 2020:2023) %>%   
  group_by(Purchase.Date) %>%
  summarise(
    n       = n(),
    mean    = mean(Total.Purchase.Amount, na.rm = TRUE),
    sd      = sd(Total.Purchase.Amount, na.rm = TRUE),
    min     = min(Total.Purchase.Amount, na.rm = TRUE),
    q25     = quantile(Total.Purchase.Amount, 0.25, na.rm = TRUE),
    median  = median(Total.Purchase.Amount, na.rm = TRUE),
    q75     = quantile(Total.Purchase.Amount, 0.75, na.rm = TRUE),
    max     = max(Total.Purchase.Amount, na.rm = TRUE),
    .groups = "drop"
  )


purchase.stats %>%
  kable(digits = 2, caption = "Základné štatistiky Total Purchase Amount podľa roku") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%                 
  row_spec(0, bold = TRUE, background = "#f2f2f2") %>%  
  add_header_above(c(" " = 2, "Total Purchase Amount Statistics" = 7))
Základné štatistiky Total Purchase Amount podľa roku
Total Purchase Amount Statistics
Purchase.Date n mean sd min q25 median q75 max
2020 66473 2714.75 1445.93 108 1467 2707 3967.00 5345
2021 66295 2732.49 1440.76 100 1482 2737 3976.00 5350
2022 69732 2730.42 1442.80 101 1481 2731 3978.25 5350
2023 47500 2722.90 1441.92 100 1475 2717 3972.00 5350

Testovanie hypotéz

t.test.result <- t.test(
  udaje$Total.Purchase.Amount[udaje$Purchase.Date == 2020],
  udaje$Total.Purchase.Amount[udaje$Purchase.Date == 2021]
)

print(t.test.result)

    Welch Two Sample t-test

data:  udaje$Total.Purchase.Amount[udaje$Purchase.Date == 2020] and udaje$Total.Purchase.Amount[udaje$Purchase.Date == 2021]
t = -2.2399, df = 132766, p-value = 0.0251
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -33.272512  -2.217213
sample estimates:
mean of x mean of y 
 2714.745  2732.490 

ANOVA: Comparing Reading Scores Across Programs

anova.result <- aov(Total.Purchase.Amount ~ factor(Purchase.Date), data = udaje)
summary(anova.result)
                          Df    Sum Sq Mean Sq F value Pr(>F)
factor(Purchase.Date)      3 1.293e+07 4310257    2.07  0.102
Residuals             249996 5.205e+11 2082031               

Linear Regression: Predicting Math Scores

model <- lm(Total.Purchase.Amount ~ Product.Price + Customer.Age + Quantity, data = udaje)

summary(model)

Call:
lm(formula = Total.Purchase.Amount ~ Product.Price + Customer.Age + 
    Quantity, data = udaje)

Residuals:
     Min       1Q   Median       3Q      Max 
-2503.19 -1245.94     0.78  1248.80  2502.01 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   2517.10048   11.87068 212.044   <2e-16 ***
Product.Price   -0.02177    0.02036  -1.069    0.285    
Customer.Age     4.87284    0.18775  25.954   <2e-16 ***
Quantity        -0.10037    2.03719  -0.049    0.961    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1441 on 249996 degrees of freedom
Multiple R-squared:  0.002693,  Adjusted R-squared:  0.002681 
F-statistic:   225 on 3 and 249996 DF,  p-value: < 2.2e-16
library(broom)
library(dplyr)
library(kableExtra)
library(stringr)

model <- lm(Total.Purchase.Amount ~ Product.Price + Customer.Age + Quantity, data = udaje)

coef.tbl <- tidy(model, conf.int = TRUE) %>%
  mutate(
    term = recode(term,
      "(Intercept)" = "Intercept",
      "Product.Price" = "Product Price",
      "Customer.Age" = "Customer Age",
      "Quantity" = "Quantity"
    ),
    stars = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ "·",
      TRUE            ~ ""
    )
  ) %>%
  transmute(
    Term = term,
    Estimate = estimate,
    `Std. Error` = std.error,
    `t value` = statistic,
    `p value` = p.value,
    `95% CI` = str_c("[", round(conf.low, 3), ", ", round(conf.high, 3), "]"),
    Sig = stars
  )

coef.tbl %>%
  kable(
    digits = 3,
    caption = "OLS Regression Coefficients (Total Purchase Amount ~ Product Price + Customer Age + Quantity)"
  ) %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#f2f2f2") %>%
  footnote(
    general = "Signif. codes: *** p<0.001, ** p<0.01, * p<0.05, · p<0.1.",
    threeparttable = TRUE
  )
OLS Regression Coefficients (Total Purchase Amount ~ Product Price + Customer Age + Quantity)
Term Estimate Std. Error t value p value 95% CI Sig
Intercept 2517.100 11.871 212.044 0.000 [2493.834, 2540.367] ***
Product Price -0.022 0.020 -1.069 0.285 [-0.062, 0.018]
Customer Age 4.873 0.188 25.954 0.000 [4.505, 5.241] ***
Quantity -0.100 2.037 -0.049 0.961 [-4.093, 3.892]
Note:
Signif. codes: *** p<0.001, ** p<0.01, * p<0.05, · p<0.1.
fit.tbl <- glance(model) %>%
  transmute(
    `R-squared` = r.squared,
    `Adj. R-squared` = adj.r.squared,
    `F-statistic` = statistic,
    `F p-value` = p.value,
    `AIC` = AIC,
    `BIC` = BIC,
    `Num. obs.` = nobs
  )

fit.tbl %>%
  kable(digits = 3, caption = "Model Fit Statistics") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("condensed"))
Model Fit Statistics
R-squared Adj. R-squared F-statistic F p-value AIC BIC Num. obs.
0.003 0.003 224.989 0 4346021 4346073 250000
