L7_2

Trực quan hóa dữ liệu 2025 - P2

# Tải Thư viện chung library
pacman::p_load(ggplot2, GGally, gganimate, tidyverse)

X <- openxlsx::read.xlsx("~/Library/CloudStorage/GoogleDrive-chuonghn90@gmail.com/Other computers/My Laptop/Lectures_Quant using R/Supermarket Transactions.xlsx", sheet = "Data")

head(X, n=5)

  Transaction Purchase.Date Customer.ID Gender Marital.Status Homeowner
1           1         40895        7223      F              M         Y
2           2         40897        7841      M              M         Y
3           3         40898        8374      F              M         N
4           4         40898        9619      M              M         Y
5           5         40899        1900      F              S         Y
  Children Annual.Income          City State.or.Province Country Product.Family
1        2   $30K - $50K   Los Angeles                CA     USA           Food
2        5   $70K - $90K   Los Angeles                CA     USA           Food
3        2   $50K - $70K     Bremerton                WA     USA           Food
4        3   $30K - $50K      Portland                OR     USA           Food
5        3 $130K - $150K Beverly Hills                CA     USA          Drink
  Product.Department     Product.Category Units.Sold Revenue
1        Snack Foods          Snack Foods          5   27.38
2            Produce           Vegetables          5   14.90
3        Snack Foods          Snack Foods          3    5.52
4             Snacks                Candy          4    4.44
5          Beverages Carbonated Beverages          4   14.00

names(X)

 [1] "Transaction"        "Purchase.Date"      "Customer.ID"       
 [4] "Gender"             "Marital.Status"     "Homeowner"         
 [7] "Children"           "Annual.Income"      "City"              
[10] "State.or.Province"  "Country"            "Product.Family"    
[13] "Product.Department" "Product.Category"   "Units.Sold"        
[16] "Revenue"

# thông tin thống kê từ 2 biến nhóm biến định 
ggally_colbar(X,
              aes(Gender, Marital.Status))

ggally_count(X,
              aes(Marital.Status,Gender, 
                  colour=Homeowner))

X |> 
  filter(Marital.Status=="M") |>
ggally_cross(
  aes(Homeowner,Gender, 
                  colour=Gender))

ggally_crosstable(X, 
                  aes(Marital.Status,Gender), 
                  cells = "col.prop", 
                  fill = "std.resid")

X |> 
ggally_facetbar(aes(Annual.Income,Children  ))

ggally_facetbar(X, 
                aes(Marital.Status,Gender, 
                    colour = Product.Family))

X |> filter(Marital.Status=="M") |>
ggally_rowbar( 
                aes(Gender,Product.Category))

Biểu diễn thông tin doanh thu theo địa phương và giới tính

BD1

ggplot(X, aes(City, Revenue)) +
        geom_bar(stat = "identity")

Chuyen dang huong bieu do

# Quay hệ trục toạ độ
ggplot(X, aes(City, Revenue )) +
        geom_bar(stat = "identity") +
        coord_flip()

Phan biet theo gioi tinh

ggplot(X, aes(City, Revenue, 
              fill = Gender)) +
        geom_bar(stat = "identity") +
        coord_flip()

ggplot(X, aes(City, Revenue, fill = Gender)) +
        geom_bar(stat = "identity", position = "dodge") +
        coord_flip()

ggplot(X, aes(City, Revenue, fill = Gender)) +
        geom_bar(stat = "identity", position = "dodge") +
        coord_flip() +
        facet_wrap(~ Gender)

X |>group_by(City, Gender) |>
   summarise(Revenue = sum(Revenue))

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

# A tibble: 46 × 3
# Groups:   City [23]
   City          Gender Revenue
   <chr>         <chr>    <dbl>
 1 Acapulco      F        2566.
 2 Acapulco      M        2596.
 3 Bellingham    F         453.
 4 Bellingham    M         539.
 5 Beverly Hills F        5050.
 6 Beverly Hills M        5270.
 7 Bremerton     F        5270.
 8 Bremerton     M        5705.
 9 Camacho       F        3643.
10 Camacho       M        2154.
# ℹ 36 more rows

X |> group_by(City, Gender) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE)) %>%
        ungroup()  |>
        ggplot( aes(Revenue, City)) +
        geom_point(aes(color = Gender))

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

X |>group_by(City, Gender) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE)) %>%
        ungroup()  |>
ggplot( aes(Revenue, City, fill = Gender))  +
        geom_bar(stat = "identity", position = "dodge") +
        facet_wrap(~ Gender)

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

X |>group_by(City, Gender) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE)) %>%
        ungroup()  |>
ggplot( 
  aes(Revenue, City))  + 
        geom_point(aes(color = Gender))+
        geom_line(aes(group = City))

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

X |>group_by(City, Gender) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE)) %>%
        ungroup()  |>
ggplot( aes(Revenue, City, label = round(Revenue, 0)))  + 
        geom_point(aes(color = Gender))+
        geom_line(aes(group = City)) +
  geom_text()

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

X |>group_by(City, Gender) %>%
        summarise(Revenue = sum(Revenue, na.rm = T)) %>%
        ungroup()  -> X2

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

X2 %>%
        group_by(City) %>%
        arrange(desc(Revenue)) %>%
        slice(1)

# A tibble: 23 × 3
# Groups:   City [23]
   City          Gender Revenue
   <chr>         <chr>    <dbl>
 1 Acapulco      M        2596.
 2 Bellingham    M         539.
 3 Beverly Hills M        5270.
 4 Bremerton     M        5705.
 5 Camacho       F        3643.
 6 Guadalajara   F         291.
 7 Hidalgo       F        7361.
 8 Los Angeles   M        6282.
 9 Merida        F        4770.
10 Mexico City   F        1255.
# ℹ 13 more rows

right_label <- X2 %>%
        group_by(City) %>%
        arrange(desc(Revenue)) %>%
        slice(1)

left_label <- X2 %>%
        group_by(City) %>%
        arrange(desc(Revenue)) %>%
        slice(2)

ggplot(X2, 
       aes(Revenue, City)) +
        geom_line(aes(group = City)) +
        geom_point(aes(color = Gender), size = 1.5) +
        geom_text(data = right_label, aes(color = Gender, label = round(Revenue, 0)),
                  size = 3, hjust = -.5) +
        geom_text(data = left_label, aes(color = Gender, label = round(Revenue, 0)),
                  size = 3, hjust = 1.5) +
        scale_x_continuous(limits = c(-500, 10500))

Thubac<- X2 %>%
        group_by(City) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE)) %>%
        arrange(Revenue) %>%
        mutate(City = factor(City, levels = .$City))

X2 <- X2 %>%
  mutate(City= factor(City, levels = Thubac$City))

ggplot(X2,
       aes(Revenue, City)) +
        geom_line(aes(group = City)) +
        geom_point(aes(color = Gender), size = 1.5) +
        geom_text(data = right_label, aes(color = Gender, label = round(Revenue, 0)),
                  size = 3, hjust = -.5) +
        geom_text(data = left_label, aes(color = Gender, label = round(Revenue, 0)),
                  size = 3, hjust = 1.5)

head(X2)

# A tibble: 6 × 3
  City          Gender Revenue
  <fct>         <chr>    <dbl>
1 Acapulco      F        2566.
2 Acapulco      M        2596.
3 Bellingham    F         453.
4 Bellingham    M         539.
5 Beverly Hills F        5050.
6 Beverly Hills M        5270.

X2 |>
  group_by(City) |>
  mutate(D=max(Revenue)/min(Revenue)) %>%
  arrange(desc(D)) |>
  head(10)

# A tibble: 10 × 4
# Groups:   City [5]
   City        Gender Revenue     D
   <fct>       <chr>    <dbl> <dbl>
 1 Hidalgo     F        7361.  1.86
 2 Hidalgo     M        3952.  1.86
 3 Camacho     F        3643.  1.69
 4 Camacho     M        2154.  1.69
 5 Walla Walla F         488.  1.38
 6 Walla Walla M         676.  1.38
 7 San Andres  F        3459.  1.37
 8 San Andres  M        4748.  1.37
 9 Yakima      F        2159.  1.35
10 Yakima      M        2911.  1.35

X2 |>
  group_by(City) |>
  mutate(D=max(Revenue)/min(Revenue)) %>%
  arrange(desc(D)) |>
  head(10) ->BigD

X2 %>%
  filter(City %in% BigD$City)

# A tibble: 10 × 3
   City        Gender Revenue
   <fct>       <chr>    <dbl>
 1 Camacho     F        3643.
 2 Camacho     M        2154.
 3 Hidalgo     F        7361.
 4 Hidalgo     M        3952.
 5 San Andres  F        3459.
 6 San Andres  M        4748.
 7 Walla Walla F         488.
 8 Walla Walla M         676.
 9 Yakima      F        2159.
10 Yakima      M        2911.

highlight <- X2 %>%
  filter(City %in% BigD$City)

ggplot(X2,
       aes(Revenue, City)) +
        geom_line(aes(group = City), color="gray") +
        geom_point(aes(color = Gender), size = 0.5) +
         geom_line(data = highlight, aes(group = City))+
          geom_point(data = highlight, aes(color = Gender), size = 2)

right_label1 <- filter(right_label, City %in% BigD$City)
left_label1 <- filter(left_label, City %in% BigD$City)

ggplot(X2,
       aes(Revenue, City)) +
        geom_line(aes(group = City), color="gray") +
        geom_point(aes(color = Gender), size = 1) +
         geom_line(data = highlight, aes(group = City))+
          geom_point(data = highlight, aes(color = Gender), size = 5)+
  geom_text(data = right_label1, aes(color = Gender, label = round(Revenue, 0)),size = 3, hjust = -.5) +
        geom_text(data = left_label1, aes(color = Gender, label = round(Revenue, 0)),size = 3, hjust = 1.5)

Loại biểu đồ tương tác

Biểu đồ động

library(htmlwidgets)
library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

head(X)

  Transaction Purchase.Date Customer.ID Gender Marital.Status Homeowner
1           1         40895        7223      F              M         Y
2           2         40897        7841      M              M         Y
3           3         40898        8374      F              M         N
4           4         40898        9619      M              M         Y
5           5         40899        1900      F              S         Y
6           6         40899        6696      F              M         Y
  Children Annual.Income          City State.or.Province Country Product.Family
1        2   $30K - $50K   Los Angeles                CA     USA           Food
2        5   $70K - $90K   Los Angeles                CA     USA           Food
3        2   $50K - $70K     Bremerton                WA     USA           Food
4        3   $30K - $50K      Portland                OR     USA           Food
5        3 $130K - $150K Beverly Hills                CA     USA          Drink
6        3   $10K - $30K Beverly Hills                CA     USA           Food
  Product.Department     Product.Category Units.Sold Revenue
1        Snack Foods          Snack Foods          5   27.38
2            Produce           Vegetables          5   14.90
3        Snack Foods          Snack Foods          3    5.52
4             Snacks                Candy          4    4.44
5          Beverages Carbonated Beverages          4   14.00
6               Deli          Side Dishes          3    4.37

  X %>%
  group_by(City, Gender,Product.Family ) %>%
  mutate(R=sum(Revenue)) -> X2

ggplot(X2,
       aes(R , City, size = R , color = Product.Family)) +
  geom_point()

ggplot(X2,
       aes(R , City, size = R , color = Product.Family)) +
  geom_point() -> p1
p1

ggplotly(p1 )

ggplot(X2,
       aes(Revenue, City)) +
        geom_line(aes(group = City), color="gray") +
        geom_point(aes(color = Gender), size = 1) +
         geom_line(data = highlight, aes(group = City))+
          geom_point(data = highlight, aes(color = Gender), size = 5)+
  geom_text(data = right_label1, aes(color = Gender, label = round(Revenue, 0)),size = 3, hjust = -.5) +
        geom_text(data = left_label1, aes(color = Gender, label = round(Revenue, 0)),size = 3, hjust = 1.5)  -> P

X |>group_by(City, Gender) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE)) %>%
        ungroup()  -> X2

`summarise()` has grouped output by 'City'. You can override using the
`.groups` argument.

ggplot(X2,
       aes(Revenue, City)) +
        geom_line(aes(group = City), color="gray") +
        geom_point(aes(color = Gender), size = 0.5) +
         geom_line(data = highlight, aes(group = City))+
          geom_point(data = highlight, aes(color = Gender), size = 2)+
  geom_text(data = right_label1, aes(color = Gender, label = round(Revenue, 0)),size = 3, hjust = -.5) +
        geom_text(data = left_label1, aes(color = Gender, label = round(Revenue, 0)),size = 3, hjust = 1.5) -> P2

ggplotly(P2)

X0 <- ggplot(X, aes(City, Revenue, fill = Gender)) +
        geom_bar(stat = "identity", position = "dodge") +
        coord_flip() +
        facet_wrap(~ Gender) 
ggplotly(X0)

Biểu diễn dữ liệu thông qua 1 app trung gian từ R

if(!require('GWalkR')) {
  install.packages('GWalkR')
  library('GWalkR')
}

Loading required package: GWalkR

# !formatR
library(GWalkR)
library(shiny)
Dat<-X # Data 
app <- shinyApp(
  ui = fluidPage(
    titlePanel("Explore the data here: "),
    gwalkrOutput("mygraph")
  ),
  server = function(input, output, session) {
    output$mygraph = renderGwalkr(
      gwalkr(Dat)
    )
  }
)
if (interactive()) app