1. Packages và đọc dữ liệu

library(haven)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(labelled)

dat <- read_sav("./Data2.sav")

var.label <-  data.frame(
  var = names(dat), 
  label = var_label(dat, unlist = T)
)

2. Ngành hoạt động chính

dat %>% 
  mutate(
    c4_1 = factor(
      c4, levels = 1:6, labels = c('Công nghiệp', 'Thương mại', 'Thông tin truyền thông', 'Công nghệ thông tin', 'Tài chính – bảo hiểm', 'Khác')
    ),
    .after = c4
  ) %>% 
  group_by(c4_1) %>% 
  summarise(
    count = n()
  ) %>%  
  ggplot(aes(x = reorder(c4_1, count), y = count)) + 
  geom_bar(width = 0.6, fill = "#124b92", stat = 'identity') + 
  coord_flip() + 
  theme_minimal() + 
  labs(
    x = "Ngành hoạt động chính của doanh nghiệp",
    y = "Số lượng doanh nghiệp",
    title = "Số lượng doanh nghiệp được khảo sát phân theo ngành"
  )

  # theme(
  #   axis.title.y = element_text(margin = margin(r = 20), vjust = 0.2),
  #   axis.title.x = element_text(margin = margin(t = 20), hjust = 0.3),
  #   plot.title = element_text(face = 'bold', hjust = -0.3, margin = margin(b = 10))
  # )

3. Câu hỏi (c5-c8, c11-c13, 18) câu hỏi có không

Tỷ lệ phần trăm (%) đúng sai của từng câu hỏi

dat %>% 
  select(c5:c8, c11:c13, c18) %>% 
  gather(key = 'var', value = 'value1', everything()) %>% 
  mutate(
    value1 = factor(value1, levels = c(2, 1), labels = c('Không', 'Có'), ordered = T),
    var = factor(
      var,
      levels =c("c12", "c18", "c11", "c13", "c6", "c5", "c8", "c7" ),
      labels = c(
        'Sử dụng chữ ký điện tử, chữ ký số trong giao dịch',
        'Triển khai các hoạt động nhằm bảo đảm an toàn, an ninh mạng',
        'Sử dụng các nền tảng thanh toán trực tuyến, di động',
        'Sử dụng mạng xã hội phục vụ hoạt động sản xuất kinh doanh',
        'Bố trí tổ chức, nhân sự chuyên trách cho hoạt động Chuyển đổi số',
        'Nhận được các tài liệu tuyên truyền (quyển, tờ rơi, video clip) về Chuyển đổi số',
        'Xây dựng kế hoạch, chương trình hành động cụ thể cho hoạt động Chuyển đổi số',
        'Phân bổ ngân sách riêng biệt cho hoạt động Chuyển đổi số'
      ),
      ordered = T
    )
  ) %>% 
  ggplot(aes(var, fill = value1)) +
  geom_bar(position = 'fill') + 
  coord_flip() + 
  ylab('Tỷ lệ phần trăm') + 
  theme_minimal() +
  theme(
    axis.title.y = element_blank()
    # axis.title.x = element_text(margin = margin(t = 20))
  ) +
  scale_fill_manual(values = c('#dddddd', '#124b92')) + 
  guides(fill = guide_legend(title = ""))

## Warning: attributes are not identical across measure variables;
## they will be dropped

4. Câu hỏi cụm

Câu 16_1 - 16_4

dat %>% 
  select(starts_with('c16')) %>% 
  gather(key = 'var', value = 'value1', everything()) %>% 
  mutate(
    value1 = factor(value1, levels = c(0, 1), labels = c('Không', 'Có'), ordered = T),
    var = factor(
      var,
      levels =c("c16_4","c16_1", "c16_3", "c16_2"),
      labels = c(
        'Thực hiện qua các hình thức khác',
        'Thực hiện trực tiếp thông qua hệ thống website tự xây dựng',
        'Thực hiện qua mạng xã hội',
        'Thực hiện qua các sàn thương mại điện tử'
      ),
      ordered = T
    )
  ) %>% 
  ggplot(aes(var, fill = value1)) +
  geom_bar(position = 'fill', width = 0.5) + 
  coord_flip() + 
  ylab('Tỷ lệ phần trăm') + 
  theme_minimal() +
  theme(
    axis.title.y = element_blank()
    # axis.title.x = element_text(margin = margin(t = 20))
  ) +
  scale_fill_manual(values = c('#dddddd', '#124b92')) +
  guides(fill = guide_legend(title = ""))

## Warning: attributes are not identical across measure variables;
## they will be dropped

17_1 - 17_4

dat %>% 
  select(starts_with('c17')) %>% 
  gather(key = 'var', value = 'value1', everything()) %>% 
  mutate(
    value1 = factor(value1, levels = c(0, 1), labels = c('Không', 'Có'), ordered = T),
    var = factor(
      var,
      levels =c("c17_4", "c17_1", "c17_2", "c17_3"),
      labels = c(
        'Thanh toán bằng hình thức khác',
        'Chuyển khoản qua dịch vụ ngân hàng điện tử',
        'Thanh toán bằng thẻ tín dụng',
        'Thanh toán bằng ví điện tử'
      ),
      ordered = T
    )
  ) %>% 
  ggplot(aes(var, fill = value1)) +
  geom_bar(position = 'fill', width = 0.5) + 
  coord_flip() + 
  ylab('Tỷ lệ phần trăm') + 
  theme_minimal() +
  theme(
    axis.title.y = element_blank(),
    axis.title.x = element_text(margin = margin(t = 20))
  ) +
  scale_fill_manual(values = c('#dddddd', '#124b92')) +
  guides(fill = guide_legend(title = ""))

## Warning: attributes are not identical across measure variables;
## they will be dropped

5. Xử lý trực quan trên bản đồ Việt Nam

library(readxl)

mst <- read_excel('./data.json.xlsx')

mst <- mst %>% 
  drop_na() %>% 
  filter(tinh_thanh == 'Hà Nội')

dat <- dat %>% 
  mutate(
    c2_1 = ifelse(
      nchar(as.character(c2)),
      paste('0', as.character(c2), sep = '')
    ),
    .after = c2
  ) %>% 
  inner_join(mst, by = c('c2_1' = 'mst'))

dat <- dat %>% 
  mutate(
    quan_huyen = sub('^\\w+\\s', '', quan_huyen)
  )

library(stringi)

# Get geo-spatial data by district level for all provinces: 
vietnam_dis <- raster::getData("GADM", country = "Vietnam", level = 2) #'GADM' is a database of global administrative boundaries, ?getData 

# Only select Hanoi
hanoi <- vietnam_dis[vietnam_dis$NAME_1 == "Hà Nội", ]

# Convert to data frame
# fortify-spatial: SpatialPolygonsDataFrame to convert into a dataframe
# To figure out the correct variable name for region, inspect as.data.frame(model)
hanoi_df <- hanoi %>% 
  fortify(region = "NAME_2") # %>%  #name of variable used to split up regions
  # mutate(id = stri_trans_general(id, "Latin-ASCII")) # ?stri_trans_general 

# some_districts <- c("Hoan Kiem", "Hai Ba Trung", "Ba Dinh", "Tay Ho", 
#                     "Hoang Mai", "Thanh Xuan", "Nam Tu Liem", "Bac Tu Liem",
#                     "Cau Giay", "Dong Da", "Long Bien")

dat %>% 
  group_by(quan_huyen) %>% 
  transmute(
    mean_c14 = mean(c14),
    mean_c15 = mean(c15)
  ) %>% 
  right_join(hanoi_df, by = c('quan_huyen' = 'id')) %>% 
  ggplot(aes(fill = mean_c14, x = long, y = lat, group = group)) +
    geom_polygon(color = "gray85", size = .5) +
    labs(
      fill = "%",
      title = "Tỷ lệ (%) lao động được tham gia ít nhất một khoá đào tạo, tập huấn về Chuyển đổi số, Kỹ năng số",
    
    ) + 
    theme(axis.text = element_blank()) + 
    theme(axis.ticks = element_blank()) + 
    theme(axis.title = element_blank())

dat %>% 
  group_by(quan_huyen) %>% 
  transmute(
    mean_c14 = mean(c14),
    mean_c15 = mean(c15)
  ) %>% 
  right_join(hanoi_df, by = c('quan_huyen' = 'id')) %>% 
  ggplot(aes(fill = mean_c15, x = long, y = lat, group = group)) +
  geom_polygon(color = "gray85", size = .5) +
  labs(
    fill = "%",
    title = "Tỷ lệ (%) lao động trong doanh nghiệp hoạt động trong các vị trí liên quan đến công nghệ số",
    
  ) + 
  theme(axis.text = element_blank()) + 
  theme(axis.ticks = element_blank()) + 
  theme(axis.title = element_blank())

Báo cáo AI và Digitalization

Khánh

2022-09-17

1. Packages và đọc dữ liệu

2. Ngành hoạt động chính

3. Câu hỏi (c5-c8, c11-c13, 18) câu hỏi có không

4. Câu hỏi cụm

Câu 16_1 - 16_4

17_1 - 17_4

5. Xử lý trực quan trên bản đồ Việt Nam