R Notebook

頤珍公司資料個案讀書會

第一周

開會時間：2021/03/30（二）
主要目標：探性資料分析

統計性分析
分群：統計分群、人格特質分群

資料集：

1.人格特質BIG FIVE問卷 + BIGFIVE_doula.xlsx + BIGFIVE_customer.xlsx 2.BIG FIVE 問卷對應 + Big5_item.csv 3.月嫂個人基本資料 + doula_BasicInfo.xlsx

五大人格特質介紹： + Surgency 外向性 + Agreeableness 親和性 + Adjustment 適應障礙 + Conscientiousness 盡責性 + Openness_to_Experience 開放性

基本參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] ""

pacman::p_load(readr,dplyr,readxl,tidyr,stringr)
pacman::p_load(FactoMineR, factoextra)

讀入基本資料

d = dir("../data","xlsx",full=T)  # 讀入特定路徑下所有.xlsx的檔案
c5 = read_xlsx(d[1]) # 消費者
m5 = read_xlsx(d[2]) # 月嫂
sat = read_xlsx(d[3]) # 基本資料
item = read_xlsx(d[4]) # 基本資料
item = item[,c(1,4)]
names(c5)[1:2] = names(m5)[1:2] = names(sat)[1:2] = c("time","name")
names(c5)[3:27] = names(m5)[3:27] = item$category

# 有基本資料可是沒有填問卷的月嫂(找出在sat可是不在m5裡面的name)
#a = setdiff(sat$name, m5$name);
#which(a %in% c5$name) # 確認這些人也不在客戶名單

# 把重覆填表單的人篩掉
c5_a = c5[!duplicated(c5$name),]  
m5_a = m5[!duplicated(m5$name),]

算pca要用的matrix

# 把數字作scale(避免分數偏高或分數偏低)
c5_x = c5_a[,3:27] %>% t %>% scale(T,F) %>% t %>% data.frame()
m5_x = m5_a[,3:27] %>% t %>% scale(T,F) %>% t %>% data.frame()

#c5_x = t(apply(c5_a[,3:27], 1, function(x)(x-min(x))/(max(x)-min(x))))
#m5_x = t(apply(m5_a[,3:27], 1, function(x)(x-min(x))/(max(x)-min(x))))

# 依據不同的column name取平均數
prefix <- unique(unlist(strsplit(names(c5[3:27]), "\\_[0-9]"))) # colnames有哪些
c5_rM = sapply(prefix, function(i) 
  rowMeans(c5_x[,grepl(i, names(c5_x))])) 
m5_rM = sapply(prefix, function(i) 
  rowMeans(m5_x[,grepl(i, names(m5_x))]))

月嫂的pca

set.seed(123)
c5_kmg = kmeans(scale(c5_rM), 5)$cluster %>% factor
table(c5_kmg)

## c5_kmg
##  1  2  3  4  5 
## 10 13 14  7 15

pca = PCA(c5_rM)

# 另一種畫法
# c5_pca <- princomp(c5_rM) #

fviz_pca_biplot(
  pca, col.var="black", labelsize=3,
  axes=c(1,2),
  col.ind=c5_kmg, alpha.ind=0.6, pointshape=16, 
  addEllipses = T, ellipse.level = 0.65, 
  mean.point = FALSE) + 
  theme(legend.position = "none")

顧客的pca

set.seed(123)
m5_kmg = kmeans(scale(m5_rM), 5)$cluster %>% factor
table(m5_kmg)

## m5_kmg
##  1  2  3  4  5 
## 21 23 26 35 27

pca = PCA(m5_rM)

fviz_pca_biplot(
  pca, col.var="black", labelsize=3,
  axes=c(1,2),
  col.ind=m5_kmg, alpha.ind=0.6, pointshape=16, 
  addEllipses = T, ellipse.level = 0.65, 
  mean.point = FALSE) + 
  theme(legend.position = "none")

以princomp降維

看一下各主成份的比例 loadings:the matrix of variable loadings (columns are eigenvectors)

# 顧客
c5_pca <- princomp(c5_rM, cor=F)
summary(c5_pca)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4       Comp.5
## Standard deviation     0.7336369 0.6483998 0.4852933 0.33087691 3.332001e-08
## Proportion of Variance 0.4128635 0.3225001 0.1806562 0.08398023 8.516367e-16
## Cumulative Proportion  0.4128635 0.7353636 0.9160198 1.00000000 1.000000e+00

print(c5_pca$loadings, digits = 8, cutoff=0)

## 
## Loadings:
##                        Comp.1      Comp.2      Comp.3      Comp.4     
## Surgency                0.41726151  0.60276630  0.47229816  0.19874622
## Agreeableness           0.65616413 -0.50225957 -0.33710369 -0.05954045
## Adjustment             -0.38886935 -0.51012036  0.61468914 -0.10351378
## Conscientiousness      -0.41232613  0.06286730 -0.42125644  0.66975957
## Openness_to_Experience -0.27223016  0.34674634 -0.32862717 -0.70545156
##                        Comp.5     
## Surgency                0.44721360
## Agreeableness           0.44721360
## Adjustment              0.44721360
## Conscientiousness       0.44721360
## Openness_to_Experience  0.44721360
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0

biplot(c5_pca)

# 月嫂
m5_pca <- princomp(m5_rM, cor=F)
summary(m5_pca)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4 Comp.5
## Standard deviation     0.7288911 0.6176855 0.5052674 0.33092539      0
## Proportion of Variance 0.4158361 0.2986287 0.1998202 0.08571503      0
## Cumulative Proportion  0.4158361 0.7144648 0.9142850 1.00000000      1

print(m5_pca$loadings, digits = 8, cutoff=0)

## 
## Loadings:
##                        Comp.1      Comp.2      Comp.3      Comp.4     
## Surgency                0.73619841  0.45631418  0.22312395  0.00223143
## Agreeableness           0.30383180 -0.77556909 -0.31178171 -0.09471530
## Adjustment             -0.37184036 -0.21560330  0.76996261  0.14969148
## Conscientiousness      -0.37988821  0.28677486 -0.22216762 -0.72393831
## Openness_to_Experience -0.28830164  0.24808336 -0.45913723  0.66673069
##                        Comp.5     
## Surgency                0.44721360
## Agreeableness           0.44721360
## Adjustment              0.44721360
## Conscientiousness       0.44721360
## Openness_to_Experience  0.44721360
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0

biplot(m5_pca)

# 月嫂和顧客個分到哪個群
c5_kmg = cbind(c5_kmg,c5_a$name)
m5_kmg = cbind(m5_kmg,m5_a$name)

R Notebook

劉岱宜

算pca要用的matrix

月嫂的pca

顧客的pca

以princomp降維