ティーンのSNSページのテキストからクラスタリングする
30,000人分のアメリカの高校生の, 性別・年齢・SNSの友達数・関心分野
teens <- read.csv("snsdata.csv", header = T, stringsAsFactors = T)
str(teens)
## 'data.frame': 30000 obs. of 40 variables:
## $ gradyear : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ gender : Factor w/ 2 levels "F","M": 2 1 2 1 NA 1 1 2 1 1 ...
## $ age : num 19 18.8 18.3 18.9 19 ...
## $ friends : int 7 0 69 0 10 142 72 17 52 39 ...
## $ basketball : int 0 0 0 0 0 0 0 0 0 0 ...
## $ football : int 0 1 1 0 0 0 0 0 0 0 ...
## $ soccer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ softball : int 0 0 0 0 0 0 0 1 0 0 ...
## $ volleyball : int 0 0 0 0 0 0 0 0 0 0 ...
## $ swimming : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cheerleading: int 0 0 0 0 0 0 0 0 0 0 ...
## $ baseball : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tennis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sports : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cute : int 0 1 0 1 0 0 0 0 0 1 ...
## $ sex : int 0 0 0 0 1 1 0 2 0 0 ...
## $ sexy : int 0 0 0 0 0 0 0 1 0 0 ...
## $ hot : int 0 0 0 0 0 0 0 0 0 1 ...
## $ kissed : int 0 0 0 0 5 0 0 0 0 0 ...
## $ dance : int 1 0 0 0 1 0 0 0 0 0 ...
## $ band : int 0 0 2 0 1 0 1 0 0 0 ...
## $ marching : int 0 0 0 0 0 1 1 0 0 0 ...
## $ music : int 0 2 1 0 3 2 0 1 0 1 ...
## $ rock : int 0 2 0 1 0 0 0 1 0 1 ...
## $ god : int 0 1 0 0 1 0 0 0 0 6 ...
## $ church : int 0 0 0 0 0 0 0 0 0 0 ...
## $ jesus : int 0 0 0 0 0 0 0 0 0 2 ...
## $ bible : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hair : int 0 6 0 0 1 0 0 0 0 1 ...
## $ dress : int 0 4 0 0 0 1 0 0 0 0 ...
## $ blonde : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mall : int 0 1 0 0 0 0 2 0 0 0 ...
## $ shopping : int 0 0 0 0 2 1 0 0 0 1 ...
## $ clothes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hollister : int 0 0 0 0 0 0 2 0 0 0 ...
## $ abercrombie : int 0 0 0 0 0 0 0 0 0 0 ...
## $ die : int 0 0 0 0 0 0 0 0 0 0 ...
## $ death : int 0 0 1 0 0 0 0 0 0 0 ...
## $ drunk : int 0 0 0 0 1 1 0 0 0 0 ...
## $ drugs : int 0 0 0 0 1 0 0 0 0 0 ...
table(teens$gender, useNA = "ifany")
##
## F M <NA>
## 22054 5222 2724
summary(teens$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.086 16.312 17.287 17.994 18.259 106.927 5086
高校生の年齢としておかしいものは欠損値に変える
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# 欠損値処理
teens %>%
## 年齢
mutate(age = ifelse(age < 20 & age >= 13, age, NA)) -> teens
## 性別
summary(teens$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.03 16.30 17.27 17.25 18.22 20.00 5523
## ダミーコーディング
### 女性と性別不明ダミー
teens %>%
mutate(female = if_else(gender == "F" & !is.na(gender), 1, 0),
no_gender = if_else(is.na(gender), 1, 0)) -> teens
table(teens$gender, useNA = "ifany")
##
## F M <NA>
## 22054 5222 2724
table(teens$female, useNA = "ifany")
##
## 0 1
## 7946 22054
table(teens$no_gender, useNA = "ifany")
##
## 0 1
## 27276 2724
## 欠損値の補完
### 年齢を補完する
mean(teens$age, na.rm = T)
## [1] 17.25243
aggregate(x = teens[c("age")], by = list(teens$gradyear),
FUN = mean, na.rm = T)
## Group.1 age
## 1 2006 18.65586
## 2 2007 17.70617
## 3 2008 16.76770
## 4 2009 15.81957
## 卒業年で年齢を推定する
## 卒業年ごとの平均年齢を代入する
teens %>%
group_by(gradyear) %>%
mutate(ave_age = mean(age, na.rm = T)) %>%
mutate(age = if_else(is.na(age), ave_age, age)) -> teens
## 欠損値が消えた
summary(teens$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.03 16.28 17.24 17.24 18.21 20.00
statsパッケージでk-means法を実施する
library(stats)
## 特徴量だけのデータフレームを作成
interests <- teens[5:40]
## 正規化
interests_z <- scale(interests) %>%
as.data.frame()
summary(interests_z$basketball)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3322 -0.3322 -0.3322 0.0000 -0.3322 29.4923
# k=5で分割
teen_clusters <- kmeans(interests_z, 5)
## 各クラスタのサイズ
teen_clusters$size
## [1] 20541 986 5087 868 2518
## 各クラスタのセントロイドの座標
### +であれば関心が強く, -であれば関心が弱め
teen_clusters$centers
## basketball football soccer softball volleyball swimming
## 1 -0.18471786 -0.18845642 -0.08040292 -0.13447193 -0.13077930 -0.1080758
## 2 0.34332617 0.36267270 0.11427889 0.13033068 0.08696035 0.2598940
## 3 -0.04346466 0.02913756 0.05139085 -0.07096261 -0.06331635 0.2929004
## 4 0.14167017 0.22213753 0.10379744 0.02815663 0.16699415 0.2392009
## 5 1.41139990 1.25990865 0.47154728 1.17959841 1.10315092 0.1056866
## cheerleading baseball tennis sports cute sex
## 1 -0.10670753 -0.13851514 -0.04453516 -0.15243746 -0.197719775 -0.0984615843
## 2 0.16930409 0.22070549 0.11749380 0.77073031 0.483632533 2.1056013259
## 3 0.28074961 -0.08583049 0.07633162 -0.09139197 0.639285999 0.0004868018
## 4 0.39745979 0.01969240 0.13637546 0.09662301 0.377235569 0.0217809425
## 5 0.09999096 1.21014720 0.11607423 1.09305795 0.001990418 -0.0297878165
## sexy hot kissed dance band marching
## 1 -0.096165349 -0.14280363 -0.13599590 -0.173755162 -0.12284436 -0.08918994
## 2 0.546791238 0.30728406 3.10478416 0.421169428 0.42478899 0.05994389
## 3 0.257430315 0.43898448 -0.02121897 0.585926434 0.46579024 0.40088720
## 4 0.118735199 0.41623474 0.06655066 0.224905488 -0.08861319 -0.09326362
## 5 0.009366997 0.01427383 -0.08643781 -0.008734715 -0.07468411 -0.07363566
## music rock god church jesus bible
## 1 -0.16563106 -0.13099155 -0.129959068 -0.16382375 -0.092055529 -0.07712600
## 2 1.13496832 1.17568053 0.408642171 0.17061756 0.105484524 0.06921067
## 3 0.39832473 0.20934747 0.435244108 0.58184060 0.352437384 0.32033414
## 4 0.14586276 0.06842745 0.035267938 -0.01161520 0.015301587 -0.03669010
## 5 0.05173231 0.16168816 0.008684947 0.09814675 -0.007635373 -0.03244217
## hair dress blonde mall shopping clothes
## 1 -0.21092142 -0.15916229 -0.02917974 -0.19057189 -0.24173679 -0.19615172
## 2 2.54884945 0.51808596 0.36574690 0.61694756 0.26313226 1.20933731
## 3 0.27096137 0.54783787 0.01464754 0.54015363 0.77023285 0.45192850
## 4 0.44616998 0.15298455 0.06176156 0.60870187 0.79614983 0.57306801
## 5 0.02133253 -0.06398806 0.04393711 0.01195874 0.03845691 0.01602965
## hollister abercrombie die death drunk drugs
## 1 -0.15598772 -0.1485480 -0.10029465 -0.09490037 -0.09110576 -0.11445965
## 2 0.16410482 0.2595749 1.75304512 0.93425619 1.89121987 2.85135813
## 3 -0.06485679 -0.0761444 0.09300957 0.20112474 0.02832696 -0.05201458
## 4 4.14729831 3.9833955 0.04833458 0.09981960 0.03938611 0.03849076
## 5 -0.09038606 -0.1091567 -0.07285312 -0.03240395 -0.06815955 -0.09099888
浮かび上がった数字からパターンを見つけていく
## それぞれのクラスター分けの結果を元のデータにくっつける
teens$cluster <- teen_clusters$cluster
クラスターごとの集計が可能になる
teens %>%
group_by(cluster) %>%
select(cluster, gender, age, friends) %>%
head()
## # A tibble: 6 x 4
## # Groups: cluster [3]
## cluster gender age friends
## <int> <fct> <dbl> <int>
## 1 1 M 19.0 7
## 2 3 F 18.8 0
## 3 1 M 18.3 69
## 4 1 F 18.9 0
## 5 2 <NA> 19.0 10
## 6 3 F 18.7 142
teens %>%
group_by(cluster) %>%
select(cluster, gender, age, friends) %>%
summarise(mean_age = mean(age),
mean_friends = mean(friends))
## # A tibble: 5 x 3
## cluster mean_age mean_friends
## <int> <dbl> <dbl>
## 1 1 17.3 27.5
## 2 2 17.1 30.7
## 3 3 17.2 36.2
## 4 4 16.9 41.3
## 5 5 17.0 35.5
teens %>%
group_by(cluster) %>%
select(cluster, female, no_gender) %>%
summarise(mean_female = mean(female),
mean_no_gender = mean(no_gender))
## # A tibble: 5 x 3
## cluster mean_female mean_no_gender
## <int> <dbl> <dbl>
## 1 1 0.701 0.104
## 2 2 0.800 0.0568
## 3 3 0.865 0.0664
## 4 4 0.836 0.0772
## 5 5 0.693 0.0500