alisonlo — Apr 30, 2014, 12:03 PM
# Teens_SNS program uses SNS data "teenSNS.csv" to classify 1000 teenagers into 5 segments
# read the data set
teenager = read.csv("teenager.csv")
str(teenager)
'data.frame': 1000 obs. of 44 variables:
$ X : int 8628 23649 12269 26488 28211 1367 15840 26767 16539 13695 ...
$ X.1 : int 8628 23649 12269 26488 28211 1367 15840 26767 16539 13695 ...
$ gradyear : int 2007 2009 2007 2009 2009 2006 2008 2009 2008 2007 ...
$ gender : Factor w/ 2 levels "F","M": 1 1 1 1 1 NA NA 1 1 NA ...
$ age : num 17.2 16.2 17.2 16.5 15.2 ...
$ friends : int 10 37 0 16 71 26 22 56 74 1 ...
$ basketball : int 0 1 0 0 0 0 0 0 0 0 ...
$ football : int 0 0 0 0 0 0 0 0 0 0 ...
$ soccer : int 0 0 0 1 0 0 0 0 0 0 ...
$ softball : int 0 0 0 0 0 0 0 0 0 0 ...
$ volleyball : int 0 0 0 0 0 0 0 0 0 0 ...
$ swimming : int 0 0 0 0 0 0 0 0 0 0 ...
$ cheerleading: int 0 0 0 0 1 0 0 0 0 0 ...
$ baseball : int 0 0 0 0 0 0 0 0 0 0 ...
$ tennis : int 0 0 0 0 0 0 0 0 0 0 ...
$ sports : int 1 0 0 1 1 0 0 0 0 0 ...
$ cute : int 0 0 0 7 0 1 0 0 0 0 ...
$ sex : int 0 0 0 0 0 0 0 0 0 0 ...
$ sexy : int 0 0 0 0 1 0 0 0 0 0 ...
$ hot : int 0 0 0 0 0 0 0 0 0 0 ...
$ kissed : int 0 0 0 0 0 0 0 0 0 0 ...
$ dance : int 0 1 0 2 0 0 0 0 4 0 ...
$ band : int 0 0 0 0 0 0 0 0 0 0 ...
$ marching : int 0 0 0 0 0 0 0 0 0 0 ...
$ music : int 1 0 0 0 0 0 0 1 1 0 ...
$ rock : int 0 1 0 0 1 0 0 0 0 0 ...
$ god : int 0 1 0 2 0 0 0 1 1 0 ...
$ church : int 0 0 0 0 0 0 0 0 0 0 ...
$ jesus : int 0 0 0 0 0 0 0 0 0 0 ...
$ bible : int 0 0 0 0 0 0 0 0 0 0 ...
$ hair : int 0 0 0 0 0 0 0 1 1 0 ...
$ dress : int 0 0 0 0 0 0 0 0 0 0 ...
$ blonde : int 0 0 0 0 0 0 0 0 0 0 ...
$ mall : int 0 0 0 0 0 0 0 0 0 0 ...
$ shopping : int 0 1 0 1 0 0 0 1 1 0 ...
$ clothes : int 0 1 0 0 0 0 0 0 0 0 ...
$ hollister : int 0 0 0 1 0 0 0 0 0 0 ...
$ abercrombie : int 0 0 0 0 0 0 0 0 0 0 ...
$ die : int 0 0 0 2 0 0 0 0 0 0 ...
$ death : int 0 0 0 0 0 0 0 0 0 0 ...
$ drunk : int 0 0 0 0 0 0 0 0 0 0 ...
$ drugs : int 0 0 0 0 0 0 0 0 0 0 ...
$ female : int 1 1 1 1 1 0 0 1 1 0 ...
$ no_gender : int 0 0 0 0 0 1 1 0 0 1 ...
# We will be using the the words they frequently use in texting to segment the 1000 teenagers
# variables 7 to 42 are the keywords that reflect their interests
summary(teenager[7:10])
basketball football soccer softball
Min. : 0.000 Min. :0.00 Min. : 0.000 Min. :0.000
1st Qu.: 0.000 1st Qu.:0.00 1st Qu.: 0.000 1st Qu.:0.000
Median : 0.000 Median :0.00 Median : 0.000 Median :0.000
Mean : 0.272 Mean :0.27 Mean : 0.223 Mean :0.185
3rd Qu.: 0.000 3rd Qu.:0.00 3rd Qu.: 0.000 3rd Qu.:0.000
Max. :10.000 Max. :7.00 Max. :12.000 Max. :8.000
table(teenager$basketball)
0 1 2 3 4 5 6 10
852 85 30 19 8 2 3 1
# Notice that some words have higher means than others. Therefore without standardization it is
# difficult to compare whether mentioning 'music' or 'church' 3 times is considered frequent or not.
# Standardization converts the variable to z-score that has a mean of zero, and a value of 1 means it is
# at the 50+34=84 percentile.
# We take out all the interest keywords, standarize them as save as interests_z.
interests_z = as.data.frame(lapply(teenager[7:42], scale))
summary(interests_z[1:4])
basketball football soccer softball
Min. :-0.328 Min. :-0.372 Min. :-0.237 Min. :-0.24
1st Qu.:-0.328 1st Qu.:-0.372 1st Qu.:-0.237 1st Qu.:-0.24
Median :-0.328 Median :-0.372 Median :-0.237 Median :-0.24
Mean : 0.000 Mean : 0.000 Mean : 0.000 Mean : 0.00
3rd Qu.:-0.328 3rd Qu.:-0.372 3rd Qu.:-0.237 3rd Qu.:-0.24
Max. :11.722 Max. : 9.265 Max. :12.497 Max. :10.14
table(interests_z[1])
-0.327757304395905 0.877232785294923 2.08222287498575
852 85 30
3.28721296467658 4.49220305436741 5.69719314405824
19 8 2
6.90218323374906 11.7221435925124
3 1
# now the interests_z variables represent how often the word is mentioned relative to everybody (mean=0)
# We apply k-means clustering to these standardized interest variables
interest_cluster = kmeans(interests_z, centers=5)
# the result of the k-means clustering is contained in the object interest_cluster
names(interest_cluster)
[1] "cluster" "centers" "totss" "withinss"
[5] "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
# the sizes of the different clusters (segments)
interest_cluster$size
[1] 28 9 802 137 24
# the centroid of the segments (the average mention of the keywords)
round(interest_cluster$centers,2)
basketball football soccer softball volleyball swimming cheerleading
1 0.62 0.27 0.60 -0.05 0.03 0.62 -0.01
2 1.15 0.85 -0.24 0.48 -0.20 0.37 -0.22
3 -0.14 -0.09 -0.05 -0.12 -0.11 -0.05 -0.13
4 0.62 0.42 0.24 0.71 0.66 0.12 0.80
5 -0.18 -0.20 -0.24 -0.02 -0.20 0.06 -0.22
baseball tennis sports cute sex sexy hot kissed dance band
1 0.26 0.30 0.39 0.88 3.23 0.89 1.07 2.24 0.94 0.42
2 0.16 0.42 0.24 0.00 0.03 -0.29 -0.28 -0.19 0.01 0.42
3 -0.04 -0.03 -0.10 -0.14 -0.10 -0.06 -0.08 -0.07 -0.14 -0.11
4 0.18 0.07 0.51 0.66 -0.05 0.16 0.27 -0.02 0.54 -0.03
5 -0.20 0.15 -0.19 -0.05 -0.17 0.07 0.09 -0.19 0.36 3.21
marching music rock god church jesus bible hair dress blonde mall
1 -0.14 1.77 2.80 0.27 -0.14 0.09 -0.10 2.14 0.68 1.11 0.08
2 -0.14 0.61 0.29 6.44 1.76 6.62 4.94 0.45 -0.24 0.96 -0.02
3 -0.14 -0.13 -0.14 -0.11 -0.13 -0.08 -0.04 -0.15 -0.10 -0.09 -0.18
4 -0.11 0.28 0.18 0.18 0.68 -0.05 -0.06 0.47 0.46 0.28 1.06
5 5.35 0.45 0.25 -0.02 0.03 0.24 -0.10 -0.17 0.15 -0.20 0.02
shopping clothes hollister abercrombie die death drunk drugs
1 0.32 0.91 0.21 -0.03 1.57 0.68 2.15 2.80
2 -0.05 0.09 -0.23 -0.17 0.35 0.81 0.46 0.98
3 -0.17 -0.16 -0.17 -0.16 -0.07 -0.06 -0.06 -0.09
4 0.96 0.74 1.03 0.98 0.05 0.13 -0.14 -0.08
5 0.01 0.15 -0.23 -0.17 0.10 0.11 0.05 -0.18
# copy the cluster membership to the original data set in order to compare them on demographics
teenager$cluster = interest_cluster$cluster
# to make sure the copying is done properly (the frequency should match the previous 'size' command)
table(teenager$cluster)
1 2 3 4 5
28 9 802 137 24
# the proportion of females in each segment
tapply(teenager$female, teenager$cluster, mean)
1 2 3 4 5
0.8214 0.5556 0.7057 0.8905 0.5833
# a graphical representation (stacked bar plot) of the proportion of females
counts= table(teenager$female,teenager$cluster )
barplot(counts, main="Stacked Bar Plot", xlab="Segment", ylab="Number of females", legend=rownames(counts))
# a graphical representation as Spinogram
install.packages("vcd")
Error: trying to use CRAN without setting a mirror
library(vcd)
Loading required package: grid
counts2= table(teenager$cluster, teenager$female)
spine(counts2, main = "Spinogram", xlab="Segment", ylab="Proportions of females(=1)")
# the average number of friends for each segment
tapply(teenager$friends, teenager$cluster, mean)
1 2 3 4 5
41.39 27.33 26.93 37.34 26.54
# boxplot of the number of friends
boxplot(teenager$friends ~ teenager$cluster)
# boxplot removing the outliers
boxplot(teenager$friends ~ teenager$cluster, outline=FALSE)