teens_SNS.R

alisonlo — Apr 30, 2014, 12:03 PM

# Teens_SNS program uses SNS data "teenSNS.csv" to classify 1000 teenagers into 5 segments
# read the data set
teenager = read.csv("teenager.csv")
str(teenager)
'data.frame':   1000 obs. of  44 variables:
 $ X           : int  8628 23649 12269 26488 28211 1367 15840 26767 16539 13695 ...
 $ X.1         : int  8628 23649 12269 26488 28211 1367 15840 26767 16539 13695 ...
 $ gradyear    : int  2007 2009 2007 2009 2009 2006 2008 2009 2008 2007 ...
 $ gender      : Factor w/ 2 levels "F","M": 1 1 1 1 1 NA NA 1 1 NA ...
 $ age         : num  17.2 16.2 17.2 16.5 15.2 ...
 $ friends     : int  10 37 0 16 71 26 22 56 74 1 ...
 $ basketball  : int  0 1 0 0 0 0 0 0 0 0 ...
 $ football    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ soccer      : int  0 0 0 1 0 0 0 0 0 0 ...
 $ softball    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ volleyball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ swimming    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cheerleading: int  0 0 0 0 1 0 0 0 0 0 ...
 $ baseball    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tennis      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sports      : int  1 0 0 1 1 0 0 0 0 0 ...
 $ cute        : int  0 0 0 7 0 1 0 0 0 0 ...
 $ sex         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sexy        : int  0 0 0 0 1 0 0 0 0 0 ...
 $ hot         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ kissed      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ dance       : int  0 1 0 2 0 0 0 0 4 0 ...
 $ band        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ marching    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ music       : int  1 0 0 0 0 0 0 1 1 0 ...
 $ rock        : int  0 1 0 0 1 0 0 0 0 0 ...
 $ god         : int  0 1 0 2 0 0 0 1 1 0 ...
 $ church      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ jesus       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ bible       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ hair        : int  0 0 0 0 0 0 0 1 1 0 ...
 $ dress       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ blonde      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mall        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ shopping    : int  0 1 0 1 0 0 0 1 1 0 ...
 $ clothes     : int  0 1 0 0 0 0 0 0 0 0 ...
 $ hollister   : int  0 0 0 1 0 0 0 0 0 0 ...
 $ abercrombie : int  0 0 0 0 0 0 0 0 0 0 ...
 $ die         : int  0 0 0 2 0 0 0 0 0 0 ...
 $ death       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ drunk       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ drugs       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ female      : int  1 1 1 1 1 0 0 1 1 0 ...
 $ no_gender   : int  0 0 0 0 0 1 1 0 0 1 ...

# We will be using the the words they frequently use in texting to segment the 1000 teenagers
# variables 7 to 42 are the keywords that reflect their interests
summary(teenager[7:10])
   basketball        football        soccer          softball    
 Min.   : 0.000   Min.   :0.00   Min.   : 0.000   Min.   :0.000  
 1st Qu.: 0.000   1st Qu.:0.00   1st Qu.: 0.000   1st Qu.:0.000  
 Median : 0.000   Median :0.00   Median : 0.000   Median :0.000  
 Mean   : 0.272   Mean   :0.27   Mean   : 0.223   Mean   :0.185  
 3rd Qu.: 0.000   3rd Qu.:0.00   3rd Qu.: 0.000   3rd Qu.:0.000  
 Max.   :10.000   Max.   :7.00   Max.   :12.000   Max.   :8.000  
table(teenager$basketball)

  0   1   2   3   4   5   6  10 
852  85  30  19   8   2   3   1 

# Notice that some words have higher means than others. Therefore without standardization it is 
# difficult to compare whether mentioning 'music' or 'church' 3 times is considered frequent or not.
# Standardization converts the variable to z-score that has a mean of zero, and a value of 1 means it is 
# at the 50+34=84 percentile. 

# We take out all the interest keywords, standarize them as save as interests_z. 
interests_z = as.data.frame(lapply(teenager[7:42], scale))
summary(interests_z[1:4])
   basketball        football          soccer          softball    
 Min.   :-0.328   Min.   :-0.372   Min.   :-0.237   Min.   :-0.24  
 1st Qu.:-0.328   1st Qu.:-0.372   1st Qu.:-0.237   1st Qu.:-0.24  
 Median :-0.328   Median :-0.372   Median :-0.237   Median :-0.24  
 Mean   : 0.000   Mean   : 0.000   Mean   : 0.000   Mean   : 0.00  
 3rd Qu.:-0.328   3rd Qu.:-0.372   3rd Qu.:-0.237   3rd Qu.:-0.24  
 Max.   :11.722   Max.   : 9.265   Max.   :12.497   Max.   :10.14  
table(interests_z[1])

-0.327757304395905  0.877232785294923   2.08222287498575 
               852                 85                 30 
  3.28721296467658   4.49220305436741   5.69719314405824 
                19                  8                  2 
  6.90218323374906   11.7221435925124 
                 3                  1 

# now the interests_z variables represent how often the word is mentioned relative to everybody (mean=0)
# We apply k-means clustering to these standardized interest variables 
interest_cluster = kmeans(interests_z, centers=5)

# the result of the k-means clustering is contained in the object interest_cluster
names(interest_cluster)
[1] "cluster"      "centers"      "totss"        "withinss"    
[5] "tot.withinss" "betweenss"    "size"         "iter"        
[9] "ifault"      

# the sizes of the different clusters (segments)
interest_cluster$size
[1]  28   9 802 137  24

# the centroid of the segments (the average mention of the keywords)
round(interest_cluster$centers,2)
  basketball football soccer softball volleyball swimming cheerleading
1       0.62     0.27   0.60    -0.05       0.03     0.62        -0.01
2       1.15     0.85  -0.24     0.48      -0.20     0.37        -0.22
3      -0.14    -0.09  -0.05    -0.12      -0.11    -0.05        -0.13
4       0.62     0.42   0.24     0.71       0.66     0.12         0.80
5      -0.18    -0.20  -0.24    -0.02      -0.20     0.06        -0.22
  baseball tennis sports  cute   sex  sexy   hot kissed dance  band
1     0.26   0.30   0.39  0.88  3.23  0.89  1.07   2.24  0.94  0.42
2     0.16   0.42   0.24  0.00  0.03 -0.29 -0.28  -0.19  0.01  0.42
3    -0.04  -0.03  -0.10 -0.14 -0.10 -0.06 -0.08  -0.07 -0.14 -0.11
4     0.18   0.07   0.51  0.66 -0.05  0.16  0.27  -0.02  0.54 -0.03
5    -0.20   0.15  -0.19 -0.05 -0.17  0.07  0.09  -0.19  0.36  3.21
  marching music  rock   god church jesus bible  hair dress blonde  mall
1    -0.14  1.77  2.80  0.27  -0.14  0.09 -0.10  2.14  0.68   1.11  0.08
2    -0.14  0.61  0.29  6.44   1.76  6.62  4.94  0.45 -0.24   0.96 -0.02
3    -0.14 -0.13 -0.14 -0.11  -0.13 -0.08 -0.04 -0.15 -0.10  -0.09 -0.18
4    -0.11  0.28  0.18  0.18   0.68 -0.05 -0.06  0.47  0.46   0.28  1.06
5     5.35  0.45  0.25 -0.02   0.03  0.24 -0.10 -0.17  0.15  -0.20  0.02
  shopping clothes hollister abercrombie   die death drunk drugs
1     0.32    0.91      0.21       -0.03  1.57  0.68  2.15  2.80
2    -0.05    0.09     -0.23       -0.17  0.35  0.81  0.46  0.98
3    -0.17   -0.16     -0.17       -0.16 -0.07 -0.06 -0.06 -0.09
4     0.96    0.74      1.03        0.98  0.05  0.13 -0.14 -0.08
5     0.01    0.15     -0.23       -0.17  0.10  0.11  0.05 -0.18

# copy the cluster membership to the original data set in order to compare them on demographics
teenager$cluster = interest_cluster$cluster

# to make sure the copying is done properly (the frequency should match the previous 'size' command)
table(teenager$cluster)

  1   2   3   4   5 
 28   9 802 137  24 

# the proportion of females in each segment
tapply(teenager$female, teenager$cluster, mean)
     1      2      3      4      5 
0.8214 0.5556 0.7057 0.8905 0.5833 

# a graphical representation (stacked bar plot) of the proportion of females
counts= table(teenager$female,teenager$cluster )
barplot(counts, main="Stacked Bar Plot", xlab="Segment", ylab="Number of females", legend=rownames(counts)) 

# a graphical representation as Spinogram
install.packages("vcd")
Error: trying to use CRAN without setting a mirror
library(vcd)
Loading required package: grid

plot of chunk unnamed-chunk-1

counts2= table(teenager$cluster, teenager$female)
spine(counts2, main = "Spinogram", xlab="Segment", ylab="Proportions of females(=1)")

plot of chunk unnamed-chunk-1


# the average number of friends for each segment
tapply(teenager$friends, teenager$cluster, mean)
    1     2     3     4     5 
41.39 27.33 26.93 37.34 26.54 

# boxplot of the number of friends 
boxplot(teenager$friends ~ teenager$cluster)

plot of chunk unnamed-chunk-1


# boxplot removing the outliers
boxplot(teenager$friends ~ teenager$cluster, outline=FALSE)

plot of chunk unnamed-chunk-1