Lately, I’ve been interested in analyzing and visualizing some of my social media data. I began doing this last year with my Facebook data. Here are a few different things I looked at last year when I was playing around with this stuff.

First, I downloaded my Facebook data following the steps here: http://thinktostart.com/analyzing-facebook-with-r/

Then, I just loaded the data frame and got started.

library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(dplyr)

fb <- read.csv("~/Desktop/fb.csv")
attach(fb)

Gender and relationship status

To start, I wanted to look at my friends relationship status and gender breakdown (N= 495). First I need to tabulate the gender/status data.

# Tabulate gender and relationship data. 
tab_gen <- table(gender)
rel <- as.vector(relationship_status)
rel<-ifelse(is.na(rel),"N/A",rel)
frel<-as.factor(rel) 
frel <- factor(frel, levels=names(sort(table(frel),decreasing=FALSE)))
frela <- factor(frel, levels=names(sort(table(frel),decreasing=TRUE)))

# Order factors 
order <- c("N/A","Married","In a relationship","Single","Engauged","It's complicated","In an open relationship","Divorced")

# rename
gen <- as.vector(gender)
gen<-ifelse(is.na(gender),"N/A",gen)
gen<-ifelse(gen=="male","Male",gen)
gen<-ifelse(gen=="female","Female",gen)

Here is the breakdown of my friend’s genders.

# Plot gender
g_sex <- ggplot(fb,aes(gen,fill=gen)) + scale_fill_brewer(palette="Dark2",name="") + 
  geom_bar() + xlab("") + ylab("") + ggtitle("Sex") + theme(axis.text.x = element_text(angle= 315,hjust = 0)) 
g_sex + geom_bar(colour="black", show_guide=FALSE)

plot of chunk unnamed-chunk-3

Here is their relationship status breakdown.

# Plot relationship Status
g_relstat2 <- ggplot(fb,aes(frel,fill=frel)) + scale_fill_brewer(palette="Spectral",name="") +  geom_bar() + xlab("") + ylab("") + ggtitle("Relationship Status") + theme(axis.text.x = element_text(angle= 315,hjust = 0)) 
g_relstat2 +  geom_bar(colour="black", show_guide=FALSE)

plot of chunk unnamed-chunk-4

Finally let’s put them both together to see how they look side by side.

# Plot both gender and relationship status
g_both2 <- ggplot(fb,aes(frela,fill=gen)) + scale_fill_brewer(palette="Set1",name="") + 
  geom_bar(position="dodge") + xlab("") + ylab("") + ggtitle("Relationship Status by Sex") + theme(axis.text.x = element_text(angle= 315,hjust = 0))

g_both2 + geom_bar(colour="black",position="dodge", show_guide=FALSE)

plot of chunk unnamed-chunk-5

Results:

Birthdays

Though many people hide their birthdays on Facebook, others really want to let you know about it. I wanted to get a bird’s eye view of where my friend’s birthdays fell over time and throughout the year.

First, I had to parse the text in the birthday column to use in the analysis.

# Parse text
birthmonth <- vector()
for(i in 1:length(birthday)){
  birthmonth[i] <- strsplit(as.matrix(birthday),"/")[[i]][1]
}

bd <- vector()
for(i in 1:length(birthday)){
  bd[i] <- strsplit(as.matrix(birthday),"/")[[i]][2]
}

birthyear <- vector()
for(i in 1:length(birthday)){
  birthyear[i] <- strsplit(as.matrix(birthday),"/")[[i]][3]
}

First, I wanted to see the distribution of my friends’ birth years relative to my own. I was born in 1985, and, perhaps unsurprisingly, my friends birth year centered around that year.

ggplot(na.omit(fb),aes(na.omit(birthyear))) + geom_bar(aes(fill=..count..)) +
  theme(axis.text.x = element_text(angle= 315,hjust = 0),legend.position="none") + 
  xlab("Year of Birth") + ylab("Number of people") + ggtitle("My friend's birth years")

plot of chunk unnamed-chunk-7

Now, let’s look at something more interesting. I want to make a heatmap of all of my friends’ birthdays throughout the year. However, first we need to wrangle the data a bit.

# Creat matrix
birthmatrix <- cbind(as.numeric(birthmonth),as.numeric(bd))
 
birthmatrix[,1] <- ifelse(birthmatrix[,1]==1,"January",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==2,"February",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==3,"March",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==4,"April",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==5,"May",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==6,"June",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==7,"July",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==8,"August",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==9,"September",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==10,"October",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==11,"November",birthmatrix[,1])
birthmatrix[,1] <- ifelse(birthmatrix[,1]==12,"December",birthmatrix[,1])

birthmatrix[,2] <- as.numeric(birthmatrix[,2])

# Convert to data frame for plotting.
bdf <- as.data.frame(birthmatrix)
bdf <- select(bdf, Month = V1, Day = V2)
bdf  <- na.omit(bdf)
bdf$Month<-as.factor(bdf$Month)
bdf$Month <- ordered(bdf$Month, levels=c("January","February","March","April","May","June","July","August","September","October","November","December"))
x <- as.character(1:31)
bdf$Day <- factor(bdf$Day,levels=x)
birthday_mat <- as.data.frame(table(bdf))
y <- 1:31
y <- sort(y,decreasing=TRUE)
y <- as.character(y)

birthday_mat$Day <- ordered(birthday_mat$Day,levels=y)

Finally, here’s the heatmap:

myPalette <- colorRampPalette(rev(brewer.pal(5, "Spectral")))
ggplot(birthday_mat,aes(Month,Day,fill=Freq)) + geom_tile(color="white") + 
  scale_fill_gradientn(colours = myPalette(10),name="Number of \nPeople") + ylab("Day") + xlab("Month") + ggtitle("My Friend's Birthdays") +
  theme_minimal() + theme(axis.text.x = element_text(angle= 315,hjust = 0))

plot of chunk smallplot

Results:

Where are they now?

A recent Gallup poll showed that states differ in their residents’ desire to live somewhere else (but it’s not always the in places you might expect: http://www.pixable.com/article/half-connecticut-illinois-residents-want-move-different-state-map). This got me wondering how many of my friends currently live in their home state. To investigate, I took my Friends’ Facebook data and looked at the difference between the the number of people in their ‘home state’ vs. their ‘current location’ to get an idea of how the people I know are moving around. (Note: this is a difference between ‘current’ and ‘home’ totals in the aggregate. Thus, if I have a friend who was born in Missouri and moved away but a different friend was born elsewhere and moved into Missouri, Missouri would have a value of 0.)

# Make blank state map  
us_state_map <- map_data('state')
states <- c("alabama","arizona","arkansas","california",
           "colorado","connecticut","delaware","district of columbia",
           "florida","georgia","idaho","illinois",
           "indiana","iowa","kansas","kentucky",
           "louisiana","maine","maryland","massachusetts",
           "michigan","minnesota","mississippi","missouri",
           "montana","nebraska","nevada","new hampshire",
           "new jersey","new mexico","new york","north carolina",
           "north dakota","ohio","oklahoma","oregon",
           "pennsylvania","rhode island","south carolina","south dakota",
           "tennessee","texas","utah","vermont",
           "virginia","washington","west virginia","wisconsin",
           "wyoming")

# Calculate state difference
loc_state <- vector()
home_state <- vector()
for(i in 1:length(location)){
  loc_state[i] <- strsplit(as.vector(location),", ")[[i]][2]
  home_state[i] <- strsplit(as.vector(hometown),", ")[[i]][2]
}

loc_state <- tolower(loc_state[c(1:length(loc_state))])
home_state <- tolower(home_state[c(1:length(home_state))])
df_states <- as.data.frame(cbind(states))
df_states[,1] <- as.vector(df_states[,1])
df_states$pos <- as.numeric(rep(0,49)) 
df_states$neg <- as.numeric(rep(0,49))

df_home <- as.data.frame(table(home_state))
df_home[,1] <- as.vector(df_home[,1])
df_loc <- as.data.frame(table(loc_state))
df_loc[,1] <- as.vector(df_loc[,1])

for(i in 1:length(df_states[,1])){
  for(k in 1:length(df_home[,1]))
  if(df_states[i,1] == df_home[k,1]){ 
  df_states[i,2] <- df_home[k,2]
  }
  
    for(j in 1:length(df_loc[,1]))
  if(df_states[i,1] == df_loc[j,1]){ 
  df_states[i,3] <- df_loc[j,2]
  }
    
}

df_states$change <- df_states[,3]-df_states[,2]
df_states$region <- df_states$states

# Make plot 
map_data <- merge(us_state_map, df_states, by='region', all=T)
map_data <- map_data[order(map_data$order), ]

c1 <- qplot(long, lat, data=map_data, geom="polygon", group=group, fill=change) +
  theme_minimal() + scale_fill_gradient2(low=c("blue4","blue3","blue2","blue"), mid=c("gray75"), high=c("red","red2","red3","red4"), na.value = "gray75", name="Total Change \nin Population") +
  xlab("Longitude") + ylab("Latitude") + ggtitle("Home State to Current State \nPopulation Change")

c1

plot of chunk unnamed-chunk-9

Results: