1.Input File

pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

2.1 Create Histogram to Visualize Birthday Distribution

library(ggthemes)
## Loading required package: ggplot2
theme_set(theme_minimal(12))
ggplot(aes(x = dob_day), data = pf) + 
  geom_histogram() +
  scale_x_discrete(breaks = 1:31)+
  xlab('Date of Birth - Day')+
  ylab('Counts')+
  ggtitle('Distribution of Date of Birth')

# Hypothesis: The amount of users with birthday on the 1st is about double of any other given day of the month, which does not make sense. This suggests some user birthdays may not be true because of the input convinience during sign up. 

# Potential Fix: change the birthday input layout to calendar view and observe the change in birthday distribution

2.2 Drill-down Birthday Analysis by Breakding Down by Month

library(ggthemes)
theme_set(theme_minimal(12))
ggplot(aes(x = dob_day), data = pf) + 
  geom_histogram() +
  facet_wrap(~dob_month, ncol=3)+
  xlab('Date of Birth - Day')+
  ylab('Counts by Month')+
  ggtitle('Distribution of Date of Birth')

# This detailed histogram shows that the amount of users with birthday on Jan 01 is ~15x of average amount of birthdays in any other day of the year!
# This further suggests that many users do not register with their real birthday. One potential reason is the layout of the registration sheet.

3.1 Friends Count Analysis to Identify Outlier(s)

library(ggthemes)
theme_set(theme_minimal(12))
ggplot(aes(x=friend_count), data=pf) +
  geom_histogram()+
  xlab('User Friend Count')+
  ylab('Counts')+
  ggtitle('Histogram of User Friend Count')

# The histogram shows the majority of users have around 350 friends on average. There are a few users with close to 5000 friends.

3.2 Adjust the Scale to Focus On Areas of Interest

library(ggthemes)
theme_set(theme_minimal(12))
ggplot(aes(x=friend_count), data=pf) +
  geom_histogram(binwidth = 25)+
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))+
  xlab('User Friend Count')+
  ylab('Counts')+
  ggtitle('Histogram of User Friend Count')

# By zooming into the range of 0-1000 friend counts and adjust the bin to 25, the graph shows that most of the users have around 0-50 friends. This implies that these users could be new.

3.3 Further Analyze the Friend Counts by Gender

library(ggthemes)
theme_set(theme_minimal(12))
ggplot(aes(x=friend_count), data=pf) +
  geom_histogram(binwidth = 25)+
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))+
  facet_wrap(~gender, ncol = 3)+
  xlab('User Friend Count')+
  ylab('Counts')+
  ggtitle('Histogram of User Friend Count')

# Gender = NA is not important in this analysis and it occupies display real estate

3.4 Further Analyze the Friend Counts by Gender - omit gender = NA

library(ggthemes)
theme_set(theme_minimal(12))
ggplot(aes(x=friend_count), data= subset(pf, !is.na(gender))) +
  geom_histogram(binwidth = 25)+
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))+
  facet_wrap(~gender, ncol = 2)+
  xlab('User Friend Count')+
  ylab('Counts')+
  ggtitle('Histogram of User Friend Count')

# This histogram breakdown shows that male users have a lot more friends in the 0-100 range. Both genders have similar distribution with friend counts > 100. 
# Hypothesis: there are more male users than female users. 

3.5 Test the hypothesis

table(pf$gender)
## 
## female   male 
##  40254  58574
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917
# The result shows there are more male users
# However, female users have more friends in general based on the Median; Note, median is a more robust measurement because it take the mid point in the data. On the other hand, Mean various due to the higher range values and counts which is not a fair comparison.

4. Create histogram to analyze Tenure

library(ggthemes)
theme_set(theme_minimal(12))

# divide tenure by 365 to conver to year
ggplot(aes(tenure/365), data = pf) +
  geom_histogram(binwidth = 0.25, color='black', fill='#F79420' )+
  scale_x_continuous(breaks=seq(1, 7, 1), limits=c(0,7))+
  xlab('Number of Years Using Facebook')+
  ylab('User Count')+
  ggtitle('User Tenure in Year')

# Observation: most users have been using Facebook less than 2 years

5. Create histogram to analyze user age

library(ggthemes)
theme_set(theme_minimal(12))

# set binwidth to 1 to have a finer view 
ggplot(aes(age), data = pf)+
  geom_histogram(binwidth = 1, color='black', fill='#F79420')+
  scale_x_continuous(breaks=seq(10, 133, 10))+
  xlab('User Age')+
  ylab('Number of Users in the Age Bucket')+
  ggtitle('User Age')

# Observation: most users are in their 20s - 30s; some users exaggerate their ages (ie. most likely the ones in 100 ranges)

6. Create histogram to analyze friend count with scale transformation and multiple plots on the same page

#install.packages('gridExtra')
library(gridExtra)
## Loading required package: grid
p1 = ggplot(aes(friend_count), data = pf)+
  geom_histogram(binwidth = 1, color='black', fill='#F79420')+
  xlab('Friend Count')+
  ylab('Number of Users')+
  ggtitle('Friend Count with Normal Scale')

p2 = ggplot(aes(friend_count+1), data = pf)+
  geom_histogram(binwidth = 0.1, color='black', fill='#F79420')+
  scale_x_log10()+
  xlab('Friend Count (log10 scale)')+
  ylab('Number of Users')+
  ggtitle('Friend Count with log10 Scale')

p3 = ggplot(aes(friend_count), data = pf)+
  geom_histogram(binwidth = 5, color='black', fill='#F79420')+
  scale_x_sqrt(breaks=seq(0, 5000, 500))+
  xlab('Friend Count (sqrt scale)')+
  ylab('Number of Users')+
  ggtitle('Friend Count with Square Root Scale')

# Plot multiple graphs on the same page

grid.arrange(p1, p2, p3, ncol = 1)

# Summary: it's easier to see the distribution with log10 scale. In addition, the log 10 transformation provides a bell shape distribution, which is more suitable for linear regression prediction. Linear regression assumes the data set to have a normal distribution.

7.1 Use frequency polygon to determine the Like feature usage pattern by gender

ggplot(aes(x = www_likes, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender)))+
  geom_freqpoly(aes(color = gender))+
  scale_x_log10()+
  xlab('Number of Like Feature Used') +
  ylab('Percentage of users receive that number of Likes')+
  ggtitle('Like Usage Analysis by Gender')

# Insight: There are more male users in the lower like usage range. But there are more female users in the higher like usage range

7.2 Use numerical summary to determine which gender use the like feature more in general

by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175
by(pf$www_likes, pf$gender, summary)
## pf$gender: female
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     0.00    87.14    25.00 14860.00 
## -------------------------------------------------------- 
## pf$gender: male
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     0.00    24.42     2.00 12900.00
# Insight: Female uses more likes features (~2x more) compare to male users. This can help companies to make design and marketing decisions.

8. Use boxplot to understand more about friend count distribution

# use the coord_cartesion() to limit the y-axis without removing records; so the statistics won't be scewed

ggplot(aes(x = gender, y = friend_count), data = subset(pf, !is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,1000))+
  xlab('Gender')+
  ylab('Friend Count')+
  ggtitle('Boxplot of Friend Count by Gender')

by(pf$friend_count, pf$gender, summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

9. Understand which gender initiate friendship more on average

ggplot(aes(x = gender, y = friendships_initiated ), data = subset(pf, !is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,150))+
  xlab('Gender')+
  ylab('Count of Friendship Initiated')+
  ggtitle('Boxplot of Friendship Initiation by Gender')

by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    49.0   113.9   124.8  3654.0 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    44.0   103.1   111.0  4144.0
#Insight: male users initiate more friendship compared to females

10. Understand mobile usage

# transform mobile usage data in to binary, instead of counts, for analysis
mobile_check_in = NA
mobile_check_in = ifelse(pf$mobile_likes > 0, 1, 0)
summary(mobile_check_in)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6459  1.0000  1.0000
# convert mobile_check_in to factor variable
mobile_check_in = factor(mobile_check_in)
summary(mobile_check_in)
##     0     1 
## 35056 63947
percentage_mobile_check_in = sum(mobile_check_in ==1 ) / length(mobile_check_in)
percentage_mobile_check_in
## [1] 0.6459097
#Insight: over 50% of the users used mobile device to check-in; it may be worth the investment to continue develop mobile experience
#Refection: it's important to think about the type of data needed for analysis. Sometimes it's not only raw counts, but also binary True/False type of analysis. Data transformation is more than just functional (ie. log10 or sqrt)