Load main libs that will be used during the assignment
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(scales))
suppressMessages(library(xlsx))
suppressMessages(library(tidyr))
suppressMessages(library(lubridate))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))
# Create a histogram of diamond prices.
# Facet the histogram by diamond color
# and use cut to color the histogram bars.
# The plot should look something like this.
# http://i.imgur.com/b5xyrOu.jpg
# Note: In the link, a color palette of type
# 'qual' was used to color the histogram using
# scale_fill_brewer(type = 'qual')
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram() +
facet_wrap(~ color) +
scale_fill_brewer(type = 'qual') +
scale_x_log10(labels=dollar, expression(paste(Log[10], " of Price"))) +
ylab("Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Q5.2
# Create a scatterplot of diamond price vs.
# table and color the points by the cut of
# the diamond.
# The plot should look something like this.
# http://i.imgur.com/rQF9jQr.jpg
# Note: In the link, a color palette of type
# 'qual' was used to color the scatterplot using
# scale_color_brewer(type = 'qual')
ggplot(diamonds, aes(x = table, y = price, color = cut)) +
geom_jitter(size = 3) +
scale_x_continuous(breaks = seq(50, 80, 2),
limits = c(50, 80)) +
scale_color_brewer(type = 'qual') +
theme_minimal()
## Warning: Removed 6 rows containing missing values (geom_point).
#Q5.3
## What is the typical table range for the majority of diamonds of ideal cut?
# 54 to 57
## What is the typical table range for the majory of diamonds of premium cut?
# 58 to 60
## Use the graph that you created from the previous exercise to see the answer. You do not need to run summaries.
# Create a scatterplot of diamond price vs.
# volume (x * y * z) and color the points by
# the clarity of diamonds. Use scale on the y-axis
# to take the log10 of price. You should also
# omit the top 1% of diamond volumes from the plot.
# Note: Volume is a very rough approximation of
# a diamond's actual volume.
# The plot should look something like this.
# http://i.imgur.com/excUpea.jpg
# Note: In the link, a color palette of type
# 'div' was used to color the scatterplot using
# scale_color_brewer(type = 'div')
diamonds <- diamonds %>%
mutate(volume = x * y *z)
ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) +
geom_jitter(size = 3) +
scale_y_log10(labels=dollar) +
scale_color_brewer(type = 'div') +
theme_minimal()
#Q5.5
# Many interesting variables are derived from two or more others.
# For example, we might wonder how much of a person's network on
# a service like Facebook the user actively initiated. Two users
# with the same degree (or number of friends) might be very
# different if one initiated most of those connections on the
# service, while the other initiated very few. So it could be
# useful to consider this proportion of existing friendships that
# the user initiated. This might be a good predictor of how active
# a user is compared with their peers, or other traits, such as
# personality (i.e., is this person an extrovert?).
# Your task is to create a new variable called 'prop_initiated'
# in the Pseudo-Facebook data set. The variable should contain
# the proportion of friendships that the user initiated.
fb <- tbl_df(read.table("pseudo_facebook.tsv", header=TRUE))
# fb <- fb %>%
# mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))
# Using code below instead will generate NA's when friend_count is 0 due to division by 0
# But future exercises asking for averages will be different if we don't generate those NA's
fb <- fb %>%
mutate(prop_initiated = friendships_initiated/friend_count)
# Create a line graph of the proportion of
# friendships initiated ('prop_initiated') vs.
# tenure and color the line segment by
# year_joined.bucket.
# Recall, we created year_joined.bucket in Lesson 5
# by first creating year_joined from the variable tenure.
# Then, we used the cut function on year_joined to create
# four bins or cohorts of users.
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
# The plot should look something like this.
# http://i.imgur.com/vNjPtDh.jpg
# OR this
# http://i.imgur.com/IBN1ufQ.jpg
fb <- fb %>%
mutate(year_joined = floor(2014 - tenure/365),
year_joined_bucket = cut(year_joined, breaks=c(2004, 2009, 2011, 2012, 2014)))
## Adding in 0's for the mean function instead of leaving out NA's gives a better approximation
## for the proportion of friendships initiated for this plot.
fb2 <- fb %>%
mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))
ggplot(subset(fb2, tenure > 0), aes(x=tenure, y=prop_initiated)) +
geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=mean) +
theme_minimal()
#Q5.7
# Smooth the last plot you created of
# of prop_initiated vs tenure colored by
# year_joined.bucket. You can use larger
# bins for tenure or add a smoother to the plot.
ggplot(subset(fb, tenure > 0), aes(x=tenure, y=prop_initiated)) +
#geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=mean) +
geom_smooth(aes(color = year_joined_bucket))
## Warning: Removed 1897 rows containing non-finite values (stat_smooth).
#Q5.8
## On average, which group initiated the greatest
## poportion of its Facebook friendships? The plot with
## the smoother that you created in the last exercise can
## help you answer this question.
## People who joined after 2012.
## For the group with the largest proportion of
## friendships initated, what is the group's average
## (mean) proportion on friendships initiated?
fb %>%
filter(year_joined_bucket == "(2012,2014]") %>%
summarise(avg = mean(prop_initiated, na.rm=TRUE))
## Source: local data frame [1 x 1]
##
## avg
## (dbl)
## 1 0.6653892
# They have newer accounts. To get started after first creating an account,
# you have to build your online network of friends based on your existing network of friends.
# After this initial population, someone may add/acquire new friends
# at a lower rate allowing the proportion to even out slightly.
# The younger cohort may also have fundamentally different patterns of acquisition;
# 2005 wasn't too long ago and Facebook was first targeted at a college-age demographic,
# who are now in their late twenties and early thirties.
# Rates might have already been lower in general for this group,
# who never were in their mid-teens and starting ubiquitous Facebook accounts.
# Create a scatter plot of the price/carat ratio
# of diamonds. The variable x should be
# assigned to cut. The points should be colored
# by diamond color, and the plot should be
# faceted by clarity.
# The plot should look something like this.
# http://i.imgur.com/YzbWkHT.jpg.
# Note: In the link, a color palette of type
# 'div' was used to color the histogram using
# scale_color_brewer(type = 'div')
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) +
geom_jitter() +
facet_wrap(~clarity) +
scale_color_brewer(type = 'div')