Notes:
# Create a histogram of diamond prices.
# Facet the histogram by diamond color
# and use cut to color the histogram bars.
# The plot should look something like this.
# http://i.imgur.com/b5xyrOu.jpg
# Note: In the link, a color palette of type
# 'qual' was used to color the histogram using
# scale_fill_brewer(type = 'qual')
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ===========================================
# Import Libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
data("diamonds")
#Spot Check
head(diamonds)
## Source: local data frame [6 x 10]
##
## carat cut color clarity depth table price x y z
## (dbl) (fctr) (fctr) (fctr) (dbl) (dbl) (int) (dbl) (dbl) (dbl)
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
# Check the Names
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
# Inspiration
# ggplot(aes(x = time, y = price),
# data = subset(yo, id %in% sample.ids)) +
# facet_wrap( ~ id) +
# geom_line() +
# geom_point(aes(size = all.purchases), pch = 1)
# Inspiration
# ggplot(data=yo, aes(all.purchases)) + geom_histogram(binwidth = 1, col="red",
# fill="green",
# alpha = .2)
# Inspiration
qplot(x = price, data = diamonds) + facet_wrap(~cut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Inspiration
# ggplot( aes(x = price/carat), data = diamonds, binwidth = 10) + geom_histogram() + scale_x_log10() + facet_wrap(~cut, # scales="free_y")
# EXPERIMENTS:
# dsamp <- diamonds[sample(nrow(diamonds), 1000), ]
# (d <- ggplot(dsamp, aes(carat, price)) +
# geom_point(aes(colour = clarity)))
# d + scale_colour_brewer()
# d + scale_colour_brewer("Diamond\nclarity")
# d + scale_colour_brewer(palette = "Greens")
# d + scale_colour_brewer(palette = "Set1")
# EXPERIMENTS:
p <- ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram(position = "dodge", binwidth = 1000)
p + scale_fill_brewer()
# EXPERIMENTS:
p <- ggplot(data=diamonds, aes(price)) + facet_wrap(~color) +
geom_histogram(binwidth = 400)
p + scale_fill_brewer(type = 'qual')
# Actual
ggplot(aes(x = price, fill = cut), data=diamonds) +
geom_histogram(binwidth = .1) +
facet_wrap(~color) +
scale_fill_brewer(type = 'qual') +
scale_x_log10()
Notes:
# Create a scatterplot of diamond price vs.
# table and color the points by the cut of
# the diamond.
# The plot should look something like this.
# http://i.imgur.com/rQF9jQr.jpg
# Note: In the link, a color palette of type
# 'qual' was used to color the scatterplot using
# scale_color_brewer(type = 'qual')
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ===========================================
# Inspiration
ggplot(aes(x = table, y = price), data = diamonds) +
geom_point(aes(color = cut)) +
scale_fill_brewer(type='qual') +
coord_cartesian(xlim = c(50,80)) +
scale_x_discrete(breaks = seq(50,80,2))
# Actual
ggplot(aes(x = table, y = price), data = diamonds) +
geom_point(aes(colour = cut)) +
scale_color_brewer(type = 'qual') +
coord_cartesian(xlim = c(50,80))
** Q1:** - What is the typical table range for the majority of diamonds of IDEAL cut?
53 - 57
** Q2:** -What is the typical table range for the majority of thamonds of PREMIUM cut?
58 - 62
Notes:
# Create a scatterplot of diamond price vs.
# volume (x * y * z) and color the points by
# the clarity of diamonds. Use scale on the y-axis
# to take the log10 of price. You should also
# omit the top 1% of diamond volumes from the plot.
# Note: Volume is a very rough approximation of
# a diamond's actual volume.
# The plot should look something like this.
# http://i.imgur.com/excUpea.jpg
# Note: In the link, a color palette of type
# 'div' was used to color the scatterplot using
# scale_color_brewer(type = 'div')
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ===========================================
# Inspiration
# diamonds.no_outliers_volume <- subset(diamonds, volume > 0 & volume <= 800)
# Create scatterplot - using GGPlot
# ggplot(aes(x = volume, y = price), data = diamonds.no_outliers_volume) +
# geom_point(alpha = .05, position = position_jitter(h = 0),color = 'orange') +
# geom_smooth()
# Experiments
# ggplot(aes(x = (x * y * z), y = price), data = diamonds) +
# scale_y_log10() +
# geom_point()
# Actual
#
# Create a diamond variable called volume = (x * y * z)
#
diamonds$diamond_volume <- diamonds$x * diamonds$y * diamonds$z
# Create plot
#
ggplot(aes(x = diamond_volume, y = price), data = diamonds) +
scale_y_log10() +
geom_point(aes(color = clarity)) +
coord_cartesian(xlim=c(0,quantile(diamonds$diamond_volume,0.99))) +
scale_color_brewer(type = 'div')
Notes:
# Many interesting variables are derived from two or more others.
# For example, we might wonder how much of a person's network on
# a service like Facebook the user actively initiated. Two users
# with the same degree (or number of friends) might be very
# different if one initiated most of those connections on the
# service, while the other initiated very few. So it could be
# useful to consider this proportion of existing friendships that
# the user initiated. This might be a good predictor of how active
# a user is compared with their peers, or other traits, such as
# personality (i.e., is this person an extrovert?).
# Your task is to create a new variable called 'prop_initiated'
# in the Pseudo-Facebook data set. The variable should contain
# the proportion of friendships that the user initiated.
# This programming assignment WILL BE automatically graded.
# DO NOT DELETE THIS NEXT LINE OF CODE
# ========================================================================
# pf <- read.delim('/datasets/ud651/pseudo_facebook.tsv')
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================================
# Read in the data
pf <- read.delim('pseudo_facebook.tsv')
# Create variable
pf$prop_initiated <- ifelse(is.nan(pf$friendships_initiated / pf$friend_count), 0, pf$friendships_initiated / pf$friend_count)
# Spot Check on the variable
summary(pf$prop_initiated)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.4400 0.6184 0.5958 0.7795 1.0000
Notes:
# Create a line graph of the median proportion of
# friendships initiated ('prop_initiated') vs.
# tenure and color the line segment by
# year_joined.bucket.
# Recall, we created year_joined.bucket in Lesson 5
# by first creating year_joined from the variable tenure.
# Then, we used the cut function on year_joined to create
# four bins or cohorts of users.
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
# The plot should look something like this.
# http://i.imgur.com/vNjPtDh.jpg
# OR this
# http://i.imgur.com/IBN1ufQ.jpg
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ===========================================================
pf$year_joined <- floor( 2014 - (pf$tenure / 365))
pf$year_joined.bucket <- cut(pf$year_joined,
breaks = c(2004, 2009, 2011, 2012, 2014))
ggplot(aes(x = tenure, y = prop_initiated),
data = pf) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
## Warning: Removed 2 rows containing non-finite values (stat_summary).
Notes:
# Smooth the last plot you created of
# of prop_initiated vs tenure colored by
# year_joined.bucket. You can bin together ranges
# of tenure or add a smoother to the plot.
# There won't be a solution image for this exercise.
# You will answer some questions about your plot in
# the next two exercises.
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ====================================================
ggplot(aes(x = tenure, y = prop_initiated), data = pf) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) +
geom_smooth() +
geom_smooth(method='lm', color='purple')
## Warning: Removed 2 rows containing non-finite values (stat_summary).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
Qestion: - On average, which group initiated the greatest proportion of it’s FB friendships? The plot with the smoother can help answer that question.
** Answer** - People who joineed after 2012.
Qestion: - For the group with the largest proportion of friendships initiated, what is the groups average (mean) proportion of friendships initiated?
** Answer** - .67 (taken by eyeballing plot)
Qestion: - Why do you think this groups proportion of friendships initiated is higher that the others?
** Answer** - More of their friends were already on the service, and those who have been using the service for a long time tend to not initiate friendship requests as much, so the onus was on them to find friends.
Notes:
# Create a scatter plot of the price/carat ratio
# of diamonds. The variable x should be
# assigned to cut. The points should be colored
# by diamond color, and the plot should be
# faceted by clarity.
# The plot should look something like this.
# http://i.imgur.com/YzbWkHT.jpg.
# Note: In the link, a color palette of type
# 'div' was used to color the histogram using
# scale_color_brewer(type = 'div')
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ===========================================
# inspiration
# ggplot(aes(x = table, y = price), data = diamonds) +
# geom_point(aes(colour = cut)) +
# scale_color_brewer(type = 'qual') +
# coord_cartesian(xlim = c(50,80))
# inspiration
# ggplot(aes(x = price, fill = cut), data=diamonds) +
# geom_histogram(binwidth = .1) +
# facet_wrap(~color) +
# scale_fill_brewer(type = 'qual') +
# scale_x_log10()
# Experiments
# diamonds$price_per_carat <- diamonds$price/ diamonds$carat
#
# ggplot(data=diamonds, aes(x = cut, y = price/carat, color = color)) +
# geom_point(position=position_jitter(), alpha=0.33) +
# facet_wrap(~clarity) +
# scale_fill_brewer(type = 'div')
ggplot(data = diamonds, aes(x = cut, y = price/carat, color = color)) +
geom_point(position=position_jitter(), alpha=0.33) +
facet_wrap(~clarity) +
scale_color_brewer(type = 'div')
Notes:
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 4 or you can start fresh and choose a different
# data set from Gapminder.
# If you're feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.
# In your investigation, examine 3 or more variables and create 2-5 plots that make
# use of the techniques from Lesson 5.
# You can find a link to the Gapminder website in the Instructor Notes.
# Once you've completed your investigation, create a post in the discussions that includes:
# 1. the variable(s) you investigated, your observations, and any summary statistics
# 2. snippets of code that created the plots
# 3. links to the images of your plots
# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# ============================================================================================
### Loading activities
# Load dplyr
library(dplyr)
# Load the Plot Library
library(ggplot2)
# Read the CSV file, create the dataframe
cpi_df <- read.csv("corruption_perception.csv", header=TRUE, row.names = 1, check.names = T)
# Experiments
# regions_df <- read.csv("regions.csv")
### Data Wrangling
# Drop the 3 columns we do not want.
# cpi_df <- subset(cpi_df, select = -c(X.1, X.2, X.3))
# Convert Dataset Rows names to explicit variable, name the column "Countries"
cpi_df <- cpi_df %>% add_rownames(var = "COUNTRIES")
# Convert to Local DataFrame
cpi_df.ldf <- tbl_df(cpi_df)
# Rename Columns
# Inspiration: colnames(df)[colnames(df) == 'oldName'] <- 'newName'
colnames(cpi_df.ldf)[colnames(cpi_df.ldf) == 'X2008'] <- 'CPI_2008'
colnames(cpi_df.ldf)[colnames(cpi_df.ldf) == 'X2009'] <- 'CPI_2009'
trustLevel <- function(cpi)
{
if (cpi >= 9.0) {
designation = "HT"
} else if (cpi >= 8.0 & cpi < 9.0) {
designation = "T"
} else if (cpi >= 7.0 & cpi < 8.0) {
designation = "ST"
} else if (cpi >= 6.0 & cpi < 7.0) {
designation = "Q"
} else if (cpi >= 5.0 & cpi < 6.0) {
designation = "S"
} else if (cpi >= 4.0 & cpi < 5.0) {
designation = "UT"
} else if (cpi >= 3.0 & cpi < 4.0) {
designation = "HUT"
} else if (cpi >= 2.0 & cpi < 3.0) {
designation = "CUT"
} else if ( cpi < 2.0) {
designation = "CPT"
} else {
designation = "NA"
}
return(designation)
}
# Map a function across every value of a column to populate another
cpi_df.ldf$TRUST <- sapply(cpi_df.ldf$CPI_2008,trustLevel)
### Experiments
# Barchart
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
# Create scatterplot - CPI_2008 vs Trust
ggplot(aes(x = TRUST, y = CPI_2008), data = cpi_df.ldf) + geom_point(position=position_jitter(), alpha=0.33) +
xlab('Trust') +
ylab('CPI Index 2008') +
facet_wrap(~TRUST)
# Create scatterplot - Region vs Trust
ggplot(aes(x = TRUST, y = Region), data = cpi_df.ldf) + geom_point(position=position_jitter(), alpha=0.33) +
xlab('Trues') +
ylab('Regions') +
facet_wrap(~TRUST)
ggplot(aes(x = Region, y = TRUST), data = cpi_df.ldf) + geom_point(position=position_jitter(), alpha=0.33) +
xlab('Region') +
ylab('Trust') +
facet_wrap(~Region)
# Which region is the MOST trustworthy?
ggplot(aes(x = Region, y = TRUST), data = subset(cpi_df.ldf, TRUST == 'HT' | TRUST == 'T') ) + geom_point(position=position_jitter(), alpha=0.33) +
xlab('Trues') +
ylab('Regions') +
facet_wrap(~TRUST)
# Which region is the LEAST trustworthy?
ggplot(aes(x = Region, y = TRUST), data = subset(cpi_df.ldf, TRUST == 'CUT' | TRUST == 'CPT') ) + geom_point(position=position_jitter(), alpha=0.33) +
xlab('Trues') +
ylab('Regions') +
facet_wrap(~TRUST)
# Buckets via CUT
cpi_df.ldf$trust_bucket <- cut(cpi_df.ldf$CPI_2008,
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
# Spot Check
table(cpi_df.ldf$trust_bucket, useNA = 'ifany')
##
## (1,2] (2,3] (3,4] (4,5] (5,6] (6,7] (7,8] (8,9] (9,10]
## 23 59 30 20 15 12 8 9 4
ggplot(aes(x = CPI_2008, y = Region),
data = subset(cpi_df.ldf, !is.na(cpi_df.ldf$trust_bucket))) +
geom_line(aes(color = trust_bucket),
stat = 'summary',
fun.y = mean)