Notes:
More then 3 variables …
Notes:
Adding color was a failute - it just didn’t work.
Notes:
# Import GGPlot2
library(ggplot2)
# Import dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Get Data
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
# Plot Gender vs Friend count - histogram
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
# Plot age vs gender - boxplot
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()
# Plot age vs gender - boxplot - add in MEAN using 'statsummary'
# Note: Averages are marked by X shince shape = 4.
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot() +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
# Plot age vs gender - Lineplot
# Note: Averages are marked by X shince shape = 4.
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
Notes:
# Write code to create a new data frame,
# called 'pf.fc_by_age_gender', that contains
# information on each age AND gender group.
# The data frame should contain the following variables:
# mean_friend_count,
# median_friend_count,
# n (the number of users in each age and gender grouping)
# Here is an example of the structure of your data frame. Your
# data values will be different. Note that if you are grouping by
# more than one variable, you will probably need to call the
# ungroup() function.
# age gender mean_friend_count median_friend_count n
# 1 13 female 247.2953 150 207
# 2 13 male 184.2342 61 265
# 3 14 female 329.1938 245 834
# 4 14 male 157.1204 88 1201
# See the Instructor Note for two hints.
# DO NOT DELETE THESE NEXT TWO LINES OF CODE
# ==============================================================
#pf <- read.delim('/datasets/ud651/pseudo_facebook.tsv')
#suppressMessages(library(dplyr))
# ENTER YOUR CODE BELOW THIS LINE.
# ==============================================================
# Import dplyr
library(dplyr)
# Inspiration
# pf.fc_by_age_months <- pf %>%
# group_by(age_with_months) %>%
# summarize( friend_count_mean = mean(friend_count),
# friend_count_median = median(friend_count),
# n = n()) %>%
# arrange(age_with_months)
# NOTE: Summarize will REMOVE One layer of grouping when it runs, so it will remove the gender layer
# However we'll need to ungroup one more time to remove the age layer.
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarize( friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
ungroup() %>%
arrange(age)
# print out head of dataframe
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender friend_count_mean friend_count_median n
## (int) (fctr) (dbl) (dbl) (int)
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
Notes:
# Create a line graph showing the
# median friend count over the ages
# for each gender. Be sure to use
# the data frame you just created,
# pf.fc_by_age_gender.
# See the Instructor Notes for a hint.
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# =================================================
# Plot age vs gender - Lineplot
# Note: Averages are marked by X shince shape = 4.
ggplot(aes(x = age, y = friend_count_median),
data = subset(pf.fc_by_age_gender, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
# Install tidyr
# install.packages("tidyr")
Notes:
Variables we have grouped over have been repeated for each year in our listing: head(pf.fc_by_age_gender).
Notes:
# print out head of dataframe to see LONG format
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender friend_count_mean friend_count_median n
## (int) (fctr) (dbl) (dbl) (int)
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
Notes:
Read these notes at the pause:
* An Introduction to reshape2 by Sean Anderson
* Converting Between Long and Wide Format
* Melt Data Frames
Notes:
# install.packages('reshape2')
library(reshape2)
# You can also restructure the data using the tidyr package. You can review examples and how to use the package in the Data Wrangling with R pdf.
# I think you will find the tidyr package easier to use than the reshape2 package. Both packages can get the job done.
library(tidyr)
library(dplyr)
# Reshape2 Example
# NOTES:
# Use save variable name just add WIDE to the end
# Use DCAST() FN, specify the dataset to change and then put in a formula
# Variables to the LEFT of the "~" are variables you want to KEEP with an "+" between them
# To the right of the "~" we use the gender variable since we want both MALE and FEMALE to have
# their own columns for median_friend_count in the DF.
# set value.var = median.friend, which holds the key measurements of the values in our DF
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = 'friend_count_median')
# Spot Check DF
head(pf.fc_by_age_gender.wide)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
# Tidyr Example:
# Note: Not quite right
#
#
pf.fc_by_age_gender %>%
group_by(gender) %>%
select(friend_count_median)
## Source: local data frame [202 x 2]
## Groups: gender [2]
##
## gender friend_count_median
## (fctr) (dbl)
## 1 female 148.0
## 2 male 55.0
## 3 female 224.0
## 4 male 92.5
## 5 female 276.0
## 6 male 106.5
## 7 female 258.5
## 8 male 136.0
## 9 female 245.5
## 10 male 125.0
## .. ... ...
Notes:
# Plot the ratio of the female to male median
# friend counts using the data frame
# pf.fc_by_age_gender.wide.
# Think about what geom you should use.
# Add a horizontal line to the plot with
# a y intercept of 1, which will be the
# base line. Look up the documentation
# for geom_hline to do that. Use the parameter
# linetype in geom_hline to make the
# line dashed.
# The linetype parameter can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed
# 3 = dotted, 4 = dotdash, 5 = longdash
# 6 = twodash
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# =================================================
# Spot Check DF
#
head(pf.fc_by_age_gender.wide)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
# Inspiration
#
# ggplot(aes(x = age, y = friend_count_median),
# data = subset(pf.fc_by_age_gender, !is.na(gender))) +
# geom_line(aes(color = gender), stat = 'summary', fun.y = median)
# Actual - from video
ggplot(aes(x = age, y = female / male),
data =pf.fc_by_age_gender.wide) +
geom_line() +
geom_hline(yintercept = 1, alpha = 0.3, linetype = 2)
From the plot, we can easily see that for very young users the median female user has over 2 1/2 times the median male user.
It was helpful to condition on age and understand the relationship between gender and friend count.
This help assure us that this pattern was robust for users of many different ages, and highlighted where this difference was most striking.
NOTE: - ONE way to explore all 4 variables (Friend Count, Age, Gender and Tenure) is to use a 2 dimmensional display like a scatterplot. Wee can bin One of the quantative variables and compare those bins.
In this case we can group users by the year that they joined.
Notes:
# Create a variable called year_joined
# in the pf data frame using the variable
# tenure and 2014 as the reference year.
# The variable year joined should contain the year
# that a user joined facebook.
# See the Instructor Notes for three hints if you get
# stuck. Scroll down slowly to see one hint at a time
# if you would like some guidance.
# This programming exercise WILL BE automatically graded.
# DO NOT ALTER THE CODE BELOW THIS LINE
# ========================================================
pf <- read.delim('pseudo_facebook.tsv')
# ENTER YOUR CODE BELOW THIS LINE.
# ========================================================
# Note: Tenure is measured in days
pf$year_joined <- floor( 2014 - (pf$tenure / 365))
# Spot Check on the variable
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
# Table Variable
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
# ?cut
Notes:
# Create a new variable in the data frame
# called year_joined.bucket by using
# the cut function on the variable year_joined.
# You need to create the following buckets for the
# new variable, year_joined.bucket
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
# Note that a parenthesis means exclude the year and a
# bracket means include the year.
# Look up the documentation for cut or try the link
# in the Instructor Notes to accomplish this task.
# DO NOT DELETE THE TWO LINES OF CODE BELOW THIS LINE
# ========================================================================
# pf <- read.delim('/datasets/ud651/pseudo_facebook.tsv')
# pf$year_joined <- floor(2014 - pf$tenure / 365)
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================================
pf$year_joined.bucket <- cut(pf$year_joined,
breaks = c(2004, 2009, 2011, 2012, 2014))
Notes:
table(pf$year_joined.bucket, useNA = 'ifany')
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] <NA>
## 6669 15308 33366 43658 2
# Create a line graph of friend_count vs. age
# so that each year_joined.bucket is a line
# tracking the median user friend_count across
# age. This means you should have four different
# lines on your plot.
# You should subset the data to exclude the users
# whose year_joined.bucket is NA.
# If you need a hint, see the Instructor Notes.
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ===================================================
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(pf$year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = median)
Notes:
# Write code to do the following:
# (1) Add another geom_line to code below
# to plot the grand mean of the friend count vs age.
# (2) Exclude any users whose year_joined.bucket is NA.
# (3) Use a different line type for the grand mean.
# As a reminder, the parameter linetype can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed
# 3 = dotted, 4 = dotdash, 5 = longdash
# 6 = twodash
# This assignment is not graded and
# will be marked as correct when you submit.
# The code from the last programming exercise should
# be your starter code!
# ENTER YOUR CODE BELOW THIS LINE
# ==================================================================
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(pf$year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2)
Plotting the GRAND MEAN is a good reminder that much of the data is about RECENT cohorts.
This is the type of high level obersation that you want to make as you explore data.
We can also look at tenure and friend count as a rate instead.
How many friends does a user have for EACH day they have been using the service.
Notes:
# Subset the data so that you only consider users with at least ONE day of tenure
# How many friends does a user have for EACH DAY of using the service?
# Here we want a summary of the friend rate.
#
with(subset(pf, tenure >= 1), summary(friend_count / tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
with(subset(pf, tenure >= 1), summary(friend_count / tenure))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
What is the median friend rate per day? 0.2205
What is the maximum friend rate per day ? 417.0000 <= OUTLIER, considering the 3rd Quartile is only about 0.5.
Notes:
# Create a line graph of mean of friendships_initiated per day (of tenure)
# vs. tenure colored by year_joined.bucket.
# You need to make use of the variables tenure,
# friendships_initiated, and year_joined.bucket.
# You also need to subset the data to only consider user with at least
# one day of tenure.
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================================
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket))
There is a LOT of noise on our graph since we are plotting the mean of Y (friendships_initiated / tenure) for every possible X (tenure) value.
Recall from lesson 4 that we can adjust this noise bu fitting our X axis differently.
Examine ONE of those with changes in the bin width.
NOTE: The code changing the binning is substituting x = tenure in the plotting expressions with x = 7 * round(tenure / 7), etc., binning values by the denominator in the round function and then transforming back to the natural scale with the constant in front.
Notes:
# Original Plot which will be used as a base:
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
# MOD # (1) - Use the Number 7 - Higher variance, but lesser bias.
# Substitute a formuls for X so that some of the tenures can be BINNED together:
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
# ANALYSIS: This gives us slightly LESS noise in the plot, there still is some of the same peaks as prior, but much smoother in general.
# MOD # (2) - use the number 30
#
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
# MOD # (2) - use the number 90 : Very High Bias and Much less variance
#
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
Bias vs Variance Combo Chart
NOTICE - that as the bin size increase, we see less noise on the plot.
Our estimates of the means are adjusted since we have more data points for our new value of tenure.
Lesson 4 introduced smoothers as one tool for Analysts to use in these types of situations.
Instead of using geom_line, try using geom_smooth.
Notes:
# Instead of geom_line(), use geom_smooth() to add a smoother to the plot.
# You can use the defaults for geom_smooth() but do color the line
# by year_joined.bucket
# ALTER THE CODE BELOW THIS LINE
# ==============================================================================
# Original Plot
#
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
# Plot with Smoother added
#
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure > 1)) +
geom_smooth(aes(color = year_joined.bucket))
We keep color = year_joined.bucket so we see the segments in our graph
Here: geom_smooth(aes(color = year_joined.bucket)) we are using the defaults* for the smoother so that R will automatically* choose the best statistical methods for our data.
Analysis - here again in the smooth version, we see that friendships initiated decline as tenure increases.
Notes:
NFL GameDay Chart
Notes:
Notes:
# Read in the dataset
yo <- read.csv('yogurt.csv')
# Peruse the structure of the data
glimpse(yo)
## Observations: 2,380
## Variables: 9
## $ obs (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ id (int) 2100081, 2100081, 2100081, 2100081, 2100081, 21000...
## $ time (int) 9678, 9697, 9825, 9999, 10015, 10029, 10036, 10042...
## $ strawberry (int) 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ blueberry (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pina.colada (int) 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ plain (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mixed.berry (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 2, 0, 3, 3,...
## $ price (dbl) 58.96, 58.96, 65.04, 65.04, 48.96, 65.04, 65.04, 6...
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
# Change the ID from an **INT** to a **FACTOR**
#
yo$id <- factor(yo$id)
# Peruse the structure of the data
glimpse(yo)
## Observations: 2,380
## Variables: 9
## $ obs (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ id (fctr) 2100081, 2100081, 2100081, 2100081, 2100081, 2100...
## $ time (int) 9678, 9697, 9825, 9999, 10015, 10029, 10036, 10042...
## $ strawberry (int) 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ blueberry (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pina.colada (int) 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ plain (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mixed.berry (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 2, 0, 3, 3,...
## $ price (dbl) 58.96, 58.96, 65.04, 65.04, 48.96, 65.04, 65.04, 6...
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
# Make sure the library is loaded
library(ggplot2)
# Inspiration
# ggplot(data=chol, aes(chol$AGE)) + geom_histogram()
# My answer
ggplot(data=yo, aes(yo$price)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Graders Answer
qplot(data = yo, x = price, fill = I('#F79420'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# NOTE: Grader marked everything as wrong.
** Analysis**: This dataframes conatins over 2000 observations over 9 different variables for households that buy dannon yogurt over time.
data.frame’: 2380 obs. of 9 variables:
Higher priced yougurt were shown to sell much more (2 to 4X)
There are some empty spaces for adjacent prices - this promts a conversation of price sensitivity.
If we chose a different bin width, we might have **obscured* this discreetness - say a binwidth = 10.
Notes:
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
unique(yo$price)
## [1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36
## [12] 55.04 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52
length(unique(yo$price))
## [1] 20
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
# Create a new variable called all.purchases,
# which gives the total counts of yogurt for
# each observation or household.
# One way to do this is using the transform
# function. You can look up the function transform
# and run the examples of code at the bottom of the
# documentation to figure out what it does.
# The transform function produces a data frame
# so if you use it then save the result to 'yo'!
# OR you can figure out another way to create the
# variable.
# DO NOT ALTER THE CODE BELOW THIS LINE
# ========================================================
yo <- read.csv('yogurt.csv')
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================
# Inspiration : dataFrame <- transform(dataFrame, newColumnName = some equation)
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
On a different purchase ocassion, how many 8oz yogurts does a household purchase?
TO answer this we need to combine the different yogurt flavors into one variable.
NOTE: - Need the transform function.
Notes:
# Graders Answer
qplot(data = yo, x = all.purchases, binwidth = 1, fill = I('#F79420'))
# ggplot2 version
ggplot(data=yo, aes(all.purchases)) + geom_histogram(binwidth = 1, col="red",
fill="green",
alpha = .2)
# Inspiration
# ggplot(mtc, aes(x = hp, y = mpg))
#
ggplot(aes(x = time, y = price), data = yo) + geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420') )
Analysis - the histogram reveals that most households buy one or two yogurts at a time.
Analysis - scatterplot - the model of the most common prices seem to be increasing over time.
Notes:
library(ggplot2)
library(dplyr)
# Set the random seed do the results are reproducible
# set.seed(4230)
# Sample 16 of the households
# NOTE: levels provides access to the levels attribute of a variable.
# NOTE: the "sample"" function can be used to return a random permutation of a vector - thus a "sample".
# Notice - the sampling was taken from the levels, because those are all of the "different" households available.
#
# sample.ids <- sample(levels(yo$id), 16)
# Spot Check
# length(sample.ids)
# Plot each purchase ocassion, for each of the households that have been sampled
# HAVE: (1) Time of pruchase, (2) price per item of yogurt, and (3) the number of items.
# USE: Using the "Size"" parameter to add more 'detail' to the plot.
# Note: passing size "all.purchases" so that I can consider the number of items in terms of the size of the 'point' on the plot.
# Note: %in% - means to loop through all of the ID's and create a pannel plot for each ID of our households
#
# ggplot(aes(x = time, y = price),
# data = subset(yo, id %in% sample.ids)) +
# facet_wrap( ~ id) +
# geom_line() +
# geom_point(aes(size = all.purchases), pch = 1)
# My seed number and observations:
#
# Set the random seed do the results are reproducible
# set.seed(6401)
# Sample Households
# sample.ids <- sample(levels(yo$id), 16)
# Plot
# ggplot(aes(x = time, y = price),
# data = subset(yo, id %in% sample.ids)) +
# facet_wrap( ~ id) +
# geom_line() +
# geom_point(aes(size = all.purchases), pch = 1)
Notes:
Notes:
Notes:
# Install GGally
# install.packages("GGally")
# Load the Library
# library(GGally)
# Set the Theme
# theme_set(theme_minimal(20))
# Set the seend for reproducible results
# set.seed(1836)
# pf_subset <- pf[, c(2:15)]
# names(pf_subset)
# NOTE: This could take upwards to an hour to run
# ggpairs(pf_subset[sample.int(nrows(pf_subset), 1000), ])
Notes:
Notes:
# read in the NCI data
nci <- read.table("nci.tsv")
# Changing the columns to produce a nicer plot
colnames(nci) <- c(1:64)
Notes:
# Load reshape2
library(reshape2)
# Melt our data to a LONG format:
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
# Create the HeatMap
#
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!