Notes:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.1
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.1
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
getwd()
## [1] "C:/Users/amackay/Documents/R Scripts"
#setwd("~/R Datasources")
list.files()
## [1] "Basic commands.R"
## [2] "Data Analysis with R - Problem Set 3 - Gapminder dataset analysis.rmd"
## [3] "Data Analysis with R - Problem Set 3.rmd"
## [4] "Data Analysis with R - Problem Set 4.rmd"
## [5] "Data Analysis with R - Problem Set 5.rmd"
## [6] "Data_Analysis_with_R_-_Problem_Set_3.html"
## [7] "Data_Analysis_with_R_-_Problem_Set_3_-_Gapminder_dataset_analysis.html"
## [8] "Data_Analysis_with_R_-_Problem_Set_4.html"
## [9] "demystifying.R"
## [10] "demystifyingR2_v3.Rmd"
## [11] "lesson3_student.html"
## [12] "lesson3_student.rmd"
## [13] "lesson4_student.html"
## [14] "lesson4_student.rmd"
## [15] "lesson5_student.rmd"
## [16] "lesson5_student_files"
## [17] "rsconnect"
pf <- read.csv('../R Datasources/pseudo_facebook.tsv',sep = '\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notes:
Notes: She added color by age to see if age made a difference to percieved audience size.However, due to overplotting she could not infer any more insights. ***
Notes:
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_histogram(stat = "identity")
#create new data frame of age and gender summaries
age_by_gender_groups <- group_by(subset(pf, !is.na(gender)), age, gender)
pf.fc_by_age_gender <- summarise(age_by_gender_groups,
mean_friend_count = mean(friend_count),
median_friend_count = median(as.numeric(friend_count)),
n = n())
#summarise removes one layer of grouping.Need to remove the other
pf.fc_by_age_gender <- ungroup(pf.fc_by_age_gender)
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender mean_friend_count median_friend_count n
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
#alternative syntax using chaining %>%
pf.fc_by_age_gender2 <- pf %>%
filter(!is.na(gender)) %>% #could have used subset on pf as well
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(as.numeric(friend_count)),
n = n()) %>%
ungroup() %>%
arrange(age)
Notes:
ggplot(aes(x = age, y = median_friend_count), data = pf.fc_by_age_gender, color = gender) +
geom_line(aes(color = gender))
Notes: How many times more on average does a female have more friends than a male? ***
Notes:
#this dataset is in a long format
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender mean_friend_count median_friend_count n
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
#to covert to wide would be to transpose the male and female to columns along wth median_age
Notes:
#install.packages('reshape2')
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.2.2
#use the dcast function of reshape2 to create a wide dataset
#in dcast, the columns to keep are to the left of the tilde and to the
#right is the column to transpose and lastly the numercial column
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
#alternative syntax using tidyr package
#install.packages("tidyr")
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.2
library(dplyr)
#long to wide using tidyr
#subset the data to include only the columns we need
pf.fc_by_age_gender.wide2 <- spread(subset(pf.fc_by_age_gender[c('age','gender','median_friend_count')]), gender, median_friend_count)
#use mutate to calc. the ratio of male to female median friend count
pf.fc_by_age_gender.wide2 <- mutate(pf.fc_by_age_gender.wide2, ratio = female/male)
head(pf.fc_by_age_gender.wide2)
## Source: local data frame [6 x 4]
##
## age female male ratio
## 1 13 148.0 55.0 2.690909
## 2 14 224.0 92.5 2.421622
## 3 15 276.0 106.5 2.591549
## 4 16 258.5 136.0 1.900735
## 5 17 245.5 125.0 1.964000
## 6 18 243.0 122.0 1.991803
#alternative syntax using chaining
pf.fc_by_age_gender.wide3 <- subset(pf.fc_by_age_gender[c('age', 'gender', 'median_friend_count')], !is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = female / male)
head(pf.fc_by_age_gender.wide3)
## Source: local data frame [6 x 4]
##
## age female male ratio
## 1 13 148.0 55.0 2.690909
## 2 14 224.0 92.5 2.421622
## 3 15 276.0 106.5 2.591549
## 4 16 258.5 136.0 1.900735
## 5 17 245.5 125.0 1.964000
## 6 18 243.0 122.0 1.991803
Notes:
#plot the line and add the hline geom at y-intercept 1
ggplot(aes(x = age, y = ratio), data = pf.fc_by_age_gender.wide2) +
geom_line() +
geom_hline(color = 'blue', linetype = 2, alpha = 0.6, yintercept = 1)
Notes: Add tenure in years (rounded to the nearest year) to the dataframe
#use the floor() function to round down to the nearest integer
pf$year_joined <- floor(2014 - pf$tenure / 365)
Notes:
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
#use the cut function to bin the categorical variables
pf$year_joined.bucket <- cut(pf$year_joined, c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket, useNA = 'ifany')
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] <NA>
## 6669 15308 33366 43658 2
Notes: Plot the friend count vs age with the age agroups created above ie pf$year_joined.bucket
ggplot(aes(x = age, y = friend_count), data = subset(pf,!is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
Notes:
ggplot(aes(x = age, y = friend_count), data = subset(pf,!is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2)
Notes: examine how many friends per day a user had since joining. Subset to exclude tenure =0
pf$friending_rate <- NA
pf.tenure_morethanzero <- subset(pf,tenure >0)
pf.tenure_morethanzero$friending_rate = pf.tenure_morethanzero$friend_count / pf.tenure_morethanzero$tenure
summary(pf.tenure_morethanzero$friending_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
#alernative syntax
with(subset(pf,tenure >0), summary(friend_count / tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
Notes:
What is the median friend rate? 0.2205 What is the maximum friend rate? 417.0000
ggplot(aes(x=tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >0)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)
Notes:
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x=tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >0)) +
geom_smooth(aes(color = year_joined.bucket))
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
Notes:
Notes:
Notes:
yo <- read.csv('../R Datasources/yogurt.csv')
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
#most variables are ints. Need to convert id to a factor
yo$id <- factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
#plot a histogram. Observe the discretnees of the distribution.
qplot(x = price, data=yo, fill=I('orange'))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
#set bin width to 10. In this plot you owuld miss the discretness of the distribution
#This would be a very biased model
qplot(x = price, data=yo, fill=I('orange'),binwidth = 10)
Notes:
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
length(unique(yo$price))
## [1] 20
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
library(tidyr)
library(dplyr)
#spread(subset(pf.fc_by_age_gender[c('age','gender','median_friend_count')]), gender, median_friend_count)
yo$all.purchases <- yo$strawberry + yo$blueberry + yo$pina.colada + yo$plain + yo$mixed.berry
#alternative syntax using transform
yo <- transform(yo, all.purchases2 = strawberry + blueberry + pina.colada + plain + mixed.berry)
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price all.purchases all.purchases2
## Min. :20.00 Min. : 1.000 Min. : 1.000
## 1st Qu.:50.00 1st Qu.: 1.000 1st Qu.: 1.000
## Median :65.04 Median : 2.000 Median : 2.000
## Mean :59.25 Mean : 1.971 Mean : 1.971
## 3rd Qu.:68.96 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :68.96 Max. :21.000 Max. :21.000
##
Notes:
qplot(x = all.purchases,data = yo, fill=I('blue'), binwidth = 1)
#plot a scatter of prices va time
ggplot(aes(x = time, y = price), data = yo)+
geom_point(alpha = .4)
ggplot(aes(x = time, y = price), data = yo)+
geom_jitter(alpha = .4)
Notes:
Notes: Note: x %in% y returns a logical (boolean) vector the same length as x that says whether each entry in x appears in y. That is, for each entry in x, it checks to see whether it is in y.
This allows us to subset the data so we get all the purchases occasions for the households in the sample. Then, we create scatterplots of price vs. time and facet by the sample id.
Use the pch or shape parameter to specify the symbol when plotting points
#Set the seed for reproducible results
set.seed(1230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1) #pch is for the shape of the point
Notes:
Notes:
Notes:
use this pdf as reference for the generated scatterplot https://s3.amazonaws.com/udacity-hosted-downloads/ud651/scatterplotMatrix.pdf ***
Notes:
Notes:
nci <- read.table("../R Datasources/nci.tsv")
colnames(nci) <- c(1:64)
library(reshape2)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection: 1. Third Qualitative Variable 1.1 Why do we need to use ungroup ? 2. what are cohorts? ***
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!