Notes: Most people guessed less than the actual audience size and usually the guesses were regular numbers as seen by horizontal stripes in the scatter plot.
Notes:
setwd('~/Downloads')
getwd()
## [1] "/Users/jacob/Downloads"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
qplot(x = age, y = friend_count, data = pf)
qplot(age, friend_count, data = pf)
Response: There are vertical stripes on age 69, and a few over 90 which are likely fake. People under 30 have a lot more friends than other ages.
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point() +
coord_cartesian(xlim = c(13, 90))
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
coord_cartesian(xlim = c(13, 90))
Response: the bulk of people under age 25 have under 1000 friends.
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
coord_cartesian(xlim = c(13, 90))
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
coord_cartesian(xlim = c(13, 90)) +
coord_trans(y = 'sqrt')
Notes:
ggplot(aes(x = age, y = friendships_initiated), data = pf) +
geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
coord_cartesian(xlim = c(13, 90)) +
coord_trans(y = 'sqrt')
Notes:
install.packages('dplyr', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/dplyr_0.5.0.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/dplyr_0.5.0.tgz/src/contrib:
## cannot download all files
## Warning: package 'dplyr' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/dplyr_0.5.0.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
names(pf.fc_by_age) = c("Age", "Mean", "Median", "Count")
head(pf.fc_by_age, 20)
## # A tibble: 20 × 4
## Age Mean Median Count
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
Create your plot!
ggplot(aes(x = Age, y = Mean), data = pf.fc_by_age) +
geom_line()
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20,
position = position_jitter(h = 0),
color = 'orange') +
coord_cartesian(xlim = c(13, 90)) +
coord_trans(y = 'sqrt') +
geom_line(stat = 'summary', fun.y = mean,
color = 'black') +
geom_line(stat = 'summary', fun.y = quantile,
fun.args = list(probs = 0.1),
color = 'blue',
linetype = 'dashed') +
geom_line(stat = 'summary', fun.y = quantile,
fun.args = list(probs = 0.5),
color = 'yellow',
linetype = 'dashed') +
geom_line(stat = 'summary', fun.y = quantile,
fun.args = list(probs = 0.9),
color = 'red',
linetype = 'dashed')
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(pf$age, pf$friend_count, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response: -0.027
Notes:
with(subset(pf, age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes:
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) +
geom_point(alpha = 1/15) +
coord_cartesian(xlim = c(0, 1e+03), ylim = c(0, 25e+02))
Notes:
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
with(pf, cor.test(www_likes_received, likes_received))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response: 0.947
Notes: Use correlation to see the relationship between two things.
Notes:
install.packages('alr3', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/alr3_2.0.5.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/alr3_2.0.5.tgz/src/contrib:
## cannot download all files
## Warning: package 'alr3' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/alr3_2.0.5.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data("Mitchell")
?Mitchell
Create your plot!
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point()
Take a guess for the correlation coefficient for the scatterplot.
What is the actual correlation of the two variables? (Round to the thousandths place) 0.057
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_smooth(method = 'lm', color = 'red')
with(Mitchell, cor.test(Mitchell$Month, Mitchell$Temp))
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes: Use range to see the max and min limits.
range(Mitchell$Month)
## [1] 0 203
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12), limits = c(0, 203))
Notes: This way you can see patterns every so often, such as years in this case.
ggplot(aes(x = (Month%%12), y = Temp), data = Mitchell) +
geom_point()
What do you notice? Response: A wave such as sin or cos.
Watch the solution video and check out the Instructor Notes! Notes: You can use the modulous command to see patterns
Notes:
ggplot(aes(x = Age, y = Mean), data = pf.fc_by_age) +
geom_line()
head(pf.fc_by_age, 10)
## # A tibble: 10 × 4
## Age Mean Median Count
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
pf.fc_by_age[17:19, ]
## # A tibble: 3 × 4
## Age Mean Median Count
## <int> <dbl> <dbl> <int>
## 1 29 120.8182 66.0 1936
## 2 30 115.2080 67.5 1716
## 3 31 118.4599 63.0 1694
pf$age_with_months <- pf$age + (1 - pf$dob_month/12)
Programming Assignment
age_groups_with_months <- group_by(pf, age_with_months)
pf.fc_by_age_months <- summarise(age_groups_with_months,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
head(pf.fc_by_age_months, 10)
## # A tibble: 10 × 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
## 7 13.66667 130.06522 75.5 46
## 8 13.75000 205.82609 122.0 69
## 9 13.83333 215.67742 111.0 62
## 10 13.91667 162.28462 71.0 130
ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line()
Notes:
p1 <- ggplot(aes(x = Age, y = Mean),
data = subset(pf.fc_by_age, Age < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, ncol = 1)
Notes: All of them. They each reveal something unique about the data.