Lesson 4


Scatterplots and Perceived Audience Size

Notes: Most people guessed less than the actual audience size and usually the guesses were regular numbers as seen by horizontal stripes in the scatter plot.


Scatterplots

Notes:

setwd('~/Downloads')
getwd()
## [1] "/Users/jacob/Downloads"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')

qplot(x = age, y = friend_count, data = pf)

qplot(age, friend_count, data = pf)


What are some things that you notice right away?

Response: There are vertical stripes on age 69, and a few over 90 which are likely fake. People under 30 have a lot more friends than other ages.


ggplot Syntax

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point() + 
  coord_cartesian(xlim = c(13, 90))

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

Overplotting

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20) + 
  coord_cartesian(xlim = c(13, 90)) 

What do you notice in the plot?

Response: the bulk of people under age 25 have under 1000 friends.


Coord_trans()

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20) + 
  coord_cartesian(xlim = c(13, 90))

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20, position = position_jitter(h = 0)) + 
  coord_cartesian(xlim = c(13, 90)) +
  coord_trans(y = 'sqrt')

Alpha and Jitter

Notes:

ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
  geom_point(alpha = 1/20, position = position_jitter(h = 0)) + 
  coord_cartesian(xlim = c(13, 90)) + 
  coord_trans(y = 'sqrt')


Conditional Means

Notes:

install.packages('dplyr', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/dplyr_0.5.0.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/dplyr_0.5.0.tgz/src/contrib:
##   cannot download all files
## Warning: package 'dplyr' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/dplyr_0.5.0.tgz/bin/macosx/mavericks/contrib/3.3:
##   cannot download all files
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, 
                          friend_count_mean = mean(friend_count),
                          friend_count_median = median(friend_count), 
                          n = n())
names(pf.fc_by_age) = c("Age", "Mean", "Median", "Count")

head(pf.fc_by_age, 20)
## # A tibble: 20 × 4
##      Age     Mean Median Count
##    <int>    <dbl>  <dbl> <int>
## 1     13 164.7500   74.0   484
## 2     14 251.3901  132.0  1925
## 3     15 347.6921  161.0  2618
## 4     16 351.9371  171.5  3086
## 5     17 350.3006  156.0  3283
## 6     18 331.1663  162.0  5196
## 7     19 333.6921  157.0  4391
## 8     20 283.4991  135.0  3769
## 9     21 235.9412  121.0  3671
## 10    22 211.3948  106.0  3032
## 11    23 202.8426   93.0  4404
## 12    24 185.7121   92.0  2827
## 13    25 131.0211   62.0  3641
## 14    26 144.0082   75.0  2815
## 15    27 134.1473   72.0  2240
## 16    28 125.8354   66.0  2364
## 17    29 120.8182   66.0  1936
## 18    30 115.2080   67.5  1716
## 19    31 118.4599   63.0  1694
## 20    32 114.2800   63.0  1443

Create your plot!

ggplot(aes(x = Age, y = Mean), data = pf.fc_by_age) +
  geom_line()


Overlaying Summaries with Raw Data

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20, 
             position = position_jitter(h = 0), 
             color = 'orange') + 
  coord_cartesian(xlim = c(13, 90)) + 
  coord_trans(y = 'sqrt') + 
  geom_line(stat = 'summary', fun.y = mean, 
            color = 'black') + 
  geom_line(stat = 'summary', fun.y = quantile, 
            fun.args = list(probs = 0.1),
            color = 'blue', 
            linetype = 'dashed') + 
  geom_line(stat = 'summary', fun.y = quantile, 
            fun.args = list(probs = 0.5), 
            color = 'yellow', 
            linetype = 'dashed') + 
  geom_line(stat = 'summary', fun.y = quantile, 
            fun.args = list(probs = 0.9), 
            color = 'red', 
            linetype = 'dashed') 

Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$age, pf$friend_count, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response: -0.027


Correlation on Subsets

Notes:

with(subset(pf, age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Create Scatterplots

Notes:

ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + 
  geom_point(alpha = 1/15) + 
  coord_cartesian(xlim = c(0, 1e+03), ylim = c(0, 25e+02))


Strong Correlations

Notes:

ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + 
  geom_point() + 
  xlim(0, quantile(pf$www_likes_received, 0.95)) + 
  ylim(0, quantile(pf$likes_received, 0.95)) + 
  geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

with(pf, cor.test(www_likes_received, likes_received))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response: 0.947


Moira on Correlation

Notes: Use correlation to see the relationship between two things.


More Caution with Correlation

Notes:

install.packages('alr3', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/alr3_2.0.5.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/alr3_2.0.5.tgz/src/contrib:
##   cannot download all files
## Warning: package 'alr3' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/alr3_2.0.5.tgz/bin/macosx/mavericks/contrib/3.3:
##   cannot download all files
library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data("Mitchell")
?Mitchell

Create your plot!

ggplot(aes(x = Month, y = Temp), data = Mitchell) + 
  geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place) 0.057

ggplot(aes(x = Month, y = Temp), data = Mitchell) + 
  geom_smooth(method = 'lm', color = 'red')

with(Mitchell, cor.test(Mitchell$Month, Mitchell$Temp))
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes: Use range to see the max and min limits.

range(Mitchell$Month)
## [1]   0 203
ggplot(aes(x = Month, y = Temp), data = Mitchell) + 
  geom_point() + 
  scale_x_continuous(breaks = seq(0, 203, 12), limits = c(0, 203))

Notes: This way you can see patterns every so often, such as years in this case.

ggplot(aes(x = (Month%%12), y = Temp), data = Mitchell) + 
  geom_point()


A New Perspective

What do you notice? Response: A wave such as sin or cos.

Watch the solution video and check out the Instructor Notes! Notes: You can use the modulous command to see patterns


Understanding Noise: Age to Age Months

Notes:

ggplot(aes(x = Age, y = Mean), data = pf.fc_by_age) +
  geom_line()

head(pf.fc_by_age, 10)
## # A tibble: 10 × 4
##      Age     Mean Median Count
##    <int>    <dbl>  <dbl> <int>
## 1     13 164.7500   74.0   484
## 2     14 251.3901  132.0  1925
## 3     15 347.6921  161.0  2618
## 4     16 351.9371  171.5  3086
## 5     17 350.3006  156.0  3283
## 6     18 331.1663  162.0  5196
## 7     19 333.6921  157.0  4391
## 8     20 283.4991  135.0  3769
## 9     21 235.9412  121.0  3671
## 10    22 211.3948  106.0  3032
pf.fc_by_age[17:19, ]
## # A tibble: 3 × 4
##     Age     Mean Median Count
##   <int>    <dbl>  <dbl> <int>
## 1    29 120.8182   66.0  1936
## 2    30 115.2080   67.5  1716
## 3    31 118.4599   63.0  1694

Age with Months Means

pf$age_with_months <- pf$age  + (1 - pf$dob_month/12)

Programming Assignment

age_groups_with_months <- group_by(pf, age_with_months)

pf.fc_by_age_months <- summarise(age_groups_with_months, 
                                 friend_count_mean = mean(friend_count),
                                 friend_count_median = median(friend_count),
                                 n = n())

pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)

head(pf.fc_by_age_months, 10)
## # A tibble: 10 × 4
##    age_with_months friend_count_mean friend_count_median     n
##              <dbl>             <dbl>               <dbl> <int>
## 1         13.16667          46.33333                30.5     6
## 2         13.25000         115.07143                23.5    14
## 3         13.33333         136.20000                44.0    25
## 4         13.41667         164.24242                72.0    33
## 5         13.50000         131.17778                66.0    45
## 6         13.58333         156.81481                64.0    54
## 7         13.66667         130.06522                75.5    46
## 8         13.75000         205.82609               122.0    69
## 9         13.83333         215.67742               111.0    62
## 10        13.91667         162.28462                71.0   130

Noise in Conditional Means

ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) +
  geom_line()


Smoothing Conditional Means

Notes:

p1 <- ggplot(aes(x = Age, y = Mean), 
             data = subset(pf.fc_by_age, Age < 71)) +
  geom_line() + 
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) +
  geom_line() + 
  geom_smooth()

p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count), 
             data = subset(pf, age < 71)) + 
  geom_line(stat = 'summary', fun.y = mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, p3, ncol = 1)


Which Plot to Choose?

Notes: All of them. They each reveal something unique about the data.