Lesson 4

setwd("D:/R/Udacity/EDA_Course_Materials/lesson4")
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Scatterplots

Notes:

library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')

qplot(x = age, y = friend_count, data = pf)

# equvivalent
ggplot(aes(x = age, y = friend_count), data = pf) + geom_point() 


What are some things that you notice right away?

Response: Mostly people under 25 have biggest number of friends, but people who said they are around 75 and over 100 also have quite high density of friends. Probably this is fake age which young people put for fun ***

ggplot Syntax

Notes:

qplot(x = age, y = friend_count, data = pf)

ggplot(aes(x = age, y = friend_count), data = pf) + 
    geom_point() +
    xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

Overplotting

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
    geom_jitter(alpha = 1/20) +
    xlim(13,90)
## Warning: Removed 5183 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

What do you notice in the plot?

Response: On average people don’t have more than 200 friends. But people younger than 30 have friends number of friends around 500. “Line” around 65 year old users looks like “line” for 25 year old users. ***

Coord_trans()

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
    geom_point(alpha = 1/20) +
    xlim(13,90) +
    coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

ggplot(aes(x = age, y = friend_count), data = pf) + 
    geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
    xlim(13,90) +
    coord_trans(y = "sqrt")
## Warning: Removed 5184 rows containing missing values (geom_point).

What do you notice?

There are just a few users above 1000 threshold. ***

Alpha and Jitter

Notes: Let’s xxamine the relationship between friendships_initiated (y) and age (x) using the ggplot syntax.

ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
    geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
    xlim(13,90) +
    coord_trans( y = 'sqrt')
## Warning: Removed 5178 rows containing missing values (geom_point).


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

library(dplyr)

age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##   age friend_count_mean friend_count_median    n
## 1  13          164.7500                74.0  484
## 2  14          251.3901               132.0 1925
## 3  15          347.6921               161.0 2618
## 4  16          351.9371               171.5 3086
## 5  17          350.3006               156.0 3283
## 6  18          331.1663               162.0 5196
pf.fc_by_age <- pf %>%
    group_by(age) %>%
    summarise(friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n()) %>%
    arrange(age)

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##   age friend_count_mean friend_count_median    n
## 1  13          164.7500                74.0  484
## 2  14          251.3901               132.0 1925
## 3  15          347.6921               161.0 2618
## 4  16          351.9371               171.5 3086
## 5  17          350.3006               156.0 3283
## 6  18          331.1663               162.0 5196

Create your plot!

names(pf.fc_by_age)
## [1] "age"                 "friend_count_mean"   "friend_count_median"
## [4] "n"
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
    geom_line()


Overlaying Summaries with Raw Data

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
    geom_point(alpha = 0.05, position = position_jitter(h = 0),
    color = "orange") +
    geom_line(stat = 'summary', fun.y = mean) +
    geom_line(stat = 'summary', fun.y = quantile, probs = .1, linetype = 2, color = "blue") +
        geom_line(stat = 'summary', fun.y = quantile, probs = .5, color = "blue") +
    geom_line(stat = 'summary', fun.y = quantile, probs = .9, linetype = 2, color = "blue") +
    coord_cartesian(xlim = c(13,70), ylim = c(13,1000))

What are some of your observations of the plot?

Response: Almost nobody has over 1000 friends, even young users. 90% of users are below 1000. 90% of users between 35 and 65 have less than 250 friens. ***

Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$age, pf$friend_count, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
#or
with(pf, cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response: -0.02740737 ***

Correlation on Subsets

Notes:

with(subset(pf, age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Notes:


Create Scatterplots

Notes:

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
    geom_point(alpha = 0.05, color = "blue") +
    coord_cartesian(xlim = c(0,3000), ylim = c(0,5000))


Strong Correlations

Notes:

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
    geom_point(alpha = 0.05, color = "blue") +
    xlim(0, quantile(pf$www_likes_received, 0.95)) + 
    ylim(0, quantile(pf$likes_received, 0.95)) +
    geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing missing values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response:

Pearson’s product-moment correlation

data: pf\(www_likes_received and pf\)likes_received t = 937.1035, df = 99001, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.9473553 0.9486176 sample estimates: cor 0.9479902 ***

More Caution with Correlation

Notes:

library(alr3)
## Loading required package: car

Create your plot!

ggplot(aes(x = Month, y = Temp), data = Mitchell) +
    geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

Looks like there is no correlation between Month and Temp

  1. What is the actual correlation of the two variables? (Round to the thousandths place)
cor.test(Mitchell$Month, Mitchell$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

ggplot(aes(x = Month, y = Temp), data = Mitchell) +
    geom_point() +
    scale_x_discrete(breaks = seq(0,203,12))


A New Perspective

What do you notice? Response: There is the same wave like cyclical patern each year.

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

ggplot(aes(x = age, y = friend_count_mean),
   data = pf.fc_by_age) +
   geom_line() 

head(pf.fc_by_age,10)
## Source: local data frame [10 x 4]
## 
##    age friend_count_mean friend_count_median    n
## 1   13          164.7500                74.0  484
## 2   14          251.3901               132.0 1925
## 3   15          347.6921               161.0 2618
## 4   16          351.9371               171.5 3086
## 5   17          350.3006               156.0 3283
## 6   18          331.1663               162.0 5196
## 7   19          333.6921               157.0 4391
## 8   20          283.4991               135.0 3769
## 9   21          235.9412               121.0 3671
## 10  22          211.3948               106.0 3032
pf.fc_by_age[17:19, ]
## Source: local data frame [3 x 4]
## 
##   age friend_count_mean friend_count_median    n
## 1  29          120.8182                66.0 1936
## 2  30          115.2080                67.5 1716
## 3  31          118.4599                63.0 1694
pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12

Age with Months Means

pf.fc_by_age_month <- pf %>%
    group_by(age_with_months) %>%
    summarise(friend_count_mean = mean(friend_count),
              friend_count_median = median(friend_count),
              n = n()) %>%
    arrange(age_with_months)

Noise in Conditional Means

ggplot(aes
       (x = age_with_months, y = friend_count_mean),
        data = subset(pf.fc_by_age_month, age_with_months < 71)) +
   geom_line() 

Smoothing Conditional Means

Notes:

p1 <- ggplot(aes(x = age, y = friend_count_mean),
   data = subset(pf.fc_by_age, age < 71)) +
   geom_line() +
   geom_smooth()

p2 <- ggplot(aes
       (x = age_with_months, y = friend_count_mean),
        data = subset(pf.fc_by_age_month, age_with_months < 71)) +
   geom_line() +
   geom_smooth()

p3 <- ggplot(aes
       (x = round(age / 5) * 5, y = friend_count),
        data = subset(pf, age < 71)) +
   geom_line(stat = "summary", fun.y = mean)

library(gridExtra)
## Loading required package: grid
grid.arrange(p2, p1, p3, ncol = 1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.


Which Plot to Choose?

Notes: We don’t need to choose one plot. Different visualisations tells us different details about data. ***