setwd("D:/R/Udacity/EDA_Course_Materials/lesson4")
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Notes:
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
qplot(x = age, y = friend_count, data = pf)
# equvivalent
ggplot(aes(x = age, y = friend_count), data = pf) + geom_point()
Response: Mostly people under 25 have biggest number of friends, but people who said they are around 75 and over 100 also have quite high density of friends. Probably this is fake age which young people put for fun ***
Notes:
qplot(x = age, y = friend_count, data = pf)
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point() +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13,90)
## Warning: Removed 5183 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Response: On average people don’t have more than 200 friends. But people younger than 30 have friends number of friends around 500. “Line” around 65 year old users looks like “line” for 25 year old users. ***
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20) +
xlim(13,90) +
coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
xlim(13,90) +
coord_trans(y = "sqrt")
## Warning: Removed 5184 rows containing missing values (geom_point).
There are just a few users above 1000 threshold. ***
Notes: Let’s xxamine the relationship between friendships_initiated (y) and age (x) using the ggplot syntax.
ggplot(aes(x = age, y = friendships_initiated), data = pf) +
geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
xlim(13,90) +
coord_trans( y = 'sqrt')
## Warning: Removed 5178 rows containing missing values (geom_point).
Notes:
Notes:
library(dplyr)
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
Create your plot!
names(pf.fc_by_age)
## [1] "age" "friend_count_mean" "friend_count_median"
## [4] "n"
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
geom_line()
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 0.05, position = position_jitter(h = 0),
color = "orange") +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile, probs = .1, linetype = 2, color = "blue") +
geom_line(stat = 'summary', fun.y = quantile, probs = .5, color = "blue") +
geom_line(stat = 'summary', fun.y = quantile, probs = .9, linetype = 2, color = "blue") +
coord_cartesian(xlim = c(13,70), ylim = c(13,1000))
Response: Almost nobody has over 1000 friends, even young users. 90% of users are below 1000. 90% of users between 35 and 65 have less than 250 friens. ***
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(pf$age, pf$friend_count, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
#or
with(pf, cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response: -0.02740737 ***
Notes:
with(subset(pf, age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes:
Notes:
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
geom_point(alpha = 0.05, color = "blue") +
coord_cartesian(xlim = c(0,3000), ylim = c(0,5000))
Notes:
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
geom_point(alpha = 0.05, color = "blue") +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing missing values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response:
Pearson’s product-moment correlation
data: pf\(www_likes_received and pf\)likes_received t = 937.1035, df = 99001, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.9473553 0.9486176 sample estimates: cor 0.9479902 ***
Notes:
library(alr3)
## Loading required package: car
Create your plot!
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point()
Looks like there is no correlation between Month and Temp
cor.test(Mitchell$Month, Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes:
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point() +
scale_x_discrete(breaks = seq(0,203,12))
What do you notice? Response: There is the same wave like cyclical patern each year.
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
ggplot(aes(x = age, y = friend_count_mean),
data = pf.fc_by_age) +
geom_line()
head(pf.fc_by_age,10)
## Source: local data frame [10 x 4]
##
## age friend_count_mean friend_count_median n
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
pf.fc_by_age[17:19, ]
## Source: local data frame [3 x 4]
##
## age friend_count_mean friend_count_median n
## 1 29 120.8182 66.0 1936
## 2 30 115.2080 67.5 1716
## 3 31 118.4599 63.0 1694
pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12
pf.fc_by_age_month <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
ggplot(aes
(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_month, age_with_months < 71)) +
geom_line()
Notes:
p1 <- ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes
(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_month, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes
(x = round(age / 5) * 5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = "summary", fun.y = mean)
library(gridExtra)
## Loading required package: grid
grid.arrange(p2, p1, p3, ncol = 1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Notes: We don’t need to choose one plot. Different visualisations tells us different details about data. ***