Notes:
Notes:
setwd("C:/Dersler/Udacity_R/W3")
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
dim(pf)
## [1] 99003 15
str(pf)
## 'data.frame': 99003 obs. of 15 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
#qplot(x = age, y = friend_count, data = pf)
Response: Mostly people has lower friend counts, however younger people (below 30) tend to have higher friend counts There are intersting spikes around age 70 and over 100. Over 100 probably are fake users
Notes:
ggplot(aes(x = age, y= friend_count), data =pf) + geom_point() +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes:
ggplot(aes(x = age, y= friend_count), data =pf) +
geom_point(alpha = 1/20) +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
# Add some noise
ggplot(aes(x = age, y= friend_count), data =pf) +
geom_jitter(alpha = 1/20) +
xlim(13,90)
## Warning: Removed 5202 rows containing missing values (geom_point).
Response: Even though young users tend to have lots of friends, this plot says most of the young users have relatively less friends and alpha channel allows us to see the density of the friend count better when there is an over plotting.
Notes:
ggplot(aes(x = age, y= friend_count), data =pf) +
geom_point(alpha = 1/20) +
xlim(13,90) +
coord_trans(xtrans = 'identity', ytrans = 'sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x = age, y= friend_count), data =pf) +
geom_point(alpha = 1/20, position = position_jitter(h=0)) +
xlim(13,90) +
coord_trans(xtrans = 'identity', ytrans = 'sqrt')
## Warning: Removed 5172 rows containing missing values (geom_point).
Looks like older people tends to have a narrow band of friend counts, with sqrt we squeezed the y axis a bitso outliers came much closer to median values
Notes:
ggplot(aes(x = age, y = friendships_initiated), data =pf) +
geom_point(alpha = 1/15, position = position_jitter(h=0)) +
xlim(13,90) +
coord_trans(ytrans = 'sqrt')
## Warning: Removed 5196 rows containing missing values (geom_point).
Notes:
Notes:
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age<- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
library(dlpyr)
pf %.%
group_by(age) %.%
summarise( friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %.%
arrange(age)
head(pf.fc_by_age)
Create your plot!
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
geom_line()
Notes:
ggplot(aes(x = age, y = friendships_initiated), data =pf) +
geom_point(alpha = 0.05,
position = position_jitter(h=0),
color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile,
prob = 0.1,
linetype = 2,
color ='blue') +
geom_line(stat = 'summary', fun.y = quantile,
prob = 0.9,
linetype = 2,
color ='blue') +
geom_line(stat = 'summary', fun.y = median,
color ='blue') +
coord_cartesian(xlim = c(13,70), ylim = c(0,1000))
Response:median is less than mean since the friend count is a right skewed distribution. age 69 and age over 80 has lots of fake data. Young people has still more friend counts but most of them less than 1K
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(x = pf$age, y =pf$friend_count,method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response:
Notes:
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes:
Notes:
ggplot(aes(x = www_likes_received, y = likes_received), data =pf) +
geom_point(alpha = 0.05,
position = position_jitter(h=0),
color = 'red') +
coord_cartesian(xlim = c(0,1000), ylim = c(0,2000))
Notes:
ggplot(aes(x = www_likes_received, y = likes_received), data =pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received,0.95)) +
ylim(0, quantile(pf$likes_received,0.95)) +
geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing missing values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
with(pf, cor.test(www_likes_received,likes_received), method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response:
Notes:
Notes:
library(alr3)
## Loading required package: car
data(Mitchell)
Create your plot!
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point()
Take a guess for the correlation coefficient for the scatterplot. = 0
What is the actual correlation of the two variables? (Round to the thousandths place) =0.0575
with(Mitchell, cor.test(x = Month, y = Temp, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes:
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point() +
scale_x_discrete(breaks = seq(0,203,12))
What do you notice? Response: sinus like shape
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
pf$age_with_months <- pf$age + (12 - pf$dob_month)/12
head(pf,20)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## 7 1136133 13 14 2000 1 male 12 0
## 8 1680361 13 4 2000 1 female 0 0
## 9 1365174 13 1 2000 1 male 81 0
## 10 1712567 13 2 2000 2 male 171 0
## 11 1612453 13 22 2000 2 male 98 0
## 12 2104073 13 1 2000 2 male 55 0
## 13 1918584 13 5 2000 3 male 106 0
## 14 1704433 13 21 2000 3 male 61 0
## 15 1932519 13 28 2000 3 female 0 0
## 16 1751722 13 7 2000 4 female 16 0
## 17 1470850 13 30 2000 5 female 34 0
## 18 1001768 13 23 2000 5 female 25 0
## 19 1537661 13 16 2000 5 female 4 0
## 20 1020296 13 13 2000 8 male 9 0
## friendships_initiated likes likes_received mobile_likes
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## 11 0 0 0 0
## 12 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
## 15 0 0 0 0
## 16 0 0 0 0
## 17 0 0 0 0
## 18 0 0 0 0
## 19 0 0 0 0
## 20 0 0 0 0
## mobile_likes_received www_likes www_likes_received age_with_months
## 1 0 0 0 14.08333
## 2 0 0 0 14.08333
## 3 0 0 0 14.08333
## 4 0 0 0 14.00000
## 5 0 0 0 14.00000
## 6 0 0 0 14.00000
## 7 0 0 0 13.91667
## 8 0 0 0 13.91667
## 9 0 0 0 13.91667
## 10 0 0 0 13.83333
## 11 0 0 0 13.83333
## 12 0 0 0 13.83333
## 13 0 0 0 13.75000
## 14 0 0 0 13.75000
## 15 0 0 0 13.75000
## 16 0 0 0 13.66667
## 17 0 0 0 13.58333
## 18 0 0 0 13.58333
## 19 0 0 0 13.58333
## 20 0 0 0 13.33333
#Alternatively
#pf <- mutate(pf, age_with_months = age + (12-dob_month)/12)
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
Programming Assignment
ggplot(data =subset(pf.fc_by_age_months, age_with_months < 71),
aes(x = age_with_months, y =friend_count_mean)) +
geom_line()
Notes:
p2 <- ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p1 <- ggplot(data =subset(pf.fc_by_age_months, age_with_months < 71),
aes(x = age_with_months, y =friend_count_mean)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age/5)*5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
## Loading required package: grid
grid.arrange(p1,p2,p3, ncol =1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Notes:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!